nn.CrossEntropyLoss attribute Error while trying to develop video to caption generator in pytorch - pytorch

I am getting
AttributeError: 'CrossEntropyLoss' object has no attribute 'dim'
This is the code block which I run and got the error
hidden_size=256
encoder1=VideoEncoderGRU(hidden_size)
encoder1=accelerator.prepare(encoder1)
decoder1 =DecoderRNN(hidden_size, vcd.lang_object.n_words).to(device)
decoder1=accelerator.prepare(decoder1)
encoder_hidden=encoder1.initHidden()
trainIters_modified(encoder1, decoder1,encoder_hidden)
A more detailed code of each layer.
trainIters_modified - It just a function that trains for a multiple number of times.
def trainIters_modified(encoder,decoder,encoder_hidden,print_every=10,plot_every=10):
start=time.time()
plot_losses=[]
print_loss_total=0 # Reset every print
plot_loss_total=0 #plot every
encoder_optimizer = optim.Adagrad(encoder.parameters())
decoder_optimizer = optim.Adagrad(decoder.parameters())
encoder_optimizer=accelerator.prepare(encoder_optimizer)
decoder_optimizer=accelerator.prepare(decoder_optimizer)
criterion = nn.CrossEntropyLoss()
for ep in range(1):
for vid,lab in train_loader:
n_iters=vid.shape[0]
for iter in range(1, n_iters + 1):
input_tensor = vid[iter-1]
target_tensor = lab[iter-1]
loss,encoder_hidden = train(input_tensor, target_tensor, encoder,decoder, encoder_optimizer, decoder_optimizer, criterion,encoder_hidden)
print_loss_total += loss
plot_loss_total += loss
if iter % print_every == 0:
print_loss_avg = print_loss_total / print_every
print_loss_total = 0
print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),iter, iter / n_iters * 100, print_loss_avg))
if iter % plot_every == 0:
plot_loss_avg = plot_loss_total / plot_every
plot_losses.append(plot_loss_avg)
plot_loss_total = 0
#break
showPlot(plot_losses)
train function
teacher_forcing_ratio=0.5
def train(input_tensor,target_tensor,encoder,decoder,encoder_optimizer,decoder_optimizer,encoder_hidden,criterion,max_length=MAX_LENGTH):
encoder_hidden=encoder_hidden
encoder_optimizer.zero_grad() #set's encoder gradients to zero
decoder_optimizer.zero_grad() #set's decoder gradients to zero
input_length = input_tensor.size(0) #no_of_wordsin*
target_length = target_tensor.size(0)
#print(f"input.shape:{input_tensor.shape},input_length:{input_length}")
#print(f"target.shape{target_tensor.shape},target_length:{target_length}")
encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
#print(f"encoder_outputs:{encoder_outputs.shape},max_length:{max_length},encoder.hidden_size={encoder.hidden_size}")
loss = 0
for ei in range(input_length):
in_tensor=torch.permute(input_tensor[ei],(2,1,0))
#print("passing input tensor.shape",in_tensor.shape) #torch.Size([1])
#print("encoder_hidden.shape:",encoder_hidden.shape) #torch.Size([1, 1, 256])
encoder_output, encoder_hidden = encoder(in_tensor, encoder_hidden) #
#print("encoder_output:",encoder_output.shape)
#print("encoder_hidden:",encoder_hidden.shape)
#the outputs are being stored in encoder_outputs matrix
encoder_outputs[ei]=encoder_output[0,0]
#after we have trained encoder for one epoch
decoder_input = torch.tensor([[SOS_token]], device=device) #SOS_token and EOS_token where defined above.
#set the encoder hidden as decoder hidden state
decoder_hidden = encoder_output
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
if use_teacher_forcing:
#teacher forcing , feed the target as next input
for di in range(target_length):
decoder_output,decoder_hidden = decoder(decoder_input,decoder_hidden)
loss += criterion(decoder_output, target_tensor[di])
decoder_input = target_tensor[di]
else:
# Without teacher forcing: use its own predictions as the next input
for di in range(target_length):
decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
#print("decoder_output.shape:",decoder_output.shape,"decoder_output:",decoder_output)
topv, topi = decoder_output.topk(1)
#print("tov,topi:",topv,topi)
decoder_input = topi.squeeze().detach() # detach from history as input
#print("decoder_input:",decoder_input.shape,"decoder_input:",decoder_input)
loss += criterion(decoder_output, target_tensor[di])
#print(loss)
if decoder_input.item() == EOS_token:
break
#loss.backward()
accelerator.backward(loss)
encoder_optimizer.step()
decoder_optimizer.step()
return (loss.item() / target_length) ,encoder_hidden
EncoderGRU
class VideoEncoderGRU(nn.Module):
def __init__(self,hidden_size):
super(VideoEncoderGRU,self).__init__()
self.vgg=vgg16(weights=VGG16_Weights.IMAGENET1K_V1)
self.vgg.classifier=nn.Sequential(*list(self.vgg.classifier.children())[0:0])
self.vegru_classifier=nn.Sequential(
nn.Linear(512 * 7 * 7, 1024)
)
self.gru=nn.GRU(1024,hidden_size)
self.hidden_size=hidden_size
def initHidden(self):
return torch.rand(1, 1, self.hidden_size, device=device)
def forward(self,input,hidden):
out=self.vgg(input)
out=torch.reshape(out,(-1,))
out=self.vegru_classifier(out)
out=out.view(1,1,-1)
out,hidden=self.gru(out,hidden)
return out,hidden
DecoderGRU
class DecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size):
super(DecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(output_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
output = self.embedding(input).view(1, 1, -1)
output = F.relu(output)
output, hidden = self.gru(output, hidden)
output = self.softmax(self.out(output[0]))
return output, hidden
def initHidden(self):
return torch.rand(1, 1, self.hidden_size, device=device)
My dataset consists of videos stored in a folder and csv file containing names of video along with thier captions. I think I have loaded my data correctly and convert it into a video of frame size =8 and each caption is changed to a tensor of max_length =8 containing indices of each wod in caption.
You can look at my whole source code in this notebook file
There might be a simple logical error I just knew the theory behind this problem and so I coded it
I tried to implement an LSTM encoder which takes frames of video as input. These frames are first directly passed into vgg16 network which encodes it and encoded representation is passed into each frame and final output of encoder is passed as initial hidden state to decoder along with SOS_token as first input to decoder. I wanted this model to work and predict output

Related

multi-agent DQN learn single model for all agents

I'm trying to run a DQN for a multi-agent system, so there is one DNN for each agent.
It takes input=state [batch, state size, #time steps, #nodes], while for simplicity we assume #time steps=1. #nodes is number of agents. And output=Q-values for each agent.
The problem is that I test various stuff with this network, but it return not so consistent results. I suspect it has to do with me running separately DQN for each agent, but learning it via the same model. I sum the losses for all agents into one loss, and then it divide by their amount.
I'm not sure it is correct. I'd be grateful for any help.
Here's my code:
class DQN(nn.Module):
def __init__(self, args): #node_size, inputs, outputs, layers=[128, 64, 16]):
# state_size, n_actions = inputs, outputs
super(DQN, self).__init__()
self.model_type = args.model_type
if args.model_type == "seperate_state_DNN":
out_size = args.num_of_actions
self.shared_model = nn.Sequential()
h_sizes = [args.input_state_size] + args.layers
for k in range(len(h_sizes) - 1):
self.shared_model.add_module('k1'+str(k), nn.Linear(h_sizes[k], h_sizes[k + 1]))
self.shared_model.add_module('k2'+str(k), args.activations[args.layers_nl[k]])
self.shared_model.add_module('final', nn.Linear(h_sizes[-1], out_size))
def forward(self, input, i=None):
# input state dimension: [batch, state size, #time steps, #nodes]
if self.model_type == "seperate_state_DNN":
if i is None:
final_output = torch.zeros_like(input)
else:
final_output = self.shared_model(input) # [:, :, :, i].unsqueeze(3))
return final_output
And here is the calling function:
def select_action(self, state, edge_state):
#self.policy_net.eval()
sample = random.random()
if self.configuration == 2:
self.eps_threshold = 0.0 # no exploration at all, only optimal values!
else:
self.eps_threshold = self.decay_functionn()
self.steps_done += 1
if sample > self.eps_threshold:
self.last_exploration = False
with torch.no_grad():
# t.max(1) will return largest column value of each row.
# second column on max result is index of where max element was
# found, so we pick action with the larger expected reward.
state = state.to(self.device)# torch.from_numpy(state).float().to(self.device) # Convert to tensor.
state = state.unsqueeze(0) # Add batch dimension (also to action below): [batch=1, #time steps, #nodes, state size]
final_output = []
x1 = self.policy_net(state, None)#.detach()
for i in range(self.node_size):
final_output.append(self.policy_net(x1[:, :, -1, i]+state[:, :, -1, i], i).max(1)[1].detach().cpu().view(state.shape[0], -1))
# .to(self.device) # action dimension: [batch=1, #nodes]
return torch.cat(final_output, dim=1)
else:
self.last_exploration = True
return torch.randint(0, self.n_actions, (1, self.node_size))
And this is the main RL training loop:
for epi in range(self.episodes):
print("### Starting Episode: ", epi, ' ### in index=', self.run_index)
state = env.reset(self, heatup=self.sim_heatup) # single step state
done = False
while not done:
action = agent.select_action(state) # .to(device)
next_state1, reward, done = env.do_step(action)
agent.add_to_memory(state, action, next_state, reward)
agent.optimize_model()
state = next_state
agent.curr_episode += 1
# Plot and dump statistics and learning curves.
agent.dump_data_on_episode_end(plot=True)
env.capture_episode()
env.close()
Finally, this is the optimization, executed in "agent.optimize_model()" above, including the functions it uses:
def optimize_model(self):
if len(self.memory) < self.batch_size:
return
transitions = self.memory.sample(self.batch_size)
# This converts batch-array of Transitions
# to Transition of batch-arrays.
batch = Transition(*zip(*transitions))
next_states_batch = torch.stack(batch.next_state).to(self.device)
state_batch = torch.stack(batch.state).to(self.device)
action_batch = torch.cat(batch.action).view(self.batch_size, -1).to(self.device) #torch.stack(batch.action, dim=0).to(self.device)
reward_batch = torch.cat(batch.reward).view(self.batch_size, -1).to(self.device)
# dims: states=[batch, steps, nodes, state size]; action=[batch, nodes]; reward=[batch, nodes]
loss = torch.tensor(0., device=self.device)
self.policy_net.train() # IM NOT SURE IF IT SHOULD BE HERE...
x1 = self.policy_net(state_batch, None)
x2 = self.policy_net(next_states_batch, None)
for i in range(self.node_size):
action_batch1 = action_batch[:,i].unsqueeze(1).reshape(-1, 1) # action=[batchXnodes, 1]
reward_batch1 = reward_batch[:,i].unsqueeze(1).view(-1, 1) # reward=[batchXnodes, 1]
# Compute loss
loss += self._compute_loss(i, x1[:, :, -1, i]+state_batch[:, :, -1, i], edge_state_batch, action_batch1,
x2[:, :, -1, i]+next_states_batch[:, :, -1, i], next_edge_state_batch, reward_batch1)
# Optimize the model
loss.div_(self.node_size)
self.optimizer.zero_grad()
loss.backward()
# clip grad
if self.grad_clip is not None:
for param in self.policy_net.parameters():
param.grad.data.clamp_(-self.grad_clip, self.grad_clip)
# update Policy net weights
self.optimizer.step()
#del loss
self.losses.append(loss.detach().cpu().numpy())
# update Target net weights
self._update_target()
def _compute_loss(self, i, state_batch, edge_state_batch, action_batch, next_states_batch, next_edge_state_batch, reward_batch):
# Q{policy net}(s, a): [batchXnodes, actions] ---gather---> [batchXnodes, 1=q_values according to this policy]
state_action_q_values = self.policy_net(state_batch, i).gather(1, action_batch)
# argmax{a} Q{policy net}(s', a'): [batchXnodes, actions] ---argmax---> [batchXnodes] ---unsqueeze---> [batchXnodes, 1]
next_state_actions = torch.argmax(self.policy_net(next_states_batch, i), dim=1).unsqueeze(1)
# Q{ploicy net}(s', argmax{a} Q{target net}(s', a') ): [batchXnodes, actions] --gather--> [batchXnodes, 1=q_values according to this policy]
next_state_q_values = self.target_net(next_states_batch, i).gather(1, next_state_actions)
# Q* = Disount * Q(s', argmax(..)) + R: [batchXnodes, 1]
expected_state_action_values = (next_state_q_values.detach() * self.discount) + reward_batch
loss = F.smooth_l1_loss(state_action_q_values, expected_state_action_values)
return loss
def _update_target(self):
if self.target_net is None:
# There is nothing to update.
return
# Update the target network, copying all weights and biases in DQN
if self.target_update > 1:
# Hard copy of weights.
if self.steps_done % self.target_update == 0:
self.target_net.load_state_dict(self.policy_net.state_dict())
return
elif self.target_update < 1 and self.target_update > 0:
# polyak averaging:
tau = self.target_update
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
target_param.data.copy_(tau * param + (1 - tau) * target_param)
return
else:
raise NotImplementedError
Sorry for the large question, I just wanted to supply all the necessary information.
If more information is needed I'd be happy to give it.
Any suggestion is much appreciated.
Thanks,
Shimon

How to define supervised contrastive loss for a semantic segmentation model?

I have found the code below that defines supervised contrastive loss for classification task.
class SupConLoss(nn.Module):
def __init__(self, temperature=0.07, contrast_mode='all',
base_temperature=0.07):
super(SupConLoss, self).__init__()
self.temperature = temperature
self.contrast_mode = contrast_mode
self.base_temperature = base_temperature
def forward(self, features, labels=None, mask=None):
"""Args:
features: hidden vector of shape [bsz, n_views, ...].
labels: ground truth of shape [bsz].
mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
has the same class as sample i. Can be asymmetric.
Returns:
A loss scalar.
"""
device = (torch.device('cuda')
if features.is_cuda
else torch.device('cpu'))
if len(features.shape) < 3:
raise ValueError('`features` needs to be [bsz, n_views, ...],'
'at least 3 dimensions are required')
if len(features.shape) > 3:
features = features.view(features.shape[0], features.shape[1], -1)
batch_size = features.shape[0]
if labels is not None and mask is not None:
raise ValueError('Cannot define both `labels` and `mask`')
elif labels is None and mask is None:
mask = torch.eye(batch_size, dtype=torch.float32).to(device)
elif labels is not None:
labels = labels.contiguous().view(-1, 1)
if labels.shape[0] != batch_size:
raise ValueError('Num of labels does not match num of features')
mask = torch.eq(labels, labels.T).float().to(device)
else:
mask = mask.float().to(device)
contrast_count = features.shape[1]
contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
if self.contrast_mode == 'one':
anchor_feature = features[:, 0]
anchor_count = 1
elif self.contrast_mode == 'all':
anchor_feature = contrast_feature
anchor_count = contrast_count
else:
raise ValueError('Unknown mode: {}'.format(self.contrast_mode))
# compute logits
anchor_dot_contrast = torch.div(
torch.matmul(anchor_feature, contrast_feature.T),
self.temperature)
# for numerical stability
logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
logits = anchor_dot_contrast - logits_max.detach()
# tile mask
mask = mask.repeat(anchor_count, contrast_count)
# mask-out self-contrast cases
logits_mask = torch.scatter(
torch.ones_like(mask),
1,
torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
0
)
mask = mask * logits_mask
# compute log_prob
exp_logits = torch.exp(logits) * logits_mask
log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))
# compute mean of log-likelihood over positive
mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)
# loss
loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
loss = loss.view(anchor_count, batch_size).mean()
return loss
My question is how I can use this loss for a semantic segmentation task on a pixel-wise level, where the input of the model is of size (batch, channels, height, width) and the labels are masks of size (batch, height, width).

GPU memory increasing at each batch (PyTorch)

I am trying to build a convolutionnal network using ConvLSTM layer (LSTM cell but with convolutions instead of matrix multiplications), but the problem is that my GPU memory increases at each batch, even if I'm deleting variables, and getting the true value for the loss (and not the graph) for each iteration. I may be doing something wrong but that exact same script ran without issues with another model (with more parameters and also using ConvLSTM layer).
Each batch is composed of num_batch x 3 images (grayscale) and I'm trying to predict the difference |Im(t+1)-Im(t)| with the input Im(t)
def main():
config = Config()
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, num_workers=0, shuffle=True, drop_last=True)
nb_img = len(train_dataset)
util.clear_progress_dir()
step_tensorboard = 0
###################################
# Model Setup #
###################################
model = fully_convLSTM()
if torch.cuda.is_available():
model = model.float().cuda()
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(),lr=lr)
util.enumerate_params([model])
###################################
# Training Loop #
###################################
model.train() #Put model in training mode
train_loss_recon = []
train_loss_recon2 = []
for epoch in tqdm(range(config.num_epochs)):
running_loss1 = 0.0
running_loss2 = 0.0
for i, (inputs, outputs) in enumerate(train_dataloader, 0):
print(i)
torch.cuda.empty_cache()
gc.collect()
# if torch.cuda.is_available():
inputs = autograd.Variable(inputs.float()).cuda()
outputs = autograd.Variable(outputs.float()).cuda()
im1 = inputs[:,0,:,:,:]
im2 = inputs[:,1,:,:,:]
im3 = inputs[:,2,:,:,:]
diff1 = torch.abs(im2 - im1).cuda().float()
diff2 = torch.abs(im3 - im2).cuda().float()
model.initialize_hidden()
optimizer.zero_grad()
pred1 = model.forward(im1)
loss = reconstruction_loss(diff1, pred1)
loss.backward()
# optimizer.step()
model.update_hidden()
optimizer.zero_grad()
pred2 = model.forward(im2)
loss2 = reconstruction_loss(diff2, pred2)
loss2.backward()
optimizer.step()
model.update_hidden()
## print statistics
running_loss1 += loss.detach().data
running_loss2 += loss2.detach().data
if i==0:
with torch.no_grad():
img_grid_diff_true = (diff2).cpu()
img_grid_diff_pred = (pred2).cpu()
f, axes = plt.subplots(2, 4, figsize=(48,48))
for l in range(4):
axes[0, l].imshow(img_grid_diff_true[l].squeeze(0).squeeze(0), cmap='gray')
axes[1, l].imshow(img_grid_diff_pred[l].squeeze(0).squeeze(0), cmap='gray')
plt.show()
plt.close()
writer_recon_loss.add_scalar('Reconstruction loss', running_loss1, step_tensorboard)
writer_recon_loss2.add_scalar('Reconstruction loss2', running_loss2, step_tensorboard)
step_tensorboard += 1
del pred1
del pred2
del im1
del im2
del im3
del diff1
del diff2#, im1_noised, im2_noised
del inputs
del outputs
del loss
del loss2
for obj in gc.get_objects():
if torch.is_tensor(obj) :
del obj
torch.cuda.empty_cache()
gc.collect()
epoch_loss = running_loss1 / len(train_dataloader.dataset)
epoch_loss2 = running_loss2/ len(train_dataloader.dataset)
print(f"Epoch {epoch} loss reconstruction1: {epoch_loss:.6f}")
print(f"Epoch {epoch} loss reconstruction2: {epoch_loss2:.6f}")
train_loss_recon.append(epoch_loss)
train_loss_recon2.append(epoch_loss2)
del running_loss1, running_loss2, epoch_loss, epoch_loss2
Here is the model used :
class ConvLSTMCell(nn.Module):
def __init__(self, input_channels, hidden_channels, kernel_size):
super(ConvLSTMCell, self).__init__()
# assert hidden_channels % 2 == 0
self.input_channels = input_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
# self.num_features = 4
self.padding = 1
self.Wxi = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whi = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxf = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whf = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxc = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whc = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxo = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Who = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wci = None
self.Wcf = None
self.Wco = None
def forward(self, x, h, c): ## Equation (3) dans Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting
ci = torch.sigmoid(self.Wxi(x) + self.Whi(h) + c * self.Wci)
cf = torch.sigmoid(self.Wxf(x) + self.Whf(h) + c * self.Wcf)
cc = cf * c + ci * torch.tanh(self.Wxc(x) + self.Whc(h)) ###gt= tanh(cc)
co = torch.sigmoid(self.Wxo(x) + self.Who(h) + cc * self.Wco) ##channel out = hidden channel
ch = co * torch.tanh(cc)
return ch, cc #short memory, long memory
def init_hidden(self, batch_size, hidden, shape):
if self.Wci is None:
self.Wci = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
self.Wcf = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
self.Wco = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
else:
assert shape[0] == self.Wci.size()[2], 'Input Height Mismatched!'
assert shape[1] == self.Wci.size()[3], 'Input Width Mismatched!'
return (autograd.Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda(),
autograd.Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda())
class fully_convLSTM(nn.Module):
def __init__(self):
super(fully_convLSTM, self).__init__()
layers = []
self.hidden_list = [1,32,32,1]#,32,64,32,
for k in range(len(self.hidden_list)-1): # Define blocks of [ConvLSTM,BatchNorm,Relu]
name_conv = "self.convLSTM" +str(k)
cell_conv = ConvLSTMCell(self.hidden_list[k],self.hidden_list[k+1],3)
setattr(self, name_conv, cell_conv)
name_batchnorm = "self.batchnorm"+str(k)
batchnorm=nn.BatchNorm2d(self.hidden_list[k+1])
setattr(self, name_batchnorm, batchnorm)
name_relu =" self.relu"+str(k)
relu=nn.ReLU()
setattr(self, name_relu, relu)
self.sigmoid = nn.Sigmoid()
self.internal_state=[]
def initialize_hidden(self):
for k in range(len(self.hidden_list)-1):
name_conv = "self.convLSTM" +str(k)
(h,c) = getattr(self,name_conv).init_hidden(config.batch_size, self.hidden_list[k+1],(256,256))
self.internal_state.append((h,c))
self.internal_state_new=[]
def update_hidden(self):
for i, hidden in enumerate(self.internal_state_new):
self.internal_state[i] = (hidden[0].detach(), hidden[1].detach())
self.internal_state_new = []
def forward(self, input):
x = input
for k in range(len(self.hidden_list)-1):
name_conv = "self.convLSTM" +str(k)
name_batchnorm = "self.batchnorm"+str(k)
name_relu =" self.relu"+str(k)
x, c = getattr(self,name_conv)(x, self.internal_state[k][1], self.internal_state[k][0])
self.internal_state_new.append((x.detach(),c.detach()))
x = getattr(self,name_batchnorm)(x)
if k!= len(self.hidden_list)-2:
x = getattr(self,name_relu)(x)
else :
x = self.sigmoid(x)
return x
So my question is, what in my code is causing memory to accumulate during the training phase?
A few quick notes about training code:
torch.Variable is deprecated since at least 8 minor versions (see here), don't use it
gc.collect() has no point, PyTorch does the garbage collector on it's own
Don't use torch.cuda.empty_cache() for each batch, as PyTorch reserves some GPU memory (doesn't give it back to OS) so it doesn't have to allocate it for each batch once again. It will make your code slow, don't use this function at all tbh, PyTorch handles this.
Don't spam random memory cleaning, that's most probably not where the error is
Model
Yes, this is probably the case (although it's hard to read this model's code).
Take notice of self.internal_state list and self.internal_state_new list also.
Each time you call model.initialize_hidden() a new set of tensor is added to this list (and never cleaned as far as I can tell)
self.internal_state_new seems to be cleaned in update_hidden, maybe self.internal_state should be also?
In essence, check out this self.internal_state property of your model, the list grows indefinitely from what I see. Initializing with zeros everywhere is quite strange, there is probably no need to do that (e.g. PyTorch's RNN is initialized with zeros by default, this is probably similar).

Pytorch NLP sequence length of target in Transformer

I'm trying to understand the code of Transformer (https://github.com/SamLynnEvans/Transformer).
If seeing the train_model function in "train" script, I wonder why need to use the different sequence length of trg_input from trg:
trg_input = trg[:, :-1]
In this case, the sequence length of trg_input is "seq_len(trg) - 1".
It means that trg is like:
<sos> tok1 tok2 tokn <eos>
and trg_input is like:
<sos> tok1 tok2 tokn (no eos token)
Please let me know the reason.
Thank you.
The related code is like below:
for i, batch in enumerate(opt.train):
src = batch.src.transpose(0, 1).to('cuda')
trg = batch.trg.transpose(0, 1).to('cuda')
trg_input = trg[:, :-1]
src_mask, trg_mask = create_masks(src, trg_input, opt)
preds = model(src, trg_input, src_mask, trg_mask)
ys = trg[:, 1:].contiguous().view(-1)
opt.optimizer.zero_grad()
loss = F.cross_entropy(preds.view(-1, preds.size(-1)), ys, ignore_index=opt.trg_pad)
loss.backward()
opt.optimizer.step()
def create_masks(src, trg, opt):
src_mask = (src != opt.src_pad).unsqueeze(-2)
if trg is not None:
trg_mask = (trg != opt.trg_pad).unsqueeze(-2)
size = trg.size(1) # get seq_len for matrix
np_mask = nopeak_mask(size, opt)
if trg.is_cuda:
np_mask.cuda()
trg_mask = trg_mask & np_mask
else:
trg_mask = None
return src_mask, trg_mask
That's because the entire aim is to generate the next token based on the tokens we've seen so far. Take a look at the input into the model when we get our predictions. We're not just feeding the source sequence, but also the target sequence up until our current step. The model inside Models.py looks like:
class Transformer(nn.Module):
def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
super().__init__()
self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
self.out = nn.Linear(d_model, trg_vocab)
def forward(self, src, trg, src_mask, trg_mask):
e_outputs = self.encoder(src, src_mask)
#print("DECODER")
d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
output = self.out(d_output)
return output
So you can see that the forward method receives src and trg, which are each fed into the encoder and decoder. This is a bit easier to grasp if you take a look at the model architecture from the original paper:
The "Outputs (shifted right)" corresponds to trg[:, :-1] in the code.

Some parameters are not getting saved when saving a model in pytorch

I have built an encoder-decoder model with attention for morph inflection generation. I am able to train the model and predict on test data but I am getting wrong predicting after loading a saved model
I am not getting any error during saving or loading but
When I load a saved model its predictions are completely wrong. It looks like some parameters are not getting saved.
I have tried to load and save the model using both techniques
using state_dict() eg. torch.save(encoder.state_dict(),'path')
saving complete model eg.torch.save(encoder,'path')
I have tried to save different classes one by one and also making a superclass that initiates all those class and then saving just superclass
but nothing seems to be working
Encoder class
class Encoder(nn.Module):
def __init__(self,vocab_size,embedding_size, encoder_hid_dem,decoder_hid_dem,bidirectional,dropout):
super().__init__()
self.encoder_hid_dem = encoder_hid_dem
self.encoder_n_direction=1;
self.bias = False
self.dropout=dropout
if(bidirectional==True):
self.encoder_n_direction=2;
self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size, padding_idx=0)
self.GRU_layer = nn.GRU(input_size=embedding_size, hidden_size=encoder_hid_dem, batch_first=True, bidirectional=bidirectional)
self.fc = nn.Linear(encoder_hid_dem*self.encoder_n_direction,decoder_hid_dem)
self.dropout = nn.Dropout(dropout)
def forward(self, input_word):
# print(input_word.size())
#[batch_size src_sent_lent]
embed_out = self.embedding_layer(input_word)
#[BATCH_SIZE src_sent_lent embedding_dim]
embed_out = F.relu(embed_out)
embed_out = self.dropout(embed_out)
self.batch = embed_out.size()[0]
# hidden = self.init_hidden()
GRU_out,hidden = self.GRU_layer(embed_out)
# print(GRU_out.size())
# print(hidd.size())
#[BATCH_SIZE sec_sent_len n_direction*hid_dem]
#[n_layer*n_direction batch_size hid_dem]
#where the first hid_dim elements in the third axis are the hidden states from the top layer forward RNN, and the last hid_dim elements are hidden states from the top layer backward RNN
#hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
#hidden [-2, :, : ] is the last of the forwards RNN
#hidden [-1, :, : ] is the last of the backwards RNN
GRU_out = F.relu(GRU_out)
hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)))
# print(GRU_out.size())
# print(hidden.size())
#outputs = [batch_size src sent len, encoder_hid_dim * n_direction]
#hidden = [batch size, dec hid dim]
return GRU_out,hidden
def init_hidden(self):
return (Variable(torch.eye(1, self.encoder_hid_dem)).unsqueeze(1).repeat(2, self.batch, 1).to(self.device))
Attention class
class Attention(nn.Module):
def __init__(self,encoder_hid_dem,decoder_hid_dem,bidirectional):
super().__init__()
self.enc_hid_dim = encoder_hid_dem
self.dec_hid_dim = decoder_hid_dem
self.encoder_n_direction=1;
if(bidirectional==True):
self.encoder_n_direction=2;
self.attn = nn.Linear((encoder_hid_dem * self.encoder_n_direction) + decoder_hid_dem, decoder_hid_dem)
self.v = nn.Parameter(torch.rand(decoder_hid_dem))
def forward(self, hidden, encoder_outputs):
#hidden = [batch size, dec hid dim]
#encoder_outputs = [batch_size ,src sent len, enc hid dim * encoder_n_direction]
batch_size = encoder_outputs.shape[0]
src_len = encoder_outputs.shape[1]
hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
#hidden = [batch size, src sent len, dec hid dim]
#encoder_outputs = [batch size, src sent len, enc hid dim * encoder_n_direction]
energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
#energy = [batch size, src sent len, dec hid dim]
energy = energy.permute(0, 2, 1)
#energy = [batch size, dec hid dim, src sent len]
#v = [dec hid dim]
v = self.v.repeat(batch_size, 1).unsqueeze(1)
#v = [batch size, 1, dec hid dim]
attention = torch.bmm(v, energy).squeeze(1)
#attention= [batch size, src len]
return F.softmax(attention, dim=1)
Decoder class
class Decoder(nn.Module):
def __init__(self, decoder_hid_dem, encoder_hid_dem, vocab_size,embedding_dim,attention,decoder_input_size,linear_input_size,bidirectional,dropout):
super().__init__()
self.encoder_hid_dem=encoder_hid_dem
self.decoder_hid_dem=decoder_hid_dem
self.attention=attention
self.dropout = dropout
self.output_dim = vocab_size
self.decoder_n_direction=1;
if(bidirectional==True):
self.decoder_n_direction=2;
self.GRU_layer_out = nn.GRU(decoder_input_size,decoder_hid_dem)
self.out_layer = nn.Linear(in_features=linear_input_size, out_features=vocab_size)
self.dropout = nn.Dropout(dropout)
#self.GRU_layer_out.bias = torch.nn.Parameter(torch.zeros(decoder_input_size))
def forward(self, feature, hidden,actual_word,encoder_outputs):
feature = feature.unsqueeze(1)
# print('decoder')
# print(feature.size())
#[batch_size src_sent_lent=1 feat_size=6]
# print(hidden.size())
# [batch_size dec_hid_dim]
# print(actual_word.size())
# [batch_size src_sent_lent=1 embedding_dim]
# print(encoder_outputs.size())
# outputs = [batch_size src sent len, encoder_hid_dim * encoder_n_directional]
a = self.attention(hidden,encoder_outputs)
# print(a.size())
# [batch_size src_sent_len]
a = a.unsqueeze(1)
#a = [batch size, 1, src len]
weighted = torch.bmm(a,encoder_outputs)
# print(weighted.size())
# weighted = [batch size, 1, enc_hid_dim * encoder_n_direction]
# if len(actual_word.size()) != 0:
input_char = torch.cat((actual_word,feature,weighted),2)
# else:
# input_char = torch.cat((feature,weighted),2)
input_char=input_char.permute(1,0,2)
# print(input_char.size())
# [1 BATCH_SIZE decoder_input_size]
hidden = hidden.unsqueeze(0)
# print(hidden.size())
#[1 batch_size decoder_hid_dem]
output, hidden = self.GRU_layer_out(input_char, hidden)
# print(output.size())
# [sent_len=1 batch_size decoder_n_direction*decoder_hid_dem]
# print(hidden.size())
# [n_layer*n_direction BATCH_SIZE hid_dem]
output = F.leaky_relu(output)
output = self.dropout(output)
output = torch.cat((output.squeeze(0),weighted.squeeze(1),actual_word.squeeze(1)),dim=1)
pre_out = self.out_layer(output)
predicted_output = F.log_softmax(pre_out, dim=1)
# print(predicted_output.size())
# [ batch_size vacab_size ]
return predicted_output, hidden.squeeze(0)
def init_hidden(self, batch):
return (Variable(torch.eye(1, self.decoder_hid_dem)).unsqueeze(1).repeat(1, batch, 1).to(self.device),Variable(torch.eye(1, self.decoder_hid_dem)).unsqueeze(1).repeat(1, batch, 1).to(self.device))
seq2seq class
class Seq2Seq(nn.Module):
def __init__(self,encoder,decoder,device):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
def forward(self,input_word,output_word,features_word,teaching_forcing_ratio,limit):
#print(input_word)
#print(input_word.size())
input_word = input_word.to(self.device)
output_word = output_word.to(self.device)
features_word = features_word.to(self.device)
batch_size= input_word.size()[0]
if(limit==0):
max_len = input_word.size()[1]
else:
max_len = limit
vocabsize = self.decoder.output_dim
actual_word = self.encoder.embedding_layer(torch.tensor(char_to_index['<sos>']).view(1, -1).to(self.device)).repeat(batch_size, 1, 1)
encoder_outputs,hidden = self.encoder(input_word)
features=features_word[:,:]
predicted_word = torch.zeros(max_len,batch_size,vocabsize).to(self.device)
for t in range(1,max_len):
output,hidden=self.decoder(features, hidden,actual_word,encoder_outputs)
#print(output.size())
predicted_word[t] = output
topv, topi = output.topk(1)
bs = topi.size()[0]
temp2 = torch.zeros(0,1,300).to(self.device)
for row in range(bs):
index = topi[row][0].item()
temp = self.encoder.embedding_layer(torch.tensor(index).view(1, -1).to(self.device))
temp2 = torch.cat((temp2,temp))
teacher_force = random.random() < teaching_forcing_ratio
if teacher_force == 1:
actual_word = self.encoder.embedding_layer(output_word[:,t]).unsqueeze(1)
else:
actual_word = temp2
return predicted_word
and this code is used to save and load model
torch.save(model.state_dict(), 'model.pt')
model.load_state_dict(torch.load('model.pt'))
I want that when I run my model on pre-trained weights, it should predict correctly acc to those weights
Your provided code for saving/loading parameters is wrong. The loading and saving model parameters are pretty straight-forward. In your case, it should be:
# loading
saved_params = torch.load(
filename, map_location=lambda storage, loc: storage
)
s2s.load_state_dict(saved_params)
# saving
params = s2s.state_dict()
torch.save(params, filename)
[Update]
You need to make the Seq2Seq class a derived class of PyTorch's nn.Module just like your encoder/decoder classes. Otherwise, you can't use the state_dict() method. You can assume Seq2Seq class is like a container that contains your whole network, although it does not have any learnable weights itself.

Resources