Converting spinning up policy gradient to pytorch - python-3.x

I'm trying to learn deep reinforcement learning through OpenAI spinning up. To do this, I want to rewrite some of their code using pytorch instead of tensorflow.
Currently I'm trying to convert the code for basic policy gradient (link with explanations) and this is my code so far:
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax
from torch.distributions import Categorical
import torch.optim as optim
import numpy as np
import gym
from gym.spaces import Discrete, Box
class Policy(nn.Module):
def __init__(self, sizes, activation=nn.Tanh(), output_activation=None):
# Build a feedforward neural network.
super(Policy, self).__init__()
self.layers=nn.ModuleList([nn.Linear(sizes[i],sizes[i+1]) for i in
range(len(sizes)-1)])
self.activation=activation
self.output_activation=output_activation
self.returns=[] # for R(tau) weighting in policy gradient
self.rewards=[] # list for rewards accrued throughout ep
self.logits=[] # for measuring episode logits
def forward(self,x):
for layer in self.layers[:-1]:
x=self.activation(layer(x))
x=self.layers[-1](x)
if not self.output_activation==None:
x=self.output_activation(self.layers[-1](x))
return x
# make action selection op (outputs int actions, sampled from policy)
def select_action(logits):
return Categorical(logits=logits).sample()
# make loss function whose gradient, for the right data, is policy gradient
def loss(action_logits,tau_rets):
return torch.sum(torch.dot(log_softmax(action_logits),tau_rets))
def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2,
epochs=50, batch_size=5000, render=False):
# make environment, check spaces, get obs / act dims
env = gym.make(env_name)
assert isinstance(env.observation_space, Box), \
"This example only works for envs with continuous state spaces."
assert isinstance(env.action_space, Discrete), \
"This example only works for envs with discrete action spaces."
obs_dim = env.observation_space.shape[0]
n_acts = env.action_space.n
# make core of policy network
policy = Policy(sizes=[obs_dim]+hidden_sizes+[n_acts])
# make train op
train_op = optim.Adam(policy.parameters(), lr=lr)
# for training policy
def train_one_epoch():
# make some empty lists for logging.
batch_returns = [] # for measuring episode returns
batch_lens = [] # for measuring episode lengths
# reset episode-specific variables
obs = torch.from_numpy(env.reset()).type(torch.FloatTensor) # first obs comes from starting distribution
done = False # signal from environment that episode is over
num_obs=0 # to measure the number of observations
# render first episode of each epoch
finished_rendering_this_epoch = False
# collect experience by acting in the environment with current policy
while True:
# rendering
if (not finished_rendering_this_epoch) and render:
env.render()
# act in the environment
act_logit=policy.forward(obs)
act = select_action(act_logit)
tmp, reward, done, _ = env.step(act.numpy())
obs=torch.from_numpy(tmp).type(torch.FloatTensor)
num_obs+=1
# save logit, reward
policy.rewards.append(reward)
policy.logits.append(act_logit[act].item())
if done:
# if episode is over, record info about episode
ep_ret, ep_len = sum(policy.rewards), len(policy.rewards)
batch_returns.append(ep_ret)
batch_lens.append(ep_len)
# the weight for each logprob(a|s) is R(tau)
policy.returns+= [ep_ret] * ep_len
# reset episode-specific variables
tmp, done, policy.rewards = env.reset(), False, []
obs=torch.from_numpy(tmp).type(torch.FloatTensor)
# won't render again this epoch
finished_rendering_this_epoch = True
# end experience loop if we have enough of it
if num_obs > batch_size:
break
# take a single policy gradient update step
print (len(policy.returns),len(policy.rewards),len(policy.logits))
batch_loss = loss(torch.tensor(policy.logits),torch.tensor(policy.returns))
batch_loss.backward()
return batch_loss, batch_returns, batch_lens
# training loop
for i in range(epochs):
batch_loss, batch_rets, batch_lens = train_one_epoch()
print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
(i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
When I run train(), I get the following error:
RuntimeError Traceback (most recent call last)
<ipython-input-163-2da0ffaf5447> in <module>()
----> 1 train()
<ipython-input-162-560e772be08b> in train(env_name, hidden_sizes, lr, epochs,
batch_size, render)
114 # training loop
115 for i in range(epochs):
--> 116 batch_loss, batch_rets, batch_lens = train_one_epoch()
117 print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
118 (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
<ipython-input-162-560e772be08b> in train_one_epoch()
109 print (len(policy.returns),len(policy.rewards),len(policy.logits))
110 batch_loss = loss(torch.tensor(policy.logits),torch.tensor(policy.returns))
--> 111 batch_loss.backward()
112 return batch_loss, batch_returns, batch_lens
113
~\Anaconda3\lib\site-packages\torch\tensor.py in backward(self, gradient,
retain_graph, create_graph)
91 products. Defaults to ``False``.
92 """
---> 93 torch.autograd.backward(self, gradient, retain_graph, create_graph)
94
95 def register_hook(self, hook):
~\Anaconda3\lib\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
88 Variable._execution_engine.run_backward(
89 tensors, grad_tensors, retain_graph, create_graph,
---> 90 allow_unreachable=True) # allow_unreachable flag
91
92
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
I don't understand why this happens since my code is similar to other rl pytorch code such as this.

Related

PyTorch CNN doesn't update weights while training

I want to predict a 8x8 matrix with the original 8x8 matrix. But the weights DO NOT update in the training process.
I use two simple conv layers to conv input matrix from 1x8x8 to 2x8x8. Then I used another conv layer to convert 2x8x8 to 1x8x8. The inputs and outputs in the data folder are generated randomly. The pytorch codes are shown as follows.
I have already checked some posts about weights not update issues. I think there must be some wrong with "requires_grad = True" of data or loss.backward().
Any suggestions about the codes would be grateful. Thanks in advance.
M
Tue Sep 6 15:34:17 CST 2022
The data input folder is in
data/CM10_1/CM_1.txt
data/CM10_1/CM_2.txt
data/CM10_1/CM_3.txt
data/CM10_1/CM_4.txt
The data output folder is in
data/CM10_2/CM_1.txt
data/CM10_2/CM_2.txt
data/CM10_2/CM_3.txt
data/CM10_2/CM_4.txt
CM_i.txt is shown as
207 244 107 173 70 111 180 244
230 246 233 193 11 97 192 86
32 40 202 189 24 195 70 149
232 247 244 100 209 202 173 57
161 244 167 167 177 47 167 191
24 123 9 43 80 124 41 65
71 204 216 180 242 113 30 129
139 36 238 8 8 164 127 178
data/CM_info_tr.csv
CMname,
CM_1.txt,
CM_2.txt,
CM_3.txt,
CM_4.txt,
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# outline###############################################################
#
# CM10_1/CM_i.txt to predict CM10_2/CM_i.txt
#
# data pair example
# CM10_1/CM_1.txt -> CM10_2/CM_1.txt
#
# CM10_1/CM_1.txt is 8x8 matrix with random int
# CM10_2/CM_1.txt is 8x8 matrix with random int
#
# The model uses two conv layers
# layer 01 : 1x8x8 -> 2x8x8
# layer 02 : 2x8x8 -> 1x8x8
#
# The loss is the difference between
# CM10_2/CM_1.txt(predicted) and CM10_2/CM_1.txt
#
# main ###############################################################
from __future__ import print_function, division
import os
import sys
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as F
from skimage import io, transform
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.autograd import Variable
torch.cuda.empty_cache()
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")
# test CM parameters
n_Ca = 8
batch_size = 4
#device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
# define class dataset CMDataset ###################################################
class CMDataset(Dataset):
"""CM dataset"""
def __init__(self,csv_CM,CM_beg_dir,CM_end_dir,n_Ca=n_Ca):
"""
Args:
csv_CM (string): Path to the csv file with CM class.
CM_beg_dir (string): Directory with all the CM begin data.
CM_end_dir (string): Directory with all the CM end data.
"""
self.CM_info = pd.read_csv(csv_CM)
self.CM_beg_dir = CM_beg_dir
self.CM_end_dir = CM_end_dir
def __len__(self):
return len(self.CM_info)# the number of the samples
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
#load and convert CM begin data ---------------------------------------
CM_beg_path = os.path.join(self.CM_beg_dir, self.CM_info.iloc[idx, 0])
CM_beg_data = np.loadtxt(CM_beg_path)
CM_beg_data = CM_beg_data.reshape(1,n_Ca,n_Ca)
CM_beg_data = CM_beg_data.astype(np.float32)
CM_beg_data = torch.from_numpy(CM_beg_data)
CM_beg_data = CM_beg_data.to(device)
#load and convert CM endin data ---------------------------------------
CM_end_path = os.path.join(self.CM_end_dir, self.CM_info.iloc[idx, 0])
CM_end_data = np.loadtxt(CM_end_path)
CM_end_data = CM_end_data.reshape(1,n_Ca,n_Ca)
CM_end_data = CM_end_data.astype(np.float32)
CM_end_data = torch.from_numpy(CM_end_data)
CM_end_data = CM_end_data.to(device)
return CM_beg_data, CM_end_data
# define class model CMNet ###################################################
class CMNet(nn.Module):
def __init__(self):
super(CMNet, self).__init__()
self.lay_CM_01 = nn.Conv2d(in_channels=1,out_channels=2,kernel_size=1,stride=1,bias=True)
self.lay_CM_02 = nn.Conv2d(in_channels=2,out_channels=1,kernel_size=1,stride=1,bias=True)
def forward(self, CM_data):
[n_in_batch,n_in_chan,n_in_hei,n_in_wid]=CM_data.shape
n_Ca = n_in_hei
out1_1 = self.lay_CM_01(CM_data)
out1_2 = out1_1
out1_3 = self.lay_CM_02(out1_2)
out = out1_3
return out
# load data for training and validing
CM_dataset_train = CMDataset(csv_CM = 'data/CM_info_tr.csv',
CM_beg_dir = 'data/CM10_1/',
CM_end_dir = 'data/CM10_2/',
n_Ca = n_Ca)
train_dataloader = DataLoader(CM_dataset_train,
batch_size=batch_size,
shuffle=True)
# training parameter
learning_rate = 2
epochs = 5
model = CMNet()
model = model.to(device)
# Initialize the loss function
loss_fn = nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# define train loop ###############################################################
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X1,Y) in enumerate(dataloader):
X1=X1.to(torch.float32)
Y = Y.to(torch.float32)
# Compute prediction and loss
X1=torch.autograd.Variable(X1)
pred = model(X1)
pred = torch.autograd.Variable(pred)
# compute loss
loss = loss_fn(pred,Y)
loss = Variable(loss, requires_grad = True)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss, current = loss.item(), batch * len(X1)
print(f" loss:{loss:>15f}, [{current:>5d}/{size:>5d}]")
# Train ###############################################################
for t in range(epochs):
print(f"Epoch {t+1}\n----------------------------------------------")
# print(list(model.parameters()))
train_loop(train_dataloader, model, loss_fn, optimizer)
#print("Train and Valid Done!")
What pytorch version are you using? Variable is depracated for 5 years now. Remove the lines loss = Variable(loss, requires_grad = True) and pred = torch.autograd.Variable(pred), that should do the trick. Try and read the current documentation and don't rely on archaic tutorials.

"RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn " error BertFoeSequenceClassification

I am trying to build Bert model for Arabic Text classification task using pretrained model from https://github.com/alisafaya/Arabic-BERT
i want to know the exact difference between the two statement:
model_name = 'kuisailab/albert-large-arabic'
model = AutoModel.from_pretrained(model_name)
model = BertForSequenceClassification .from_pretrained(model_name)
I fine-tuned the model by adding the following layers on top of the model:
for param in model.parameters():
param.requires_grad = False
model.classifier = nn.Sequential(
nn.Dropout(0.5),
nn.ReLU(),
nn.Linear(768,512),
nn.Linear(512,2),
nn.LogSoftmax(dim=1),
nn.Softmax(dim=1)
)
model = model.to(device)
and used the optimizer:
optimizer = AdamW(model.parameters(),
lr = 2e-5)
finally this is my training loop:
model.train()
for idx, row in train_data.iterrows():
text_parts = preprocess_text(str(row['sentence']))
label = torch.tensor([row['label']]).long().to(device)
optimizer.zero_grad()
overall_output = torch.zeros((1, 2)).float().to(device)
for part in text_parts:
if len(part) > 0:
try:
input = part.reshape(-1)[:512].reshape(1, -1)
# print(input.shape)
overall_output += model(input, labels=label)[1].float().to(device)
except Exception as e:
print(str(e))
# overall_output /= len(text_parts)
overall_output = F.softmax(overall_output[0], dim=-1)
if label == 0:
label = torch.tensor([1.0, 0.0]).float().to(device)
elif label == 1:
label = torch.tensor([0.0, 1.0]).float().to(device)
# print(overall_output, label)
loss = criterion(overall_output, label)
total_loss += loss.item()
loss.backward()
optimizer.step()
and i get the error:
mat1 dim 1 must match mat2 dim 0
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-33-5c2f0fea6c1f> in <module>()
39 total_loss += loss.item()
40
---> 41 loss.backward()
42 optimizer.step()
43
1 frames
/usr/local/lib/python3.6/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
130 Variable._execution_engine.run_backward(
131 tensors, grad_tensors_, retain_graph, create_graph,
--> 132 allow_unreachable=True) # allow_unreachable flag
133
134
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
any idea how to solve this error
BertForSequenceClassification is the class which extends the BertModel, i.e, BertForSequenceClassification defines a logistic regression layer, for the task of classificiation, with cross-entropy loss, to be jointly fine-tuned or trained on the existing Bert Model.
AutoModel, is a class provided in the library that allows to automatically identify the model class based on it's name or the model file contents.
Since you already know that you need a model for classification, you can directly use BertForSequenceClassification

Removing last 2 layers from a BERT classifier results in " 'tuple' object has no attribute 'dim' " error. Why?

I fine tuned a huggingface transformer using Keras (with ktrain) and then reloaded the model in Pytorch.
I want to access the third to last layer (pre_classifier), so I removed the two last layers:
BERT2 = torch.nn.Sequential(*(list(BERT.children())[:-2]))
Running an encoded sentence through this yields the following error message:
AttributeError Traceback (most recent call last)
<ipython-input-38-640702475573> in <module>
----> 1 ans2=BERT2(torch.tensor([e1]))
2 print (ans2)
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\container.py in forward(self, input)
90 def forward(self, input):
91 for module in self._modules.values():
---> 92 input = module(input)
93 return input
94
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\linear.py in forward(self, input)
85
86 def forward(self, input):
---> 87 return F.linear(input, self.weight, self.bias)
88
89 def extra_repr(self):
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\functional.py in linear(input, weight, bias)
1366 - Output: :math:`(N, *, out\_features)`
1367 """
-> 1368 if input.dim() == 2 and bias is not None:
1369 # fused op is marginally faster
1370 ret = torch.addmm(bias, input, weight.t())
AttributeError: 'tuple' object has no attribute 'dim'
Meanwhile deleting the classifier entirely (all three layers)
BERT3 = torch.nn.Sequential(*(list(BERT.children())[:-3]))
Yields the expected tensor (within a size 1 tuple) with the expected shape ([sentence_num,token_num,768]).
Why does the removal of two (but not three) layers breaks the model?
And how can I access the pre_classifier results?
It is not accessible by setting config with output_hidden_states=True as this flag returns the hidden values of the BERT transformer stack, not those of the classifier layers downstream to it.
--
PS
The code used to initialize the BERT model:
def collect_data_for_FT():
from sklearn.datasets import fetch_20newsgroups
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
print('size of training set: %s' % (len(train_b['data'])))
print('size of validation set: %s' % (len(test_b['data'])))
print('classes: %s' % (train_b.target_names))
x_train = train_data.data
y_train = train_data.target
x_test = test_data.data
y_test = test_data.target
return(x_train,y_train,x_test,y_test)
bert_name = 'distilbert-base-uncased'
from transformers import DistilBertForSequenceClassification,AutoConfig,AutoTokenizer
import os
dir_path = os.getcwd()
dir_path=os.path.join(dir_path,'models')
config = AutoConfig.from_pretrained(bert_name,num_labels=20) # change model configuration to access hidden values.
try:
BERT = DistilBertForSequenceClassification.from_pretrained(dir_path,config=config)
print ("Finetuned predictor loaded")
except:
import tensorflow.keras as keras
print ("No finetuned predictor found.\nTraining.")
(x_train,y_train,x_test,y_test)=collect_data_for_FT()
####
# prework:
import ktrain
from ktrain import text
t = text.Transformer(bert_name, maxlen=500, classes=train_b.target_names)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)
pre_trained_model = t.get_classifier()
learner = ktrain.get_learner(pre_trained_model, train_data=trn, val_data=val, batch_size=6)
####
####
# Find best learning rate
learner.lr_find()
learner.lr_plot()
####
learner.fit_onecycle(2e-4, 4) # choosen based on the learning rate/loss plot.
####
# prepare and save:
predictor = ktrain.get_predictor(learner.model, preproc=t)
predictor.save('my_distilbertbase_predictor')
predictor.model.save_pretrained(dir_path)
####
BERT = DistilBertForSequenceClassification.from_pretrained(os.path.join(dir_path), from_tf=True,config=config) # re-load tensorflow to pytorch
BERT.save_pretrained(dir_path) # save as a "full blooded" pytorch model
BERT = DistilBertForSequenceClassification.from_pretrained(dir_path,config=config) # re-load
from tensorflow.keras import backend as K
K.clear_session() # loading from tensorflow takes up space and the GPU. This releases it/

Gradient disappearing after first epoch in manual linear regression

I'm new to Pytorch and I've been working through the tutorials and playing around with toy examples. I wanted to just make a super simple model to get a better handle on autograd, but I'm running into issues.
I'm trying to train a linear regression model but I keep running into the following error,
----------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-80-ba5ca34a3a54> in <module>()
9 loss = torch.dot(delta, delta)
10
---> 11 loss.backward()
12 with torch.no_grad():
13 w, b = w - learning_rate*w.grad.data, b - learning_rate*b.grad.data
/usr/local/lib/python3.6/dist-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
91 products. Defaults to ``False``.
92 """
---> 93 torch.autograd.backward(self, gradient, retain_graph, create_graph)
94
95 def register_hook(self, hook):
/usr/local/lib/python3.6/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
87 Variable._execution_engine.run_backward(
88 tensors, grad_tensors, retain_graph, create_graph,
---> 89 allow_unreachable=True) # allow_unreachable flag
90
91
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
And for reference, the code is here,
# dataset for training
X = torch.randn(100, 3)
y = -3*X[:,0] + 2.2*X[:,1] + 0.002*X[:,2] + 1
w = torch.randn(3, requires_grad=True, dtype=torch.float) # model weights
b = torch.randn(1, requires_grad=True, dtype=torch.float) # model bias
num_epochs = 10
learning_rate = 1e-4
for i in range(num_epochs):
y_hat = torch.mv(X, w) + b
delta = y_hat - y
loss = torch.dot(delta, delta)
loss.backward()
with torch.no_grad():
w, b = w - learning_rate*w.grad, b - learning_rate*b.grad
The issue seems to be that after the first epoch the gradient attribute is set to None, but I'm a little confused why this would be the case.
If I try to zero the gradient after updating the weights, then I get a similar error.
The answer lies in locally disabling gradient computation. As you can see in the first example, computations carried out with the torch.no_grad() context manager result in tensors for which requires_grad == False. Since you create "fresh" w and b instead of updating them in place, these tensors lose the requires_grad property after the first iteration and you get the error on 2nd iteration. A simple fix is to reenable gradients
with torch.no_grad():
w, b = w - learning_rate*w.grad, b - learning_rate*b.grad
w.requires_grad_(True)
b.requires_grad_(True)
If you look up the source of optimizers in pytorch optim module, such as SGD, you will see that they use the in-place operators such as add_. You can rewrite your loop in this manner
with torch.no_grad():
w.sub_(learning_rate*w.grad)
b.sub_(learning_rate*b.grad)
which will not touch the requires_grad flag, since the tensors keep their "identity" - just change values. In this case, you will need to remember to call w.zero_grad() and b.zero_grad() in each iteration or the gradient values will keep additively growing.

PyTorch RuntimeError : Gradients are not CUDA tensors

I am getting the following error while doing seq to seq on characters and feeding to LSTM, and decoding to words using attention. The forward propagation is fine but while computing loss.backward() I am getting the following error.
RuntimeError: Gradients aren't CUDA tensors
My train() function is as followed.
def train(input_batch, input_batch_length, target_batch, target_batch_length, batch_size):
# Zero gradients of both optimizers
encoderchar_optimizer.zero_grad()
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
encoder_input = Variable(torch.FloatTensor(len(input_batch), batch_size, 500))
for ix , w in enumerate(input_batch):
w = w.contiguous().view(15, batch_size)
reshaped_input_length = [x[ix] for x in input_batch_length] # [15 ,.. 30 times] * 128
if USE_CUDA:
w = w.cuda()
#reshaped_input_length = Variable(torch.LongTensor(reshaped_input_length)).cuda()
hidden_all , output = encoderchar(w, reshaped_input_length)
encoder_input[ix] = output.transpose(0,1).contiguous().view(batch_size, -1)
if USE_CUDA:
encoder_input = encoder_input.cuda()
temporary_target_batch_length = [15] * batch_size
encoder_hidden_all, encoder_output = encoder(encoder_input, target_batch_length)
decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size))
decoder_hidden = encoder_output
max_target_length = max(temporary_target_batch_length)
all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, decoder.output_size))
# Move new Variables to CUDA
if USE_CUDA:
decoder_input = decoder_input.cuda()
all_decoder_outputs = all_decoder_outputs.cuda()
target_batch = target_batch.cuda()
# Run through decoder one time step at a time
for t in range(max_target_length):
decoder_output, decoder_hidden, decoder_attn = decoder(
decoder_input, decoder_hidden, encoder_hidden_all
)
all_decoder_outputs[t] = decoder_output
decoder_input = target_batch[t] # Next input is current target
if USE_CUDA:
decoder_input = decoder_input.cuda()
# Loss calculation and backpropagation
loss = masked_cross_entropy(
all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
target_batch.transpose(0, 1).contiguous(), # -> batch x seq
target_batch_length
)
loss.backward()
# Clip gradient norms
ecc = torch.nn.utils.clip_grad_norm(encoderchar.parameters(), clip)
ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
# Update parameters with optimizers
encoderchar_optimizer.step()
encoder_optimizer.step()
decoder_optimizer.step()
return loss.data[0], ec, dc
Full Stack Trace is here.
RuntimeError Traceback (most recent call last)
<ipython-input-10-9778e12ded02> in <module>()
11 data_target_batch_index= Variable(torch.LongTensor(data_target_batch_index)).transpose(0,1)
12 # Send the data for training
---> 13 loss, ar1, ar2 = train(data_input_batch_index, data_input_batch_length, data_target_batch_index, data_target_batch_length, batch_size)
14
15 # Keep track of loss
<ipython-input-8-9c71c385f8cd> in train(input_batch, input_batch_length, target_batch, target_batch_length, batch_size)
54 target_batch_length
55 )
---> 56 loss.backward()
57
58 # Clip gradient norms
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/autograd/variable.py in backward(self, gradient, retain_variables)
144 'or with gradient w.r.t. the variable')
145 gradient = self.data.new().resize_as_(self.data).fill_(1)
--> 146 self._execution_engine.run_backward((self,), (gradient,), retain_variables)
147
148 def register_hook(self, hook):
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/autograd/function.py in _do_backward(self, gradients, retain_variables)
207 def _do_backward(self, gradients, retain_variables):
208 self.retain_variables = retain_variables
--> 209 result = super(NestedIOFunction, self)._do_backward(gradients, retain_variables)
210 if not retain_variables:
211 del self._nested_output
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/autograd/function.py in backward(self, *gradients)
215 def backward(self, *gradients):
216 nested_gradients = _unflatten(gradients, self._nested_output)
--> 217 result = self.backward_extended(*nested_gradients)
218 return tuple(_iter_None_tensors(result))
219
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in backward_extended(self, grad_output, grad_hy)
314 grad_hy,
315 grad_input,
--> 316 grad_hx)
317
318 if any(self.needs_input_grad[1:]):
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_input, grad_hx)
371 hidden_size, dcy.size()))
372 if not dhy.is_cuda or not dy.is_cuda or (dcy is not None and not dcy.is_cuda):
--> 373 raise RuntimeError('Gradients aren\'t CUDA tensors')
374
375 check_error(cudnn.lib.cudnnRNNBackwardData(
RuntimeError: Gradients aren't CUDA tensors
any suggestions about why I am doing wrong?
Make sure that all the objects that inherit nn.Module also call their .cuda(). Make sure to call before you pass any tensor to them. (essentially before training)
For example, (and I am guessing your encoder and decoder are such objects), do this right before you call train().
encoder = encoder.cuda()
decoder = decoder.cuda()
This ensures that all of the model's parameters are initialized in cuda memory.
Edit
In general, whenever you have this kind of error,
RuntimeError: Gradients aren't CUDA tensors
somewhere, (from your model creation, to defining inputs, to finally supplying the outputs to the loss function) you missed specifying a Variable object to be in GPU memory. You will have go through every step in your model, verifying all Variable objects to be in GPU memory.
Additionally, you dont have to call .cuda() on the outputs. Given that the inputs are in gpu's memory, all operations also takes place in gpu's memory, and so are your outputs.

Resources