pytorch cpu & cuda issue using LBFGS optimiser - pytorch

I am getting errors trying to change a youtube tutorial into a CUDA version. I moved both data and model to 'cuda' but I get an error that I do not understand.
Data:
x = np.empty((100, 1000), 'int64')
x[:] = np.array(range(1000)) + np.random.randint(-80, 80, 100).reshape(100, 1)
data = np.sin(x / 1.0 / 20).astype('float64')
input = (torch.from_numpy(data[3:, :-1])).to('cuda')
target = (torch.from_numpy(data[3:, 1:])).to('cuda')
test_input = (torch.from_numpy(data[:3, :-1])).to('cuda')
test_target = (torch.from_numpy(data[:3, 1:])).to('cuda')
Model:
class Sequence(nn.Module):
def __init__(self):
super(Sequence, self).__init__()
self.lstm1 = nn.LSTMCell(1, 51)
self.lstm2 = nn.LSTMCell(51, 51)
self.linear = nn.Linear(51, 1)
def forward(self, input, future = 0):
outputs = []
h_t = torch.zeros(input.size(0), 51, dtype=torch.double)
c_t = torch.zeros(input.size(0), 51, dtype=torch.double)
h_t2 = torch.zeros(input.size(0), 51, dtype=torch.double)
c_t2 = torch.zeros(input.size(0), 51, dtype=torch.double)
for input_t in input.split(1, dim=1):
h_t, c_t = self.lstm1(input_t, (h_t, c_t))
h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
output = self.linear(h_t2)
outputs += [output]
for i in range(future):# if we should predict the future
h_t, c_t = self.lstm1(output, (h_t, c_t))
h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
output = self.linear(h_t2)
outputs += [output]
outputs = torch.cat(outputs, dim=1)
return outputs
Initializing the model and training:
seq = Sequence()
seq.double()
seq.to('cuda')
criterion = nn.MSELoss()
optimizer = optim.LBFGS(seq.parameters(), lr=0.8)
#begin to train
for i in range(10):
print('STEP: ', i)
def closure():
optimizer.zero_grad()
out = seq(input)
loss = criterion(out, target)
print(out.is_cuda,target.is_cuda)
print('loss:', loss.item().to('cpu').numpy())
loss.backward()
return loss
optimizer.step(closure)
# begin to predict, no need to track gradient here
with torch.no_grad():
future = 1000
pred = seq(test_input, future=future)
loss = criterion(pred[:, :-future], test_target)
print('test loss:', loss.item())
y = pred.detach().numpy()
The error I get is:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_mm).
the program works fine in cpu and I have tried moving and removing things to 'cuda' to see if it works to no success.

Related

Exploding gradient Problem with LSTM model build with LSTMCell (PyTorch Implementation)

I'm tried to solve timeseries prediction. Where my input is multivariate. My input has 4 variable, and my target is another variable.
I've processed the data as following.
4 variables and 60 timesteps input sequence.
So, each input shape is (1, 240).
I'll try to predict the next n-steps future output. During training, it will be 60 steps.
So, the target shape is (1,60)
Here is my LSTMPredictor class.
class LSTMPredictor(nn.Module):
def __init__(self,n_feature, n_hidden=51):
super(LSTMPredictor, self).__init__()
self.n_hidden = n_hidden
# lstm1, lstm2, linear
self.lstm1 = nn.LSTMCell(n_feature, self.n_hidden)
self.lstm2 = nn.LSTMCell(self.n_hidden, self.n_hidden)
self.lstm3 = nn.LSTMCell(self.n_hidden, self.n_hidden)
self.linear = nn.Linear(self.n_hidden, 1)
def forward(self, x, future=0):
outputs = []
# lstm1
h_t = torch.zeros(1, self.n_hidden, dtype=torch.float32).cuda()
c_t = torch.zeros(1, self.n_hidden, dtype=torch.float32).cuda()
# lstm2
h_t2 = torch.zeros(1, self.n_hidden, dtype=torch.float32).cuda()
c_t2 = torch.zeros(1, self.n_hidden, dtype=torch.float32).cuda()
# lstm3
h_t3 = torch.zeros(1, self.n_hidden, dtype=torch.float32).cuda()
c_t3 = torch.zeros(1, self.n_hidden, dtype=torch.float32).cuda()
h_t, c_t = self.lstm1(x, (h_t, c_t))
h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
output = None
for i in range(future):
if i == 0:
# first prediction
output = self.linear(h_t3) # h_t3?
outputs.append(output)
continue
h_t3, c_t3 = self.lstm3(h_t3, (h_t3, c_t3))
output = self.linear(h_t3)
outputs.append(output)
output = torch.cat(outputs, dim=1)
return output
Here, lstm1 and lstm2 receives the input with shape (1, 240), and then lstm3 is used to generate prediction to the future n steps successively. During training it is 60 steps.
However, my model is facing exploding gradient in the first step.
Model Initialization is shown bellow:
n_hidden = 512
n_feature = 240
model = LSTMPredictor(n_feature, n_hidden).to(device)
criterion = nn.MSELoss().to(device)
optimizer = optim.LBFGS(model.parameters(), lr=0.8)
Training Loop:
n_steps = 1
losses = []
print("--- Training Start ---")
for i in tqdm(range(n_steps)):
print("Step", i)
for i, sample_i in enumerate(train_input):
def closure():
optimizer.zero_grad()
out = model(sample_i.cuda(),future=60)
loss = criterion(out[0], train_target[i].cuda())
losses.append(loss.item())
loss.backward()
return loss
optimizer.step(closure)
print("loss", losses[-1])
Is there anything wrong in my implementation?

"IndexError: tensors used as indices must be long, byte or bool tensors" Pytorch

The dataset is a custom torch.geometric dataset
inv_mask = ~mask
--> 224 loop_attr[edge_index[0][inv_mask]] = edge_attr[inv_mask]
225
226 edge_attr = torch.cat([edge_attr[mask], loop_attr], dim=0)
IndexError: tensors used as indices must be long, byte or bool tensors
Code:-
from torch_geometric.nn import GCNConv
class GCN(torch.nn.Module):
def __init__(self, hidden_channels):
super().__init__()
torch.manual_seed(13213)
self.conv1 = GCNConv(dataset.num_features, hidden_channels)
self.conv2 = GCNConv(hidden_channels, num_classes)
def forward(self,x, edge_index):
x = self.conv1(x, edge_index)
x = x.relu()
x = F.dropout(x, p=0.5, training = self.training)
x = self.conv2(x, edge_index)
return x
model = GCN(hidden_channels = 16)
dataset.train_mask = torch.tensor([range(0,14000)]).type(torch.bool)
dataset.test_mask = torch.tensor([range(14000, 22470)]).type(torch.bool)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01, weight_decay = 5e-4)
criterion = torch.nn.CrossEntropyLoss()
dataset.y = dataset.y.float()
dataset.x = dataset.x.float()
dataset.edge_index = dataset.edge_index.float()
def train():
model.train()
optimizer.zero_grad()
out = model(dataset.x, dataset.edge_index)
loss = criterion(out[dataset.train_mask], dataset.y[dataset.train_mask])
loss.backward()
optimizer.step()
return loss
def test():
model.eval()
out = model(dataset.x, dataset.edge_index)
# pred = out.argmax(dim=1)
test_correct = out[dataset.test_mask] == dataset.y[dataset.test_mask]
test_acc = int(test_correct.sum()) / int(data.test_mask.sum())
return test_acc
for e in range(1,101):
loss = train()
print(f'Epoch: {epoch:02d}, Loss: {loss:.3f}')
The error points to optimizer.zero_grad()
could anyone please explain how to debug code in pytorch, since i used tensorflow for almost every deep learning task I did but when it came to GNN I felt torch geometric would be a viable option.
Please help me get ahead of this error and also suggest ways for me to improve the code ....

Inference on Multiple Label Sentiment analysis using pytorch_lightning

I have trained an LSTM model for sentiment analysis using pytorch_lightning but I've been having difficulties incorporating the inference.
This is my model:
class LSTM(pl.LightningModule):
def __init__(self,n_vocab,n_embed,
n_hidden,n_output,n_layers,learning_rate,embedding_matrix=None):
super().__init__()
self.n_vocab = n_vocab
self.n_layer = n_layers
self.n_hidden = n_hidden
self.embedding = nn.Embedding(n_vocab, n_embed, padding_idx = 0)
if embedding_matrix is not None:
self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
self.embedding.weight.requires_grad = False
self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, bidirectional = True)
self.fc = nn.Linear(2 * n_hidden, n_output)
self.dropout = nn.Dropout(0.2)
self.sigmoid = nn.Sigmoid()
self.batch_size = batch_size
self.learning_rate = learning_rate
def forward(self,input_words):
embedded_words = self.embedding(input_words)
lstm_out, _ = self.lstm(embedded_words)
lstm_out_f=lstm_out[:,-1 , :300 ]
lstm_out_b=lstm_out[:, 0 , 300: ]
lstm_out_final = torch.cat([lstm_out_f,lstm_out_b], dim=-1)
lstm_out_final = self.dropout(lstm_out_final)
fc_out = self.fc(lstm_out_final)
return fc_out
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
return optimizer
def training_step(self, batch, batch_nb):
x , y= batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y)
self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
return loss
def validation_step(self, batch, batch_nb):
x, y = batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y).to(device = 'cuda')
f1 = torchmetrics.F1(num_classes=5).to(device = 'cuda')
f1_score = f1(y_hat, y)
accuracy = Accuracy().to(device = 'cuda')
accur = accuracy(y_hat, y)
self.log("val_loss", loss)
self.log("f1_score", f1_score)
self.log("accuracy", accur)
def test_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
print(logits)
logitsies = softmax(logits)
choice = argmax(logitsies)
loss = F.nll_loss(logits, y)
self.log("test_loss", loss)
return choice
def predict_step(self, batch, batch_idx, dataloader_idx):
x, y = batch
x = x.view(x.size(0), -1)
y_hat = self(x)
logitsies = softmax(logits)
choice = argmax(logitsies)
loss = F.nll_loss(logits, y)
self.log("predict_loss", loss)
return choice
I called the model as such:
model = LSTM(
n_vocab=size_of_emb_matrix,
n_embed=embed_vector_len,
n_hidden=150,
n_output=5,
n_layers=1,
learning_rate=1e-4,
embedding_matrix=embedding_matrix
)
Now I am trying to write a function that would allow me to do inference. I have managed to succesfuly tokenize the input sentence and encode it through an already pre-defined functions, yet I have been getting several errors no matter what I try. I have found myself stuck and don't know how to continue. This is my function so far.
def get_sentiment(text):
x = encode_sentence(text,vocab2index)
x_bar = x[0]
y_hat = torch.tensor(x_bar)
trainer.predict(model,y_hat)
The encode_sentence function is as follows:
def encode_sentence(text, vocab2index, N=70):
tokenized = tokenize(text)
encoded = np.zeros(N, dtype=int)
enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
length = min(N, len(enc1))
encoded[:length] = enc1[:length]
return encoded, length
As I call the get_sentiment function, I am using the trainer.predict function which allows me to predict, hence doing inference.
But I have been getting the following Issue:
AttributeError Traceback (most recent call
last)
<ipython-input-195-825c536cbbb2> in <module>()
----> 1 get_sentiment("love that dress")
13 frames
/usr/local/lib/python3.7/dist-
packages/pytorch_lightning/loops/epoch/prediction_epoch_loop.py in
_store_batch_indices(self, dataloader_idx)
162 def _store_batch_indices(self, dataloader_idx: int) -> None:
163 """Stores the batch indices if the predictions should be
stored"""
--> 164 batch_sampler =
self.trainer.predict_dataloaders[dataloader_idx].batch_sampler
165 if isinstance(batch_sampler, IndexBatchSamplerWrapper):
166 self.current_batch_indices = batch_sampler.batch_indices
AttributeError: 'Tensor' object has no attribute 'batch_sampler'

pytorch runs slow when data are pre-transported to GPU

I have a model written in pytorch. Since my dataset is small, I can directly load all of the data to GPU. However, I found the forward speed becomes slow if I do so. The following is a runnable example. Specifically, I have the model:
import numpy as np
from time import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
def knn(x, k):
inner = -2*torch.matmul(x.transpose(2, 1), x)
xx = torch.sum(x**2, dim=1, keepdim=True)
pairwise_distance = -xx - inner - xx.transpose(2, 1)
idx = pairwise_distance.topk(k=k, dim=-1)[1] # (batch_size, num_points, k)
return idx
def get_graph_feature(x, k=20, idx=None):
batch_size = x.size(0)
num_points = x.size(2)
x = x.view(batch_size, -1, num_points)
if idx is None:
idx = knn(x, k=k) # (batch_size, num_points, k)
idx_base = torch.arange(0, batch_size, device=x.device).view(-1, 1, 1)*num_points
idx = idx + idx_base
idx = idx.view(-1)
_, num_dims, _ = x.size()
x = x.transpose(2, 1).contiguous() # (batch_size, num_points, num_dims) -> (batch_size*num_points, num_dims) # batch_size * num_points * k + range(0, batch_size*num_points)
feature = x.view(batch_size*num_points, -1)[idx, :]
feature = feature.view(batch_size, num_points, k, num_dims)
x = x.view(batch_size, num_points, 1, num_dims).repeat(1, 1, k, 1)
feature = torch.cat((feature-x, x), dim=3).permute(0, 3, 1, 2).contiguous()
return feature
class DGCNN(nn.Module):
def __init__(self, k=25, output_channels=10):
super(DGCNN, self).__init__()
self.k = k
self.bn1 = nn.BatchNorm2d(64)
self.bn2 = nn.BatchNorm2d(64)
self.bn3 = nn.BatchNorm2d(128)
self.bn4 = nn.BatchNorm2d(256)
self.bn5 = nn.BatchNorm1d(1024)
self.conv1 = nn.Sequential(nn.Conv2d(6, 64, kernel_size=1, bias=False),
self.bn1,
nn.LeakyReLU(negative_slope=0.2))
self.conv2 = nn.Sequential(nn.Conv2d(64*2, 64, kernel_size=1, bias=False),
self.bn2,
nn.LeakyReLU(negative_slope=0.2))
self.conv3 = nn.Sequential(nn.Conv2d(64*2, 128, kernel_size=1, bias=False),
self.bn3,
nn.LeakyReLU(negative_slope=0.2))
self.conv4 = nn.Sequential(nn.Conv2d(128*2, 256, kernel_size=1, bias=False),
self.bn4,
nn.LeakyReLU(negative_slope=0.2))
self.conv5 = nn.Sequential(nn.Conv1d(512, 1024, kernel_size=1, bias=False),
self.bn5,
nn.LeakyReLU(negative_slope=0.2))
self.linear1 = nn.Linear(1024*2, 512, bias=False)
self.bn6 = nn.BatchNorm1d(512)
self.dp1 = nn.Dropout()
self.linear2 = nn.Linear(512, 256)
self.bn7 = nn.BatchNorm1d(256)
self.dp2 = nn.Dropout()
self.linear3 = nn.Linear(256, output_channels)
def forward(self, x):
x = x.transpose(2, 1)
batch_size = x.size(0)
x = get_graph_feature(x, k=self.k)
x = self.conv1(x)
x1 = x.max(dim=-1, keepdim=False)[0]
x = get_graph_feature(x1, k=self.k)
x = self.conv2(x)
x2 = x.max(dim=-1, keepdim=False)[0]
x = get_graph_feature(x2, k=self.k)
x = self.conv3(x)
x3 = x.max(dim=-1, keepdim=False)[0]
x = get_graph_feature(x3, k=self.k)
x = self.conv4(x)
x4 = x.max(dim=-1, keepdim=False)[0]
x = torch.cat((x1, x2, x3, x4), dim=1)
x = self.conv5(x)
x1 = F.adaptive_max_pool1d(x, 1).view(batch_size, -1)
x2 = F.adaptive_avg_pool1d(x, 1).view(batch_size, -1)
x = torch.cat((x1, x2), 1)
x = F.leaky_relu(self.bn6(self.linear1(x)), negative_slope=0.2)
x = self.dp1(x)
x = F.leaky_relu(self.bn7(self.linear2(x)), negative_slope=0.2)
x = self.dp2(x)
x = self.linear3(x)
return x
Here is what the dataloader and test function looks like:
class my_loader(Dataset):
def __init__(self, device):
self.data = torch.rand(256, 2048, 3).to(device).float()
self.labels = torch.rand(256).to(device).long()
def __getitem__(self, ind):
return self.data[ind], self.labels[ind]
def __len__(self):
return len(self.data)
def test():
device = torch.device('cuda:2')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.12s --------------#
for inputs, labels in test_loader:
tic = time()
pred = model(inputs)
print('time1 {}'.format(time() - tic))
print('------------------')
#---------- this one is 0.004s --------------#
for inputs, labels in test_loader:
inputs = inputs.detach().cpu().to(device)
tic = time()
pred = model(inputs)
print('time2 {}'.format(time() - tic))
print('------------------')
#---------- this one is 0.12s --------------#
for inputs, labels in test_loader:
tic = time()
inputs = inputs.detach().cpu().to(device)
pred = model(inputs)
print('time3 {}'.format(time() - tic))
print('------------------')
Basically, it seems that if there is no explicit call of gpu to cpu transportation either before or after the forward propagation, the forward propagation would cost more time. It just seems like that the forward propagation is implicitly doing gpu->cpu transportation.
I played around with the code a little bit, and I think the problem is that you are measuring times for both cases in the same run. Here is my boiled down version of your code since your model crushed my GPU memory:
class DGCNN(nn.Module):
def __init__(self, num_layers):
super(DGCNN, self).__init__()
self.layers = nn.ModuleList([nn.Linear(256, 256) for _ in range(1200)])
def forward(self, x):
x = x.view(-1, 256)
for layer in self.layers:
x = layer(x)
return x
class my_loader(Dataset):
def __init__(self, device):
self.data = torch.rand(256, 2048, 3).to(device).float()
self.labels = torch.rand(256).to(device).long()
def __getitem__(self, ind):
return self.data[ind], self.labels[ind]
def __len__(self):
return len(self.data)
Now, here I demonstrate different versions of test().
Version #1:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.12s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs)
tac = time()
print(f'# First case -> Full forward pass: {tac - tic:.6f}')
#---------- this one is 0.004s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs.detach().cpu().to(device))
tac = time()
print(f'# Second case -> Full forward pass: {tac - tic:.6f}')
>>> # First case -> Full forward pass: 3.105103, # Second case -> Full forward pass: 2.831652
Now I switched the order of timing calculations for the cases. Version #2:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.004s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs.detach().cpu().to(device))
tac = time()
print(f'# Second case -> Full forward pass: {tac - tic:.6f}')
#---------- this one is 0.12s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs)
tac = time()
print(f'# First case -> Full forward pass: {tac - tic:.6f}')
>>> # Second case -> Full forward pass: 3.288522, # First case -> Full forward pass: 2.583231
Apparently, the first timing you calculate seems to end up slower. So, I calculated these timings separately in different runs with fresh kernels. Version #3:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.12s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs)
tac = time()
print(f'# First case -> Full forward pass: {tac - tic:.6f}')
>>> # First case -> Full forward pass: 3.091592
Version #4:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.004s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs.detach().cpu().to(device))
tac = time()
print(f'# Second case -> Full forward pass: {tac - tic:.6f}')
>>> # Second case -> Full forward pass: 3.190248
So, by testing one at a time, it seems like pred = model(inputs) runs slightly faster than pred = model(inputs.detach().cpu().to(device)), which is the obvious expected result.

PyTorch network produces constant output

I am trying to train a simple MLP to approximate y=f(a,b,c).
My code is as below.
import torch
import torch.nn as nn
from torch.autograd import Variable
# hyper parameters
input_size = 3
output_size = 1
num_epochs = 50
learning_rate = 0.001
# Network definition
class FeedForwardNet(nn.Module):
def __init__(self, l1_size, l2_size):
super(FeedForwardNet, self).__init__()
self.fc1 = nn.Linear(input_size, l1_size)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(l1_size, l2_size)
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(l2_size, output_size)
def forward(self, x):
out = self.fc1(x)
out = self.relu1(out)
out = self.fc2(out)
out = self.relu2(out)
out = self.fc3(out)
return out
model = FeedForwardNet(5 , 3)
# sgd optimizer
optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum=0.9)
for epoch in range(11):
print ('Epoch ', epoch)
for i in range(trainX_light.shape[0]):
X = Variable( torch.from_numpy(trainX_light[i]).view(-1, 3) )
Y = Variable( torch.from_numpy(trainY_light[i]).view(-1, 1) )
# forward
optimizer.zero_grad()
output = model(X)
loss = (Y - output).pow(2).sum()
print (output.data[0,0])
loss.backward()
optimizer.step()
totalnorm = 0
for p in model.parameters():
modulenorm = p.grad.data.norm()
totalnorm += modulenorm ** 2
totalnorm = math.sqrt(totalnorm)
print (totalnorm)
# validation code
if (epoch + 1) % 5 == 0:
print (' test points',testX_light.shape[0])
total_loss = 0
for t in range(testX_light.shape[0]):
X = Variable( torch.from_numpy(testX_light[t]).view(-1, 3) )
Y = Variable( torch.from_numpy(testY_light[t]).view(-1, 1) )
output = model(X)
loss = (Y - output).pow(2).sum()
print (output.data[0,0])
total_loss += loss
print ('epoch ', epoch, 'avg_loss ', total_loss.data[0] / testX_light.shape[0])
print ('Done')
The problem that I have now is, the validation code
output = model(X)
is always producing an exact same output value (I guess this value is some sort of garbage). I am not sure what mistake I am doing in this part. Could some help me figure out the mistake in my code?
The reason that network produced random values (and inf later) was the exploding gradient problem. Clipping the gradient (torch.nn.utils.clip_grad_norm(model.parameters(), 0.1)) helped.

Resources