Inference on Multiple Label Sentiment analysis using pytorch_lightning - nlp

I have trained an LSTM model for sentiment analysis using pytorch_lightning but I've been having difficulties incorporating the inference.
This is my model:
class LSTM(pl.LightningModule):
def __init__(self,n_vocab,n_embed,
n_hidden,n_output,n_layers,learning_rate,embedding_matrix=None):
super().__init__()
self.n_vocab = n_vocab
self.n_layer = n_layers
self.n_hidden = n_hidden
self.embedding = nn.Embedding(n_vocab, n_embed, padding_idx = 0)
if embedding_matrix is not None:
self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
self.embedding.weight.requires_grad = False
self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, bidirectional = True)
self.fc = nn.Linear(2 * n_hidden, n_output)
self.dropout = nn.Dropout(0.2)
self.sigmoid = nn.Sigmoid()
self.batch_size = batch_size
self.learning_rate = learning_rate
def forward(self,input_words):
embedded_words = self.embedding(input_words)
lstm_out, _ = self.lstm(embedded_words)
lstm_out_f=lstm_out[:,-1 , :300 ]
lstm_out_b=lstm_out[:, 0 , 300: ]
lstm_out_final = torch.cat([lstm_out_f,lstm_out_b], dim=-1)
lstm_out_final = self.dropout(lstm_out_final)
fc_out = self.fc(lstm_out_final)
return fc_out
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
return optimizer
def training_step(self, batch, batch_nb):
x , y= batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y)
self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
return loss
def validation_step(self, batch, batch_nb):
x, y = batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y).to(device = 'cuda')
f1 = torchmetrics.F1(num_classes=5).to(device = 'cuda')
f1_score = f1(y_hat, y)
accuracy = Accuracy().to(device = 'cuda')
accur = accuracy(y_hat, y)
self.log("val_loss", loss)
self.log("f1_score", f1_score)
self.log("accuracy", accur)
def test_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
print(logits)
logitsies = softmax(logits)
choice = argmax(logitsies)
loss = F.nll_loss(logits, y)
self.log("test_loss", loss)
return choice
def predict_step(self, batch, batch_idx, dataloader_idx):
x, y = batch
x = x.view(x.size(0), -1)
y_hat = self(x)
logitsies = softmax(logits)
choice = argmax(logitsies)
loss = F.nll_loss(logits, y)
self.log("predict_loss", loss)
return choice
I called the model as such:
model = LSTM(
n_vocab=size_of_emb_matrix,
n_embed=embed_vector_len,
n_hidden=150,
n_output=5,
n_layers=1,
learning_rate=1e-4,
embedding_matrix=embedding_matrix
)
Now I am trying to write a function that would allow me to do inference. I have managed to succesfuly tokenize the input sentence and encode it through an already pre-defined functions, yet I have been getting several errors no matter what I try. I have found myself stuck and don't know how to continue. This is my function so far.
def get_sentiment(text):
x = encode_sentence(text,vocab2index)
x_bar = x[0]
y_hat = torch.tensor(x_bar)
trainer.predict(model,y_hat)
The encode_sentence function is as follows:
def encode_sentence(text, vocab2index, N=70):
tokenized = tokenize(text)
encoded = np.zeros(N, dtype=int)
enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
length = min(N, len(enc1))
encoded[:length] = enc1[:length]
return encoded, length
As I call the get_sentiment function, I am using the trainer.predict function which allows me to predict, hence doing inference.
But I have been getting the following Issue:
AttributeError Traceback (most recent call
last)
<ipython-input-195-825c536cbbb2> in <module>()
----> 1 get_sentiment("love that dress")
13 frames
/usr/local/lib/python3.7/dist-
packages/pytorch_lightning/loops/epoch/prediction_epoch_loop.py in
_store_batch_indices(self, dataloader_idx)
162 def _store_batch_indices(self, dataloader_idx: int) -> None:
163 """Stores the batch indices if the predictions should be
stored"""
--> 164 batch_sampler =
self.trainer.predict_dataloaders[dataloader_idx].batch_sampler
165 if isinstance(batch_sampler, IndexBatchSamplerWrapper):
166 self.current_batch_indices = batch_sampler.batch_indices
AttributeError: 'Tensor' object has no attribute 'batch_sampler'

Related

Bert NER pytorch lightning

I'm trying to use pytorch-lightning for token-classification model. I have already built a model for token classification without lightning. I'm confused on what changes should be done with existing code to integrate pytorch-lightning.
Following is my pytorch code:
model = BertForTokenClassification.from_pretrained(
'bert-large-cased',
num_labels=len(tag2idx),
output_attentions = False,
output_hidden_states = False
)
for _ in trange(epochs, desc="Epoch"):
# ========================================
# Training
# ========================================
model.train()
total_loss = 0
for step, batch in enumerate(train_dataloader):
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
model.zero_grad()
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
loss = outputs[0]
loss.backward()
total_loss += loss.item()
torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
optimizer.step()
scheduler.step()
avg_train_loss = total_loss / len(train_dataloader)
loss_values.append(avg_train_loss)
# ========================================
# Validation
# ========================================
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions , true_labels = [], []
for batch in valid_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
logits = outputs[1].detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
eval_loss += outputs[0].mean().item()
predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
true_labels.extend(label_ids)
eval_loss = eval_loss / len(valid_dataloader)
validation_loss_values.append(eval_loss)
pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
valid_tags = [tag_values[l_i] for l in true_labels
for l_i in l if tag_values[l_i] != "PAD"]
f1 = f1_score([valid_tags], [pred_tags])
Following is the code which I tried for pytorch lightning.
class LightningModule(pl.LightningModule):
def __init__(self, lr, lr_backbone, weight_decay, batch_size):
super().__init__()
self.model = BertForTokenClassification.from_pretrained("bert-large-cased",
num_labels=len(tag2idx),
output_attentions = False,
output_hidden_states = False)
self.lr = lr
self.lr_backbone = lr_backbone
self.weight_decay = weight_decay
self.batch_size = batch_size
def forward(self, input_ids, attention_mask, labels):
outputs = self.model(
input_ids, token_type_ids=None, attention_mask=attention_mask, labels=labels
)
loss = outputs[0]
logits = outputs[1]
return loss, logits
def training_step(self, batch, batch_idx):
b_input_ids, b_input_mask, b_labels = batch
outputs = self.model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
loss = outputs[0]
self.log("train_loss", loss)
return loss
def validation_step(self, batch, batch_idx):
b_input_ids, b_input_mask, b_labels = batch
outputs = self.model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
eval_loss = outputs[0]
self.log("val_loss", eval_loss)
return eval_loss
def validation_end(self, outputs):
eval_loss = np.mean([x["val_loss"] for x in outputs])
self.log("val_loss", eval_loss)
pred_tags = [tag_values[p_i] for p, l in zip(self.predictions, self.true_labels)
for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
valid_tags = [tag_values[l_i] for l in self.true_labels
for l_i in l if tag_values[l_i] != "PAD"]
f1 = f1_score([valid_tags], [pred_tags])
self.log("val_f1", f1)
def configure_optimizers(self):
# optimizer = torch.optim.NAdam(optimizer_grouped_parameters,lr=4e-6,eps=1e-8)
# scheduler = scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps )
return torch.optim.NAdam(optimizer_grouped_parameters,lr=4e-6,eps=1e-8)
def train_dataloader(self):
return train_dataloader # return your dataloader
def val_dataloader(self):
return valid_dataloader # return your validation dataloader
model = LightningModule(lr=1e-6, lr_backbone=1e-5, weight_decay=1e-4, batch_size=32)
trainer = pl.Trainer(accelerator='gpu', gradient_clip_val=0.1, max_epochs=epochs, auto_scale_batch_size=None, default_root_dir="lightning_output/", enable_checkpointing=False)
trainer.fit(model)
But, when I run inference, I get the following error.
TypeError: forward() missing 2 required positional arguments: 'attention_mask' and 'labels'

LSTM multi-sequence input to one sequence output

I am new with neural networks and am currently trying to make an LSTM model that predicts an output sequence based on multiple parameters. Excuse my ignorance and dummyness in advance.
I have obtained training and validation datasets, which look somewhat like the following:
For every ID four rows are recorded, which uses columns holding certain parameters and the corresponding Y output. Practically, there are thus ~122,000 / 4 = ~30,500 samples (I mistakenly put 122,000 as ID, it is in fact the number of rows). Since the parameter values and the corresponding Y values follow temporal patterns, I am interested if a model such as LSTM improves the prediction.
I want to predict the Y in my validation dataset (~73,000/4 = ~18,000 samples), based on the temporal patterns of the parameters. But is this possible? Most tutorials I followed use a single sequence, for which an LSTM is used to extend a similar input sequence. I thus want an LSTM with 'multi-sequence' input, which outputs one sequence. How do I go about this?
I'm using PyTorch as framework. A simple LSTM model I created using a tutorial, which would not incorporate the parameters:
training_y = traindf.reset_index()['Y']
validation_y = traindf.reset_index()['Y']
Then create a dataset for this:
class YDataset(Dataset):
def __init__(self, data, seq_len = 100):
self.data = data
self.data = torch.from_numpy(data).float().view(-1)
self.seq_len = seq_len
def __len__(self):
return len(self.data)-self.seq_len-1
def __getitem__(self,index):
return self.data[index : index+self.seq_len] , self.data[index+self.seq_len]
train_y = YDataset(training_y_df)
vali_y = YDataset(validation_y_df)
batch_size = 64
train_dataloader = DataLoader(train_y, batch_size, drop_last=True)
vali_dataloader = DataLoader(vali_y, batch_size, drop_last=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
Then create the model:
class Lstm_model(nn.Module):
def __init__(self, input_dim, hidden_size, num_layers):
super(Lstm_model, self).__init__()
self.num_layers = num_layers
self.input_size = input_dim
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size=input_dim, hidden_size = hidden_size, num_layers = num_layers)
self.fc = nn.Linear(hidden_size, 1)
def forward(self,x,hn,cn):
out , (hn,cn) = self.lstm(x, (hn, cn))
final_out = self.fc(out[-1])
return final_out, hn,cn
def predict(self,x):
hn, cn = self.init()
final_out = self.fc(out[-1])
return final_out
def init(self):
h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
return h0 , c0
input_dim = 1
hidden_size = 50
num_layers = 3
model = Lstm_model(input_dim , hidden_size , num_layers).to(device)
Loss function and training loop (more or less same as for validation):
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
def train(dataloader):
hn, cn = model.init()
model.train()
for batch , item in enumerate(dataloader):
x , y = item
x = x.to(device)
y = y.to(device)
out , hn , cn = model(x.reshape(100,batch_size,1),hn,cn)
loss = loss_fn(out.reshape(batch_size), y)
hn = hn.detach()
cn = cn.detach()
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch == len(dataloader)-1:
loss = loss.item
print(f"train loss: {loss:>7f} ")
Epochs and loss metrics:
epochs = 200 # Takes really long for me
for epoch in range(epochs):
print(f"epoch {epoch} ")
train(train_dataloader)
test(vali_dataloader)
Final metrics:
import math
from sklearn.metrics import mean_squared_error
import numpy as np
def calculate_metrics(data_loader):
pred_arr = []
y_arr = []
with torch.no_grad():
hn , cn = model.init()
for batch , item = in enumerate(data_loader):
x , y = item
x , y = x.to(device) , y.to(device)
x = x.view(100,64,1)
pred = model(x, hn, cn)[0]
pred = scalar.inverse_transform(pred.detach().cpu().numpy().reshape(-1))
y = scalar.inverse_transform(y.detach().cpu().numpy().reshape(1,-1)).reshape(-1)
pred_arr = pred_arr + list(pred)
y_arr = y_arr + list(y)
return math.sqrt(mean_squared_error(y_arr,pred_arr))
I used this code more as an example of how LSTM would work. Nevertheless, I don't know if this is the right track for me. Does someone know what I should do or a tutorial that does work for my example? Thanks in advance!

pytorch cpu & cuda issue using LBFGS optimiser

I am getting errors trying to change a youtube tutorial into a CUDA version. I moved both data and model to 'cuda' but I get an error that I do not understand.
Data:
x = np.empty((100, 1000), 'int64')
x[:] = np.array(range(1000)) + np.random.randint(-80, 80, 100).reshape(100, 1)
data = np.sin(x / 1.0 / 20).astype('float64')
input = (torch.from_numpy(data[3:, :-1])).to('cuda')
target = (torch.from_numpy(data[3:, 1:])).to('cuda')
test_input = (torch.from_numpy(data[:3, :-1])).to('cuda')
test_target = (torch.from_numpy(data[:3, 1:])).to('cuda')
Model:
class Sequence(nn.Module):
def __init__(self):
super(Sequence, self).__init__()
self.lstm1 = nn.LSTMCell(1, 51)
self.lstm2 = nn.LSTMCell(51, 51)
self.linear = nn.Linear(51, 1)
def forward(self, input, future = 0):
outputs = []
h_t = torch.zeros(input.size(0), 51, dtype=torch.double)
c_t = torch.zeros(input.size(0), 51, dtype=torch.double)
h_t2 = torch.zeros(input.size(0), 51, dtype=torch.double)
c_t2 = torch.zeros(input.size(0), 51, dtype=torch.double)
for input_t in input.split(1, dim=1):
h_t, c_t = self.lstm1(input_t, (h_t, c_t))
h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
output = self.linear(h_t2)
outputs += [output]
for i in range(future):# if we should predict the future
h_t, c_t = self.lstm1(output, (h_t, c_t))
h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
output = self.linear(h_t2)
outputs += [output]
outputs = torch.cat(outputs, dim=1)
return outputs
Initializing the model and training:
seq = Sequence()
seq.double()
seq.to('cuda')
criterion = nn.MSELoss()
optimizer = optim.LBFGS(seq.parameters(), lr=0.8)
#begin to train
for i in range(10):
print('STEP: ', i)
def closure():
optimizer.zero_grad()
out = seq(input)
loss = criterion(out, target)
print(out.is_cuda,target.is_cuda)
print('loss:', loss.item().to('cpu').numpy())
loss.backward()
return loss
optimizer.step(closure)
# begin to predict, no need to track gradient here
with torch.no_grad():
future = 1000
pred = seq(test_input, future=future)
loss = criterion(pred[:, :-future], test_target)
print('test loss:', loss.item())
y = pred.detach().numpy()
The error I get is:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_mm).
the program works fine in cpu and I have tried moving and removing things to 'cuda' to see if it works to no success.

"IndexError: tensors used as indices must be long, byte or bool tensors" Pytorch

The dataset is a custom torch.geometric dataset
inv_mask = ~mask
--> 224 loop_attr[edge_index[0][inv_mask]] = edge_attr[inv_mask]
225
226 edge_attr = torch.cat([edge_attr[mask], loop_attr], dim=0)
IndexError: tensors used as indices must be long, byte or bool tensors
Code:-
from torch_geometric.nn import GCNConv
class GCN(torch.nn.Module):
def __init__(self, hidden_channels):
super().__init__()
torch.manual_seed(13213)
self.conv1 = GCNConv(dataset.num_features, hidden_channels)
self.conv2 = GCNConv(hidden_channels, num_classes)
def forward(self,x, edge_index):
x = self.conv1(x, edge_index)
x = x.relu()
x = F.dropout(x, p=0.5, training = self.training)
x = self.conv2(x, edge_index)
return x
model = GCN(hidden_channels = 16)
dataset.train_mask = torch.tensor([range(0,14000)]).type(torch.bool)
dataset.test_mask = torch.tensor([range(14000, 22470)]).type(torch.bool)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01, weight_decay = 5e-4)
criterion = torch.nn.CrossEntropyLoss()
dataset.y = dataset.y.float()
dataset.x = dataset.x.float()
dataset.edge_index = dataset.edge_index.float()
def train():
model.train()
optimizer.zero_grad()
out = model(dataset.x, dataset.edge_index)
loss = criterion(out[dataset.train_mask], dataset.y[dataset.train_mask])
loss.backward()
optimizer.step()
return loss
def test():
model.eval()
out = model(dataset.x, dataset.edge_index)
# pred = out.argmax(dim=1)
test_correct = out[dataset.test_mask] == dataset.y[dataset.test_mask]
test_acc = int(test_correct.sum()) / int(data.test_mask.sum())
return test_acc
for e in range(1,101):
loss = train()
print(f'Epoch: {epoch:02d}, Loss: {loss:.3f}')
The error points to optimizer.zero_grad()
could anyone please explain how to debug code in pytorch, since i used tensorflow for almost every deep learning task I did but when it came to GNN I felt torch geometric would be a viable option.
Please help me get ahead of this error and also suggest ways for me to improve the code ....

pytorch runs slow when data are pre-transported to GPU

I have a model written in pytorch. Since my dataset is small, I can directly load all of the data to GPU. However, I found the forward speed becomes slow if I do so. The following is a runnable example. Specifically, I have the model:
import numpy as np
from time import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
def knn(x, k):
inner = -2*torch.matmul(x.transpose(2, 1), x)
xx = torch.sum(x**2, dim=1, keepdim=True)
pairwise_distance = -xx - inner - xx.transpose(2, 1)
idx = pairwise_distance.topk(k=k, dim=-1)[1] # (batch_size, num_points, k)
return idx
def get_graph_feature(x, k=20, idx=None):
batch_size = x.size(0)
num_points = x.size(2)
x = x.view(batch_size, -1, num_points)
if idx is None:
idx = knn(x, k=k) # (batch_size, num_points, k)
idx_base = torch.arange(0, batch_size, device=x.device).view(-1, 1, 1)*num_points
idx = idx + idx_base
idx = idx.view(-1)
_, num_dims, _ = x.size()
x = x.transpose(2, 1).contiguous() # (batch_size, num_points, num_dims) -> (batch_size*num_points, num_dims) # batch_size * num_points * k + range(0, batch_size*num_points)
feature = x.view(batch_size*num_points, -1)[idx, :]
feature = feature.view(batch_size, num_points, k, num_dims)
x = x.view(batch_size, num_points, 1, num_dims).repeat(1, 1, k, 1)
feature = torch.cat((feature-x, x), dim=3).permute(0, 3, 1, 2).contiguous()
return feature
class DGCNN(nn.Module):
def __init__(self, k=25, output_channels=10):
super(DGCNN, self).__init__()
self.k = k
self.bn1 = nn.BatchNorm2d(64)
self.bn2 = nn.BatchNorm2d(64)
self.bn3 = nn.BatchNorm2d(128)
self.bn4 = nn.BatchNorm2d(256)
self.bn5 = nn.BatchNorm1d(1024)
self.conv1 = nn.Sequential(nn.Conv2d(6, 64, kernel_size=1, bias=False),
self.bn1,
nn.LeakyReLU(negative_slope=0.2))
self.conv2 = nn.Sequential(nn.Conv2d(64*2, 64, kernel_size=1, bias=False),
self.bn2,
nn.LeakyReLU(negative_slope=0.2))
self.conv3 = nn.Sequential(nn.Conv2d(64*2, 128, kernel_size=1, bias=False),
self.bn3,
nn.LeakyReLU(negative_slope=0.2))
self.conv4 = nn.Sequential(nn.Conv2d(128*2, 256, kernel_size=1, bias=False),
self.bn4,
nn.LeakyReLU(negative_slope=0.2))
self.conv5 = nn.Sequential(nn.Conv1d(512, 1024, kernel_size=1, bias=False),
self.bn5,
nn.LeakyReLU(negative_slope=0.2))
self.linear1 = nn.Linear(1024*2, 512, bias=False)
self.bn6 = nn.BatchNorm1d(512)
self.dp1 = nn.Dropout()
self.linear2 = nn.Linear(512, 256)
self.bn7 = nn.BatchNorm1d(256)
self.dp2 = nn.Dropout()
self.linear3 = nn.Linear(256, output_channels)
def forward(self, x):
x = x.transpose(2, 1)
batch_size = x.size(0)
x = get_graph_feature(x, k=self.k)
x = self.conv1(x)
x1 = x.max(dim=-1, keepdim=False)[0]
x = get_graph_feature(x1, k=self.k)
x = self.conv2(x)
x2 = x.max(dim=-1, keepdim=False)[0]
x = get_graph_feature(x2, k=self.k)
x = self.conv3(x)
x3 = x.max(dim=-1, keepdim=False)[0]
x = get_graph_feature(x3, k=self.k)
x = self.conv4(x)
x4 = x.max(dim=-1, keepdim=False)[0]
x = torch.cat((x1, x2, x3, x4), dim=1)
x = self.conv5(x)
x1 = F.adaptive_max_pool1d(x, 1).view(batch_size, -1)
x2 = F.adaptive_avg_pool1d(x, 1).view(batch_size, -1)
x = torch.cat((x1, x2), 1)
x = F.leaky_relu(self.bn6(self.linear1(x)), negative_slope=0.2)
x = self.dp1(x)
x = F.leaky_relu(self.bn7(self.linear2(x)), negative_slope=0.2)
x = self.dp2(x)
x = self.linear3(x)
return x
Here is what the dataloader and test function looks like:
class my_loader(Dataset):
def __init__(self, device):
self.data = torch.rand(256, 2048, 3).to(device).float()
self.labels = torch.rand(256).to(device).long()
def __getitem__(self, ind):
return self.data[ind], self.labels[ind]
def __len__(self):
return len(self.data)
def test():
device = torch.device('cuda:2')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.12s --------------#
for inputs, labels in test_loader:
tic = time()
pred = model(inputs)
print('time1 {}'.format(time() - tic))
print('------------------')
#---------- this one is 0.004s --------------#
for inputs, labels in test_loader:
inputs = inputs.detach().cpu().to(device)
tic = time()
pred = model(inputs)
print('time2 {}'.format(time() - tic))
print('------------------')
#---------- this one is 0.12s --------------#
for inputs, labels in test_loader:
tic = time()
inputs = inputs.detach().cpu().to(device)
pred = model(inputs)
print('time3 {}'.format(time() - tic))
print('------------------')
Basically, it seems that if there is no explicit call of gpu to cpu transportation either before or after the forward propagation, the forward propagation would cost more time. It just seems like that the forward propagation is implicitly doing gpu->cpu transportation.
I played around with the code a little bit, and I think the problem is that you are measuring times for both cases in the same run. Here is my boiled down version of your code since your model crushed my GPU memory:
class DGCNN(nn.Module):
def __init__(self, num_layers):
super(DGCNN, self).__init__()
self.layers = nn.ModuleList([nn.Linear(256, 256) for _ in range(1200)])
def forward(self, x):
x = x.view(-1, 256)
for layer in self.layers:
x = layer(x)
return x
class my_loader(Dataset):
def __init__(self, device):
self.data = torch.rand(256, 2048, 3).to(device).float()
self.labels = torch.rand(256).to(device).long()
def __getitem__(self, ind):
return self.data[ind], self.labels[ind]
def __len__(self):
return len(self.data)
Now, here I demonstrate different versions of test().
Version #1:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.12s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs)
tac = time()
print(f'# First case -> Full forward pass: {tac - tic:.6f}')
#---------- this one is 0.004s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs.detach().cpu().to(device))
tac = time()
print(f'# Second case -> Full forward pass: {tac - tic:.6f}')
>>> # First case -> Full forward pass: 3.105103, # Second case -> Full forward pass: 2.831652
Now I switched the order of timing calculations for the cases. Version #2:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.004s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs.detach().cpu().to(device))
tac = time()
print(f'# Second case -> Full forward pass: {tac - tic:.6f}')
#---------- this one is 0.12s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs)
tac = time()
print(f'# First case -> Full forward pass: {tac - tic:.6f}')
>>> # Second case -> Full forward pass: 3.288522, # First case -> Full forward pass: 2.583231
Apparently, the first timing you calculate seems to end up slower. So, I calculated these timings separately in different runs with fresh kernels. Version #3:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.12s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs)
tac = time()
print(f'# First case -> Full forward pass: {tac - tic:.6f}')
>>> # First case -> Full forward pass: 3.091592
Version #4:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.004s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs.detach().cpu().to(device))
tac = time()
print(f'# Second case -> Full forward pass: {tac - tic:.6f}')
>>> # Second case -> Full forward pass: 3.190248
So, by testing one at a time, it seems like pred = model(inputs) runs slightly faster than pred = model(inputs.detach().cpu().to(device)), which is the obvious expected result.

Resources