pytorch runs slow when data are pre-transported to GPU - pytorch

I have a model written in pytorch. Since my dataset is small, I can directly load all of the data to GPU. However, I found the forward speed becomes slow if I do so. The following is a runnable example. Specifically, I have the model:
import numpy as np
from time import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
def knn(x, k):
inner = -2*torch.matmul(x.transpose(2, 1), x)
xx = torch.sum(x**2, dim=1, keepdim=True)
pairwise_distance = -xx - inner - xx.transpose(2, 1)
idx = pairwise_distance.topk(k=k, dim=-1)[1] # (batch_size, num_points, k)
return idx
def get_graph_feature(x, k=20, idx=None):
batch_size = x.size(0)
num_points = x.size(2)
x = x.view(batch_size, -1, num_points)
if idx is None:
idx = knn(x, k=k) # (batch_size, num_points, k)
idx_base = torch.arange(0, batch_size, device=x.device).view(-1, 1, 1)*num_points
idx = idx + idx_base
idx = idx.view(-1)
_, num_dims, _ = x.size()
x = x.transpose(2, 1).contiguous() # (batch_size, num_points, num_dims) -> (batch_size*num_points, num_dims) # batch_size * num_points * k + range(0, batch_size*num_points)
feature = x.view(batch_size*num_points, -1)[idx, :]
feature = feature.view(batch_size, num_points, k, num_dims)
x = x.view(batch_size, num_points, 1, num_dims).repeat(1, 1, k, 1)
feature = torch.cat((feature-x, x), dim=3).permute(0, 3, 1, 2).contiguous()
return feature
class DGCNN(nn.Module):
def __init__(self, k=25, output_channels=10):
super(DGCNN, self).__init__()
self.k = k
self.bn1 = nn.BatchNorm2d(64)
self.bn2 = nn.BatchNorm2d(64)
self.bn3 = nn.BatchNorm2d(128)
self.bn4 = nn.BatchNorm2d(256)
self.bn5 = nn.BatchNorm1d(1024)
self.conv1 = nn.Sequential(nn.Conv2d(6, 64, kernel_size=1, bias=False),
self.bn1,
nn.LeakyReLU(negative_slope=0.2))
self.conv2 = nn.Sequential(nn.Conv2d(64*2, 64, kernel_size=1, bias=False),
self.bn2,
nn.LeakyReLU(negative_slope=0.2))
self.conv3 = nn.Sequential(nn.Conv2d(64*2, 128, kernel_size=1, bias=False),
self.bn3,
nn.LeakyReLU(negative_slope=0.2))
self.conv4 = nn.Sequential(nn.Conv2d(128*2, 256, kernel_size=1, bias=False),
self.bn4,
nn.LeakyReLU(negative_slope=0.2))
self.conv5 = nn.Sequential(nn.Conv1d(512, 1024, kernel_size=1, bias=False),
self.bn5,
nn.LeakyReLU(negative_slope=0.2))
self.linear1 = nn.Linear(1024*2, 512, bias=False)
self.bn6 = nn.BatchNorm1d(512)
self.dp1 = nn.Dropout()
self.linear2 = nn.Linear(512, 256)
self.bn7 = nn.BatchNorm1d(256)
self.dp2 = nn.Dropout()
self.linear3 = nn.Linear(256, output_channels)
def forward(self, x):
x = x.transpose(2, 1)
batch_size = x.size(0)
x = get_graph_feature(x, k=self.k)
x = self.conv1(x)
x1 = x.max(dim=-1, keepdim=False)[0]
x = get_graph_feature(x1, k=self.k)
x = self.conv2(x)
x2 = x.max(dim=-1, keepdim=False)[0]
x = get_graph_feature(x2, k=self.k)
x = self.conv3(x)
x3 = x.max(dim=-1, keepdim=False)[0]
x = get_graph_feature(x3, k=self.k)
x = self.conv4(x)
x4 = x.max(dim=-1, keepdim=False)[0]
x = torch.cat((x1, x2, x3, x4), dim=1)
x = self.conv5(x)
x1 = F.adaptive_max_pool1d(x, 1).view(batch_size, -1)
x2 = F.adaptive_avg_pool1d(x, 1).view(batch_size, -1)
x = torch.cat((x1, x2), 1)
x = F.leaky_relu(self.bn6(self.linear1(x)), negative_slope=0.2)
x = self.dp1(x)
x = F.leaky_relu(self.bn7(self.linear2(x)), negative_slope=0.2)
x = self.dp2(x)
x = self.linear3(x)
return x
Here is what the dataloader and test function looks like:
class my_loader(Dataset):
def __init__(self, device):
self.data = torch.rand(256, 2048, 3).to(device).float()
self.labels = torch.rand(256).to(device).long()
def __getitem__(self, ind):
return self.data[ind], self.labels[ind]
def __len__(self):
return len(self.data)
def test():
device = torch.device('cuda:2')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.12s --------------#
for inputs, labels in test_loader:
tic = time()
pred = model(inputs)
print('time1 {}'.format(time() - tic))
print('------------------')
#---------- this one is 0.004s --------------#
for inputs, labels in test_loader:
inputs = inputs.detach().cpu().to(device)
tic = time()
pred = model(inputs)
print('time2 {}'.format(time() - tic))
print('------------------')
#---------- this one is 0.12s --------------#
for inputs, labels in test_loader:
tic = time()
inputs = inputs.detach().cpu().to(device)
pred = model(inputs)
print('time3 {}'.format(time() - tic))
print('------------------')
Basically, it seems that if there is no explicit call of gpu to cpu transportation either before or after the forward propagation, the forward propagation would cost more time. It just seems like that the forward propagation is implicitly doing gpu->cpu transportation.

I played around with the code a little bit, and I think the problem is that you are measuring times for both cases in the same run. Here is my boiled down version of your code since your model crushed my GPU memory:
class DGCNN(nn.Module):
def __init__(self, num_layers):
super(DGCNN, self).__init__()
self.layers = nn.ModuleList([nn.Linear(256, 256) for _ in range(1200)])
def forward(self, x):
x = x.view(-1, 256)
for layer in self.layers:
x = layer(x)
return x
class my_loader(Dataset):
def __init__(self, device):
self.data = torch.rand(256, 2048, 3).to(device).float()
self.labels = torch.rand(256).to(device).long()
def __getitem__(self, ind):
return self.data[ind], self.labels[ind]
def __len__(self):
return len(self.data)
Now, here I demonstrate different versions of test().
Version #1:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.12s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs)
tac = time()
print(f'# First case -> Full forward pass: {tac - tic:.6f}')
#---------- this one is 0.004s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs.detach().cpu().to(device))
tac = time()
print(f'# Second case -> Full forward pass: {tac - tic:.6f}')
>>> # First case -> Full forward pass: 3.105103, # Second case -> Full forward pass: 2.831652
Now I switched the order of timing calculations for the cases. Version #2:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.004s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs.detach().cpu().to(device))
tac = time()
print(f'# Second case -> Full forward pass: {tac - tic:.6f}')
#---------- this one is 0.12s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs)
tac = time()
print(f'# First case -> Full forward pass: {tac - tic:.6f}')
>>> # Second case -> Full forward pass: 3.288522, # First case -> Full forward pass: 2.583231
Apparently, the first timing you calculate seems to end up slower. So, I calculated these timings separately in different runs with fresh kernels. Version #3:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.12s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs)
tac = time()
print(f'# First case -> Full forward pass: {tac - tic:.6f}')
>>> # First case -> Full forward pass: 3.091592
Version #4:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.004s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs.detach().cpu().to(device))
tac = time()
print(f'# Second case -> Full forward pass: {tac - tic:.6f}')
>>> # Second case -> Full forward pass: 3.190248
So, by testing one at a time, it seems like pred = model(inputs) runs slightly faster than pred = model(inputs.detach().cpu().to(device)), which is the obvious expected result.

Related

Inference on Multiple Label Sentiment analysis using pytorch_lightning

I have trained an LSTM model for sentiment analysis using pytorch_lightning but I've been having difficulties incorporating the inference.
This is my model:
class LSTM(pl.LightningModule):
def __init__(self,n_vocab,n_embed,
n_hidden,n_output,n_layers,learning_rate,embedding_matrix=None):
super().__init__()
self.n_vocab = n_vocab
self.n_layer = n_layers
self.n_hidden = n_hidden
self.embedding = nn.Embedding(n_vocab, n_embed, padding_idx = 0)
if embedding_matrix is not None:
self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
self.embedding.weight.requires_grad = False
self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, bidirectional = True)
self.fc = nn.Linear(2 * n_hidden, n_output)
self.dropout = nn.Dropout(0.2)
self.sigmoid = nn.Sigmoid()
self.batch_size = batch_size
self.learning_rate = learning_rate
def forward(self,input_words):
embedded_words = self.embedding(input_words)
lstm_out, _ = self.lstm(embedded_words)
lstm_out_f=lstm_out[:,-1 , :300 ]
lstm_out_b=lstm_out[:, 0 , 300: ]
lstm_out_final = torch.cat([lstm_out_f,lstm_out_b], dim=-1)
lstm_out_final = self.dropout(lstm_out_final)
fc_out = self.fc(lstm_out_final)
return fc_out
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
return optimizer
def training_step(self, batch, batch_nb):
x , y= batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y)
self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
return loss
def validation_step(self, batch, batch_nb):
x, y = batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y).to(device = 'cuda')
f1 = torchmetrics.F1(num_classes=5).to(device = 'cuda')
f1_score = f1(y_hat, y)
accuracy = Accuracy().to(device = 'cuda')
accur = accuracy(y_hat, y)
self.log("val_loss", loss)
self.log("f1_score", f1_score)
self.log("accuracy", accur)
def test_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
print(logits)
logitsies = softmax(logits)
choice = argmax(logitsies)
loss = F.nll_loss(logits, y)
self.log("test_loss", loss)
return choice
def predict_step(self, batch, batch_idx, dataloader_idx):
x, y = batch
x = x.view(x.size(0), -1)
y_hat = self(x)
logitsies = softmax(logits)
choice = argmax(logitsies)
loss = F.nll_loss(logits, y)
self.log("predict_loss", loss)
return choice
I called the model as such:
model = LSTM(
n_vocab=size_of_emb_matrix,
n_embed=embed_vector_len,
n_hidden=150,
n_output=5,
n_layers=1,
learning_rate=1e-4,
embedding_matrix=embedding_matrix
)
Now I am trying to write a function that would allow me to do inference. I have managed to succesfuly tokenize the input sentence and encode it through an already pre-defined functions, yet I have been getting several errors no matter what I try. I have found myself stuck and don't know how to continue. This is my function so far.
def get_sentiment(text):
x = encode_sentence(text,vocab2index)
x_bar = x[0]
y_hat = torch.tensor(x_bar)
trainer.predict(model,y_hat)
The encode_sentence function is as follows:
def encode_sentence(text, vocab2index, N=70):
tokenized = tokenize(text)
encoded = np.zeros(N, dtype=int)
enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
length = min(N, len(enc1))
encoded[:length] = enc1[:length]
return encoded, length
As I call the get_sentiment function, I am using the trainer.predict function which allows me to predict, hence doing inference.
But I have been getting the following Issue:
AttributeError Traceback (most recent call
last)
<ipython-input-195-825c536cbbb2> in <module>()
----> 1 get_sentiment("love that dress")
13 frames
/usr/local/lib/python3.7/dist-
packages/pytorch_lightning/loops/epoch/prediction_epoch_loop.py in
_store_batch_indices(self, dataloader_idx)
162 def _store_batch_indices(self, dataloader_idx: int) -> None:
163 """Stores the batch indices if the predictions should be
stored"""
--> 164 batch_sampler =
self.trainer.predict_dataloaders[dataloader_idx].batch_sampler
165 if isinstance(batch_sampler, IndexBatchSamplerWrapper):
166 self.current_batch_indices = batch_sampler.batch_indices
AttributeError: 'Tensor' object has no attribute 'batch_sampler'

Why do I always get the same value as the result in a CNN in pytorch?

Here is my code
dataset = pd.read_csv('augmented_data.csv')
dataset = dataset.sample(frac=1)
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.conv1 = nn.Conv2d(3,6,5)
self.pool = nn.MaxPool2d(2,2)
self.conv2 = nn.Conv2d(6,16,5)
self.fc1 = nn.Linear(1024144, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84,1)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 1024144)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
print(x)
x = self.fc3(x)
return x
files_read = 0
preprocess = transforms.Compose([
transforms.Resize(1024),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5),(0.5, 0.5, 0.5))])
# device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
device = torch.device('cpu')
# model = ConvNet().to(device)
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = optim.Adam(model.parameters(), lr=0.001)
results = []
for index, row in dataset.iterrows():
try:
image = load_img('padded_images/' + row['image_name'] +'.jpg')
except:
image = load_img('augmented_images/' + row['image_name'] +'.jpeg')
files_read += 1
input_tensor = preprocess(image)
input_batch = input_tensor.unsqueeze(0).to(device)
if files_read <= 80 * len(dataset) // 100:
output = model(input_batch)
optimizer.zero_grad()
y = torch.tensor([[float(row['target'])]]).to(device)
loss = criterion(output, y)
loss.backward()
optimizer.step()
else:
model.eval()
output = model(input_batch)
results.append([1.0 if output[0][0].double() > 0.5 else 0, float(row['target'])])
So i am using pytorch CNN to classify 60k images in 2 classes. When i print the output after the model has trained, whatever the image as input, the ouput is always "tensor([[0.6384]], grad_fn=)". Always the same value. So it predicts only 1 (because it's greater than 0.5). The thing is, when i print the ouput while training, the results vary (16, 1 , 0, 4 ,0.6 etc) but when i print the output (with the same model but not trained) the results don't vary that much (0.5, 0.51, 0.49 0.52, 0.55). So I think it's safe to say that it is converging to a single value. I just don't know why. what could i do differently?

How can I use binary_cross_entropy in binary classification in Chainer

I have train dataset of 8000 images and labels. Validation set consists of 1957 images and labels. The test set contains 2487 images. Each image contains White Blood Cell images. WBC is divided innto 4 categories: Eosinophil, Neutrophil, Monocyte and Lymphocyte. Eosinophil and Neutrophil are Polynuclear while the remaining two are Mononuclear. The cells need to be classified between the two classes : Polynuclear and Mononuclear.
# import libraries
def get_data(folder):
X = []
y = []
for wbc_type in os.listdir(folder):
if not wbc_type.startswith('.'):
if wbc_type in ['NEUTROPHIL', 'EOSINOPHIL']:
label = 'POLYNUCLEAR'
else:
label = 'MONONUCLEAR'
for image_filename in tqdm(os.listdir(folder + wbc_type)):
img_file = cv2.imread(folder + wbc_type + '/' + image_filename)
if img_file is not None:
# Downsample the image to 120, 160, 3
img_file = scipy.misc.imresize(arr=img_file, size=(120, 160, 3))
img_arr = np.asarray(img_file)
X.append(img_arr)
y.append(label)
X = np.asarray(X)
y = np.asarray(y)
return X,y
X_train, y_train = get_data('C:/Users/Neerajan/Desktop/blood-cells/dataset2-master/dataset2-master/images/TRAIN/')
X_test, y_test = get_data('C:/Users/Neerajan/Desktop/blood-cells/dataset2-master/dataset2-master/images/TEST/')
encoder = LabelEncoder()
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)
X_train=np.array((X_train), dtype = np.float32)
X_train=X_train/255.0
X_test=np.array((X_test), dtype = np.float32)
X_test=X_test/255.0
y_train = y_train.astype(int)
y_train = y_train.flatten()
from chainer.datasets import split_dataset_random
from chainer.dataset import DatasetMixin
class MyDataset(DatasetMixin):
def __init__(self, X, labels):
super(MyDataset, self).__init__()
self.X_ = X
self.labels_ = labels
self.size_ = X.shape[0]
def __len__(self):
return self.size_
def get_example(self, i):
return np.transpose(self.X_[i, ...], (2, 0, 1)), self.labels_[i]
batch_size = 32
dataset = MyDataset(X_train, y_train)
dataset_train, valid = split_dataset_random(dataset, 8000, seed=0)
train_iter = iterators.SerialIterator(dataset_train, batch_size)
valid_iter = iterators.SerialIterator(valid, batch_size, repeat=False, shuffle=False)
from chainer.dataset import concat_examples
batch_image, batch_label = concat_examples(next(train_iter))
print("batch_image.shape\n{}".format(batch_image.shape))
print("batch_label.shape\n{}".format(batch_label.shape))
batch_image.shape : (32,3,120,160) batch_label.shape : (32,)
class MyModel(chainer.Chain):
def __init__(self, n_out):
super(MyModel, self).__init__()
with self.init_scope():
self.conv1=L.Convolution2D(None, 32, 3, 3, 1)
self.conv2=L.Convolution2D(32, 64, 3, 3, 1)
self.conv3=L.Convolution2D(64, 128, 3, 3, 1)
self.fc4=L.Linear(None, 32)
self.fc5=L.Linear(32, n_out)
def __call__(self, x):
h = F.relu(self.conv1(x))
h = F.relu(self.conv2(h))
h = F.relu(self.conv3(h))
h = F.leaky_relu(self.fc4(h))
h = F.sigmoid(self.fc5(h))
return h
from chainer import training
def train(model_object, batchsize=32, gpu_id=-1, max_epoch=14):
model = L.Classifier(model_object)
if gpu_id >=0:
model.to_gpu(gpu_id)
#serializers.save_npz('kankata',model)
# 4. Optimizer
optimizer = optimizers.Adam()
optimizer.setup(model)
serializers.save_npz('my.state',optimizer)
# 5. Updater
updater = training.StandardUpdater(train_iter, optimizer, device=gpu_id)
# 6. Trainer
trainer = training.Trainer(updater, (max_epoch, 'epoch'), out='C:/Users/Neerajan/Desktop/ReportDump'.format(model_object.__class__.__name__))
# 7. Evaluator
class TestModeEvaluator(extensions.Evaluator):
def evaluate(self):
model = self.get_target('main')
ret = super(TestModeEvaluator, self).evaluate()
return ret
trainer.extend(extensions.LogReport())
trainer.extend(TestModeEvaluator(valid_iter, model, device=gpu_id))
trainer.extend(extensions.PrintReport(['epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time']))
trainer.extend(extensions.PlotReport(['main/loss', 'validation/main/loss'], x_key='epoch', file_name='loss.png'))
trainer.extend(extensions.PlotReport(['main/accuracy', 'validation/main/accuracy'], x_key='epoch', file_name='accuracy.png'))
trainer.run()
del trainer
return model
gpu_id = -1 # Set to -1 if you don't have a GPU
model = train(MyModel(2), gpu_id=gpu_id)
It is recommended that for binary classification we use sigmoid activation function in the last layer of model and binary_cross_entropy in classifier.
How do I implement binary_cross_entropy as the loss_function in the classifier?
see this example for binary classification.
43 model = L.Classifier(
44 MLP(44, 1), lossfun=F.sigmoid_cross_entropy, accfun=F.binary_accuracy)
feeding lossfun=F.sigmoid_cross_entropy to L.Classifier is a good solution.

how to fix capsule training problem for a single class of MNIST dataset?

I am training a Capsule Network with both encoder and decoder part. It works perfectly fine with all the classes (10 classes) of the MNIST data set. But when I am extracting a single class say (class 0 or class 5) and then training the capsule network, the reconstruction of the image is very poor.
Where do I need to change the network setting, or do I have an error in my data preparation?
I tried:
I changed the total class from 10 (for ten digits to 1 for 1 digit and even for 2 for 2 digits).
When I am using the default MNIST dataset, I am getting no error or tensor size, but when I am extracting a particular class and then passing it into the network, I am facing issues like a) Dimensional Issues b) Float tensor warning.
I fixed these things but manually adding a dimension and converting the data to data.float().cuda() tensor. I did this for both the case i.e when I am using the 10 Digit Capsules and when I am using the 1 Digit Capsules for training a single class digit.
But after this, the network is running fine, but I am getting really blurred and poor reconstructions. While when I am training the whole MNIST dataset without extracting any class and passing it to the network, it doesn't throw any error and the reconstruction works really fine.
I would love to share the more detail and other parts of the code -
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import Adam
from torchvision import datasets, transforms
USE_CUDA = True
### **Here we prepare the data for the complete 10 class digit training**###
class Mnist:
def __init__(self, batch_size):
dataset_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
train_dataset = datasets.MNIST('../data', train=True, download=True, transform=dataset_transform)
test_dataset = datasets.MNIST('../data', train=False, download=True, transform=dataset_transform)
self.train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
self.test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
## **Here is my code for extracting a single class digit extraction**##
class Mnist:
def __init__(self,batch_size):
dataset_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
train_mnist = datasets.MNIST("../data", train=True)
test_mnist = datasets.MNIST("../data", train= False)
train_image, train_label = train_mnist.train_data, train_mnist.train_labels
test_image, test_label = test_mnist.test_data, test_mnist.test_labels
train_0, test_0 = [train_image[key] for (key, label) in enumerate(train_label) if int(label) == 5],[test_image[key] for (key, label) in enumerate(test_label) if int(label) == 5]
train_label_0, test_label_0 = zero__train = [train_label[key] for (key, label) in enumerate(train_label) if int(label) == 5],[test_label[key] for (key, label) in enumerate(test_label) if int(label) == 5]
train_dataset = tuple(zip(train_0, train_label_0))
test_dataset = tuple(zip(test_0, test_label_0))
self.train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
self.test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
# Here is the main code for the capsule training.
''' The below code is used for training the 1 class but using the 10 Digit capsules
'''
class ConvLayer(nn.Module):
def __init__(self, in_channels=1, out_channels=256, kernel_size=9):
super(ConvLayer, self).__init__()
self.conv = nn.Conv2d(in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=1
)
def forward(self, x):
return F.relu(self.conv(x))
class PrimaryCaps(nn.Module):
def __init__(self, num_capsules=8, in_channels=256, out_channels=32, kernel_size=9):
super(PrimaryCaps, self).__init__()
self.capsules = nn.ModuleList([
nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=2, padding=0)
for _ in range(num_capsules)])
def forward(self, x):
u = [capsule(x) for capsule in self.capsules]
u = torch.stack(u, dim=1)
u = u.view(x.size(0), 32 * 6 * 6, -1)
return self.squash(u)
def squash(self, input_tensor):
squared_norm = (input_tensor ** 2).sum(-1, keepdim=True)
output_tensor = squared_norm * input_tensor / ((1. + squared_norm) * torch.sqrt(squared_norm))
return output_tensor
class DigitCaps(nn.Module):
def __init__(self, num_capsules=10, num_routes=32 * 6 * 6, in_channels=8, out_channels=16):
super(DigitCaps, self).__init__()
self.in_channels = in_channels
self.num_routes = num_routes
self.num_capsules = num_capsules
self.W = nn.Parameter(torch.randn(1, num_routes, num_capsules, out_channels, in_channels))
def forward(self, x):
batch_size = x.size(0)
x = torch.stack([x] * self.num_capsules, dim=2).unsqueeze(4)
# print(f"x at epoch {epoch} is equal to : {x}")
W = torch.cat([self.W] * batch_size, dim=0)
# print(f"W at epoch {epoch} is equal to : {W}")
u_hat = torch.matmul(W, x)
# print(f"u_hatat epoch {epoch} is equal to : {u_hat}")
b_ij = Variable(torch.zeros(1, self.num_routes, self.num_capsules, 1))
if USE_CUDA:
b_ij = b_ij.cuda()
# print(f"b_ij at epoch {epoch} is equal to : {b_ij}")
num_iterations = 3
for iteration in range(num_iterations):
c_ij = F.softmax(b_ij, dim =1)
c_ij = torch.cat([c_ij] * batch_size, dim=0).unsqueeze(4)
s_j = (c_ij * u_hat).sum(dim=1, keepdim=True)
v_j = self.squash(s_j)
# print(f"b_ij at iteration {iteration} is equal to : {b_ij}")
if iteration < num_iterations - 1:
a_ij = torch.matmul(u_hat.transpose(3, 4), torch.cat([v_j] * self.num_routes, dim=1))
b_ij = b_ij + a_ij.squeeze(4).mean(dim=0, keepdim=True)
return v_j.squeeze(1)
def squash(self, input_tensor):
squared_norm = (input_tensor ** 2).sum(-1, keepdim=True)
output_tensor = squared_norm * input_tensor / ((1. + squared_norm) * torch.sqrt(squared_norm))
return output_tensor
class Decoder(nn.Module):
def __init__(self):
super(Decoder, self).__init__()
self.reconstraction_layers = nn.Sequential(
nn.Linear(16 * 10, 512),
nn.ReLU(inplace=True),
nn.Linear(512, 1024),
nn.ReLU(inplace=True),
nn.Linear(1024, 784),
nn.Sigmoid()
)
def forward(self, x, data):
classes = torch.sqrt((x ** 2).sum(2))
classes = F.softmax(classes, dim =1)
_, max_length_indices = classes.max(dim=1)
masked = Variable(torch.sparse.torch.eye(10))
if USE_CUDA:
masked = masked.cuda()
masked = masked.index_select(dim=0, index=max_length_indices.squeeze(1).data)
reconstructions = self.reconstraction_layers((x * masked[:, :, None, None]).view(x.size(0), -1))
reconstructions = reconstructions.view(-1, 1, 28, 28)
return reconstructions, masked
class CapsNet(nn.Module):
def __init__(self):
super(CapsNet, self).__init__()
self.conv_layer = ConvLayer()
self.primary_capsules = PrimaryCaps()
self.digit_capsules = DigitCaps()
self.decoder = Decoder()
self.mse_loss = nn.MSELoss()
def forward(self, data):
output = self.digit_capsules(self.primary_capsules(self.conv_layer(data)))
reconstructions, masked = self.decoder(output, data)
return output, reconstructions, masked
def loss(self, data, x, target, reconstructions):
return self.margin_loss(x, target) + self.reconstruction_loss(data, reconstructions)
# return self.reconstruction_loss(data, reconstructions)
def margin_loss(self, x, labels, size_average=True):
batch_size = x.size(0)
v_c = torch.sqrt((x**2).sum(dim=2, keepdim=True))
left = F.relu(0.9 - v_c).view(batch_size, -1)
right = F.relu(v_c - 0.1).view(batch_size, -1)
# print(f"shape of labels, left and right respectively - {labels.size(), left.size(), right.size()}")
loss = labels * left + 0.5 * (1.0 - labels) * right
loss = loss.sum(dim=1).mean()
return loss
def reconstruction_loss(self, data, reconstructions):
loss = self.mse_loss(reconstructions.view(reconstructions.size(0), -1), data.view(reconstructions.size(0), -1))
return loss*0.0005
capsule_net = CapsNet()
if USE_CUDA:
capsule_net = capsule_net.cuda()
optimizer = Adam(capsule_net.parameters())
capsule_net
##### Here is the problem while training####
batch_size = 100
mnist = Mnist(batch_size)
n_epochs = 5
for epoch in range(n_epochs):
capsule_net.train()
train_loss = 0
for batch_id, (data, target) in enumerate(mnist.train_loader):
target = torch.eye(10).index_select(dim=0, index=target)
data, target = Variable(data), Variable(target)
if USE_CUDA:
data, target = data.cuda(), target.cuda()
data, target = data.float().cuda(), target.float().cuda() # Here I changed the data to float and it's required only when I am using my extracted dataset for a single class
data = data[:,:,:] # Use this when 1st MNist data is used
# data = data[:,None,:,:] # Use this when I am using my extracted single class digits
optimizer.zero_grad()
output, reconstructions, masked = capsule_net(data)
loss = capsule_net.loss(data, output, target, reconstructions)
loss.backward()
optimizer.step()
train_loss += loss.item()
# if batch_id % 100 == 0:
# print ("train accuracy:", sum(np.argmax(masked.data.cpu().numpy(), 1) ==
# np.argmax(target.data.cpu().numpy(), 1)) / float(batch_size))
print (train_loss / len(mnist.train_loader))
I used this to see the main data as image and the reconstructed image
import matplotlib
import matplotlib.pyplot as plt
def plot_images_separately(images):
"Plot the six MNIST images separately."
fig = plt.figure()
for j in range(1, 10):
ax = fig.add_subplot(1, 10, j)
ax.matshow(images[j-1], cmap = matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
plt.show()
plot_images_separately(data[:10,0].data.cpu().numpy())
plot_images_separately(reconstructions[:10,0].data.cpu().numpy())
I checked the normal performing code and then the problematic one, I found that the dataset passed into the network was of not same nature. The problems were -
The MNIST data extracted for a single class was not transformed into tensor and no normalization was applied, although I tried passing it through the transformation.
This is what I did to fix it -
I created transformation objections and tensor objection and then passed by list comprehension elements to it. Below are the codes and the final output of my network -
Preparing class 0 dataset (dataset for the digit 5)
class Mnist:
trans = transforms.ToTensor()
normalize = transforms.Normalize((0.1307,), (0.3081,))
def init(self,batch_size):
dataset_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
trans = transforms.ToTensor()
normalize = transforms.Normalize((0.1307,), (0.3081,))
train_mnist = datasets.MNIST("../data", train=True, transform=dataset_transform)
test_mnist = datasets.MNIST("../data", train= False, transform=dataset_transform)
train_image, train_label = train_mnist.train_data, train_mnist.train_labels
test_image, test_label = test_mnist.test_data, test_mnist.test_labels
train_0, test_0 = [normalize(trans(train_image[key].unsqueeze(2).numpy())) for (key, label) in enumerate(train_label) if int(label) == 5],[test_image[key] for (key, label) in enumerate(test_label) if int(label) == 5]
train_label_0, test_label_0 = zero__train = [train_label[key] for (key, label) in enumerate(train_label) if int(label) == 5],[test_label[key] for (key, label) in enumerate(test_label) if int(label) == 5]
train_dataset = tuple(zip(train_0, train_label_0))
test_dataset = tuple(zip(test_0, test_label_0))
self.train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
self.test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
enter image description here

PyTorch network produces constant output

I am trying to train a simple MLP to approximate y=f(a,b,c).
My code is as below.
import torch
import torch.nn as nn
from torch.autograd import Variable
# hyper parameters
input_size = 3
output_size = 1
num_epochs = 50
learning_rate = 0.001
# Network definition
class FeedForwardNet(nn.Module):
def __init__(self, l1_size, l2_size):
super(FeedForwardNet, self).__init__()
self.fc1 = nn.Linear(input_size, l1_size)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(l1_size, l2_size)
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(l2_size, output_size)
def forward(self, x):
out = self.fc1(x)
out = self.relu1(out)
out = self.fc2(out)
out = self.relu2(out)
out = self.fc3(out)
return out
model = FeedForwardNet(5 , 3)
# sgd optimizer
optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum=0.9)
for epoch in range(11):
print ('Epoch ', epoch)
for i in range(trainX_light.shape[0]):
X = Variable( torch.from_numpy(trainX_light[i]).view(-1, 3) )
Y = Variable( torch.from_numpy(trainY_light[i]).view(-1, 1) )
# forward
optimizer.zero_grad()
output = model(X)
loss = (Y - output).pow(2).sum()
print (output.data[0,0])
loss.backward()
optimizer.step()
totalnorm = 0
for p in model.parameters():
modulenorm = p.grad.data.norm()
totalnorm += modulenorm ** 2
totalnorm = math.sqrt(totalnorm)
print (totalnorm)
# validation code
if (epoch + 1) % 5 == 0:
print (' test points',testX_light.shape[0])
total_loss = 0
for t in range(testX_light.shape[0]):
X = Variable( torch.from_numpy(testX_light[t]).view(-1, 3) )
Y = Variable( torch.from_numpy(testY_light[t]).view(-1, 1) )
output = model(X)
loss = (Y - output).pow(2).sum()
print (output.data[0,0])
total_loss += loss
print ('epoch ', epoch, 'avg_loss ', total_loss.data[0] / testX_light.shape[0])
print ('Done')
The problem that I have now is, the validation code
output = model(X)
is always producing an exact same output value (I guess this value is some sort of garbage). I am not sure what mistake I am doing in this part. Could some help me figure out the mistake in my code?
The reason that network produced random values (and inf later) was the exploding gradient problem. Clipping the gradient (torch.nn.utils.clip_grad_norm(model.parameters(), 0.1)) helped.

Resources