I am trying to build Graph Convolutional Network. I converted my dataframe to PyTorch
required format using below code.
class S_Dataset(Dataset):
def __init__(self, df, transform=None):
self.df = df
self.transform = transform
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
row = self.df.iloc[idx]
x = torch.tensor([, row.s1, row.s2, row.s3, row.s4, row.temp ,row.rh, row.Location, row.Node ], dtype=torch.float)
y = torch.tensor([row.Location], dtype=torch.long)
weight1 = torch.tensor([row.neighbor1_distance], dtype=torch.float)
weight2 = torch.tensor([row.neighbor2_distance], dtype=torch.float)
weight3 = torch.tensor([row.neighbor3_distance], dtype=torch.float)
edge_index1 = torch.tensor([[row.Location, row.neighbor1_name]], dtype=torch.long).t()
edge_index2 = torch.tensor([[row.Location, row.neighbor2_name]], dtype=torch.long).t()
edge_index3 = torch.tensor([[row.Location, row.neighbor3_name]], dtype=torch.long).t()
edge_index =[edge_index1, edge_index2, edge_index3 ], dim=1)
weight =[weight1, weight2, weight3], dim=0)
if self.transform:
x, y, edge_index, weight = self.transform(x, y, edge_index, weight)
return x, y, edge_index, weight
Process_Data = S_Dataset(df)
Next I divided data into train and test set:
train_size = int(len(Process_Data) * 0.8)
test_size = len(Process_Data) - train_size
train_dataset, test_dataset =, [train_size, test_size])
# Create dataloaders
train_loader =, batch_size=32, shuffle=True )
test_loader =, batch_size=32, shuffle=True )
I designed a simple model:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv
# Create the model
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = GCNConv(9, 128)
self.conv2 = GCNConv(128, 64)
self.fc1 = nn.Linear(64, 32)
self.fc2 = nn.Linear(32, len(location_to_id))
def forward(self, x, edge_index, weight):
x = self.conv1(x, edge_index, weight)
x = torch.relu(x)
x = self.conv2(x, edge_index, weight)
x = torch.relu(x)
x = x.view(-1, 64)
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
return x
Finally to train the model:
model = Net()
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
for epoch in range(100):
total_loss = 0
for batch in train_loader:
x, y, edge_index, weight = batch
y_pred = model(x, edge_index, weight)
loss = criterion(y_pred, y)
total_loss += loss.item()
print('Epoch: {} Loss: {:.4f}'.format(epoch, total_loss / len(train_loader)))
I am facing following error:
IndexError: The shape of the mask [2, 3] at index 0 does not match the shape of the indexed tensor [32, 3] at index 0
x, y, edge_index, weight = batch
This line is causing error.
The batch size is set at 32, but there might not be enough samples to fit in the batch size of 32.
I am assuming, this error occurs after the code runs for some time, I would appreciate more context on the problem
PyTorch simple ConvNet diverge so easly

So I'm studiying pytorch coming from a background with tensorflow.
I'm trying to replicate a simple convnet, that I've developed with success in tensorflow, to classify cat vs dogs images.
In pytorch I see some strange behaviors:
Using a Learning Rate of 0.001 make the CNet predicting only 0 after the first batch (might be exploding gradients?)
Using a Learning Rate of 0.0005 gives a smooth learning curve and the CNet converge
Can anyone help me to understand what I'm doing wrong? that the code:
import pathlib
import torch
import torch.nn.functional as F
import torchvision
from import DataLoader
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CNet(torch.nn.Module):
def __init__(self):
super(CNet, self).__init__() #input is 180x180 image
self.conv1 = torch.nn.Conv2d(3, 32, 3) # out -> 178x178x32
self.conv2 = torch.nn.Conv2d(32, 64, 3)
self.conv3 = torch.nn.Conv2d(64, 128, 3)
self.conv4 = torch.nn.Conv2d(128, 256, 3)
self.conv5 = torch.nn.Conv2d(256, 256, 3)
self.flatten = torch.nn.Flatten()
#self.fc = torch.nn.LazyLinear(1)
self.fc = torch.nn.Linear(7*7*256, 1)
def forward(self, x):
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv3(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv4(x)), (2, 2))
x = F.relu(self.conv5(x))
x = self.flatten(x)
o = torch.sigmoid(self.fc(x))
return o
def train(model : CNet, train_data : DataLoader, criterion, optimizer : torch.optim.Optimizer, epochs = 10, validation_data : DataLoader = None):
losses = []
for epoch in range(epochs):
epoch_loss = 0.0
running_loss = 0.0
for i, data in enumerate(train_data, 0):
imgs, labels = data
imgs, labels =,, dtype=torch.float)
labels = labels.unsqueeze(-1)
# run
output = net(imgs)
# zero out accumulated grads
loss = criterion(output, labels)
running_loss += loss.item()
epoch_loss += loss.item()
#if i % 50 == 49:
# print(f'[{epoch+1}, {i:5d}] loss: {running_loss / 50.0:.3f}')
# running_loss = 0.0
losses.append(epoch_loss / len(train_data.dataset))
print(f'[{epoch+1}, {epochs:5d}] loss: {losses[-1]:.3f}')
return losses
if __name__=="__main__":
transforms = torchvision.transforms.Compose([
torchvision.transforms.Resize((180, 180)),
dataset_dir = pathlib.Path("E:\Datasets\\torch\Cat_Dog\cats_vs_dogs_small")
train_data = torchvision.datasets.ImageFolder(dataset_dir / "train", transform=transforms)
validation_data = torchvision.datasets.ImageFolder(dataset_dir / "validation", transform=transforms)
test_data = torchvision.datasets.ImageFolder(dataset_dir / "test", transform=transforms)
train_data_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=2, persistent_workers=True, pin_memory=True)
validation_data_loader = DataLoader(validation_data, batch_size=32, num_workers=2, shuffle=True, pin_memory=True)
test_data_loader = DataLoader(test_data, batch_size=32, shuffle=True, pin_memory=True, num_workers=2)
import matplotlib.pyplot as plt
#for i in range(1, 10):
# plt.subplot(3, 3, i)
# plt.axis('off')
# rand_idx = np.random.random_integers(0, len(train_data))
# plt.imshow(np.moveaxis(test_data[rand_idx][0].numpy(), 0, 2))
net = CNet()
net =
criterion = torch.nn.BCELoss()
optimizer = torch.optim.RMSprop(net.parameters(), 0.001)
# TODO save best model
losses = train(net, train_data_loader, criterion, optimizer, epochs=30)
epochs = range(1, len(losses) + 1)
plt.plot(epochs, losses, 'bo', label='Training Loss')
print('Training Finished')
correct_count, all_count = 0, 0
for images,labels in test_data_loader:
images,labels =,, dtype=torch.float)
with torch.no_grad():
ps = net(images)
pred_label = (ps > 0.5).to(torch.float)
true_label = labels.unsqueeze(1)
correct_count += (pred_label == true_label).sum().item()
all_count += len(labels)
print("Number Of Images Tested =", all_count)
print("\nModel Accuracy =", (correct_count/all_count))
and here some screenshot of the loss for each point:
LR=0.001 (not convering on pytorch, converging on tensorflow)
LR=0.0005 (converging in 30 epochs) [I know that the validation loss is not 0, accuracy is ~70% but is expected]
As you can see the loss on the two experiment are very different in scale. What might cause that such a weird behavior? I call it 'wierd' cause I never seen that happen on tensorflow.
Function AddmmBackward returned an invalid gradient

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
class NeuralNetwork(nn.Module):
def __init__(self):
self.conv1 = nn.Conv2d(1, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 3)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = NeuralNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
from torchvision import datasets, transforms
from import DataLoader, random_split
def UploadData(path, train):
#set up transforms for train and test datasets
train_transforms = transforms.Compose([transforms.Grayscale(num_output_channels=1), transforms.Resize(255), transforms.CenterCrop(224), transforms.RandomRotation(30),
transforms.RandomHorizontalFlip(), transforms.transforms.ToTensor()])
valid_transforms = transforms.Compose([transforms.Grayscale(num_output_channels=1), transforms.Resize(255), transforms.CenterCrop(224), transforms.RandomRotation(30),
transforms.RandomHorizontalFlip(), transforms.transforms.ToTensor()])
test_transforms = transforms.Compose([transforms.Grayscale(num_output_channels=1), transforms.Resize(255), transforms.CenterCrop(224), transforms.ToTensor()])
#set up datasets from Image Folders
train_dataset = datasets.ImageFolder(path + '/train', transform=train_transforms)
valid_dataset = datasets.ImageFolder(path + '/validation', transform=valid_transforms)
test_dataset = datasets.ImageFolder(path + '/test', transform=test_transforms)
#set up dataloaders with batch size of 32
trainloader =, batch_size=32, shuffle=True)
validloader =, batch_size=32, shuffle=True)
testloader =, batch_size=32, shuffle=True)
return trainloader, validloader, testloader
trainloader, validloader, testloader = UploadData("/home/lns/research/dataset", True)
epochs = 5
min_valid_loss = np.inf
for e in range(epochs):
train_loss = 0.0
for data, labels in trainloader:
# Transfer Data to GPU if available
if torch.cuda.is_available():
print("using GPU for data")
data, labels = data.cuda(), labels.cuda()
# Clear the gradients
# Forward Pass
target = net(data)
# Find the Loss
loss = criterion(target,labels)
# Calculate gradients
# Update Weights
# Calculate Loss
train_loss += loss.item()
valid_loss = 0.0
model.eval() # Optional when not using Model Specific layer
for data, labels in validloader:
# Transfer Data to GPU if available
if torch.cuda.is_available():
print("using GPU for data")
data, labels = data.cuda(), labels.cuda()
# Forward Pass
target = net(data)
# Find the Loss
loss = criterion(target,labels)
# Calculate Loss
valid_loss += loss.item()
print('Epoch ',e+1, '\t\t Training Loss: ',train_loss / len(trainloader),' \t\t Validation Loss: ',valid_loss / len(validloader))
if min_valid_loss > valid_loss:
print("Validation Loss Decreased(",min_valid_loss,"--->",valid_loss,") \t Saving The Model")
min_valid_loss = valid_loss
# Saving State Dict, '/home/lns/research/MODEL.pth')
After searching a lot i am asking for help. Can someone help me
understand why this error is occuring in backward propagation.
i followed pytorch cnn tutorail and geeksforgeeks tutorial
dataset is x ray images transformed into grayscale and resize to 255
Is my neural network is wrong or data is not processed correctly?
This is a size mismmatch between the output of your CNN and the number of neurons on on your first fully-connected layer. Because of missing padding, the number of elements when flattened is 16*4*4 i.e. 256 (and not 16*5*5):
self.fc1 = nn.Linear(256, 120)
Once modified, the model will run correctly:
>>> model = NeuralNetwork()
>>> model(torch.rand(1, 1, 28, 28)).shape
torch.Size([1, 3])
Alternatively, you can use an nn.LazyLinear which will deduce the in_feature argument during the very first inference based on its input shape.
RuntimeError: input.size(-1) must be equal to input_size. Expected 28, got 0

Here are my code by using Pysft
class Arguments:
def __init__(self):
# self.cuda = False
self.no_cuda = True
self.seed = 1
self.batch_size = 50
self.test_batch_size = 1000
self.epochs = 10 = 0.01
self.momentum = 0.5
self.log_interval = 10
hook = sy.TorchHook(torch)
bob = sy.VirtualWorker(hook, id="bob")
alice = sy.VirtualWorker(hook, id="alice")
Here is my LSTM model, in can run successfully by only use pytorch, but it can't run with pysyft
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.rnn = torch.nn.RNN(input_size=28,
self.fc = torch.nn.Linear(32, 10)
def forward(self, x):
x = x.squeeze()
x, _ = self.rnn(x)
x = self.fc(x[:, -1, :])
return x.view(-1, 10)
def train(args, model, device, federated_train_loader, optimizer, epoch):
for batch_idx, (data, target) in enumerate(federated_train_loader):
model.send(data.location) # <-- NEW: send the model to the right location
data, target =,
# data, target = data.cuda(), target.cuda()
output = model(
loss = F.nll_loss(output, target)
model.get() # <-- NEW: get the model back
if batch_idx % args.log_interval == 0:
loss = loss.get() # <-- NEW: get the loss back
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * args.batch_size, len(federated_train_loader) * args.batch_size,
100. * batch_idx / len(federated_train_loader), loss.item()))
When I use Pysyft to run my LSTM model,there is a mistakes.But if I use my model without Pysyft,it an run scuccessfully.I don't know how to resolve it?
import torch
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
import torch.nn.functional as F
import time
import numpy as np
import syft as sy
class Arguments:
def __init__(self):
self.cuda = False
self.no_cuda = True
self.seed = 1
self.batch_size = 50
self.test_batch_size = 1000
self.epochs = 10 = 0.01
self.momentum = 0.5
self.log_interval = 10
hook = sy.TorchHook(torch) # <-- NEW: hook PyTorch ie add extra functionalities to support Federated Learning
bob = sy.VirtualWorker(hook, id="bob") # <-- NEW: define remote worker bob
alice = sy.VirtualWorker(hook, id="alice") # <-- NEW: and alice
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.rnn = torch.nn.RNN(input_size=28,
self.fc = torch.nn.Linear(32, 10)
def forward(self, x):
x = x.squeeze()
x, _ = self.rnn(x)
x = self.fc(x[:, -1, :])
return x.view(-1, 10)
def train(args, model, device, federated_train_loader, optimizer, epoch):
for batch_idx, (data, target) in enumerate(federated_train_loader): # <-- now it is a distributed dataset
model.send(data.location) # <-- NEW: send the model to the right location
data, target =,
output = model(
loss = F.nll_loss(output, target)
model.get() # <-- NEW: get the model back
if batch_idx % args.log_interval == 0:
loss = loss.get() # <-- NEW: get the loss back
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * args.batch_size, len(federated_train_loader) * args.batch_size,
100. * batch_idx / len(federated_train_loader), loss.item()))
if __name__ == '__main__':
args = Arguments()
use_cuda = not args.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
losses = []
federated_train_loader = sy.FederatedDataLoader(
datasets.MNIST('../data', train=True, download=True,
transforms.Normalize((0.1307,), (0.3081,))
.federate((bob, alice)), # <-- NEW: we distribute the dataset across all the workers, it's now a FederatedDataset
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader =
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.Normalize((0.1307,), (0.3081,))
batch_size=args.test_batch_size, shuffle=True, **kwargs)
model = Model().to(device)
optimizer = torch.optim.Adam(model.parameters(),
t = time.time()
for epoch in range(1, args.epochs + 1):
train(args, model, device, federated_train_loader, optimizer, epoch)
test(args, model, device, test_loader)
total_time = time.time() - t
Here are the whole codes
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import syft as sy
hook = sy.TorchHook(torch)
bob = sy.VirtualWorker(hook, id="bob")
alice = sy.VirtualWorker(hook, id="alice")
class Arguments():
def __init__(self):
self.batch_size = 64
self.test_batch_size = 1000
self.epochs = 10 = 0.01
self.momentum = 0.5
self.no_cuda = False
self.seed = 1
self.log_interval = 10
self.save_model = False
args = Arguments()
use_cuda = not args.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
federated_train_loader = sy.FederatedDataLoader( # <-- this is now a FederatedDataLoader
datasets.MNIST('../data', train=True, download=True,
transforms.Normalize((0.1307,), (0.3081,))
.federate((bob, alice)),
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader =
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.Normalize((0.1307,), (0.3081,))
batch_size=args.test_batch_size, shuffle=True, **kwargs)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 20, 5, 1)
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4*4*50, 500)
self.fc2 = nn.Linear(500, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4*4*50)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
model = Net()
model = #pushing the model into available device.
optimizer = optim.SGD(model.parameters(), lr=0.01)
for epoch in range(1, args.epochs + 1):
# Train the model
for batch_idx, (data, target) in enumerate(federated_train_loader): # iterate through each worker's dataset
model.send(data.location) #send the model to the right location ; data.location returns the worker name in which the data is present
data, target =, # pushing both the data and target labels onto the available device.
optimizer.zero_grad() # 1) erase previous gradients (if they exist)
output = model(data) # 2) make a prediction
loss = F.nll_loss(output, target) # 3) calculate how much we missed
loss.backward() # 4) figure out which weights caused us to miss
optimizer.step() # 5) change those weights
model.get() # get the model back (with gradients)
if batch_idx % args.log_interval == 0:
loss = loss.get() #get the loss back
print('Epoch: {} [Training: {:.0f}%]\tLoss: {:.6f}'.format(epoch, 100. * batch_idx / len(federated_train_loader), loss.item()))
# Test the model
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target =,
output = model(data) # Getting a prediction
test_loss += F.nll_loss(output, target, reduction='sum').item() #updating test loss
pred = output.argmax(1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item() #correct pred in the current test set.
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))), "")
I hav tested the above code in torch 1.x and pysyft 0.2.5,And its working. (but with cnn model)...
Building Autoencoder with Softmax classifier - Input mismatch error

I am trying to train an auto-encoder with a softmax classifier to replicate the results in this paper Intriguing properties of neural networks.
My implementation is the following:
n_embedded = 400
class AE400_10(nn.Module):
def __init__(self):
super(AE400_10, self).__init__()
self.encoder = nn.Sequential(nn.Linear(28*28, n_embedded), nn.Sigmoid())
self.decoder = nn.Sequential(nn.Linear(n_embedded, 28*28))
self.classifier = nn.Sequential(nn.Linear(28*28, 10))
def forward(self, x):
x = x.view(-1, 28*28)
encoded = self.encoder(x)
decoded = self.decoder(encoded)
out = self.classifier(decoded) ##NEW UPDATED
return decoded, F.log_softmax(out)
For the training I have the following:
model = AE400_10().to(device)
criterion1 = nn.MSELoss()
criterion2 = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
for epoch in range(epochs):
total_batch = len(train_set) // batch_size_train
for batch_idx, (data, target) in enumerate(MNSIT_train):
X =
Y =
decoded, out = model(X)
loss1 = criterion1(decoded, inputs)
loss2 = criterion2(out, labels)
loss = loss1 + loss2
if (batch_idx+1) % 100 == 0:
print('Epoch [%d/%d], lter [%d/%d], Loss: %.4f'%(epoch+1, epochs, batch_idx+1, total_batch, cost.item()))
But I am getting the following error:
RuntimeError: size mismatch, m1: [128 x 400], m2: [784 x 10] at
I understand this is an error in the dimension but I am not sure why it is happening.
I fixed the input to the classifier based on the comments below and now I am getting the following error:
RuntimeError: The size of tensor a (784) must match the size of tensor
b (28) at non-singleton dimension 3
I don't use nn.Sequential so I'm not sure why exactly this happens but if you
self.classifier = nn.Sequential(nn.Linear(28*28, 10))
self.classifier = nn.Linear(28*28, 10)
your code works
import torch
import torch.nn as nn
import torch.nn.functional as F
n_embedded = 400
class AE400_10(nn.Module):
def __init__(self):
super(AE400_10, self).__init__()
self.encoder = nn.Sequential(nn.Linear(28*28, n_embedded), nn.Sigmoid())
self.decoder = nn.Sequential(nn.Linear(n_embedded, 28*28))
self.test = nn.Linear(28*28, 10)
self.classifier = nn.Sequential(nn.Linear(28*28, 10))
def forward(self, x):
x = x.view(-1,28*28)
encoded = self.encoder(x)
decoded = self.decoder(encoded)
out = self.classifier(decoded)
return decoded, F.log_softmax(out)
x = torch.ones(128,28,28)
model = AE400_10()
instead of encoded out = self.classifier(encoded)
put decoded as input of classifier
out = self.classifier(decoded)
I think, here is why you are getting the mismatch, because the classifier is expecting a tensor of 28 *28 as input as defined in your code.
Then,when calling the criterions:
loss1 = criterion1(decoded, X)
how to fix capsule training problem for a single class of MNIST dataset?

I am training a Capsule Network with both encoder and decoder part. It works perfectly fine with all the classes (10 classes) of the MNIST data set. But when I am extracting a single class say (class 0 or class 5) and then training the capsule network, the reconstruction of the image is very poor.
Where do I need to change the network setting, or do I have an error in my data preparation?
I tried:
I changed the total class from 10 (for ten digits to 1 for 1 digit and even for 2 for 2 digits).
When I am using the default MNIST dataset, I am getting no error or tensor size, but when I am extracting a particular class and then passing it into the network, I am facing issues like a) Dimensional Issues b) Float tensor warning.
I fixed these things but manually adding a dimension and converting the data to data.float().cuda() tensor. I did this for both the case i.e when I am using the 10 Digit Capsules and when I am using the 1 Digit Capsules for training a single class digit.
But after this, the network is running fine, but I am getting really blurred and poor reconstructions. While when I am training the whole MNIST dataset without extracting any class and passing it to the network, it doesn't throw any error and the reconstruction works really fine.
I would love to share the more detail and other parts of the code -
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import Adam
from torchvision import datasets, transforms
### **Here we prepare the data for the complete 10 class digit training**###
class Mnist:
def __init__(self, batch_size):
dataset_transform = transforms.Compose([
transforms.Normalize((0.1307,), (0.3081,))
train_dataset = datasets.MNIST('../data', train=True, download=True, transform=dataset_transform)
test_dataset = datasets.MNIST('../data', train=False, download=True, transform=dataset_transform)
self.train_loader =, batch_size=batch_size, shuffle=True)
self.test_loader =, batch_size=batch_size, shuffle=True)
## **Here is my code for extracting a single class digit extraction**##
class Mnist:
def __init__(self,batch_size):
dataset_transform = transforms.Compose([
transforms.Normalize((0.1307,), (0.3081,))
train_mnist = datasets.MNIST("../data", train=True)
test_mnist = datasets.MNIST("../data", train= False)
train_image, train_label = train_mnist.train_data, train_mnist.train_labels
test_image, test_label = test_mnist.test_data, test_mnist.test_labels
train_0, test_0 = [train_image[key] for (key, label) in enumerate(train_label) if int(label) == 5],[test_image[key] for (key, label) in enumerate(test_label) if int(label) == 5]
train_label_0, test_label_0 = zero__train = [train_label[key] for (key, label) in enumerate(train_label) if int(label) == 5],[test_label[key] for (key, label) in enumerate(test_label) if int(label) == 5]
train_dataset = tuple(zip(train_0, train_label_0))
test_dataset = tuple(zip(test_0, test_label_0))
self.train_loader =, batch_size=batch_size, shuffle=True)
self.test_loader =, batch_size=batch_size, shuffle=True)
# Here is the main code for the capsule training.
''' The below code is used for training the 1 class but using the 10 Digit capsules
class ConvLayer(nn.Module):
def __init__(self, in_channels=1, out_channels=256, kernel_size=9):
super(ConvLayer, self).__init__()
self.conv = nn.Conv2d(in_channels=in_channels,
def forward(self, x):
return F.relu(self.conv(x))
class PrimaryCaps(nn.Module):
def __init__(self, num_capsules=8, in_channels=256, out_channels=32, kernel_size=9):
super(PrimaryCaps, self).__init__()
self.capsules = nn.ModuleList([
nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=2, padding=0)
for _ in range(num_capsules)])
def forward(self, x):
u = [capsule(x) for capsule in self.capsules]
u = torch.stack(u, dim=1)
u = u.view(x.size(0), 32 * 6 * 6, -1)
return self.squash(u)
def squash(self, input_tensor):
squared_norm = (input_tensor ** 2).sum(-1, keepdim=True)
output_tensor = squared_norm * input_tensor / ((1. + squared_norm) * torch.sqrt(squared_norm))
return output_tensor
class DigitCaps(nn.Module):
def __init__(self, num_capsules=10, num_routes=32 * 6 * 6, in_channels=8, out_channels=16):
super(DigitCaps, self).__init__()
self.in_channels = in_channels
self.num_routes = num_routes
self.num_capsules = num_capsules
self.W = nn.Parameter(torch.randn(1, num_routes, num_capsules, out_channels, in_channels))
def forward(self, x):
batch_size = x.size(0)
x = torch.stack([x] * self.num_capsules, dim=2).unsqueeze(4)
# print(f"x at epoch {epoch} is equal to : {x}")
W =[self.W] * batch_size, dim=0)
# print(f"W at epoch {epoch} is equal to : {W}")
u_hat = torch.matmul(W, x)
# print(f"u_hatat epoch {epoch} is equal to : {u_hat}")
b_ij = Variable(torch.zeros(1, self.num_routes, self.num_capsules, 1))
b_ij = b_ij.cuda()
# print(f"b_ij at epoch {epoch} is equal to : {b_ij}")
num_iterations = 3
for iteration in range(num_iterations):
c_ij = F.softmax(b_ij, dim =1)
c_ij =[c_ij] * batch_size, dim=0).unsqueeze(4)
s_j = (c_ij * u_hat).sum(dim=1, keepdim=True)
v_j = self.squash(s_j)
# print(f"b_ij at iteration {iteration} is equal to : {b_ij}")
if iteration < num_iterations - 1:
a_ij = torch.matmul(u_hat.transpose(3, 4),[v_j] * self.num_routes, dim=1))
b_ij = b_ij + a_ij.squeeze(4).mean(dim=0, keepdim=True)
return v_j.squeeze(1)
def squash(self, input_tensor):
squared_norm = (input_tensor ** 2).sum(-1, keepdim=True)
output_tensor = squared_norm * input_tensor / ((1. + squared_norm) * torch.sqrt(squared_norm))
return output_tensor
class Decoder(nn.Module):
def __init__(self):
super(Decoder, self).__init__()
self.reconstraction_layers = nn.Sequential(
nn.Linear(16 * 10, 512),
nn.Linear(512, 1024),
nn.Linear(1024, 784),
def forward(self, x, data):
classes = torch.sqrt((x ** 2).sum(2))
classes = F.softmax(classes, dim =1)
_, max_length_indices = classes.max(dim=1)
masked = Variable(torch.sparse.torch.eye(10))
masked = masked.cuda()
masked = masked.index_select(dim=0, index=max_length_indices.squeeze(1).data)
reconstructions = self.reconstraction_layers((x * masked[:, :, None, None]).view(x.size(0), -1))
reconstructions = reconstructions.view(-1, 1, 28, 28)
return reconstructions, masked
class CapsNet(nn.Module):
def __init__(self):
super(CapsNet, self).__init__()
self.conv_layer = ConvLayer()
self.primary_capsules = PrimaryCaps()
self.digit_capsules = DigitCaps()
self.decoder = Decoder()
self.mse_loss = nn.MSELoss()
def forward(self, data):
output = self.digit_capsules(self.primary_capsules(self.conv_layer(data)))
reconstructions, masked = self.decoder(output, data)
return output, reconstructions, masked
def loss(self, data, x, target, reconstructions):
return self.margin_loss(x, target) + self.reconstruction_loss(data, reconstructions)
# return self.reconstruction_loss(data, reconstructions)
def margin_loss(self, x, labels, size_average=True):
batch_size = x.size(0)
v_c = torch.sqrt((x**2).sum(dim=2, keepdim=True))
left = F.relu(0.9 - v_c).view(batch_size, -1)
right = F.relu(v_c - 0.1).view(batch_size, -1)
# print(f"shape of labels, left and right respectively - {labels.size(), left.size(), right.size()}")
loss = labels * left + 0.5 * (1.0 - labels) * right
loss = loss.sum(dim=1).mean()
return loss
def reconstruction_loss(self, data, reconstructions):
loss = self.mse_loss(reconstructions.view(reconstructions.size(0), -1), data.view(reconstructions.size(0), -1))
return loss*0.0005
capsule_net = CapsNet()
capsule_net = capsule_net.cuda()
optimizer = Adam(capsule_net.parameters())
##### Here is the problem while training####
batch_size = 100
mnist = Mnist(batch_size)
n_epochs = 5
for epoch in range(n_epochs):
train_loss = 0
for batch_id, (data, target) in enumerate(mnist.train_loader):
target = torch.eye(10).index_select(dim=0, index=target)
data, target = Variable(data), Variable(target)
data, target = data.cuda(), target.cuda()
data, target = data.float().cuda(), target.float().cuda() # Here I changed the data to float and it's required only when I am using my extracted dataset for a single class
data = data[:,:,:] # Use this when 1st MNist data is used
# data = data[:,None,:,:] # Use this when I am using my extracted single class digits
output, reconstructions, masked = capsule_net(data)
loss = capsule_net.loss(data, output, target, reconstructions)
train_loss += loss.item()
# if batch_id % 100 == 0:
# print ("train accuracy:", sum(np.argmax(, 1) ==
# np.argmax(, 1)) / float(batch_size))
print (train_loss / len(mnist.train_loader))
I used this to see the main data as image and the reconstructed image
import matplotlib
import matplotlib.pyplot as plt
def plot_images_separately(images):
"Plot the six MNIST images separately."
fig = plt.figure()
for j in range(1, 10):
ax = fig.add_subplot(1, 10, j)
ax.matshow(images[j-1], cmap =
I checked the normal performing code and then the problematic one, I found that the dataset passed into the network was of not same nature. The problems were -
The MNIST data extracted for a single class was not transformed into tensor and no normalization was applied, although I tried passing it through the transformation.
This is what I did to fix it -
I created transformation objections and tensor objection and then passed by list comprehension elements to it. Below are the codes and the final output of my network -
Preparing class 0 dataset (dataset for the digit 5)
class Mnist:
trans = transforms.ToTensor()
normalize = transforms.Normalize((0.1307,), (0.3081,))
def init(self,batch_size):
dataset_transform = transforms.Compose([
transforms.Normalize((0.1307,), (0.3081,))
trans = transforms.ToTensor()
normalize = transforms.Normalize((0.1307,), (0.3081,))
train_mnist = datasets.MNIST("../data", train=True, transform=dataset_transform)
test_mnist = datasets.MNIST("../data", train= False, transform=dataset_transform)
train_image, train_label = train_mnist.train_data, train_mnist.train_labels
test_image, test_label = test_mnist.test_data, test_mnist.test_labels
train_0, test_0 = [normalize(trans(train_image[key].unsqueeze(2).numpy())) for (key, label) in enumerate(train_label) if int(label) == 5],[test_image[key] for (key, label) in enumerate(test_label) if int(label) == 5]
train_label_0, test_label_0 = zero__train = [train_label[key] for (key, label) in enumerate(train_label) if int(label) == 5],[test_label[key] for (key, label) in enumerate(test_label) if int(label) == 5]
train_dataset = tuple(zip(train_0, train_label_0))
test_dataset = tuple(zip(test_0, test_label_0))
self.train_loader =, batch_size=batch_size, shuffle=True)
self.test_loader =, batch_size=batch_size, shuffle=True)
