Why do I always get the same value as the result in a CNN in pytorch? - pytorch

Here is my code
dataset = pd.read_csv('augmented_data.csv')
dataset = dataset.sample(frac=1)
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.conv1 = nn.Conv2d(3,6,5)
self.pool = nn.MaxPool2d(2,2)
self.conv2 = nn.Conv2d(6,16,5)
self.fc1 = nn.Linear(1024144, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84,1)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 1024144)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
print(x)
x = self.fc3(x)
return x
files_read = 0
preprocess = transforms.Compose([
transforms.Resize(1024),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5),(0.5, 0.5, 0.5))])
# device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
device = torch.device('cpu')
# model = ConvNet().to(device)
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = optim.Adam(model.parameters(), lr=0.001)
results = []
for index, row in dataset.iterrows():
try:
image = load_img('padded_images/' + row['image_name'] +'.jpg')
except:
image = load_img('augmented_images/' + row['image_name'] +'.jpeg')
files_read += 1
input_tensor = preprocess(image)
input_batch = input_tensor.unsqueeze(0).to(device)
if files_read <= 80 * len(dataset) // 100:
output = model(input_batch)
optimizer.zero_grad()
y = torch.tensor([[float(row['target'])]]).to(device)
loss = criterion(output, y)
loss.backward()
optimizer.step()
else:
model.eval()
output = model(input_batch)
results.append([1.0 if output[0][0].double() > 0.5 else 0, float(row['target'])])
So i am using pytorch CNN to classify 60k images in 2 classes. When i print the output after the model has trained, whatever the image as input, the ouput is always "tensor([[0.6384]], grad_fn=)". Always the same value. So it predicts only 1 (because it's greater than 0.5). The thing is, when i print the ouput while training, the results vary (16, 1 , 0, 4 ,0.6 etc) but when i print the output (with the same model but not trained) the results don't vary that much (0.5, 0.51, 0.49 0.52, 0.55). So I think it's safe to say that it is converging to a single value. I just don't know why. what could i do differently?

Related

Reshape data to be usable for training GCN in PyTorch

I am trying to build Graph Convolutional Network. I converted my dataframe to PyTorch
required format using below code.
class S_Dataset(Dataset):
def __init__(self, df, transform=None):
self.df = df
self.transform = transform
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
row = self.df.iloc[idx]
x = torch.tensor([row.date.to_pydatetime().timestamp(), row.s1, row.s2, row.s3, row.s4, row.temp ,row.rh, row.Location, row.Node ], dtype=torch.float)
y = torch.tensor([row.Location], dtype=torch.long)
weight1 = torch.tensor([row.neighbor1_distance], dtype=torch.float)
weight2 = torch.tensor([row.neighbor2_distance], dtype=torch.float)
weight3 = torch.tensor([row.neighbor3_distance], dtype=torch.float)
edge_index1 = torch.tensor([[row.Location, row.neighbor1_name]], dtype=torch.long).t()
edge_index2 = torch.tensor([[row.Location, row.neighbor2_name]], dtype=torch.long).t()
edge_index3 = torch.tensor([[row.Location, row.neighbor3_name]], dtype=torch.long).t()
edge_index = torch.cat([edge_index1, edge_index2, edge_index3 ], dim=1)
weight = torch.cat([weight1, weight2, weight3], dim=0)
if self.transform:
x, y, edge_index, weight = self.transform(x, y, edge_index, weight)
return x, y, edge_index, weight
Process_Data = S_Dataset(df)
Next I divided data into train and test set:
train_size = int(len(Process_Data) * 0.8)
test_size = len(Process_Data) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(Process_Data, [train_size, test_size])
# Create dataloaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True )
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True )
I designed a simple model:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv
# Create the model
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = GCNConv(9, 128)
self.conv2 = GCNConv(128, 64)
self.fc1 = nn.Linear(64, 32)
self.fc2 = nn.Linear(32, len(location_to_id))
def forward(self, x, edge_index, weight):
x = self.conv1(x, edge_index, weight)
x = torch.relu(x)
x = self.conv2(x, edge_index, weight)
x = torch.relu(x)
x = x.view(-1, 64)
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
return x
Finally to train the model:
model = Net()
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
for epoch in range(100):
total_loss = 0
for batch in train_loader:
optimizer.zero_grad()
x, y, edge_index, weight = batch
y_pred = model(x, edge_index, weight)
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
total_loss += loss.item()
print('Epoch: {} Loss: {:.4f}'.format(epoch, total_loss / len(train_loader)))
I am facing following error:
IndexError: The shape of the mask [2, 3] at index 0 does not match the shape of the indexed tensor [32, 3] at index 0
x, y, edge_index, weight = batch
This line is causing error.
How can I resphae my data so I can train my model?
The batch size is set at 32, but there might not be enough samples to fit in the batch size of 32.
I am assuming, this error occurs after the code runs for some time, I would appreciate more context on the problem
A general solution could be decreasing the size of batch to something smaller and trying the code again. Making sure all samples are covered in the epoch.

RuntimeError: shape '[64, 3, 32, 32]' is invalid for input of size 49152

I'm trying to train a CNN model with Cifar10 dataset and I get this error:
8 optimizer.zero_grad()
9 input, target = batch
---> 10 input = input.view(batch_size, n_channel, 32, 32)
11 output = model(input)
12 loss = loss_fn(output, target)
RuntimeError: shape '[64, 3, 32, 32]' is invalid for input of size 49152
Can someone help me? I cant solve this error.
49512=64x3x16x16 but ı dont understand where does this number come from? (I'm beginner with pytorch)
My code:
...
class CNNModel(nn.Module):
def __init__(self) -> None:
super().__init__()
self.cnn1 = nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1)
self.relu1 = nn.ReLU()
self.maxpool1 = nn.MaxPool2d(2)
self.cnn2 = nn.Conv2d(6, 16, kernel_size=3, stride=1, padding=1)
self.relu2 = nn.ReLU()
self.maxpool2 = nn.MaxPool2d(2)
self.fc1 = nn.Linear(8*8*16, 256)
self.fc2 = nn.Linear(256, 10)
def forward(self, x):
x = self.cnn1(x)
x = self.relu1(x)
x = self.maxpool1(x)
x = self.cnn2(x)
x = self.relu2(x)
x = self.maxpool2(x)
x = x.view(x.size(0), -1)
x = self.fc1(x)
out = self.fc2(x)
return out
model = CNNModel()
model
def train_model(model, train_loader, test_loader, loss_fn, optimizer, epochs = epochs):
for epoch in range(1, epochs + 1):
training_loss = .0
validation_loss = .0
model.train()
for batch in train_loader:
optimizer.zero_grad()
input, target = batch
input = input.view(batch_size, n_channel, 32, 32)
output = model(input)
loss = loss_fn(output, target)
loss.backward()
optimizer.step()
training_loss += loss.data
model.eval()
num_correct = 0
num_examples = len(test_loader.dataset)
for batch in test_loader:
input, target = batch
input = input.view(batch_size, n_channel, 32, 32)
output = model(input)
loss = loss_fn(output, target)
validation_loss += loss.data
predicted = torch.max(output.data, 1)[1]
num_correct += (predicted == target).sum()
accuracy = 100 * num_correct / num_examples
print("Epoch: {}".format(epoch), "\n",
"Training loss: {:.2f}".format(training_loss), "\n",
"Accuracy: {:.2f}".format(accuracy), "\n",
"Validation loss: {:.2f}".format(validation_loss)
)
I need to write some more to get Stackoverflow to accept my question. ignore this sentence :)
Think view() as a rearrangement. For example, let's say we have input like
input = torch.randn(1,3,32,32)
with declaring the input we say input should have 1 batch size, 3 channels, and 32x32 width and height.
With view, we can rearrange these dimensions like
input=input.view(1,3*2*2,16,16)
So what we've seen from here is that without changing the total number (in our case this is equal to 133232 = 13221616) you can change your shape.
For the solution,
print(input.shape)
and then look at your dimension and then change accordingly with keeping in mind the total number must not be changed.

KL Divergence goes NaN on Bayesian Convolutional Neural Network

I'm trying to implement a Bayesian Convolutional Neural Network using Pytorch on Python 3.7. I mainly orient myself on Shridhar's implementation. When running my CNN with normalized and MNIST data, the KL Divergence is NaN after a couple of iterations. I already implemented linear layers the same way and they worked perfectly fine.
I normalized the data as follows:
train_loader = torch.utils.data.DataLoader(datasets.MNIST('./mnist', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])), batch_size=BATCH_SIZE, shuffle=True, **LOADER_KWARGS)
eval_loader = torch.utils.data.DataLoader(datasets.MNIST('./mnist', train=False, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])), batch_size=EVAL_BATCH_SIZE, shuffle=False, **LOADER_KWARGS)
My implementation of the Conv-Layer looks as follows:
class BayesianConv2d(nn.Module):
def __init__(self, in_channels, out_channels, prior_sigma, kernel_size, stride=1, padding=0, dilation=1, groups=1):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.normal = torch.distributions.Normal(0,1)
# conv-parameters
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
self.dilation = dilation
self.groups = groups
# Weight parameters
self.weight_mu = nn.Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size).uniform_(0, 0.1))
self.weight_rho = nn.Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size).uniform_(-3,0.1))
self.weight_sigma = 0
self.weight = 0
# Bias parameters
self.bias_mu = nn.Parameter(torch.Tensor(out_channels).uniform_(0, 0.1))
self.bias_rho = nn.Parameter(torch.Tensor(out_channels).uniform_(-3,0.1))
self.bias_sigma = 0
self.bias = 0
# prior
self.prior_sigma = prior_sigma
def forward(self, input, sample=False, calculate_log_probs=False):
# compute sigma out of rho: sigma = log(1+e^rho)
self.weight_sigma = torch.log1p(torch.exp(self.weight_rho))
self.bias_sigma = torch.log1p(torch.exp(self.bias_rho))
# sampling process -> use local reparameterization trick
activations_mu = F.conv2d(input.to(DEVICE), self.weight_mu, self.bias_mu, self.stride, self.padding, self.dilation, self.groups)
activations_sigma = torch.sqrt(1e-16 + F.conv2d((input**2).to(DEVICE), self.weight_sigma**2, self.bias_sigma**2, self.stride, self.padding, self.dilation, self.groups))
activation_epsilon = Variable(self.weight_mu.data.new(activations_sigma.size()).normal_(mean=0, std=1))
outputs = activations_mu + activations_sigma * activation_epsilon
if self.training or calculate_log_probs:
self.kl_div = 0.5 * ((2 * torch.log(self.prior_sigma / self.weight_sigma) - 1 + (self.weight_sigma / self.prior_sigma).pow(2) + ((0 - self.weight_mu) / self.prior_sigma).pow(2)).sum() \
+ (2 * torch.log(0.1 / self.bias_sigma) - 1 + (self.bias_sigma / 0.1).pow(2) + ((0 - self.bias_mu) / 0.1).pow(2)).sum())
return outputs
The implementation of the corresponding Conv-Net looks as follows:
class BayesianConvNetwork(nn.Module):
# Set up network by definining layers
def __init__(self):
super().__init__()
self.conv1 = layers.BayesianConv2d(1, 24, prior_sigma=0.1, kernel_size = (5,5), padding=2)
self.pool1 = nn.MaxPool2d(kernel_size=3,stride=2, padding=1)
self.conv2 = layers.BayesianConv2d(24, 48, prior_sigma=0.1, kernel_size = (5,5), padding=2)
self.pool2 = nn.MaxPool2d(kernel_size=3,stride=2, padding=1)
self.conv3 = layers.BayesianConv2d(48, 64, prior_sigma=0.1, kernel_size = (5,5), padding=2)
self.pool3 = nn.MaxPool2d(kernel_size=3,stride=2, padding=1)
self.fcl1 = layers.BayesianLinearWithLocalReparamTrick(4*4*64, 256, prior_sigma=0.1)
self.fcl2 = layers.BayesianLinearWithLocalReparamTrick(256, 10, prior_sigma=0.1)
# define forward function by assigning corresponding activation functions to layers
def forward(self, x, sample=False):
x = F.relu(self.conv1(x, sample))
x = self.pool1(x)
x = F.relu(self.conv2(x, sample))
x = self.pool2(x)
x = F.relu(self.conv3(x, sample))
x = self.pool3(x)
x = x.view(-1, 4*4*64)
x = F.relu(self.fcl1(x, sample))
x = F.log_softmax(self.fcl2(x, sample), dim=1)
return x
# summing up KL-divergences to obtain overall KL-divergence-value
def total_kl_div(self):
return (self.conv1.kl_div + self.conv2.kl_div + self.conv3.kl_div + self.fcl1.kl_div + self.fcl2.kl_div)
# sampling prediction: perform prediction for each of the "different networks" that result from the weight distributions
def sample_elbo(self, input, target, batch_idx, nmbr_batches, samples=SAMPLES):
outputs = torch.zeros(samples, target.shape[0], CLASSES).to(DEVICE)
kl_divs = torch.zeros(samples).to(DEVICE)
for i in range(samples): # sample through networks
outputs[i] = self(input, sample=True) # perform prediction
kl_divs[i] = self.total_kl_div() # calculate total kl_div of the network
kl_div = kl_divs.mean() # compute mean kl_div from all samples
negative_log_likelihood = F.nll_loss(outputs.mean(0), target, size_average=False)
loss = kl_weighting * kl_div + negative_log_likelihood
return loss
Has anyone faced the same issue or knows how to solve it?
Many thanks in advance!
I figured out that it appears to be an issue with the SGD-optimizer. Using Adam as optimizer solved the problem though I don't know the reason for that. If anyone has an answer on why it works with Adam but not with SGD, feel free to comment.

pytorch runs slow when data are pre-transported to GPU

I have a model written in pytorch. Since my dataset is small, I can directly load all of the data to GPU. However, I found the forward speed becomes slow if I do so. The following is a runnable example. Specifically, I have the model:
import numpy as np
from time import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
def knn(x, k):
inner = -2*torch.matmul(x.transpose(2, 1), x)
xx = torch.sum(x**2, dim=1, keepdim=True)
pairwise_distance = -xx - inner - xx.transpose(2, 1)
idx = pairwise_distance.topk(k=k, dim=-1)[1] # (batch_size, num_points, k)
return idx
def get_graph_feature(x, k=20, idx=None):
batch_size = x.size(0)
num_points = x.size(2)
x = x.view(batch_size, -1, num_points)
if idx is None:
idx = knn(x, k=k) # (batch_size, num_points, k)
idx_base = torch.arange(0, batch_size, device=x.device).view(-1, 1, 1)*num_points
idx = idx + idx_base
idx = idx.view(-1)
_, num_dims, _ = x.size()
x = x.transpose(2, 1).contiguous() # (batch_size, num_points, num_dims) -> (batch_size*num_points, num_dims) # batch_size * num_points * k + range(0, batch_size*num_points)
feature = x.view(batch_size*num_points, -1)[idx, :]
feature = feature.view(batch_size, num_points, k, num_dims)
x = x.view(batch_size, num_points, 1, num_dims).repeat(1, 1, k, 1)
feature = torch.cat((feature-x, x), dim=3).permute(0, 3, 1, 2).contiguous()
return feature
class DGCNN(nn.Module):
def __init__(self, k=25, output_channels=10):
super(DGCNN, self).__init__()
self.k = k
self.bn1 = nn.BatchNorm2d(64)
self.bn2 = nn.BatchNorm2d(64)
self.bn3 = nn.BatchNorm2d(128)
self.bn4 = nn.BatchNorm2d(256)
self.bn5 = nn.BatchNorm1d(1024)
self.conv1 = nn.Sequential(nn.Conv2d(6, 64, kernel_size=1, bias=False),
self.bn1,
nn.LeakyReLU(negative_slope=0.2))
self.conv2 = nn.Sequential(nn.Conv2d(64*2, 64, kernel_size=1, bias=False),
self.bn2,
nn.LeakyReLU(negative_slope=0.2))
self.conv3 = nn.Sequential(nn.Conv2d(64*2, 128, kernel_size=1, bias=False),
self.bn3,
nn.LeakyReLU(negative_slope=0.2))
self.conv4 = nn.Sequential(nn.Conv2d(128*2, 256, kernel_size=1, bias=False),
self.bn4,
nn.LeakyReLU(negative_slope=0.2))
self.conv5 = nn.Sequential(nn.Conv1d(512, 1024, kernel_size=1, bias=False),
self.bn5,
nn.LeakyReLU(negative_slope=0.2))
self.linear1 = nn.Linear(1024*2, 512, bias=False)
self.bn6 = nn.BatchNorm1d(512)
self.dp1 = nn.Dropout()
self.linear2 = nn.Linear(512, 256)
self.bn7 = nn.BatchNorm1d(256)
self.dp2 = nn.Dropout()
self.linear3 = nn.Linear(256, output_channels)
def forward(self, x):
x = x.transpose(2, 1)
batch_size = x.size(0)
x = get_graph_feature(x, k=self.k)
x = self.conv1(x)
x1 = x.max(dim=-1, keepdim=False)[0]
x = get_graph_feature(x1, k=self.k)
x = self.conv2(x)
x2 = x.max(dim=-1, keepdim=False)[0]
x = get_graph_feature(x2, k=self.k)
x = self.conv3(x)
x3 = x.max(dim=-1, keepdim=False)[0]
x = get_graph_feature(x3, k=self.k)
x = self.conv4(x)
x4 = x.max(dim=-1, keepdim=False)[0]
x = torch.cat((x1, x2, x3, x4), dim=1)
x = self.conv5(x)
x1 = F.adaptive_max_pool1d(x, 1).view(batch_size, -1)
x2 = F.adaptive_avg_pool1d(x, 1).view(batch_size, -1)
x = torch.cat((x1, x2), 1)
x = F.leaky_relu(self.bn6(self.linear1(x)), negative_slope=0.2)
x = self.dp1(x)
x = F.leaky_relu(self.bn7(self.linear2(x)), negative_slope=0.2)
x = self.dp2(x)
x = self.linear3(x)
return x
Here is what the dataloader and test function looks like:
class my_loader(Dataset):
def __init__(self, device):
self.data = torch.rand(256, 2048, 3).to(device).float()
self.labels = torch.rand(256).to(device).long()
def __getitem__(self, ind):
return self.data[ind], self.labels[ind]
def __len__(self):
return len(self.data)
def test():
device = torch.device('cuda:2')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.12s --------------#
for inputs, labels in test_loader:
tic = time()
pred = model(inputs)
print('time1 {}'.format(time() - tic))
print('------------------')
#---------- this one is 0.004s --------------#
for inputs, labels in test_loader:
inputs = inputs.detach().cpu().to(device)
tic = time()
pred = model(inputs)
print('time2 {}'.format(time() - tic))
print('------------------')
#---------- this one is 0.12s --------------#
for inputs, labels in test_loader:
tic = time()
inputs = inputs.detach().cpu().to(device)
pred = model(inputs)
print('time3 {}'.format(time() - tic))
print('------------------')
Basically, it seems that if there is no explicit call of gpu to cpu transportation either before or after the forward propagation, the forward propagation would cost more time. It just seems like that the forward propagation is implicitly doing gpu->cpu transportation.
I played around with the code a little bit, and I think the problem is that you are measuring times for both cases in the same run. Here is my boiled down version of your code since your model crushed my GPU memory:
class DGCNN(nn.Module):
def __init__(self, num_layers):
super(DGCNN, self).__init__()
self.layers = nn.ModuleList([nn.Linear(256, 256) for _ in range(1200)])
def forward(self, x):
x = x.view(-1, 256)
for layer in self.layers:
x = layer(x)
return x
class my_loader(Dataset):
def __init__(self, device):
self.data = torch.rand(256, 2048, 3).to(device).float()
self.labels = torch.rand(256).to(device).long()
def __getitem__(self, ind):
return self.data[ind], self.labels[ind]
def __len__(self):
return len(self.data)
Now, here I demonstrate different versions of test().
Version #1:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.12s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs)
tac = time()
print(f'# First case -> Full forward pass: {tac - tic:.6f}')
#---------- this one is 0.004s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs.detach().cpu().to(device))
tac = time()
print(f'# Second case -> Full forward pass: {tac - tic:.6f}')
>>> # First case -> Full forward pass: 3.105103, # Second case -> Full forward pass: 2.831652
Now I switched the order of timing calculations for the cases. Version #2:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.004s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs.detach().cpu().to(device))
tac = time()
print(f'# Second case -> Full forward pass: {tac - tic:.6f}')
#---------- this one is 0.12s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs)
tac = time()
print(f'# First case -> Full forward pass: {tac - tic:.6f}')
>>> # Second case -> Full forward pass: 3.288522, # First case -> Full forward pass: 2.583231
Apparently, the first timing you calculate seems to end up slower. So, I calculated these timings separately in different runs with fresh kernels. Version #3:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.12s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs)
tac = time()
print(f'# First case -> Full forward pass: {tac - tic:.6f}')
>>> # First case -> Full forward pass: 3.091592
Version #4:
def test():
device = torch.device('cuda:0')
test_set = my_loader(device)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)
model = DGCNN().to(device)
model.eval()
#---------- this one is 0.004s --------------#
tic = time()
for inputs, labels in test_loader:
pred = model(inputs.detach().cpu().to(device))
tac = time()
print(f'# Second case -> Full forward pass: {tac - tic:.6f}')
>>> # Second case -> Full forward pass: 3.190248
So, by testing one at a time, it seems like pred = model(inputs) runs slightly faster than pred = model(inputs.detach().cpu().to(device)), which is the obvious expected result.

PyTorch network produces constant output

I am trying to train a simple MLP to approximate y=f(a,b,c).
My code is as below.
import torch
import torch.nn as nn
from torch.autograd import Variable
# hyper parameters
input_size = 3
output_size = 1
num_epochs = 50
learning_rate = 0.001
# Network definition
class FeedForwardNet(nn.Module):
def __init__(self, l1_size, l2_size):
super(FeedForwardNet, self).__init__()
self.fc1 = nn.Linear(input_size, l1_size)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(l1_size, l2_size)
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(l2_size, output_size)
def forward(self, x):
out = self.fc1(x)
out = self.relu1(out)
out = self.fc2(out)
out = self.relu2(out)
out = self.fc3(out)
return out
model = FeedForwardNet(5 , 3)
# sgd optimizer
optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum=0.9)
for epoch in range(11):
print ('Epoch ', epoch)
for i in range(trainX_light.shape[0]):
X = Variable( torch.from_numpy(trainX_light[i]).view(-1, 3) )
Y = Variable( torch.from_numpy(trainY_light[i]).view(-1, 1) )
# forward
optimizer.zero_grad()
output = model(X)
loss = (Y - output).pow(2).sum()
print (output.data[0,0])
loss.backward()
optimizer.step()
totalnorm = 0
for p in model.parameters():
modulenorm = p.grad.data.norm()
totalnorm += modulenorm ** 2
totalnorm = math.sqrt(totalnorm)
print (totalnorm)
# validation code
if (epoch + 1) % 5 == 0:
print (' test points',testX_light.shape[0])
total_loss = 0
for t in range(testX_light.shape[0]):
X = Variable( torch.from_numpy(testX_light[t]).view(-1, 3) )
Y = Variable( torch.from_numpy(testY_light[t]).view(-1, 1) )
output = model(X)
loss = (Y - output).pow(2).sum()
print (output.data[0,0])
total_loss += loss
print ('epoch ', epoch, 'avg_loss ', total_loss.data[0] / testX_light.shape[0])
print ('Done')
The problem that I have now is, the validation code
output = model(X)
is always producing an exact same output value (I guess this value is some sort of garbage). I am not sure what mistake I am doing in this part. Could some help me figure out the mistake in my code?
The reason that network produced random values (and inf later) was the exploding gradient problem. Clipping the gradient (torch.nn.utils.clip_grad_norm(model.parameters(), 0.1)) helped.

Resources