PyTorch simple ConvNet diverge so easly - pytorch

So I'm studiying pytorch coming from a background with tensorflow.
I'm trying to replicate a simple convnet, that I've developed with success in tensorflow, to classify cat vs dogs images.
In pytorch I see some strange behaviors:
Using a Learning Rate of 0.001 make the CNet predicting only 0 after the first batch (might be exploding gradients?)
Using a Learning Rate of 0.0005 gives a smooth learning curve and the CNet converge
Can anyone help me to understand what I'm doing wrong? that the code:
import pathlib
import torch
import torch.nn.functional as F
import torchvision
from torch.utils.data.dataloader import DataLoader
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CNet(torch.nn.Module):
def __init__(self):
super(CNet, self).__init__() #input is 180x180 image
self.conv1 = torch.nn.Conv2d(3, 32, 3) # out -> 178x178x32
self.conv2 = torch.nn.Conv2d(32, 64, 3)
self.conv3 = torch.nn.Conv2d(64, 128, 3)
self.conv4 = torch.nn.Conv2d(128, 256, 3)
self.conv5 = torch.nn.Conv2d(256, 256, 3)
self.flatten = torch.nn.Flatten()
#self.fc = torch.nn.LazyLinear(1)
self.fc = torch.nn.Linear(7*7*256, 1)
def forward(self, x):
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv3(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv4(x)), (2, 2))
x = F.relu(self.conv5(x))
x = self.flatten(x)
o = torch.sigmoid(self.fc(x))
return o
def train(model : CNet, train_data : DataLoader, criterion, optimizer : torch.optim.Optimizer, epochs = 10, validation_data : DataLoader = None):
losses = []
for epoch in range(epochs):
epoch_loss = 0.0
running_loss = 0.0
for i, data in enumerate(train_data, 0):
imgs, labels = data
imgs, labels = imgs.to(device), labels.to(device, dtype=torch.float)
labels = labels.unsqueeze(-1)
# run
output = net(imgs)
# zero out accumulated grads
loss = criterion(output, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
epoch_loss += loss.item()
#if i % 50 == 49:
# print(f'[{epoch+1}, {i:5d}] loss: {running_loss / 50.0:.3f}')
# running_loss = 0.0
losses.append(epoch_loss / len(train_data.dataset))
print(f'[{epoch+1}, {epochs:5d}] loss: {losses[-1]:.3f}')
return losses
if __name__=="__main__":
transforms = torchvision.transforms.Compose([
torchvision.transforms.Resize((180, 180)),
torchvision.transforms.ToTensor(),
])
dataset_dir = pathlib.Path("E:\Datasets\\torch\Cat_Dog\cats_vs_dogs_small")
train_data = torchvision.datasets.ImageFolder(dataset_dir / "train", transform=transforms)
validation_data = torchvision.datasets.ImageFolder(dataset_dir / "validation", transform=transforms)
test_data = torchvision.datasets.ImageFolder(dataset_dir / "test", transform=transforms)
train_data_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=2, persistent_workers=True, pin_memory=True)
validation_data_loader = DataLoader(validation_data, batch_size=32, num_workers=2, shuffle=True, pin_memory=True)
test_data_loader = DataLoader(test_data, batch_size=32, shuffle=True, pin_memory=True, num_workers=2)
import matplotlib.pyplot as plt
#plt.figure()
#for i in range(1, 10):
# plt.subplot(3, 3, i)
# plt.axis('off')
# rand_idx = np.random.random_integers(0, len(train_data))
# plt.imshow(np.moveaxis(test_data[rand_idx][0].numpy(), 0, 2))
#plt.show()
net = CNet()
net = net.to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.RMSprop(net.parameters(), 0.001)
net.train()
# TODO save best model
losses = train(net, train_data_loader, criterion, optimizer, epochs=30)
epochs = range(1, len(losses) + 1)
plt.plot(epochs, losses, 'bo', label='Training Loss')
plt.show()
print('Training Finished')
correct_count, all_count = 0, 0
for images,labels in test_data_loader:
images,labels = images.to(device), labels.to(device, dtype=torch.float)
with torch.no_grad():
ps = net(images)
pred_label = (ps > 0.5).to(torch.float)
true_label = labels.unsqueeze(1)
correct_count += (pred_label == true_label).sum().item()
all_count += len(labels)
print("Number Of Images Tested =", all_count)
print("\nModel Accuracy =", (correct_count/all_count))
and here some screenshot of the loss for each point:
LR=0.001 (not convering on pytorch, converging on tensorflow)
LR=0.0005 (converging in 30 epochs) [I know that the validation loss is not 0, accuracy is ~70% but is expected]
As you can see the loss on the two experiment are very different in scale. What might cause that such a weird behavior? I call it 'wierd' cause I never seen that happen on tensorflow.
Is typicall such different behavior between those 2 framework? or am I loosing something?

Related

Why is my autoencoder not learning the FMNIST dataset?

I am using a simple autoencoder to learn images from the FashionMnist dataset. I have preprocessed the dataset by grayscaling and normalizing it. I did not make the network too deep, to prevent it from creating a direct mapping.
Here's my PyTorch code -
import torch
import torchvision as tv
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch import nn
import os
from torchviz import make_dot
transforms = tv.transforms.Compose([tv.transforms.Grayscale(num_output_channels=1)])
trainset = tv.datasets.FashionMNIST(root='./data', train=True,
download=True, transform=transforms)
PATH = './ae.pth'
data = trainset.data.float()
data = data/255
# print(trainset.data.shape)
plt.imshow(trainset.data[0], cmap = 'gray')
plt.show()
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.encode = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 30),
nn.ReLU()
)
self.decode = nn.Sequential(
nn.Linear(30, 512),
nn.ReLU(),
nn.Linear(512, 28*28),
nn.Sigmoid()
)
def forward(self, x):
x = self.flatten(x)
encoded = self.encode(x)
decoded = self.decode(encoded)
return decoded
if(os.path.exists(PATH)):
print("Loading data on cpu")
device = torch.device('cpu')
model = NeuralNetwork()
model.load_state_dict(torch.load(PATH, map_location=device))
else:
device = "cuda" if torch.cuda.is_available() else "cpu"
data = data.to(device)
print(f"Using device = {device}")
model = NeuralNetwork().to(device)
# print(model)
lossFn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-3)
for epoch in range(1000):
print("Epoch = ", epoch)
optimizer.zero_grad()
outputs = model(data)
loss = lossFn(outputs, data.reshape(-1, 784))
loss.backward()
optimizer.step()
torch.save(model.state_dict(), PATH)
data = data.to("cpu")
model = model.to("cpu")
pred = model(data)
pred = pred.reshape(-1, 28, 28)
# print(pred.shape)
plt.imshow(pred.detach().numpy()[0], cmap = 'gray')
plt.show()
For testing, I am inputting the following image -
However, I get this as output -
I had an intuition that there was an issue with your loss function. When working with images, distance-based losses such as L1 or L2 losses work really well, as you are essentially measuring how far-away your predictions are from the ground-truth images. This was what I had observed as well, as the loss wasn't converging with BCE and it was rather oscillating.
I rewrote the entire thing and replaced BCE loss with MSE Loss and in just 50 epochs, the loss has gone down considerably, and it is still going down.
Here is the prediction after just 50 epochs -
The ground-truth image is -
I believe that you can get the loss down much more if you train for longer.
Here is the full code. I used a dataloader for batchifying and processing the data.
I also changed the transformations so that the resulting data is a torch tensor.
import torch
import torchvision as tv
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch import nn
from torch.utils.data import DataLoader
transforms = tv.transforms.Compose([
transforms.Grayscale(num_output_channels=1),
transforms.ToTensor()
])
trainset = tv.datasets.FashionMNIST(root='./data', train=True,
download=True, transform=transforms)
loader = DataLoader(trainset, batch_size=32, num_workers=1, shuffle=True)
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.encode = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 30),
nn.ReLU()
)
self.decode = nn.Sequential(
nn.Linear(30, 512),
nn.ReLU(),
nn.Linear(512, 28*28),
nn.Sigmoid()
)
def forward(self, x):
x = self.flatten(x)
encoded = self.encode(x)
decoded = self.decode(encoded)
return decoded
model = NeuralNetwork().to(device)
lossFn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-2)
epochs = 50
for epoch in range(epochs):
for images, labels in loader:
optimizer.zero_grad()
images, labels = images.to(device), labels.to(device)
outputs = model(images)
loss = lossFn(outputs, images.reshape(-1, 28*28))
loss.backward()
optimizer.step()
print(f'Loss : {loss.item()}')
print(f'Epochs done : {epoch}')
Here is some inference code -
# infer on some test data
testset = tv.datasets.FashionMNIST(root='./data', train=False,
download=False, transform=transforms)
testloader = DataLoader(testset, shuffle=False, batch_size=32, num_workers=1)
test_images, test_labels = next(iter(testloader))
test_images = test_images.to(device)
predictions = model(test_images)
prediction = predictions[0]
prediction = prediction.view(1, 28, 28)
prediction = prediction.detach().cpu().numpy()
prediction = prediction.transpose(1, 2, 0)
# plot the prediction
plt.imshow(prediction, cmap = 'gray')
plt.show()
# plot the actual image
test_image = test_images[0]
test_image = test_image.detach().cpu().numpy()
test_image = test_image.transpose(1, 2, 0)
plt.imshow(test_image, cmap='gray')
plt.show()
This is the loss going down --
Epochs done : 39
Loss : 0.04641226679086685
Epochs done : 40
Loss : 0.04445071145892143
Epochs done : 41
Loss : 0.05033266171813011
Epochs done : 42
Loss : 0.04813298210501671
Epochs done : 43
Loss : 0.0474831722676754
Epochs done : 44
Loss : 0.044186390936374664
Epochs done : 45
Loss : 0.049083154648542404
Epochs done : 46
Loss : 0.04645842686295509
Epochs done : 47
Loss : 0.04586248844861984
Epochs done : 48
Loss : 0.0467853844165802
Epochs done : 49

Function AddmmBackward returned an invalid gradient

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 3)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = NeuralNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
def UploadData(path, train):
#set up transforms for train and test datasets
train_transforms = transforms.Compose([transforms.Grayscale(num_output_channels=1), transforms.Resize(255), transforms.CenterCrop(224), transforms.RandomRotation(30),
transforms.RandomHorizontalFlip(), transforms.transforms.ToTensor()])
valid_transforms = transforms.Compose([transforms.Grayscale(num_output_channels=1), transforms.Resize(255), transforms.CenterCrop(224), transforms.RandomRotation(30),
transforms.RandomHorizontalFlip(), transforms.transforms.ToTensor()])
test_transforms = transforms.Compose([transforms.Grayscale(num_output_channels=1), transforms.Resize(255), transforms.CenterCrop(224), transforms.ToTensor()])
#set up datasets from Image Folders
train_dataset = datasets.ImageFolder(path + '/train', transform=train_transforms)
valid_dataset = datasets.ImageFolder(path + '/validation', transform=valid_transforms)
test_dataset = datasets.ImageFolder(path + '/test', transform=test_transforms)
#set up dataloaders with batch size of 32
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)
return trainloader, validloader, testloader
trainloader, validloader, testloader = UploadData("/home/lns/research/dataset", True)
epochs = 5
min_valid_loss = np.inf
for e in range(epochs):
train_loss = 0.0
for data, labels in trainloader:
# Transfer Data to GPU if available
if torch.cuda.is_available():
print("using GPU for data")
data, labels = data.cuda(), labels.cuda()
# Clear the gradients
optimizer.zero_grad()
# Forward Pass
target = net(data)
# Find the Loss
loss = criterion(target,labels)
# Calculate gradients
loss.backward()
# Update Weights
optimizer.step()
# Calculate Loss
train_loss += loss.item()
valid_loss = 0.0
model.eval() # Optional when not using Model Specific layer
for data, labels in validloader:
# Transfer Data to GPU if available
if torch.cuda.is_available():
print("using GPU for data")
data, labels = data.cuda(), labels.cuda()
# Forward Pass
target = net(data)
# Find the Loss
loss = criterion(target,labels)
# Calculate Loss
valid_loss += loss.item()
print('Epoch ',e+1, '\t\t Training Loss: ',train_loss / len(trainloader),' \t\t Validation Loss: ',valid_loss / len(validloader))
if min_valid_loss > valid_loss:
print("Validation Loss Decreased(",min_valid_loss,"--->",valid_loss,") \t Saving The Model")
min_valid_loss = valid_loss
# Saving State Dict
torch.save(net.state_dict(), '/home/lns/research/MODEL.pth')
After searching a lot i am asking for help. Can someone help me
understand why this error is occuring in backward propagation.
i followed pytorch cnn tutorail and geeksforgeeks tutorial
dataset is x ray images transformed into grayscale and resize to 255
Is my neural network is wrong or data is not processed correctly?
This is a size mismmatch between the output of your CNN and the number of neurons on on your first fully-connected layer. Because of missing padding, the number of elements when flattened is 16*4*4 i.e. 256 (and not 16*5*5):
self.fc1 = nn.Linear(256, 120)
Once modified, the model will run correctly:
>>> model = NeuralNetwork()
>>> model(torch.rand(1, 1, 28, 28)).shape
torch.Size([1, 3])
Alternatively, you can use an nn.LazyLinear which will deduce the in_feature argument during the very first inference based on its input shape.
self.fc1 = nn.LazyLinear(120)

TF 2.0 Error: Gradients does not exist for variables during training using gradienttape

I tried to make a class using batchnormalization layer from tf 2.0, however it gave me an error that Gradients does not exist for variables. I tried to use batchnormalization directly but it gave me the same error as well. it seems like it is not traing the variable related to the batchnormalization step.
I tried to use model.trainable_variables instead of model.variables but it didn't work either.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy import ndimage
learning_rate = 0.001
training_epochs = 15
batch_size = 100
tf.random.set_seed(777)
cur_dir = os.getcwd()
ckpt_dir_name = 'checkpoints'
model_dir_name = 'minst_cnn_best'
checkpoint_dir = os.path.join(cur_dir, ckpt_dir_name, model_dir_name)
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_prefix = os.path.join(checkpoint_dir, model_dir_name)
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images = train_images.astype(np.float32) /255.
test_images = test_images.astype(np.float32) /255.
print(train_images.shape, test_images.shape)
train_images = np.expand_dims(train_images, axis = -1)
test_images = np.expand_dims(test_images, axis = -1)
print(train_images.shape, test_images.shape)
train_labels = to_categorical(train_labels, 10)
test_labels = to_categorical(test_labels, 10)
train_dataset = tf.data.Dataset.from_tensor_slices((train_images,
train_labels)).shuffle(buffer_size = 100000).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((test_images,
test_labels)).batch(batch_size)
class ConvBNRelu(tf.keras.Model):
def __init__(self, filters, kernel_size=3, strides=1, padding='SAME'):
super(ConvBNRelu, self).__init__()
self.conv = keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, strides=strides,
padding=padding, kernel_initializer='glorot_normal')
self.batchnorm = tf.keras.layers.BatchNormalization()
def call(self, inputs, training=False):
layer = self.conv(inputs)
layer = self.batchnorm(layer)
layer = tf.nn.relu(layer)
return layer
class DenseBNRelu(tf.keras.Model):
def __init__(self, units):
super(DenseBNRelu, self).__init__()
self.dense = keras.layers.Dense(units=units, kernel_initializer='glorot_normal')
self.batchnorm = tf.keras.layers.BatchNormalization()
def call(self, inputs, training=False):
layer = self.dense(inputs)
layer = self.batchnorm(layer)
layer = tf.nn.relu(layer)
return layer
class MNISTModel(tf.keras.Model):
def __init__(self):
super(MNISTModel, self).__init__()
self.conv1 = ConvBNRelu(filters=32, kernel_size=[3, 3], padding='SAME')
self.pool1 = keras.layers.MaxPool2D(padding='SAME')
self.conv2 = ConvBNRelu(filters=64, kernel_size=[3, 3], padding='SAME')
self.pool2 = keras.layers.MaxPool2D(padding='SAME')
self.conv3 = ConvBNRelu(filters=128, kernel_size=[3, 3], padding='SAME')
self.pool3 = keras.layers.MaxPool2D(padding='SAME')
self.pool3_flat = keras.layers.Flatten()
self.dense4 = DenseBNRelu(units=256)
self.drop4 = keras.layers.Dropout(rate=0.4)
self.dense5 = keras.layers.Dense(units=10, kernel_initializer='glorot_normal')
def call(self, inputs, training=False):
net = self.conv1(inputs)
net = self.pool1(net)
net = self.conv2(net)
net = self.pool2(net)
net = self.conv3(net)
net = self.pool3(net)
net = self.pool3_flat(net)
net = self.dense4(net)
net = self.drop4(net)
net = self.dense5(net)
return net
models = []
num_models = 5
for m in range(num_models):
models.append(MNISTModel())
def loss_fn(model, images, labels):
logits = model(images, training=True)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits,
labels=labels))
return loss
def grad(model, images, labels):
with tf.GradientTape() as tape:
loss = loss_fn(model, images, labels)
return tape.gradient(loss, model.variables)
def evaluate(models, images, labels):
predictions = np.zeros_like(labels)
for model in models:
logits = model(images, training=False)
predictions += logits
correct_prediction = tf.equal(tf.argmax(predictions, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
return accuracy
optimizer = keras.optimizers.Adam(learning_rate = learning_rate)
checkpoints = []
for m in range(num_models):
checkpoints.append(tf.train.Checkpoint(cnn=models[m]))
for epoch in range(training_epochs):
avg_loss = 0.
avg_train_acc = 0.
avg_test_acc = 0.
train_step = 0
test_step = 0
for images, labels in train_dataset:
for model in models:
grads = grad(model, images, labels)
optimizer.apply_gradients(zip(grads, model.variables))
loss = loss_fn(model, images, labels)
avg_loss += loss / num_models
acc = evaluate(models, images, labels)
avg_train_acc += acc
train_step += 1
avg_loss = avg_loss / train_step
avg_train_acc = avg_train_acc / train_step
for images, labels in test_dataset:
acc = evaluate(models, images, labels)
avg_test_acc += acc
test_step += 1
avg_test_acc = avg_test_acc / test_step
print('Epoch:', '{}'.format(epoch + 1), 'loss =', '{:.8f}'.format(avg_loss),
'train accuracy = ', '{:.4f}'.format(avg_train_acc),
'test accuracy = ', '{:.4f}'.format(avg_test_acc))
for idx, checkpoint in enumerate(checkpoints):
checkpoint.save(file_prefix=checkpoint_prefix+'-{}'.format(idx))
print('Learning Finished!')
W0727 20:27:05.344142 140332288718656 optimizer_v2.py:982] Gradients does not exist for variables ['mnist_model/conv_bn_relu/batch_normalization/moving_mean:0', 'mnist_model/conv_bn_relu/batch_normalization/moving_variance:0', 'mnist_model/conv_bn_relu_1/batch_normalization_1/moving_mean:0', 'mnist_model/conv_bn_relu_1/batch_normalization_1/moving_variance:0', 'mnist_model/conv_bn_relu_2/batch_normalization_2/moving_mean:0', 'mnist_model/conv_bn_relu_2/batch_normalization_2/moving_variance:0', 'mnist_model/dense_bn_relu/batch_normalization_3/moving_mean:0', 'mnist_model/dense_bn_relu/batch_normalization_3/moving_variance:0'] when minimizing the loss.
W0727 20:27:05.407717 140332288718656 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:460: BaseResourceVariable.constraint (from tensorflow.python.ops.resource_variable_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Apply a constraint manually following the optimizer update step.
W0727 20:27:05.499249 140332288718656 optimizer_v2.py:982] Gradients does not exist for variables ['mnist_model_1/conv_bn_relu_3/batch_normalization_4/moving_mean:0', 'mnist_model_1/conv_bn_relu_3/batch_normalization_4/moving_variance:0', 'mnist_model_1/conv_bn_relu_4/batch_normalization_5/moving_mean:0', 'mnist_model_1/conv_bn_relu_4/batch_normalization_5/moving_variance:0', 'mnist_model_1/conv_bn_relu_5/batch_normalization_6/moving_mean:0', 'mnist_model_1/conv_bn_relu_5/batch_normalization_6/moving_variance:0', 'mnist_model_1/dense_bn_relu_1/batch_normalization_7/moving_mean:0', 'mnist_model_1/dense_bn_relu_1/batch_normalization_7/moving_variance:0'] when minimizing the loss.
...
You're computing the gradient of the loss with respect to the model.variables: this collection contains not only the trainable variables (the model weights) but also the non-trainable variables like the moving mean and variance computed by the batch normalization layer.
You have to compute the gradient with respect to the trainable_variables. In short change the lines
return tape.gradient(loss, model.variables)
and
optimizer.apply_gradients(zip(grads, model.variables))
to
return tape.gradient(loss, model.trainable_variables)
and
optimizer.apply_gradients(zip(grads, model.trainable_variables))

PyTorch: Add validation error in training

I am using PyTorch to train a cnn model. Here is my Network architecture:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as I
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 5)
self.pool = nn.MaxPool2d(2,2)
self.conv1_bn = nn.BatchNorm2d(32)
self.conv2 = nn.Conv2d(32, 64, 5)
self.conv2_drop = nn.Dropout2d()
self.conv2_bn = nn.BatchNorm2d(64)
self.fc1 = torch.nn.Linear(53*53*64, 256)
self.fc2 = nn.Linear(256, 136)
def forward(self, x):
x = F.relu(self.conv1_bn(self.pool(self.conv1(x))))
x = F.relu(self.conv2_bn(self.pool(self.conv2_drop(self.conv2(x)))))
x = x.view(-1, 53*53*64)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return x
Then I train the model like below:
# prepare the net for training
net.train()
for epoch in range(n_epochs): # loop over the dataset multiple times
running_loss = 0.0
# train on batches of data, assumes you already have train_loader
for batch_i, data in enumerate(train_loader):
# get the input images and their corresponding labels
images = data['image']
key_pts = data['keypoints']
# flatten pts
key_pts = key_pts.view(key_pts.size(0), -1)
# wrap them in a torch Variable
images, key_pts = Variable(images), Variable(key_pts)
# convert variables to floats for regression loss
key_pts = key_pts.type(torch.FloatTensor)
images = images.type(torch.FloatTensor)
# forward pass to get outputs
output_pts = net(images)
# calculate the loss between predicted and target keypoints
loss = criterion(output_pts, key_pts)
# zero the parameter (weight) gradients
optimizer.zero_grad()
# backward pass to calculate the weight gradients
loss.backward()
# update the weights
optimizer.step()
# print loss statistics
running_loss += loss.data[0]
I am wondering if it is possible to add the validation error in the training? I mean something like this (validation split) in Keras:
myModel.fit(trainX, trainY, epochs=50, batch_size=1, verbose=2, validation_split = 0.1)
Here is an example how to split your dataset for training and validation, then switch between the two phases every epoch:
import numpy as np
import torch
from torchvision import datasets
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler
# Examples:
my_dataset = datasets.MNIST(root="/home/benjamin/datasets/mnist", train=True, download=True)
validation_split = 0.1
dataset_len = len(my_dataset)
indices = list(range(dataset_len))
# Randomly splitting indices:
val_len = int(np.floor(validation_split * dataset_len))
validation_idx = np.random.choice(indices, size=val_len, replace=False)
train_idx = list(set(indices) - set(validation_idx))
# Contiguous split
# train_idx, validation_idx = indices[split:], indices[:split]
## Defining the samplers for each phase based on the random indices:
train_sampler = SubsetRandomSampler(train_idx)
validation_sampler = SubsetRandomSampler(validation_idx)
train_loader = torch.utils.data.DataLoader(my_dataset, sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(my_dataset, sampler=validation_sampler)
data_loaders = {"train": train_loader, "val": validation_loader}
data_lengths = {"train": len(train_idx), "val": val_len}
# Training with Validation (your code + code from Pytorch tutorial: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html)
n_epochs = 40
net = ...
for epoch in range(n_epochs):
print('Epoch {}/{}'.format(epoch, n_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
optimizer = scheduler(optimizer, epoch)
net.train(True) # Set model to training mode
else:
net.train(False) # Set model to evaluate mode
running_loss = 0.0
# Iterate over data.
for data in data_loaders[phase]:
# get the input images and their corresponding labels
images = data['image']
key_pts = data['keypoints']
# flatten pts
key_pts = key_pts.view(key_pts.size(0), -1)
# wrap them in a torch Variable
images, key_pts = Variable(images), Variable(key_pts)
# convert variables to floats for regression loss
key_pts = key_pts.type(torch.FloatTensor)
images = images.type(torch.FloatTensor)
# forward pass to get outputs
output_pts = net(images)
# calculate the loss between predicted and target keypoints
loss = criterion(output_pts, key_pts)
# zero the parameter (weight) gradients
optimizer.zero_grad()
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
# update the weights
optimizer.step()
# print loss statistics
running_loss += loss.data[0]
epoch_loss = running_loss / data_lengths[phase]
print('{} Loss: {:.4f}'.format(phase, epoch_loss))

PyTorch network produces constant output

I am trying to train a simple MLP to approximate y=f(a,b,c).
My code is as below.
import torch
import torch.nn as nn
from torch.autograd import Variable
# hyper parameters
input_size = 3
output_size = 1
num_epochs = 50
learning_rate = 0.001
# Network definition
class FeedForwardNet(nn.Module):
def __init__(self, l1_size, l2_size):
super(FeedForwardNet, self).__init__()
self.fc1 = nn.Linear(input_size, l1_size)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(l1_size, l2_size)
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(l2_size, output_size)
def forward(self, x):
out = self.fc1(x)
out = self.relu1(out)
out = self.fc2(out)
out = self.relu2(out)
out = self.fc3(out)
return out
model = FeedForwardNet(5 , 3)
# sgd optimizer
optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum=0.9)
for epoch in range(11):
print ('Epoch ', epoch)
for i in range(trainX_light.shape[0]):
X = Variable( torch.from_numpy(trainX_light[i]).view(-1, 3) )
Y = Variable( torch.from_numpy(trainY_light[i]).view(-1, 1) )
# forward
optimizer.zero_grad()
output = model(X)
loss = (Y - output).pow(2).sum()
print (output.data[0,0])
loss.backward()
optimizer.step()
totalnorm = 0
for p in model.parameters():
modulenorm = p.grad.data.norm()
totalnorm += modulenorm ** 2
totalnorm = math.sqrt(totalnorm)
print (totalnorm)
# validation code
if (epoch + 1) % 5 == 0:
print (' test points',testX_light.shape[0])
total_loss = 0
for t in range(testX_light.shape[0]):
X = Variable( torch.from_numpy(testX_light[t]).view(-1, 3) )
Y = Variable( torch.from_numpy(testY_light[t]).view(-1, 1) )
output = model(X)
loss = (Y - output).pow(2).sum()
print (output.data[0,0])
total_loss += loss
print ('epoch ', epoch, 'avg_loss ', total_loss.data[0] / testX_light.shape[0])
print ('Done')
The problem that I have now is, the validation code
output = model(X)
is always producing an exact same output value (I guess this value is some sort of garbage). I am not sure what mistake I am doing in this part. Could some help me figure out the mistake in my code?
The reason that network produced random values (and inf later) was the exploding gradient problem. Clipping the gradient (torch.nn.utils.clip_grad_norm(model.parameters(), 0.1)) helped.

Resources