Why is the output of a convolutional network so exponentially large? - pytorch

I am trying to reproduce a unet result on Carvana dataset using Ternausnet in PyTorch using Lightning.
I am using DiceLoss for that with sigmoid activation function. I think I am running into an issue of a vanishing gradient, because all gradients of weights are 0, and I see the output of the network with min value of order 10^8.
What could be the issue here? How can I address the vanishing gradient? Also, if I use a different criterion, I see a problem of loss going into negative values without stopping (for BCE with logits, for instance).
Here is the code for my Dice loss:
class DiceLoss(nn.Module):
def __init__(self):
super().__init__()
def forward(self, logits, targets, eps=0, threshold=None):
# comment out if your model contains a sigmoid or
# equivalent activation layer
proba = torch.sigmoid(logits)
proba = proba.view(proba.shape[0], 1, -1)
targets = targets.view(targets.shape[0], 1, -1)
if threshold:
proba = (proba > threshold).float()
# flatten label and prediction tensors
intersection = torch.sum(proba * targets, dim=1)
summation = torch.sum(proba, dim=1) + torch.sum(targets, dim=1)
dice = (2.0 * intersection + eps) / (summation + eps)
# print(intersection, summation, dice)
return (1 - dice).mean()

Related

Measuring uncertainty using MC Dropout on pytorch

I am trying to implement Bayesian CNN using Mc Dropout on Pytorch,
the main idea is that by applying dropout at test time and running over many forward passes , you get predictions from a variety of different models.
I’ve found an application of the Mc Dropout and I really did not get how they applied this method and how exactly they did choose the correct prediction from the list of predictions
here is the code
def mcdropout_test(model):
model.train()
test_loss = 0
correct = 0
T = 100
for data, target in test_loader:
if args.cuda:
data, target = data.cuda(), target.cuda()
data, target = Variable(data, volatile=True), Variable(target)
output_list = []
for i in xrange(T):
output_list.append(torch.unsqueeze(model(data), 0))
output_mean = torch.cat(output_list, 0).mean(0)
test_loss += F.nll_loss(F.log_softmax(output_mean), target, size_average=False).data[0] # sum up batch loss
pred = output_mean.data.max(1, keepdim=True)[1] # get the index of the max log-probability
correct += pred.eq(target.data.view_as(pred)).cpu().sum()
test_loss /= len(test_loader.dataset)
print('\nMC Dropout Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
train()
mcdropout_test()
I have replaced
data, target = Variable(data, volatile=True), Variable(target)
by adding
with torch.no_grad(): at the beginning
And this is how I have defined my CNN
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 192, 5, padding=2)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(192, 192, 5, padding=2)
self.fc1 = nn.Linear(192 * 8 * 8, 1024)
self.fc2 = nn.Linear(1024, 256)
self.fc3 = nn.Linear(256, 10)
self.dropout = nn.Dropout(p=0.3)
nn.init.xavier_uniform_(self.conv1.weight)
nn.init.constant_(self.conv1.bias, 0.0)
nn.init.xavier_uniform_(self.conv2.weight)
nn.init.constant_(self.conv2.bias, 0.0)
nn.init.xavier_uniform_(self.fc1.weight)
nn.init.constant_(self.fc1.bias, 0.0)
nn.init.xavier_uniform_(self.fc2.weight)
nn.init.constant_(self.fc2.bias, 0.0)
nn.init.xavier_uniform_(self.fc3.weight)
nn.init.constant_(self.fc3.bias, 0.0)
def forward(self, x):
x = self.pool(F.relu(self.dropout(self.conv1(x)))) # recommended to add the relu
x = self.pool(F.relu(self.dropout(self.conv2(x)))) # recommended to add the relu
x = x.view(-1, 192 * 8 * 8)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(self.dropout(x)))
x = self.fc3(self.dropout(x)) # no activation function needed for the last layer
return x
Can anyone help me to get the right implementation of the Monte Carlo Dropout method on CNN?
Implementing MC Dropout in Pytorch is easy. All that is needed to be done is to set the dropout layers of your model to train mode. This allows for different dropout masks to be used during the different various forward passes. Below is an implementation of MC Dropout in Pytorch illustrating how multiple predictions from the various forward passes are stacked together and used for computing different uncertainty metrics.
import sys
import numpy as np
import torch
import torch.nn as nn
def enable_dropout(model):
""" Function to enable the dropout layers during test-time """
for m in model.modules():
if m.__class__.__name__.startswith('Dropout'):
m.train()
def get_monte_carlo_predictions(data_loader,
forward_passes,
model,
n_classes,
n_samples):
""" Function to get the monte-carlo samples and uncertainty estimates
through multiple forward passes
Parameters
----------
data_loader : object
data loader object from the data loader module
forward_passes : int
number of monte-carlo samples/forward passes
model : object
keras model
n_classes : int
number of classes in the dataset
n_samples : int
number of samples in the test set
"""
dropout_predictions = np.empty((0, n_samples, n_classes))
softmax = nn.Softmax(dim=1)
for i in range(forward_passes):
predictions = np.empty((0, n_classes))
model.eval()
enable_dropout(model)
for i, (image, label) in enumerate(data_loader):
image = image.to(torch.device('cuda'))
with torch.no_grad():
output = model(image)
output = softmax(output) # shape (n_samples, n_classes)
predictions = np.vstack((predictions, output.cpu().numpy()))
dropout_predictions = np.vstack((dropout_predictions,
predictions[np.newaxis, :, :]))
# dropout predictions - shape (forward_passes, n_samples, n_classes)
# Calculating mean across multiple MCD forward passes
mean = np.mean(dropout_predictions, axis=0) # shape (n_samples, n_classes)
# Calculating variance across multiple MCD forward passes
variance = np.var(dropout_predictions, axis=0) # shape (n_samples, n_classes)
epsilon = sys.float_info.min
# Calculating entropy across multiple MCD forward passes
entropy = -np.sum(mean*np.log(mean + epsilon), axis=-1) # shape (n_samples,)
# Calculating mutual information across multiple MCD forward passes
mutual_info = entropy - np.mean(np.sum(-dropout_predictions*np.log(dropout_predictions + epsilon),
axis=-1), axis=0) # shape (n_samples,)
Moving on to the implementation which is posted in the question above, multiple predictions from T different forward passes are obtained by first setting the model to train mode (model.train()). Note that this is not desirable because unwanted stochasticity will be introduced in the predictions if there are layers other than dropout such as batch-norm in the model. Hence the best way is to just set the dropout layers to train mode as shown in the snippet above.

How to write a custom f1 loss function with weighted average for keras?

I am trying to do a multiclass classification in keras. Till now I am using categorical_crossentropy
as the loss function. But since the metric required is weighted-f1, I am not sure if categorical_crossentropy is the best loss choice. I was trying to implement a weighted-f1 score in keras using sklearn.metrics.f1_score, but due to the problems in conversion between a tensor and a scalar, I am running into errors.
Something like this:
def f1_loss(y_true, y_pred):
return 1 - f1_score(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1), average='weighted')
Followed by
model.compile(loss=f1_loss, optimizer=opt)
How do I write this loss function in keras?
Edit:
Shape for y_true and y_pred is (n_samples, n_classes) in my case it is (n_samples, 4)
y_true and y_pred both are tensors so sklearn's f1_score cannot work directly on them. I need a function that calculates weighted f1 on tensors.
The variables are self explained:
def f1_weighted(true, pred): #shapes (batch, 4)
#for metrics include these two lines, for loss, don't include them
#these are meant to round 'pred' to exactly zeros and ones
#predLabels = K.argmax(pred, axis=-1)
#pred = K.one_hot(predLabels, 4)
ground_positives = K.sum(true, axis=0) + K.epsilon() # = TP + FN
pred_positives = K.sum(pred, axis=0) + K.epsilon() # = TP + FP
true_positives = K.sum(true * pred, axis=0) + K.epsilon() # = TP
#all with shape (4,)
precision = true_positives / pred_positives
recall = true_positives / ground_positives
#both = 1 if ground_positives == 0 or pred_positives == 0
#shape (4,)
f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
#still with shape (4,)
weighted_f1 = f1 * ground_positives / K.sum(ground_positives)
weighted_f1 = K.sum(weighted_f1)
return 1 - weighted_f1 #for metrics, return only 'weighted_f1'
Important notes:
This loss will work batchwise (as any Keras loss).
So if you are working with small batch sizes, the results will be unstable between each batch, and you may get a bad result. Use big batch sizes, enough to include a significant number of samples for all classes.
Since this loss collapses the batch size, you will not be able to use some Keras features that depend on the batch size, such as sample weights, for instance.

Pytorch customize weight

I have a network
class Net(nn.Module)
and two different weights w0 and w1 (concatenate weights of all layers into a vector). Now I want to optimize the network on the line connecting w0 and w1, which means that the weight will have the form theta * w0 + (1-theta) * w1. So now the parameter I want to optimize is no longer the weight itself, but the theta.
How can I implement this? In Pytorch, how can I define the parameter to be theta, and set the weight to be form I want. To be specific, if I create a new class
NetOnLine(nn.Module)
how should I write the forward(self, X) function?
You can define the parameter theta in your net as an nn.Parameter. You'd define the forward function the same way as normal - pass the data through the layers or operations you want and then return it.
Here's a minimal example, where I train a "network" to learn to multiply a Tensor by 2:
import numpy as np
import torch
class SampleNet(torch.nn.Module):
def __init__(self):
super(SampleNet, self).__init__()
self.theta = torch.nn.Parameter(torch.rand(1))
def forward(self, x):
x = x * self.theta.expand_as(x) # expand_as() to match sizes
return x
train_data = np.random.rand(1000, 10)
train_data[:, 5:] = 2 * train_data[:, :5]
train_data = torch.Tensor(train_data)
sample_net = SampleNet()
optimizer = torch.optim.Adam(params=sample_net.parameters())
mse_loss = torch.nn.MSELoss()
for epoch in range(5):
for data in train_data:
x = data[:5]
y = data[5:]
optimizer.zero_grad()
prediction = sample_net(x)
loss = mse_loss(y, prediction)
loss.backward()
optimizer.step()
print(f"Epoch {epoch}, Loss {loss.data.item()}")
print(f"Learned theta: {sample_net.theta.data.item()}")
which prints out
Epoch 0, Loss 0.03369491919875145
Epoch 1, Loss 0.0018534092232584953
Epoch 2, Loss 1.2343853995844256e-05
Epoch 3, Loss 2.2044337466553543e-09
Epoch 4, Loss 4.0527581290916714e-12
Learned theta: 1.999994158744812

weighted_masked_objective in keras

According to Keras Code, Keras computes the loss value considering optional weights and masks.
for i in range(len(self.outputs)):
if i in skip_target_indices:
continue
y_true = self.targets[i]
y_pred = self.outputs[i]
weighted_loss = weighted_losses[i]
sample_weight = sample_weights[i]
mask = masks[i]
loss_weight = loss_weights_list[i]
with K.name_scope(self.output_names[i] + '_loss'):
output_loss = weighted_loss(y_true, y_pred,
sample_weight, mask)
if len(self.outputs) > 1:
self.metrics_tensors.append(output_loss)
self.metrics_names.append(self.output_names[i] + '_loss')
if total_loss is None:
total_loss = loss_weight * output_loss
else:
total_loss += loss_weight * output_loss
On the other hand, in Keras documentation I see the basic loss function is introduced in compile function, and then sample or class weights can be introduced in fit command.
I am not sure how to relate 'weights and masks' in the first code to 'sample and class weights' in the second document. Can anybody give me more explanation?
My application is actually a Convolutional LSTM network, where I input a series of images and want the network to produce an output map (with the same size of input maps) of pixel classes, but some pixels don't have valid labels during training. Should I use weight or mask, sample or class?

Keras, binary segmentation, add weight to loss function

I'm solving a binary segmentation problem with Keras (w. tf backend). How can I add more weight to the center of each area of mask?
I've tried dice coef with added cv2.erode(), but it doesn't work
def dice_coef_eroded(y_true, y_pred):
kernel = (3, 3)
y_true = cv2.erode(y_true.eval(), kernel, iterations=1)
y_pred = cv2.erode(y_pred.eval(), kernel, iterations=1)
y_true_f = K.flatten(y_true)
y_pred_f = K.flatten(y_pred)
intersection = K.sum(y_true_f * y_pred_f)
return (2. * intersection + 1) / (K.sum(y_true_f) + K.sum(y_pred_f) + 1)
Keras 2.1.3, tensorflow 1.4
All right, the solution I found is following:
1) Create in your Iterator a method to retrieve weights' matrix (with shape = mask shape). The output must contain [image, mask, weights]
2) Create a Lambda layer containing loss function
3) Create an Identity loss function
Example:
def weighted_binary_loss(X):
import keras.backend as K
import keras.layers.merge as merge
y_pred, weights, y_true = X
loss = K.binary_crossentropy(y_pred, y_true)
loss = merge([loss, weights], mode='mul')
return loss
def identity_loss(y_true, y_pred):
return y_pred
def get_unet_w_lambda_loss(input_shape=(1024, 1024, 3), mask_shape=(1024, 1024, 1)):
images = Input(input_shape)
mask_weights = Input(mask_shape)
true_masks = Input(mask_shape)
...
y_pred = Conv2D(1, (1, 1), activation='sigmoid')(up1) #output of original unet
loss = Lambda(weighted_binary_loss, output_shape=(1024, 1024, 1))([y_pred, mask_weights, true_masks])
model = Model(inputs=[images, mask_weights, true_masks], outputs=loss)
I'm implementing this solution but I wonder what should be the ground truth that we must give to the network. That is, now the output is the loss, and we want the loss to be 0, so should we train the network as follows?
model = get_unet_w_lambda_loss()
model.fit([inputs, weights, masks], zero_images)

Resources