How to check accuracy on BCELoss Pytorch? - pytorch

I'm trying to use Pytorch to take a HeartDisease.csv and predict whether the patient has heart disease or not... the .csv provides 13 inputs and 1 target
I'm using BCELoss and I'm having trouble understanding how to write an accuracy check function.
My num_samples is correct but not my num_correct. I think this is a result of not understanding the predictions tensor. Right now my num_correct is usually over 8000 while my num_samples is 303...
Any insight on how to write this check accuracy function is much appreciated
I wrote this on a google co lab
#imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
#create fully connected network
class NN(nn.Module):
def __init__(self, input_size, num_classes):
super(NN, self).__init__()
self.outputs = nn.Linear(input_size, 1)
def forward(self, x):
x = self.outputs(x)
return torch.sigmoid(x)
#set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#hyperparameters
input_size = 13 # 13 inputs
num_classes = 1 # heartdisease or not
learning_rate = 0.001
batch_size = 64
num_epochs = 1
#load data
class MyDataset(Dataset):
def __init__(self, root, n_inp):
self.df = pd.read_csv(root)
self.data = self.df.to_numpy()
self.x , self.y = (torch.from_numpy(self.data[:,:n_inp]),
torch.from_numpy(self.data[:,n_inp:]))
def __getitem__(self, idx):
return self.x[idx, :], self.y[idx,:]
def __len__(self):
return len(self.data)
train_dataset = MyDataset("heart.csv", input_size)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle =True)
test_dataset = MyDataset("heart.csv", input_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle =True)
#initialize network
model = NN(input_size=input_size, num_classes=num_classes).to(device)
#loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#train network
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(train_loader):
#get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)
#forward
scores = model(data.float())
targets = targets.float()
loss = criterion(scores, targets)
#backward
optimizer.zero_grad()
loss.backward()
#grad descent or adam step
optimizer.step()
#check accuracy of model
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x.float())
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print("Got {} / {} with accuracy {}".format(num_correct, num_samples, float(num_correct)/float(num_samples)*100))
model.train()
print("checking accuracy on training data")
check_accuracy(train_loader, model)
print("checking accuracy on test data")
check_accuracy(test_loader, model)

Note: Don't fool yourself. A single linear layer + a sigmoid + BCE loss = logistic regression. This is a linear model, so just take note of that when referring to it as a "neural network", which is a term usually reserved for similar networks but with at least one hidden layer and nonlinear activations.
The sigmoid layer at the end of your model's forward() function returns an (N,1)-sized tensor, where N is the batch size. In other words, it returns a scalar for every data point. Each scalar is a value between 0 and 1 (this is the range of the sigmoid function).
The idea is to interpret those scalars as probabilities corresponding to the positive class. Suppose 1 corresponds to heart disease, and 0 corresponds to no heart disease; heart disease is the positive class, and no heart disease is the negative class. Now suppose a score is 0.6. This might be interpreted as a 60% chance that the associated label is heart disease, and a 40% chance that the associated label is no heart disease. This interpretation of the sigmoid output is what motivates the BCE loss to begin with (it's ultimately just a negative log likelihood).
So what you might do is check if your scores are greater than 0.5. If so, predict heart disease. If not, predict no heart disease.
Right now, you're computing maximums from the scores across dimension 1, which does nothing because dimension 1 is already of size 1; taking the maximum of a single value simply gives you that value.
Try something like this:
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x.float())
// Create a Boolean tensor (True for scores > 0.5, False for others)
// and then cast it to a long tensor (Trues -> 1, Falses -> 0)
predictions = (scores > 0.5).long()
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print("Got {} / {} with accuracy {}".format(num_correct, num_samples, float(num_correct)/float(num_samples)*100))
model.train()
You may also want to squeeze your prediction and target tensors to size (N) instead of (N,1), though I'm not sure it's necessary in your case.

Related

Strange loss curve while training EfficientNetV2 with Pytorch

I'm new to Pytorch. And I use the architecture that a pre-trained EfficientNetV2 model to connect to a single fully connected layer with one neuron using the ReLU activation function in regression task. However, both losses on training and validation set suddenly increase after first epoch and keep at about the same value during 50 epochs, then suddenly decrease to about same value as first epoch. Can anyone help me figure out what's happening?
Some codes for model and training process:
# hyper-parameter
image_size = 256
learning_rate = 1e-3
batch_size = 32
epochs = 60
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.net = models.efficientnet_v2_m(pretrained=True,weights='DEFAULT')
self.net.classifier[1] = nn.Linear(in_features=1280, out_features=1, bias=True)
self.net.classifier = nn.Sequential(self.net.classifier,nn.ReLU())
def forward(self, input):
output = self.net(input)
return output
model = Model()
# Define the loss function with Classification Cross-Entropy loss and an optimizer with Adam optimizer
loss_fn = nn.L1Loss()
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
# Function to test the model with the test dataset and print the accuracy for the test images
def testAccuracy():
model.eval()
loss = 0.0
total = 0.0
with torch.no_grad():
for data in validation_loader:
images, labels = data
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print("The model test will be running on", device, "device")
# get the inputs
images = Variable(images.to(device))
labels = Variable(labels.to(device))
# run the model on the test set to predict labels
outputs = model(images)
# the label with the highest energy will be our prediction
# print('outputs: ',outputs)
# print('labels: ',labels)
temp = loss_fn(outputs, labels.unsqueeze(1))
loss += loss_fn(outputs, labels.unsqueeze(1)).item()
total += 1
# compute the accuracy over all test images
mae = loss/total
return(mae)
# Training function. We simply have to loop over our data iterator and feed the inputs to the network and optimize.
def train(num_epochs):
best_accuracy = 0.0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
train_loss_all = []
val_loss_all = []
for epoch in range(num_epochs): # loop over the dataset multiple times
running_loss = 0.0
total = 0
for i, (images, labels) in tqdm(enumerate(train_loader, 0),total=len(train_loader)):
# get the inputs
images = Variable(images.to(device))
labels = Variable(labels.to(device))
# zero the parameter gradients
optimizer.zero_grad()
# predict classes using images from the training set
outputs = model(images)
# compute the loss based on model output and real labels
loss = loss_fn(outputs, labels.unsqueeze(1))
# backpropagate the loss
loss.backward()
# adjust parameters based on the calculated gradients
optimizer.step()
# Let's print statistics for every one batch
running_loss += loss.item() # extract the loss value
total += 1
train_loss = running_loss/total
train_loss_all.append(train_loss)
accuracy = testAccuracy()
val_loss_all.append(accuracy)
if accuracy > best_accuracy:
saveModel()
best_accuracy = accuracy
history = {'train_loss':train_loss_all,'val_loss':val_loss_all}
return(history)
Loss curve:
loss curve

What should I think about when writing a custom loss function?

I'm trying to get my toy network to learn a sine wave.
I output (via tanh) a number between -1 and 1, and I want the network to minimise the following loss, where self(x) are the predictions.
loss = -torch.mean(self(x)*y)
This should be equivalent to trading a stock with a sinusoidal price, where self(x) is our desired position, and y are the returns of the next time step.
The issue I'm having is that the network doesn't learn anything. It does work if I change the loss function to be torch.mean((self(x)-y)**2) (MSE), but this isn't what I want. I'm trying to focus the network on 'making a profit', not making a prediction.
I think the issue may be related to the convexity of the loss function, but I'm not sure, and I'm not certain how to proceed. I've experimented with differing learning rates, but alas nothing works.
What should I be thinking about?
Actual code:
%load_ext tensorboard
import matplotlib.pyplot as plt; plt.rcParams["figure.figsize"] = (30,8)
import torch;from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F;import pytorch_lightning as pl
from torch import nn, tensor
def piecewise(x): return 2*(x>0)-1
class TsDs(torch.utils.data.Dataset):
def __init__(self, s, l=5): super().__init__();self.l,self.s=l,s
def __len__(self): return self.s.shape[0] - 1 - self.l
def __getitem__(self, i): return self.s[i:i+self.l], torch.log(self.s[i+self.l+1]/self.s[i+self.l])
def plt(self): plt.plot(self.s)
class TsDm(pl.LightningDataModule):
def __init__(self, length=5000, batch_size=1000): super().__init__();self.batch_size=batch_size;self.s = torch.sin(torch.arange(length)*0.2) + 5 + 0*torch.rand(length)
def train_dataloader(self): return DataLoader(TsDs(self.s[:3999]), batch_size=self.batch_size, shuffle=True)
def val_dataloader(self): return DataLoader(TsDs(self.s[4000:]), batch_size=self.batch_size)
dm = TsDm()
class MyModel(pl.LightningModule):
def __init__(self, learning_rate=0.01):
super().__init__();self.learning_rate = learning_rate
super().__init__();self.learning_rate = learning_rate
self.conv1 = nn.Conv1d(1,5,2)
self.lin1 = nn.Linear(20,3);self.lin2 = nn.Linear(3,1)
# self.network = nn.Sequential(nn.Conv1d(1,5,2),nn.ReLU(),nn.Linear(20,3),nn.ReLU(),nn.Linear(3,1), nn.Tanh())
# self.network = nn.Sequential(nn.Linear(5,5),nn.ReLU(),nn.Linear(5,3),nn.ReLU(),nn.Linear(3,1), nn.Tanh())
def forward(self, x):
out = x.unsqueeze(1)
out = self.conv1(out)
out = out.reshape(-1,20)
out = nn.ReLU()(out)
out = self.lin1(out)
out = nn.ReLU()(out)
out = self.lin2(out)
return nn.Tanh()(out)
def step(self, batch, batch_idx, stage):
x, y = batch
loss = -torch.mean(self(x)*y)
# loss = torch.mean((self(x)-y)**2)
print(loss)
self.log("loss", loss, prog_bar=True)
return loss
def training_step(self, batch, batch_idx): return self.step(batch, batch_idx, "train")
def validation_step(self, batch, batch_idx): return self.step(batch, batch_idx, "val")
def configure_optimizers(self): return torch.optim.SGD(self.parameters(), lr=self.learning_rate)
#logger = pl.loggers.TensorBoardLogger(save_dir="/content/")
mm = MyModel(0.1);trainer = pl.Trainer(max_epochs=10)
# trainer.tune(mm, dm)
trainer.fit(mm, datamodule=dm)
#
If I understand you correctly, I think that you were trying to maximize the unnormalized correlation between the network's prediction, self(x), and the target value y.
As you mention, the problem is the convexity of the loss wrt the model weights. One way to see the problem is to consider that the model is a simple linear predictor w'*x, where w is the model weights, w' it's transpose, and x the input feature vector (assume a scalar prediction for now). Then, if you look at the derivative of the loss wrt the weight vector (i.e., the gradient), you'll find that it no longer depends on w!
One way to fix this is change the loss to,
loss = -torch.mean(torch.square(self(x)*y))
or
loss = -torch.mean(torch.abs(self(x)*y))
You will have another big problem, however: these loss functions encourage unbound growth of the model weights. In the linear case, one solves this by a Lagrangian relaxation of a hard constraint on, for example, the norm of the model weight vector. I'm not sure how this would be done with neural networks as each layer would need it's own Lagrangian parameter...

Pytorch BCE loss not decreasing for word sense disambiguation task

I am performing word sense disambiguation and have created my own vocabulary of the top 300k most common English words. My model is very simple where each word in the sentences (their respective index value) is passed through an embedding layer which embeds the word and average the resulting embedding. The averaged embedding is then sent through a linear layer, as shown in the model below.
class TestingClassifier(nn.Module):
def __init__(self, vocabSize, features, embeddingDim):
super(TestingClassifier, self).__init__()
self.embeddings = nn.Embedding(vocabSize, embeddingDim)
self.linear = nn.Linear(features, 2)
self.sigmoid = nn.Sigmoid()
def forward(self, inputs):
embeds = self.embeddings(inputs)
avged = torch.mean(embeds, dim=-1)
output = self.linear(avged)
output = self.sigmoid(output)
return output
I am running BCELoss as loss function and SGD as optimizer. My problem is that my loss barely decreases as training goes on, almost as if it converges with a very high loss. I have tried different learning rates (0.0001, 0.001, 0.01 and 0.1) but I get the same issue.
My training function is as follows:
def train_model(model,
optimizer,
lossFunction,
batchSize,
epochs,
isRnnModel,
trainDataLoader,
validDataLoader,
earlyStop = False,
maxPatience = 1
):
validationAcc = []
patienceCounter = 0
stopTraining = False
model.train()
# Train network
for epoch in range(epochs):
losses = []
if(stopTraining):
break
for inputs, labels in tqdm(trainDataLoader, position=0, leave=True):
optimizer.zero_grad()
# Predict and calculate loss
prediction = model(inputs)
loss = lossFunction(prediction, labels)
losses.append(loss)
# Backward propagation
loss.backward()
# Readjust weights
optimizer.step()
print(sum(losses) / len(losses))
curValidAcc = check_accuracy(validDataLoader, model, isRnnModel) # Check accuracy on validation set
curTrainAcc = check_accuracy(trainDataLoader, model, isRnnModel)
print("Epoch", epoch + 1, "Training accuracy", curTrainAcc, "Validation accuracy:", curValidAcc)
# Control early stopping
if(earlyStop):
if(patienceCounter == 0):
if(len(validationAcc) > 0 and curValidAcc < validationAcc[-1]):
benchmark = validationAcc[-1]
patienceCounter += 1
print("Patience counter", patienceCounter)
elif(patienceCounter == maxPatience):
print("EARLY STOP. Patience level:", patienceCounter)
stopTraining = True
else:
if(curValidAcc < benchmark):
patienceCounter += 1
print("Patience counter", patienceCounter)
else:
benchmark = curValidAcc
patienceCounter = 0
validationAcc.append(curValidAcc)
Batch size is 32 (training set contains 8000 rows), vocabulary size is 300k, embedding dimension is 24. I have tried adding more linear layers to the network, but it makes no difference. The prediction accuracy on the training and validation sets stays at around 50% (which is horrible) even after many epochs of training. Any help is much appreciated!

I get something wrong when use model.train() and model.eval() on pytorch

I have prepare features and their labels as blow; I want to build a model which is constructed by transformers' encoder and then add a linear layer to predict a value. but I got some error when I use the model to predict after its training.
At first I run below code:
import torch
from torch import nn
features = torch.rand(bach_size, channels, lenght)
labels = torch.rand(batch_size)
class TransformerModel(nn.Module):
def __init__(self):
super(TransformerModel, self).__init__()
encoder_layer = nn.TransformerEncoderLayer(d_model=8, nhead=8, dropout=0.5)
self.transformer_encoder = nn.TransformerEncoder(encoder_layer, 6)
self.decoder = nn.Linear(40, 1)
def forward(self, src):
encoded = self.transformer_encoder(src.transpose(1, 0)).transpose(1, 0)
pred = self.decoder(encoded.reshape(encoded.shape[0], -1))
return pred
model = TransformerModel()
criterion = nn.MSELoss()
lr = 0.3 # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
def train():
model.train() # Turn on the train mode
optimizer.zero_grad()
output = model(features)
loss = criterion(output.view(-1, 1), labels.view(-1, 1))
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
return loss.item()
for _ in range(100):
train()
After that, I predict features by the below codes:
model.eval()
output = model(features)
I get all values of 'output' are the same, and if use 'model.train()', the 'output' seems Ok; so what is the problem? or the model was built wrong?

Failing to train SkipGram word embedding in Pytorch

I am training the skipgram word embeddings using the famous model described in https://arxiv.org/abs/1310.4546. I want to train it in PyTorch but I am getting errors and I can't figure out where they are coming from. Below I have provided my model class, training loop, and batching method. Does anyone have any insight into whats going on?
I am getting an error on the output = loss(data, target) line. It is having a problem with <class 'torch.LongTensor'> which is weird because CrossEntropyLoss takes a long tensor. The output shape might be wrong which is: torch.Size([1000, 100, 1000]) after the feedforward.
I have my model defined as:
import torch
import torch.nn as nn
torch.manual_seed(1)
class SkipGram(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super(SkipGram, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.hidden_layer = nn.Linear(embedding_dim, vocab_size)
# Loss needs to be input: (minibatch (N), C) target: (minibatch, 1), each label is a class
# Calculate loss in training
def forward(self, x):
embeds = self.embeddings(x)
x = self.hidden_layer(embeds)
return x
My training is defined as:
import torch.optim as optim
from torch.autograd import Variable
net = SkipGram(1000, 300)
optimizer = optim.SGD(net.parameters(), lr=0.01)
batch_size = 100
size = len(train_ints)
batches = batch_index_gen(batch_size, size)
inputs, targets = build_tensor_from_batch_index(batches[0], train_ints)
for i in range(100):
running_loss = 0.0
for batch_idx, batch in enumerate(batches):
data, target = build_tensor_from_batch_index(batch, train_ints)
# if (torch.cuda.is_available()):
# data, target = data.cuda(), target.cuda()
# net = net.cuda()
data, target = Variable(data), Variable(target)
optimizer.zero_grad()
output = net.forward(data)
loss = nn.CrossEntropyLoss()
output = loss(data, target)
output.backward()
optimizer.step()
running_loss += loss.data[0]
optimizer.step()
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
i, batch_idx * len(batch_size), len(size),
100. * (batch_idx * len(batch_size)) / len(size), loss.data[0]))
If useful my batching is:
def build_tensor_from_batch_index(index, train_ints):
minibatch = []
for i in range(index[0], index[1]):
input_arr = np.zeros( (1000,1), dtype=np.int )
target_arr = np.zeros( (1000,1), dtype=np.int )
input_index, target_index = train_ints[i]
input_arr[input_index] = 1
target_arr[input_index] = 1
input_tensor = torch.from_numpy(input_arr)
target_tensor = torch.from_numpy(target_arr)
minibatch.append( (input_tensor, target_tensor) )
# Concatenate all tensors into a minibatch
#x = [tensor[0] for tensor in minibatch]
#print(x)
input_minibatch = torch.cat([tensor[0] for tensor in minibatch], 1)
target_minibatch = torch.cat([tensor[1] for tensor in minibatch], 1)
#target_minibatch = minibatch[0][1]
return input_minibatch, target_minibatch
I'm not sure about that since I did not read the paper, but seems weird that you are computing the loss with the original data and the targets:
output = loss(data, target)
Considering that the output of the network is output = net.forward(data) I think you should compute your loss as:
error = loss(output, target)
If this doesn't help, briefly point me out what the paper says about the loss function.

Resources