I am kinda new to the pytorch, now struggling with a classification problem - pytorch

I built a very simple structure
class classifier (nn.Module):
def __init__(self):
super().__init__()
self.classify = nn.Sequential(
nn.Linear(166,80),
nn.Tanh(),
nn.Linear(80,40),
nn.Tanh(),
nn.Linear(40,1),
nn.Softmax()
)
def forward (self, x):
pred = self.classify(x)
return pred
model = classifier()
The loss function and optimizer are defined as
criteria = nn.BCEWithLogitsLoss()
iteration = 1000
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
and here is the training and evaluation section
for epoch in range (iteration):
model.train()
y_pred = model(x_train)
loss = criteria(y_pred,y_train)
optimizer.zero_grad()
loss.backward()
optimizer.step()
model.eval()
with torch.inference_mode():
test_pred = model(x_test)
test_loss = criteria(test_pred, y_test)
if epoch % 100 == 0:
print(loss)
print(test_loss)
I received the same loss values, and by debugging, I found that the weights were not being updated.

The problem is in the network architecture: you are using a Softmax layer on a single valued output at the end. As per the definition of the softmax function, for a output vector x, we have, for index i:
softmax(x_i) = e^{x_i} / sum_j (e^{x_j})
Here, you only have a single valued output. Due to this, the output of your neural network is always 1, irrespective of the inputs or the weights. To fix this, remove the Softmax layer at the end. An activation function like Sigmoid might be more appropriate, and in fact you are already applying this when using the BCEWithLogitsLoss.

The problem lies here
y_pred = model(x_train)
loss = criteria(y_pred,y_train)
optimizer.zero_grad()
loss.backward()
optimizer.step()
after loss is calculated, you are clearing the gradients by doing optimizer.zero_grad()
the ideal case should be:
optimizer.zero_grad()
y_pred = model(x_train)
loss = criteria(y_pred,y_train)
loss.backward()
optimizer.step()

Related

Strange loss curve while training EfficientNetV2 with Pytorch

I'm new to Pytorch. And I use the architecture that a pre-trained EfficientNetV2 model to connect to a single fully connected layer with one neuron using the ReLU activation function in regression task. However, both losses on training and validation set suddenly increase after first epoch and keep at about the same value during 50 epochs, then suddenly decrease to about same value as first epoch. Can anyone help me figure out what's happening?
Some codes for model and training process:
# hyper-parameter
image_size = 256
learning_rate = 1e-3
batch_size = 32
epochs = 60
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.net = models.efficientnet_v2_m(pretrained=True,weights='DEFAULT')
self.net.classifier[1] = nn.Linear(in_features=1280, out_features=1, bias=True)
self.net.classifier = nn.Sequential(self.net.classifier,nn.ReLU())
def forward(self, input):
output = self.net(input)
return output
model = Model()
# Define the loss function with Classification Cross-Entropy loss and an optimizer with Adam optimizer
loss_fn = nn.L1Loss()
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
# Function to test the model with the test dataset and print the accuracy for the test images
def testAccuracy():
model.eval()
loss = 0.0
total = 0.0
with torch.no_grad():
for data in validation_loader:
images, labels = data
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print("The model test will be running on", device, "device")
# get the inputs
images = Variable(images.to(device))
labels = Variable(labels.to(device))
# run the model on the test set to predict labels
outputs = model(images)
# the label with the highest energy will be our prediction
# print('outputs: ',outputs)
# print('labels: ',labels)
temp = loss_fn(outputs, labels.unsqueeze(1))
loss += loss_fn(outputs, labels.unsqueeze(1)).item()
total += 1
# compute the accuracy over all test images
mae = loss/total
return(mae)
# Training function. We simply have to loop over our data iterator and feed the inputs to the network and optimize.
def train(num_epochs):
best_accuracy = 0.0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
train_loss_all = []
val_loss_all = []
for epoch in range(num_epochs): # loop over the dataset multiple times
running_loss = 0.0
total = 0
for i, (images, labels) in tqdm(enumerate(train_loader, 0),total=len(train_loader)):
# get the inputs
images = Variable(images.to(device))
labels = Variable(labels.to(device))
# zero the parameter gradients
optimizer.zero_grad()
# predict classes using images from the training set
outputs = model(images)
# compute the loss based on model output and real labels
loss = loss_fn(outputs, labels.unsqueeze(1))
# backpropagate the loss
loss.backward()
# adjust parameters based on the calculated gradients
optimizer.step()
# Let's print statistics for every one batch
running_loss += loss.item() # extract the loss value
total += 1
train_loss = running_loss/total
train_loss_all.append(train_loss)
accuracy = testAccuracy()
val_loss_all.append(accuracy)
if accuracy > best_accuracy:
saveModel()
best_accuracy = accuracy
history = {'train_loss':train_loss_all,'val_loss':val_loss_all}
return(history)
Loss curve:
loss curve

Pytorch quickstart calls model.eval() but not model.train()

In Pytorch quickstart tutorial the code uses model.eval() during evaluation/test but it does not call model.train() during training.
According to this and source, some modules like BatchNorm and Dropout need to know if the model is in train or evaluation mode. The model in the tutorial does not use any such module so it runs to convergence. Am I missing something or Pytorch's very first tutorial actually has a logical bug?
Training:
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
# Compute prediction error
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
You can see there is no model.train() in the above code.
Testing:
def test(dataloader, model):
size = len(dataloader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= size
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
At the second line, there is a model.eval().
Training loop:
epochs = 5
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train(train_dataloader, model, loss_fn, optimizer)
test(test_dataloader, model)
print("Done!")
This loop calls train() and test() methods without any call to model.train(). So after the first call of test(), the model is always in "evaluation" mode. If we add a BatchNorm to the model we'll be on our way to encounter a hard-to-find bug.
Main question:
Is it good practice to always call model.train() during training and model.eval() during evaluation/test?

How do you test a custom dataset in Pytorch?

I've been following tutorials in Pytorch that use datasets from Pytorch that allow you to enable whether you'd like to train using the data or not... But now I'm using a .csv and a custom dataset.
class MyDataset(Dataset):
def __init__(self, root, n_inp):
self.df = pd.read_csv(root)
self.data = self.df.to_numpy()
self.x , self.y = (torch.from_numpy(self.data[:,:n_inp]),
torch.from_numpy(self.data[:,n_inp:]))
def __getitem__(self, idx):
return self.x[idx, :], self.y[idx,:]
def __len__(self):
return len(self.data)
How can I tell Pytorch not to train my test_dataset so I can use it as a reference of how accurate my model is?
train_dataset = MyDataset("heart.csv", input_size)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle =True)
test_dataset = MyDataset("heart.csv", input_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle =True)
In pytorch, a custom dataset inherits the class Dataset. Mainly it contains two methods __len__() is to specify the length of your dataset object to iterate over and __getitem__() to return a batch of data at a time.
Once the dataloader objects are initialized (train_loader and test_loader as specified in your code), you need to write a train loop and a test loop.
def train(model, optimizer, loss_fn, dataloader):
model.train()
for i, (input, gt) in enumerate(dataloader):
if params.use_gpu: #(If training using GPU)
input, gt = input.cuda(non_blocking = True), gt.cuda(non_blocking = True)
predicted = model(input)
loss = loss_fn(predicted, gt)
optimizer.zero_grad()
loss.backward()
optimizer.step()
and your test loop should be:
def test(model,loss_fn, dataloader):
model.eval()
for i, (input, gt) in enumerate(dataloader):
if params.use_gpu: #(If training using GPU)
input, gt = input.cuda(non_blocking = True), gt.cuda(non_blocking = True)
predicted = model(input)
loss = loss_fn(predicted, gt)
In additional you can use metrics dictionary to log your predicted, loss, epochs etc,. The main difference between training and test loop is that we exclude back propagation (zero_grad(), backward(), step()) in inference stage.
Finally,
for epoch in range(1, epochs + 1):
train(model, optimizer, loss_fn, train_loader)
test(model, loss_fn, test_loader)
There are a couple of things to note when you're testing in pytorch:
Put your model into evaluation mode so that things like dropout and batch normalization aren't in training mode: model.eval()
Put a wrapper around your testing code to avoid the computation of gradients (saving memory and time): with torch.no_grad():
Normalise or standardise your data according to your training set only. This is important for min/max normalisation or z-score standardisation so that the model accurately reflects test performance.
Other than that, what you've written looks pretty fine to me, as you're not applying any transforms to your data (for example, image flipping or gaussian noise injections). To show what code should look like in test mode, see below:
for e in range(num_epochs):
for B, (dat, label) in enumerate(train_loader):
#transforms here
opt.zero_grad()
out = model(dat.to(device))
loss = criterion(out)
loss.backward()
opt.step()
with torch.no_grad():
model.eval()
global_corr = 0
for B, (dat,label) in enumerate(test_loader):
out = model(dat.to(device))
# get batch eval metrics here!

TensorFlow 2.0 'build' function

I was reading about creating neural networks using TensorFlow 2.0 in conjunction with 'GradientTape' API and came across the following code:
model = tf.keras.Sequential((
tf.keras.layers.Reshape(target_shape=(28 * 28,), input_shape=(28, 28)),
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(10)))
model.build()
optimizer = tf.keras.optimizers.Adam()
In this code, what's the use/function of 'model.build()'? Is it compiling the designed neural network?
The rest of the code is:
compute_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
compute_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
def train_one_step(model, optimizer, x, y):
with tf.GradientTape() as tape:
logits = model(x)
loss = compute_loss(y, logits)
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
compute_accuracy(y, logits)
return loss
#tf.function
def train(model, optimizer):
train_ds = mnist_dataset()
step = 0
loss = 0.0
accuracy = 0.0
for x, y in train_ds:
step += 1
loss = train_one_step(model, optimizer, x, y)
if step % 10 == 0:
tf.print('Step', step, ': loss', loss, '; accuracy', compute_accuracy.result())
return step, loss, accuracy
step, loss, accuracy = train(model, optimizer)
print('Final step', step, ': loss', loss, '; accuracy', compute_accuracy.result())
They refer to this as the "delayed-build pattern", where you can actually create a model without defining what its input shape is.
For example
model = Sequential()
model.add(Dense(32))
model.add(Dense(32))
model.build((None, 500))
is equivalent to
model = Sequential()
model.add(Dense(32, input_shape=(500,)))
model.add(Dense(32))
In the second case you need to know the input shape before defining the model's architecture. model.build() allows you to actually define a model (i.e. its architecture) and actually build it (i.e. initialize parameters, etc.) later.
Example taken from here.

how to add BatchNormalization with SWA:stochastic weights average?

I am a beginner in Deepleaning and Pytorch.
I don't understand how to use BatchNormalization in using SWA.
pytorch.org says in https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/:
Note that the SWA averages of the weights are never used to make
predictions during training, and so the batch normalization layers do
not have the activation statistics computed after you reset the
weights of your model with opt.swap_swa_sgd()
This means it's suitable for adding BatchNormalization layer after using SWA?
# it means, in my idea
#for example
opt = torchcontrib.optim.SWA(base_opt)
for i in range(100):
opt.zero_grad()
loss_fn(model(input), target).backward()
opt.step()
if i > 10 and i % 5 == 0:
opt.update_swa()
opt.swap_swa_sgd()
#save model once
torch.save(model,"swa_model.pt")
#model_load
saved_model=torch.load("swa_model.pt")
#it means adding BatchNormalization layer??
model2=saved_model
model2.add_module("Batch1",nn.BatchNorm1d(10))
# decay learning_rate more
learning_rate=0.005
optimizer = torch.optim.SGD(model2.parameters(), lr=learning_rate)
# train model again
for epoch in range(num_epochs):
loss = train(train_loader)
val_loss, val_acc = valid(test_loader)
I appreciate your replying.
following your advise,
I try to make example model adding optimizer.bn_update()
# add optimizer.bn_update() to model
criterion = nn.CrossEntropyLoss()
learning_rate=0.01
base_opt = torch.optim.SGD(model.parameters(), lr=0.1)
optimizer = SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
def train(train_loader):
#mode:train
model.train()
running_loss = 0
for batch_idx, (images, labels) in enumerate(train_loader):
optimizer.zero_grad()
outputs = model(images)
#loss
loss = criterion(outputs, labels)
running_loss += loss.item()
loss.backward()
optimizer.step()
optimizer.swap_swa_sgd()
train_loss = running_loss / len(train_loader)
return train_loss
def valid(test_loader):
model.eval()
running_loss = 0
correct = 0
total = 0
#torch.no_grad
with torch.no_grad():
for batch_idx, (images, labels) in enumerate(test_loader):
outputs = model(images)
loss = criterion(outputs, labels)
running_loss += loss.item()
_, predicted = torch.max(outputs, 1)
correct += (predicted == labels).sum().item()
total += labels.size(0)
val_loss = running_loss / len(test_loader)
val_acc = float(correct) / total
return val_loss, val_acc
num_epochs=30
loss_list = []
val_loss_list = []
val_acc_list = []
for epoch in range(num_epochs):
loss = train(train_loader)
val_loss, val_acc = valid(test_loader)
optimizer.bn_update(train_loader, model)
print('epoch %d, loss: %.4f val_loss: %.4f val_acc: %.4f'
% (epoch, loss, val_loss, val_acc))
# logging
loss_list.append(loss)
val_loss_list.append(val_loss)
val_acc_list.append(val_acc)
# optimizer.bn_updata()
optimizer.bn_update(train_loader, model)
# go on evaluating model,,,
What the documentation is telling you is that since SWA computes averages of weights but those weights aren't used for prediction during training the batch normalization layers won't see those weights. This means they haven't computed the respective statistics for them (as they were never able to) which is important because the weights are used during actual prediction (i.e. not during training).
This means, they assume you have batch normalization layers in your model and want to train it using SWA. This is (more or less) not straight-forward due to the reasons above.
One approach is given as follows:
To compute the activation statistics you can just make a forward pass on your training data using the SWA model once the training is finished.
Or you can use their helper class:
In the SWA class we provide a helper function opt.bn_update(train_loader, model). It updates the activation statistics for every batch normalization layer in the model by making a forward pass on the train_loader data loader. You only need to call this function once in the end of training.
In case you are using Pytorch's DataLoader class you can simply supply the model (after training) and the training loader to the bn_update function which updates all batch normalization statistics for you. You only need to call this function once in the end of training.
Steps to proceed:
Train your model that includes batch normalization layers using SWA
After your model has finished training, call opt.bn_update(train_loader, model) using your training data and providing your trained model
I tried to compare before and after using optimizer.bn_update() in Mnist Data.
as follows:
# using Mnist Data
X_train = X_train.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
# to compare Test Data Accuracy
X_train_=X_train[0:50000]
y_train_=y_train[0:50000]
# to validate for test data
X_train_ToCompare=X_train[50000:60000]
y_train_ToCompare=y_train[50000:60000]
print(X_train_.shape)
print(y_train_.shape)
print(X_train_ToCompare.shape)
print(y_train_ToCompare.shape)
#(50000, 784)
#(50000,)
#(10000, 784)
#(10000,)
# like keras,simple MLP model
from torch import nn
model = nn.Sequential()
model.add_module('fc1', nn.Linear(784, 1000))
model.add_module('relu1', nn.ReLU())
model.add_module('fc2', nn.Linear(1000, 1000))
model.add_module('relu2', nn.ReLU())
model.add_module('fc3', nn.Linear(1000, 10))
print(model)
# using GPU
model.cuda()
criterion = nn.CrossEntropyLoss()
learning_rate=0.01
base_opt = torch.optim.SGD(model.parameters(), lr=0.1)
optimizer = SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
def train(train_loader):
model.train()
running_loss = 0
for batch_idx, (images, labels) in enumerate(train_loader):
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
running_loss += loss.item()
loss.backward()
optimizer.step()
optimizer.swap_swa_sgd()
train_loss = running_loss / len(train_loader)
return train_loss
def valid(test_loader):
model.eval()
running_loss = 0
correct = 0
total = 0
with torch.no_grad():
for batch_idx, (images, labels) in enumerate(test_loader):
outputs = model(images)
loss = criterion(outputs, labels)
running_loss += loss.item()
_, predicted = torch.max(outputs, 1)
correct += (predicted == labels).sum().item()
total += labels.size(0)
val_loss = running_loss / len(test_loader)
val_acc = float(correct) / total
return val_loss, val_acc
num_epochs=30
loss_list = []
val_loss_list = []
val_acc_list = []
for epoch in range(num_epochs):
loss = train(train_loader)
val_loss, val_acc = valid(test_loader)
optimizer.bn_update(train_loader, model)
print('epoch %d, loss: %.4f val_loss: %.4f val_acc: %.4f'
% (epoch, loss, val_loss, val_acc))
# logging
loss_list.append(loss)
val_loss_list.append(val_loss)
val_acc_list.append(val_acc)
# output:
# epoch 0, loss: 0.7832 val_loss: 0.5381 val_acc: 0.8866
# ...
# epoch 29, loss: 0.0502 val_loss: 0.0758 val_acc: 0.9772
#evaluate model
# attempt to evaluate model before optimizer.bn_update()
# using X_train_toCompare for test data
model.eval()
predicted_list=[]
for i in range(len(X_train_ToCompare)):
temp_predicted=model(torch.cuda.FloatTensor(X_train_ToCompare[i]))
_,y_predicted=torch.max(temp_predicte,0)
predicted_list.append(int(y_predicted))
sum(predicted_list==y_train_ToCompare)
# test accuracy 9757/10000
#after using:optimizer.bn_update
model.train()
optimizer.bn_update(train_loader, model)
# evaluate model
model.eval()
predicted_list_afterBatchNorm=[]
for i in range(len(X_train_ToCompare)):
temp_predicted=model(torch.cuda.FloatTensor(X_train_ToCompare[i]))
_,y_predicted=torch.max(temp_predicted,0)
predicted_list_afterBatchNorm.append(int(y_predicted))
sum(predicted_list_withNorm==y_train_ToCompare)
# test accuracy 9778/10000
I don't know if this way is correct to validate...
Using optimizer.bn_update() method, I confirm test accuracy is improved ofen.
but some test accuracy is descended:I think this is because of
simple MLP model and learning epochs are not enough.
there is need to try test more.
thank you for reply.

Resources