I'd like to know the actual learning rate during training, here is my code.
learning_rate = 0.001
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1, 2], gamma=0.1)
def train(epoch):
train_loss = 0
for batch_idx, (input, target) in enumerate(train_loader):
predict_label = net(input)
loss = criterion(predict_label, target)
train_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(optimizer.param_groups[0]['lr'])
scheduler.step()
print(scheduler.state_dict()['_last_lr'])
print(optimizer.param_groups[0]['lr'])
the output is 0.001, 0.0001, 0.0001. So what is the actual lr during optimizer.step()? 0.001 or 0.0001? Thanks.
The important part is here:
for batch_idx, (input, target) in enumerate(train_loader):
...
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(optimizer.param_groups[0]['lr']) #### CURRENT LEARNING RATE
scheduler.step() #step through learning rate
print(scheduler.state_dict()['_last_lr']) #### NEW LEARNING RATE
print(optimizer.param_groups[0]['lr']) #### NEW LEARNING RATE
Because you step your scheduler after your epoch, then the first epoch will have your initial value which is set to 0.001. If you run for multiple epochs then it will continue to be annealed.
Related
I built a very simple structure
class classifier (nn.Module):
def __init__(self):
super().__init__()
self.classify = nn.Sequential(
nn.Linear(166,80),
nn.Tanh(),
nn.Linear(80,40),
nn.Tanh(),
nn.Linear(40,1),
nn.Softmax()
)
def forward (self, x):
pred = self.classify(x)
return pred
model = classifier()
The loss function and optimizer are defined as
criteria = nn.BCEWithLogitsLoss()
iteration = 1000
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
and here is the training and evaluation section
for epoch in range (iteration):
model.train()
y_pred = model(x_train)
loss = criteria(y_pred,y_train)
optimizer.zero_grad()
loss.backward()
optimizer.step()
model.eval()
with torch.inference_mode():
test_pred = model(x_test)
test_loss = criteria(test_pred, y_test)
if epoch % 100 == 0:
print(loss)
print(test_loss)
I received the same loss values, and by debugging, I found that the weights were not being updated.
The problem is in the network architecture: you are using a Softmax layer on a single valued output at the end. As per the definition of the softmax function, for a output vector x, we have, for index i:
softmax(x_i) = e^{x_i} / sum_j (e^{x_j})
Here, you only have a single valued output. Due to this, the output of your neural network is always 1, irrespective of the inputs or the weights. To fix this, remove the Softmax layer at the end. An activation function like Sigmoid might be more appropriate, and in fact you are already applying this when using the BCEWithLogitsLoss.
The problem lies here
y_pred = model(x_train)
loss = criteria(y_pred,y_train)
optimizer.zero_grad()
loss.backward()
optimizer.step()
after loss is calculated, you are clearing the gradients by doing optimizer.zero_grad()
the ideal case should be:
optimizer.zero_grad()
y_pred = model(x_train)
loss = criteria(y_pred,y_train)
loss.backward()
optimizer.step()
first of all, thanks for visiting my questions.
in multi label classification problem, i wonder if i measure accuracy correcyly.
the label data are one-hot encoded, and it shape (1000) e.g. (0, 1, 0, 0, .... 0, 1)
i used res50(in 3 gpus) for training, which implemented in pytorch.models
However, the accuracy of the model is higher than expected and the early epoch already outputs a high value.
am i measuring the accuracy of the model correctly?
codes below
`def train(log_interval, model, device, train_loader, criterion, optimizer, epoch):
model.train()
running_loss = 0
running_correct = 0
_iter = 0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
model.zero_grad()
#output = F.softmax(model(data), dim=1)
output = model(data)
loss = criterion(output, target.type(torch.cuda.LongTensor))
loss.backward()
optimizer.step()
pred = torch.sigmoid(output)
pred[pred >= 0.5]=1
pred[pred < 0.5]=0
running_loss += loss.item() * data.size(0)
running_correct += (pred == target).sum()/(target.size(0)*target.size(1))
_iter += 1
epoch_loss = running_loss / len(train_loader.dataset)
epoch_acc = running_correct / _iter
print('Epochs : {}, train loss : {:4f}, train acc : {:4f}'.format(epoch, epoch_loss, epoch_acc))`
I'm using a pre-trained model from Pytorch ( Resnet 18,34,50) in order to classify images. During the training, a weird periodicity appears in the training as you can see in the image below. Did somebody already have a similar issue?In order to deal with the overfitting, I'm using Data augmentation in the preprocessing.
When using SGD as an optimizer with the following parameters, we obtain this sort of graph:
criterion: NLLLoss()
learning rate: 0.0001
epoch: 40
print every 40 iteration
We also try adam and Adam bound as optimizers but the same periodicity was observed.
Thank's in advance for your answer!
Here is the code :
def train_classifier():
start=0
stop=0
start = timeit.default_timer()
epochs = 40
steps = 0
print_every = 40
model.to('cuda')
epo=[]
train=[]
valid=[]
acc_valid=[]
for e in range(epochs):
print('Currently running epoch',e,':')
model.train()
running_loss = 0
for images, labels in iter(train_loader):
steps += 1
images, labels = images.to('cuda'), labels.to('cuda')
optimizer.zero_grad()
output = model.forward(images)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if steps % print_every == 0:
model.eval()
# Turn off gradients for validation, saves memory and computations
with torch.no_grad():
validation_loss, accuracy = validation(model, val_loader, criterion)
print("Epoch: {}/{}.. ".format(e+1, epochs),
"Training Loss: {:.3f}.. ".format(running_loss/print_every),
"Validation Loss: {:.3f}.. ".format(validation_loss/len(val_loader)),
"Validation Accuracy: {:.3f}".format(accuracy/len(val_loader)))
stop = timeit.default_timer()
print('Time: ', stop - start)
acc_valid.append(accuracy/len(val_loader))
train.append(running_loss/print_every)
valid.append(validation_loss/len(val_loader))
epo.append(e+1)
running_loss = 0
model.train()
return train,epo,valid,acc_valid
I came across this code online and I was wondering if I interpreted it correctly. Below is a part of a gradient descent process. full code available through the link https://jovian.ml/aakashns/03-logistic-regression. My question is as followed: During the training step, I guess the author is trying to minimize the loss for each batch by updating the parameters. However, how can we be sure the total loss of all training samples is minimized if loss.backward() is only applied to the batch loss?
def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD):
history = []
optimizer = opt_func(model.parameters(), lr)
for epoch in range(epochs):
# Training Phase
for batch in train_loader:
loss = model.training_step(batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Validation phase
result = evaluate(model, val_loader)
model.epoch_end(epoch, result)
history.append(result)
return history
I am a beginner in Deepleaning and Pytorch.
I don't understand how to use BatchNormalization in using SWA.
pytorch.org says in https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/:
Note that the SWA averages of the weights are never used to make
predictions during training, and so the batch normalization layers do
not have the activation statistics computed after you reset the
weights of your model with opt.swap_swa_sgd()
This means it's suitable for adding BatchNormalization layer after using SWA?
# it means, in my idea
#for example
opt = torchcontrib.optim.SWA(base_opt)
for i in range(100):
opt.zero_grad()
loss_fn(model(input), target).backward()
opt.step()
if i > 10 and i % 5 == 0:
opt.update_swa()
opt.swap_swa_sgd()
#save model once
torch.save(model,"swa_model.pt")
#model_load
saved_model=torch.load("swa_model.pt")
#it means adding BatchNormalization layer??
model2=saved_model
model2.add_module("Batch1",nn.BatchNorm1d(10))
# decay learning_rate more
learning_rate=0.005
optimizer = torch.optim.SGD(model2.parameters(), lr=learning_rate)
# train model again
for epoch in range(num_epochs):
loss = train(train_loader)
val_loss, val_acc = valid(test_loader)
I appreciate your replying.
following your advise,
I try to make example model adding optimizer.bn_update()
# add optimizer.bn_update() to model
criterion = nn.CrossEntropyLoss()
learning_rate=0.01
base_opt = torch.optim.SGD(model.parameters(), lr=0.1)
optimizer = SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
def train(train_loader):
#mode:train
model.train()
running_loss = 0
for batch_idx, (images, labels) in enumerate(train_loader):
optimizer.zero_grad()
outputs = model(images)
#loss
loss = criterion(outputs, labels)
running_loss += loss.item()
loss.backward()
optimizer.step()
optimizer.swap_swa_sgd()
train_loss = running_loss / len(train_loader)
return train_loss
def valid(test_loader):
model.eval()
running_loss = 0
correct = 0
total = 0
#torch.no_grad
with torch.no_grad():
for batch_idx, (images, labels) in enumerate(test_loader):
outputs = model(images)
loss = criterion(outputs, labels)
running_loss += loss.item()
_, predicted = torch.max(outputs, 1)
correct += (predicted == labels).sum().item()
total += labels.size(0)
val_loss = running_loss / len(test_loader)
val_acc = float(correct) / total
return val_loss, val_acc
num_epochs=30
loss_list = []
val_loss_list = []
val_acc_list = []
for epoch in range(num_epochs):
loss = train(train_loader)
val_loss, val_acc = valid(test_loader)
optimizer.bn_update(train_loader, model)
print('epoch %d, loss: %.4f val_loss: %.4f val_acc: %.4f'
% (epoch, loss, val_loss, val_acc))
# logging
loss_list.append(loss)
val_loss_list.append(val_loss)
val_acc_list.append(val_acc)
# optimizer.bn_updata()
optimizer.bn_update(train_loader, model)
# go on evaluating model,,,
What the documentation is telling you is that since SWA computes averages of weights but those weights aren't used for prediction during training the batch normalization layers won't see those weights. This means they haven't computed the respective statistics for them (as they were never able to) which is important because the weights are used during actual prediction (i.e. not during training).
This means, they assume you have batch normalization layers in your model and want to train it using SWA. This is (more or less) not straight-forward due to the reasons above.
One approach is given as follows:
To compute the activation statistics you can just make a forward pass on your training data using the SWA model once the training is finished.
Or you can use their helper class:
In the SWA class we provide a helper function opt.bn_update(train_loader, model). It updates the activation statistics for every batch normalization layer in the model by making a forward pass on the train_loader data loader. You only need to call this function once in the end of training.
In case you are using Pytorch's DataLoader class you can simply supply the model (after training) and the training loader to the bn_update function which updates all batch normalization statistics for you. You only need to call this function once in the end of training.
Steps to proceed:
Train your model that includes batch normalization layers using SWA
After your model has finished training, call opt.bn_update(train_loader, model) using your training data and providing your trained model
I tried to compare before and after using optimizer.bn_update() in Mnist Data.
as follows:
# using Mnist Data
X_train = X_train.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
# to compare Test Data Accuracy
X_train_=X_train[0:50000]
y_train_=y_train[0:50000]
# to validate for test data
X_train_ToCompare=X_train[50000:60000]
y_train_ToCompare=y_train[50000:60000]
print(X_train_.shape)
print(y_train_.shape)
print(X_train_ToCompare.shape)
print(y_train_ToCompare.shape)
#(50000, 784)
#(50000,)
#(10000, 784)
#(10000,)
# like keras,simple MLP model
from torch import nn
model = nn.Sequential()
model.add_module('fc1', nn.Linear(784, 1000))
model.add_module('relu1', nn.ReLU())
model.add_module('fc2', nn.Linear(1000, 1000))
model.add_module('relu2', nn.ReLU())
model.add_module('fc3', nn.Linear(1000, 10))
print(model)
# using GPU
model.cuda()
criterion = nn.CrossEntropyLoss()
learning_rate=0.01
base_opt = torch.optim.SGD(model.parameters(), lr=0.1)
optimizer = SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
def train(train_loader):
model.train()
running_loss = 0
for batch_idx, (images, labels) in enumerate(train_loader):
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
running_loss += loss.item()
loss.backward()
optimizer.step()
optimizer.swap_swa_sgd()
train_loss = running_loss / len(train_loader)
return train_loss
def valid(test_loader):
model.eval()
running_loss = 0
correct = 0
total = 0
with torch.no_grad():
for batch_idx, (images, labels) in enumerate(test_loader):
outputs = model(images)
loss = criterion(outputs, labels)
running_loss += loss.item()
_, predicted = torch.max(outputs, 1)
correct += (predicted == labels).sum().item()
total += labels.size(0)
val_loss = running_loss / len(test_loader)
val_acc = float(correct) / total
return val_loss, val_acc
num_epochs=30
loss_list = []
val_loss_list = []
val_acc_list = []
for epoch in range(num_epochs):
loss = train(train_loader)
val_loss, val_acc = valid(test_loader)
optimizer.bn_update(train_loader, model)
print('epoch %d, loss: %.4f val_loss: %.4f val_acc: %.4f'
% (epoch, loss, val_loss, val_acc))
# logging
loss_list.append(loss)
val_loss_list.append(val_loss)
val_acc_list.append(val_acc)
# output:
# epoch 0, loss: 0.7832 val_loss: 0.5381 val_acc: 0.8866
# ...
# epoch 29, loss: 0.0502 val_loss: 0.0758 val_acc: 0.9772
#evaluate model
# attempt to evaluate model before optimizer.bn_update()
# using X_train_toCompare for test data
model.eval()
predicted_list=[]
for i in range(len(X_train_ToCompare)):
temp_predicted=model(torch.cuda.FloatTensor(X_train_ToCompare[i]))
_,y_predicted=torch.max(temp_predicte,0)
predicted_list.append(int(y_predicted))
sum(predicted_list==y_train_ToCompare)
# test accuracy 9757/10000
#after using:optimizer.bn_update
model.train()
optimizer.bn_update(train_loader, model)
# evaluate model
model.eval()
predicted_list_afterBatchNorm=[]
for i in range(len(X_train_ToCompare)):
temp_predicted=model(torch.cuda.FloatTensor(X_train_ToCompare[i]))
_,y_predicted=torch.max(temp_predicted,0)
predicted_list_afterBatchNorm.append(int(y_predicted))
sum(predicted_list_withNorm==y_train_ToCompare)
# test accuracy 9778/10000
I don't know if this way is correct to validate...
Using optimizer.bn_update() method, I confirm test accuracy is improved ofen.
but some test accuracy is descended:I think this is because of
simple MLP model and learning epochs are not enough.
there is need to try test more.
thank you for reply.