I was following a tutorial about object localization and I think it has a wrong accuracy computation
This is the accuracy output I was getting:
Epoch 26 Accuracy 298.75 loss: 27.6987382248044 time: 0.9697500228881836 mins
Is the computation wrong or right?
def train(model):
# Defining the optimizer
optimizer=Adam(model.parameters(),lr=0.001,weight_decay=0.0001)
num_of_epochs = 30
epochs = []
losses = []
# Creating a directory for storing models
os.mkdir('modelssss')
for epoch in range(num_of_epochs):
tot_loss = 0
tot_correct = 0
train_start = time.time()
model.train()
for batch, (x, y, z) in enumerate(dataloader):
# Converting data from cpu to GPU if available to improve speed
x,y,z = x.to(device),y.to(device),z.to(device)
# Sets the gradients of all optimized tensors to zero
optimizer.zero_grad()
[y_pred,z_pred]= model(x)
# Compute loss (here CrossEntropyLoss)
class_loss = F.cross_entropy(y_pred, y)
box_loss = F.mse_loss(z_pred, z)
(box_loss + class_loss).backward()
# class_loss.backward()
optimizer.step()
print("Train batch:", batch+1, " epoch: ", epoch, " ",
(time.time()-train_start)/60, end='\r')
model.eval()
for batch, (x, y,z) in enumerate(valdataloader):
# Converting data from cpu to GPU if available to improve speed
x,y,z = x.to(device),y.to(device),z.to(device)
# Sets the gradients of all optimized tensors to zero
optimizer.zero_grad()
with torch.no_grad():
[y_pred,z_pred]= model(x)
# Compute loss (here CrossEntropyLoss)
class_loss = F.cross_entropy(y_pred, y)
box_loss = F.mse_loss(z_pred, z)
# Compute loss (here CrossEntropyLoss)
tot_loss += (class_loss.item() + box_loss.item())
tot_correct += get_num_correct(y_pred, y)
print("Test batch:", batch+1, " epoch: ", epoch, " ",
(time.time()-train_start)/60, end='\r')
epochs.append(epoch)
losses.append(tot_loss)
print("Epoch", epoch, "Accuracy", (tot_correct)/2.4, "loss:",
tot_loss, " time: ", (time.time()-train_start)/60, " mins")
torch.save(model.state_dict(), "model_ep"+str(epoch+1)+".pth")
this is the code from the tutorial
Related
i try to access the huggingface arabert model but this error appears,
`RuntimeError
Traceback (most recent call last)
in
28 lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)
29
---> 30 train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)
3 frames
/usr/local/lib/python3.8/dist-packages/torch/serialization.py in init(self, name)
285 class _open_zipfile_writer_file(_opener):
286 def init(self, name) -> None:
--> 287 super(_open_zipfile_writer_file, self).init(torch._C.PyTorchFileWriter(str(name)))
288
289 def exit(self, *args) -> None:
RuntimeError: Parent directory models/aubmindlab does not exist.`
i copied the model name and i piped the farasapy but it still occur with me, how can i accesed?
here is the calling block:
def train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):
best_loss = np.Inf
best_ep = 1
nb_iterations = len(train_loader)
print_every = nb_iterations // 5 # print the training loss 5 times per epoch
iters = []
train_losses = []
val_losses = []
scaler = GradScaler()
for ep in range(epochs):
net.train()
running_loss = 0.0
for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):
# Converting to cuda tensors
seq, attn_masks, token_type_ids, labels = \
seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
# Enables autocasting for the forward pass (model + loss)
with autocast():
# Obtaining the logits from the model
logits = net(seq, attn_masks, token_type_ids)
# Computing loss
loss = criterion(logits.squeeze(-1), labels.float())
loss = loss / iters_to_accumulate # Normalize the loss because it is averaged
# Backpropagating the gradients
# Scales loss. Calls backward() on scaled loss to create scaled gradients.
scaler.scale(loss).backward()
if (it + 1) % iters_to_accumulate == 0:
# Optimization step
# scaler.step() first unscales the gradients of the optimizer's assigned params.
# If these gradients do not contain infs or NaNs, opti.step() is then called,
# otherwise, opti.step() is skipped.
scaler.step(opti)
# Updates the scale for next iteration.
scaler.update()
# Adjust the learning rate based on the number of iterations.
lr_scheduler.step()
# Clear gradients
opti.zero_grad()
running_loss += loss.item()
if (it + 1) % print_every == 0: # Print training loss information
print()
print("Iteration {}/{} of epoch {} complete. Loss : {} "
.format(it+1, nb_iterations, ep+1, running_loss / print_every))
running_loss = 0.0
val_loss = evaluate_loss(net, device, criterion, val_loader) # Compute validation loss
print()
print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))
if val_loss < best_loss:
print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
print()
net_copy = copy.deepcopy(net) # save a copy of the model
best_loss = val_loss
best_ep = ep + 1
# Saving the model
path_to_model='models/{}_lr_{}_val_loss_{}_ep_{}.pt'.format(bert_model, lr, round(best_loss, 5), best_ep)
torch.save(net_copy.state_dict(), path_to_model)
print("The model has been saved in {}".format(path_to_model))
del loss
torch.cuda.empty_cache()
and here is the block that has an error the last line where it is occur:
`# Set all seeds to make reproducible results
set_seed(1)
# Creating instances of training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model)
# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=5)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)
if torch.cuda.device_count() > 1: # if multiple GPUs
print("Let's use", torch.cuda.device_count(), "GPUs!")
net = nn.DataParallel(net)
net.to(device)
criterion = nn.BCEWithLogitsLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader) # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * epochs # Necessary to take into account Gradient accumulation
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)
train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)`
As far as I know, when training and validating a model with GPU, GPU memory is mainly used for loading data, forward & backward. and to what I know, I think GPU memory usage should be same 1)before training, 2)after training, 3)before validation, 4)after validation. But in my case, GPU memory used in the validation phase is still occupied in the training phase and vice versa. It is not increasing per epoch so I'm sure it is not a common mistake like loss.item().
Here is the summary of my question
Shouldn't the GPU memory used in one phase be cleaned up before another(except for model weights)?
If it should, are there any noob mistakes I'm making here..?
Thank you for your help.
Here is the code for training loop
eval_result = evaluate(model,val_loader,True,True)
print(eval_result)
print('start training')
for epoch in range(num_epoch):
model.train()
time_ = datetime.datetime.now()
for iter_, data in enumerate(tr_loader):
x, y = data
x = x.to(device).view(x.shape[0],1,*(x.shape[1:]))
y = y.to(device).long()
pred = model.forward(x)
loss = loss_fn(pred,y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# print
print_iter = 16
if (iter_+1) % print_iter == 0:
elapsed = datetime.datetime.now() - time_
expected = elapsed * (num_batches / print_iter)
_epoch = epoch + ((iter_ + 1) / num_batches)
print('\rTRAIN [{:.3f}/{}] loss({}) '
'elapsed {} expected per epoch {}'.format(
_epoch,num_epoch, loss.item(), elapsed, expected)
,end="\t\t\t")
time_ = datetime.datetime.now()
print()
eval_result = evaluate(model,val_loader,True,True)
print(eval_result)
scheduler.step(eval_result[0])
if (epoch+1) %1 == 0:
save_model(model, optimizer, scheduler)
I've read about how making a validation phase a function helps since python is function scoping language.
so the evaluate() is
def evaluate(model, val_loader, get_acc = True, get_IOU = True):
"""
pred: Tensor of shape B C D H W
label Tensor of shape B D H W
"""
val_loss = 0
val_acc = 0
val_IOU = 0
with torch.no_grad():
model.eval()
for data in tqdm(val_loader):
x, y = data
x = x.to(device).view(x.shape[0],1,*(x.shape[1:]))
y = y.to(device).long()
pred = model.forward(x)
loss = loss_fn(pred,y)
val_loss += loss.item()
pred = torch.argmax(pred, dim=1)
if get_acc:
total = np.prod(y.shape)
total = total if total != 0 else 1
val_acc += torch.sum((pred == y)).cpu().item()/total
if get_IOU:
iou = 0
for class_num in range(1,8):
iou += torch.sum((pred==class_num)&(y==class_num)).cpu().item()\
/ torch.sum((pred==class_num)|(y==class_num)).cpu().item()
val_IOU += iou/7
val_loss /= len(val_loader)
val_acc /= len(val_loader)
val_IOU /= len(val_loader)
return (val_loss, val_acc, val_IOU)
and here is GPU usage in colab. 1 is the point where the evaluate() is first called, and 2 is when the train started.
Allocating GPU memory is slow. PyTorch retains the GPU memory it allocates, even after no more tensors referencing that memory remain. You can call torch.cuda.empty_cache() to free any GPU memory that isn't accessible.
first of all, thanks for visiting my questions.
in multi label classification problem, i wonder if i measure accuracy correcyly.
the label data are one-hot encoded, and it shape (1000) e.g. (0, 1, 0, 0, .... 0, 1)
i used res50(in 3 gpus) for training, which implemented in pytorch.models
However, the accuracy of the model is higher than expected and the early epoch already outputs a high value.
am i measuring the accuracy of the model correctly?
codes below
`def train(log_interval, model, device, train_loader, criterion, optimizer, epoch):
model.train()
running_loss = 0
running_correct = 0
_iter = 0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
model.zero_grad()
#output = F.softmax(model(data), dim=1)
output = model(data)
loss = criterion(output, target.type(torch.cuda.LongTensor))
loss.backward()
optimizer.step()
pred = torch.sigmoid(output)
pred[pred >= 0.5]=1
pred[pred < 0.5]=0
running_loss += loss.item() * data.size(0)
running_correct += (pred == target).sum()/(target.size(0)*target.size(1))
_iter += 1
epoch_loss = running_loss / len(train_loader.dataset)
epoch_acc = running_correct / _iter
print('Epochs : {}, train loss : {:4f}, train acc : {:4f}'.format(epoch, epoch_loss, epoch_acc))`
In Pytorch quickstart tutorial the code uses model.eval() during evaluation/test but it does not call model.train() during training.
According to this and source, some modules like BatchNorm and Dropout need to know if the model is in train or evaluation mode. The model in the tutorial does not use any such module so it runs to convergence. Am I missing something or Pytorch's very first tutorial actually has a logical bug?
Training:
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
# Compute prediction error
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
You can see there is no model.train() in the above code.
Testing:
def test(dataloader, model):
size = len(dataloader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= size
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
At the second line, there is a model.eval().
Training loop:
epochs = 5
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train(train_dataloader, model, loss_fn, optimizer)
test(test_dataloader, model)
print("Done!")
This loop calls train() and test() methods without any call to model.train(). So after the first call of test(), the model is always in "evaluation" mode. If we add a BatchNorm to the model we'll be on our way to encounter a hard-to-find bug.
Main question:
Is it good practice to always call model.train() during training and model.eval() during evaluation/test?
I am new to pytorch, and I am trying to train my model (CNN), using the following code:
The program runs fine, but it does not display this Epoch/Step/Loss/Accuracy part:
print(‘Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}%’
as if (i+1) % 100 == 0: never turns to 0
Training part:
iter = 0
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(dataloaders['train']):
images = Variable(images)
labels = Variable(labels)
# Clear the gradients
optimizer.zero_grad()
# Forward propagation
outputs = model(images)
# Calculating loss with softmax to obtain cross entropy loss
loss = criterion(outputs, labels)
# Backward prop
loss.backward()
# Updating gradients
optimizer.step()
iter += 1
# Total number of labels
total = labels.size(0)
# Obtaining predictions from max value
_, predicted = torch.max(outputs.data, 1)
# Calculate the number of correct answers
correct = (predicted == labels).sum().item()
# Print loss and accuracy
if (i+1) % 100 == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}%'
.format(epoch + 1, num_epochs, i + 1, len(dataloaders['train']), loss.item(),
(correct / total) * 100))
Full Code:
https://pastebin.com/dshNmhRL