I'm training a VGG11 from scratch, and I keep getting "Out of memory" in my GPU even though i clean the GPU entirely.
self.criterion = nn.CrossEntropyLoss()
lr_dict = {100: 0.01, 150: 0.001}
self.optimizer = optim.SGD(self.model.parameters(), lr=0.1, momentum=0.9)
result_per_epoc = list()
train_acc = 0.0
train_loss = 0.0
for epoch in range(epochs):
debug_memory(f"Train epoc {epoch} START", clear_mem=True)
if epoch in lr_dict.keys():
for gopt in self.optimizer.param_groups:
gopt['lr'] = lr_dict[epoch]
epoch_time_start = datetime.now()
for batch_idx, (images, labels) in tqdm(enumerate(iter(traindata_loader)), total=len(traindata_loader)):
images = images.to(self.device)
labels = labels.to(self.device)
self.optimizer.zero_grad()
try:
outputs = self.model(images)
except Exception as e:
debug_memory(f"On crash", clear_mem=True)
print(f"Batch ID: {batch_idx}")
raise e
loss = self.criterion(outputs, labels)
loss.backward()
self.optimizer.step()
train_accuracy = accuracy(outputs, labels)
train_acc += float(train_accuracy)
train_loss += float(loss)
del images, labels, loss, outputs
clear_memory()
debug_memory(f"Train epoc {epoch} TRAIN", clear_mem=True)
train_acc /= (batch_idx + 1)
train_loss /= (batch_idx + 1)
Up to line "debug_memory(f"Train epoc {epoch} START", clear_mem=True)" I used 3,499/11,019 (31%) of my memory. But after a few batchs my code crashes on memory even though i delete everything i've added to the GPU and in "clear_memory" I did this:
torch.cuda.empty_cache()
gc.collect()
with torch.no_grad():
torch.cuda.empty_cache()
gc.collect()
The model i'm using is "vgg11_bn" from torchvision.models
What else am I missing?
Related
I am trying to find the training/validation accuracy and loss of my model for each epoch as I train it to find the best epoch to use from now on. I appreciate that there is lots of information on this now but this topic is very new to me, and I find it very difficult to find the right answer for my situation.
I assume that I need to add in one or two bits to the train_one_epoch() and evaluate() functions in order to do this?
My model setup is:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.02, momentum=0.9, weight_decay=0.0001)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20,40], gamma=0.1)
And my training function is:
epochs = 50
for epoch in range(epochs):
train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=20)
lr_scheduler.step()
evaluate(model, val_data_loader, device=device)
print("\n\n")
torch.save(model, f'./Models/trained_{ds}_model_Epoch{epochs}_LR0_02.pt')
I am using coco-like annotations, for example:
{'boxes': tensor([[316.9700, 242.5500, 464.1000, 442.1700], [ 39.2200, 172.6700, 169.8400, 430.9600]]), 'labels': tensor([2, 2]), 'image_id': tensor(1416), 'area': tensor([29370.1094, 33738.3789]), 'iscrowd': tensor([0, 0])}
The train_one_epoch and evaluate functions are from 'engine.py' from Torchvision.
It seems like using Tensorboard is a good tool to use, but I don't really know how to use it.
The engine.py is:
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, scaler=None):
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
header = f"Epoch: [{epoch}]"
lr_scheduler = None
if epoch == 0:
warmup_factor = 1.0 / 1000
warmup_iters = min(1000, len(data_loader) - 1)
lr_scheduler = torch.optim.lr_scheduler.LinearLR(
optimizer, start_factor=warmup_factor, total_iters=warmup_iters
)
for images, targets in metric_logger.log_every(data_loader, print_freq, header):
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
with torch.cuda.amp.autocast(enabled=scaler is not None):
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
loss_value = losses_reduced.item()
if not math.isfinite(loss_value):
print(f"Loss is {loss_value}, stopping training")
print(loss_dict_reduced)
sys.exit(1)
optimizer.zero_grad()
if scaler is not None:
scaler.scale(losses).backward()
scaler.step(optimizer)
scaler.update()
else:
losses.backward()
optimizer.step()
if lr_scheduler is not None:
lr_scheduler.step()
metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
metric_logger.update(lr=optimizer.param_groups[0]["lr"])
return metric_logger
The evaluate function is:
def evaluate(model, data_loader, device):
n_threads = torch.get_num_threads()
# FIXME remove this and make paste_masks_in_image run on the GPU
torch.set_num_threads(1)
cpu_device = torch.device("cpu")
model.eval()
metric_logger = utils.MetricLogger(delimiter=" ")
header = "Test:"
coco = get_coco_api_from_dataset(data_loader.dataset)
iou_types = _get_iou_types(model)
coco_evaluator = CocoEvaluator(coco, iou_types)
for images, targets in metric_logger.log_every(data_loader, 100, header):
images = list(img.to(device) for img in images)
if torch.cuda.is_available():
torch.cuda.synchronize()
model_time = time.time()
outputs = model(images)
outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
model_time = time.time() - model_time
res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
evaluator_time = time.time()
coco_evaluator.update(res)
evaluator_time = time.time() - evaluator_time
metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print("Averaged stats:", metric_logger)
coco_evaluator.synchronize_between_processes()
# accumulate predictions from all images
coco_evaluator.accumulate()
coco_evaluator.summarize()
torch.set_num_threads(n_threads)
return coco_evaluator
i try to access the huggingface arabert model but this error appears,
`RuntimeError
Traceback (most recent call last)
in
28 lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)
29
---> 30 train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)
3 frames
/usr/local/lib/python3.8/dist-packages/torch/serialization.py in init(self, name)
285 class _open_zipfile_writer_file(_opener):
286 def init(self, name) -> None:
--> 287 super(_open_zipfile_writer_file, self).init(torch._C.PyTorchFileWriter(str(name)))
288
289 def exit(self, *args) -> None:
RuntimeError: Parent directory models/aubmindlab does not exist.`
i copied the model name and i piped the farasapy but it still occur with me, how can i accesed?
here is the calling block:
def train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):
best_loss = np.Inf
best_ep = 1
nb_iterations = len(train_loader)
print_every = nb_iterations // 5 # print the training loss 5 times per epoch
iters = []
train_losses = []
val_losses = []
scaler = GradScaler()
for ep in range(epochs):
net.train()
running_loss = 0.0
for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):
# Converting to cuda tensors
seq, attn_masks, token_type_ids, labels = \
seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
# Enables autocasting for the forward pass (model + loss)
with autocast():
# Obtaining the logits from the model
logits = net(seq, attn_masks, token_type_ids)
# Computing loss
loss = criterion(logits.squeeze(-1), labels.float())
loss = loss / iters_to_accumulate # Normalize the loss because it is averaged
# Backpropagating the gradients
# Scales loss. Calls backward() on scaled loss to create scaled gradients.
scaler.scale(loss).backward()
if (it + 1) % iters_to_accumulate == 0:
# Optimization step
# scaler.step() first unscales the gradients of the optimizer's assigned params.
# If these gradients do not contain infs or NaNs, opti.step() is then called,
# otherwise, opti.step() is skipped.
scaler.step(opti)
# Updates the scale for next iteration.
scaler.update()
# Adjust the learning rate based on the number of iterations.
lr_scheduler.step()
# Clear gradients
opti.zero_grad()
running_loss += loss.item()
if (it + 1) % print_every == 0: # Print training loss information
print()
print("Iteration {}/{} of epoch {} complete. Loss : {} "
.format(it+1, nb_iterations, ep+1, running_loss / print_every))
running_loss = 0.0
val_loss = evaluate_loss(net, device, criterion, val_loader) # Compute validation loss
print()
print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))
if val_loss < best_loss:
print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
print()
net_copy = copy.deepcopy(net) # save a copy of the model
best_loss = val_loss
best_ep = ep + 1
# Saving the model
path_to_model='models/{}_lr_{}_val_loss_{}_ep_{}.pt'.format(bert_model, lr, round(best_loss, 5), best_ep)
torch.save(net_copy.state_dict(), path_to_model)
print("The model has been saved in {}".format(path_to_model))
del loss
torch.cuda.empty_cache()
and here is the block that has an error the last line where it is occur:
`# Set all seeds to make reproducible results
set_seed(1)
# Creating instances of training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model)
# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=5)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)
if torch.cuda.device_count() > 1: # if multiple GPUs
print("Let's use", torch.cuda.device_count(), "GPUs!")
net = nn.DataParallel(net)
net.to(device)
criterion = nn.BCEWithLogitsLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader) # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * epochs # Necessary to take into account Gradient accumulation
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)
train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)`
my code works fine for epoch number1 but when the epoch changes it stops working because of different shaping.
could you please help me to solve this problem?
I really appreciate your time
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
trainTransform = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), torchvision.transforms.Normalize((0.1307,), (0.3081,))])
trainset = torchvision.datasets.FashionMNIST(root='{}/./data'.format(path_prefix), train = True, download = True, transform = transform)
train_loader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=False, num_workers=4)
valset = torchvision.datasets.FashionMNIST(root='{}/./data'.format(path_prefix), train=False, download=True, transform=transform)
val_loader = torch.utils.data.DataLoader(valset, batch_size=32, shuffle=False, num_workers=4)
def train(self, epoch):
# Note that you need to modify both trainer and loss_function for the VAE model
self.model.train()
train_loss = 0
for batch_idx, (data, _) in tqdm(enumerate(self.train_loader), total=len(self.train_loader) ) :
data = data.view(data.shape[0], -1)
data = data.to(self.device)
#print(data.shape)
#print(data)
self.optimizer.zero_grad()
recon_batch = self.model(data)
loss = self.loss_function(recon_batch, data)
loss.backward()
train_loss += loss.item()
self.optimizer.step()
train_loss /= len(self.train_loader.dataset)/32 # 32 is the batch size
print('====> Epoch: {} Average loss: {:.4f}'.format(
epoch, train_loss ))
Please try deleting "num_workers=4" statement in train and val loaders. I had the same kind of error and this was the cause. Also tqdm makes things complicated so if you still have the error try to edit your code without it.
I encounter the following problem.
I perform an increasing cross-validation; I have 20 subjects in my dataset and try to classify images. I start with 3 subjects and perform a cross-validation with k=3; that is I train 3 different models and validate on the subject left out. And this is what I do for 4, 5, ..., 20 drivers. Hence, I have a lot of models trained.
Now I wanted to check the performance of all models on another dataset, but for some reason the accuracy is the same for all models, which must be a bug somewhere.
I already use copy.deepcopy(), so I must have an error somewhere else.
I'm open for any hints!
Here is the code for the training function:
def train_model(model, num_classes, dirname, trainloader, valloader, trainset_size, valset_size, criterion, optimizer, scheduler, patience, min_delta, num_epochs, fold):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
since = time.time()
train_loss, train_acc, val_loss, val_acc = [], [], [], []
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
early_stopping = False
counter = 0
last_train_epoch = 0
for epoch in range(num_epochs):
if early_stopping:
print('\nEarly Stopping')
break
print('Epoch {}/{}'.format(epoch+1, num_epochs))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
dataloader = trainloader
dataset_size = trainset_size
else:
model.eval() # Set model to evaluate mode
dataloader = valloader
dataset_size = valset_size
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloader:
model = model.to(device)
inputs = inputs.to(device)
#labels = labels.long().to(device)
labels = labels.to(device) #test_tensor.type(torch.FloatTensor)
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
# zero the parameter gradients
optimizer.zero_grad()
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / dataset_size
epoch_acc = running_corrects.double() / dataset_size
if phase == 'train':
train_loss.append(epoch_loss)
train_acc.append(epoch_acc)
else:
val_loss.append(epoch_loss)
val_acc.append(epoch_acc)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# early stopping
if phase == 'val':
if counter == patience:
early_stopping = True
break
if epoch == 0:
best_loss = epoch_loss
else:
if best_loss >= epoch_loss + min_delta:
print('Validation loss decreased ({:.4f} --> {:.4f}). Saving model ...'.format(best_loss,epoch_loss))
best_model_wts = copy.deepcopy(model.state_dict())
torch.save(model.state_dict(), '{}/weights/model_fold_{}.pth'.format(dirname,fold))
last_train_epoch = epoch + 1
best_loss = epoch_loss
counter = 0
else:
counter += 1
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
# save best model
return model, train_acc, train_loss, val_acc, val_loss, last_train_epoch
Here is how I call the function:
model = net
model = model.to(device)
# train
[model, train_acc, train_loss, val_acc, val_loss, last_train_epoch] = train_model(model, num_classes, dirname,\
trainloader, valloader,\
trainset_size, valset_size,\
criterion, optimizer, \
exp_lr_scheduler, patience, \
min_delta, num_epochs, fold=val_index)
# test model
[preds_val, labels_val, idx_false_val, pred_time_val_fold] = test(model, valloader)
[preds_tr, labels_tr, idx_false_train, pred_time_train_fold] = test(model, trainloader)
[preds_all, labels_all, idx_false_all, pred_time_all_fold] = test(model, allloader)
print('Accuracy on all data: ', accuracy_score(labels_all, preds_all))
and for the sake of completeness, this is what the test() function looks like:
def test(model, dataloader):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
pred_labels, gt_labels, idx_false, pred_time = [], [], [], []
was_training = model.training
model.eval()
with torch.no_grad():
for i, (inputs, labels) in enumerate(dataloader):
inputs = inputs.to(device)
labels = labels.to(device)
start_pred = time.clock()
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
end_pred = time.clock()
pred_time.append(end_pred-start_pred)
for j in range(inputs.size()[0]):
pred_labels.append(preds[j].item())
gt_labels.append(labels[j].item())
for i in range(len(pred_labels)):
if pred_labels[i] != gt_labels[i]:
idx_false.append(i)
model.train(mode=was_training)
return pred_labels, gt_labels, idx_false, pred_time
Edit: It looks as if it always saves the same models even though I try to make sure that only the updated weights of the best model are saved.
I'm new to pytorch and I'm using ResNet50 model to train.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet50(pretrained=True)
for param in model.parameters():
param.requires_grad = False
model.fc = nn.Sequential(nn.Linear(2048, 512),nn.ReLU(),nn.Dropout(0.2),nn.Linear(512, 10),nn.LogSoftmax(dim=1))
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=0.003)
model.to(device)
epochs = 1
steps = 0
running_loss = 0
print_every = 10
train_losses, test_losses = [], []
for epoch in range(epochs):
for inputs, labels in trainloader:
steps += 1
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
logps = model.forward(inputs)
loss = criterion(logps, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if steps % print_every == 0:
test_loss = 0
accuracy = 0
model.eval()
with torch.no_grad():
for inputs, labels in testloader:
inputs, labels = inputs.to(device),labels.to(device)
logps = model.forward(inputs)
batch_loss = criterion(logps, labels)
test_loss += batch_loss.item()
ps = torch.exp(logps)
top_p, top_class = ps.topk(1, dim=1)
equals = top_class == labels.view(*top_class.shape)
accuracy += torch.mean(equals.type(torch.FloatTensor)).item()
train_losses.append(running_loss/len(trainloader))
test_losses.append(test_loss/len(testloader))
print(f"Epoch {epoch+1}/{epochs}.. "
f"Train loss: {running_loss/print_every:.3f}.. "
f"Test loss: {test_loss/len(testloader):.3f}.. "
f"Test accuracy: {accuracy/len(testloader):.3f}")
running_loss = 0
model.train()
This code works fine in Google Colaboratory but in my local machine(CPU), on execution of "model.forward(inputs)", it gives an error "Illegal instruction core dumped". I've tried updating my pytorch version, but the problem persists.
please try with: CPU:#-GPU (e.g.:0)
example
import torch
import torch
import pycuda.driver as cuda
cuda.init()
torch.cuda.is_available()
# True
## Get Id of default device
torch.cuda.current_device()
# 0
cuda.Device(0).name() # '0' is the id of your GPU
# Tesla K80
source: full ref at
https://medium.com/p/speed-up-your-algorithms-part-1-pytorch-56d8a4ae7051