Need help regarding Transfer Learning a Faster RCNN ResNet50FPN in PyTorch - python-3.x

I am new to PyTorch. I'm trying to use a pre-trained Faster RCNN torchvision.models.detection.fasterrcnn_resnet50_fpn() for object detection project. I have created a CustomDataset(Dataset) class to handle the custom dataset.
Here is the custom class implementation
class ToTensor(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
image, landmarks = sample['image'], sample['meta_data']
# swap color axis because
# numpy image: H x W x C
# torch image: C X H X W
image = image.transpose((2, 0, 1))
return {'image': torch.from_numpy(image),
'meta_data': landmarks}
class CustomDataset(Dataset):
"""Custom Landmarks dataset."""
def __init__(self, data_dir, root_dir, transform=None):
"""
Args:
data_dir (string): Directory with all the labels(json).
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.data_dir = data_dir
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(os.listdir(self.data_dir))
def __getitem__(self, idx):
img_name = sorted(os.listdir(self.root_dir))[idx]
image = io.imread(self.root_dir+'/'+img_name, plugin='matplotlib')
json_file = sorted(os.listdir(self.data_dir))[idx]
with open(self.data_dir+'/'+json_file) as f:
meta_data = json.load(f)
meta_data = meta_data['annotation']['object']
sample = {'image': image, 'meta_data': meta_data}
to_tensor = ToTensor()
transformed_sample = to_tensor(sample)
if self.transform:
sample = self.transform(sample)
return transformed_sample
Here is the train_model function
def train_model(model, criterion, optimizer, lr_scheduler, num_epochs=25):
since = time.time()
best_model = model
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'test']:
if phase == 'train':
optimizer = lr_scheduler(optimizer, epoch)
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
for data in dset_loaders[phase]:
# get the inputs
inputs, labels = data['image'], data['meta_data']
inputs= inputs.to(device) # ,
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs, labels)
_, preds = torch.max(outputs.data, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item()
running_corrects += torch.sum(preds == labels).item()
epoch_loss = running_loss / dset_sizes[phase]
epoch_acc = running_corrects / dset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'test' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model = copy.deepcopy(model)
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
return best_model
While performing model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=25) I am getting "RuntimeError: _thnn_upsample_bilinear2d_forward not supported on CUDAType for Byte"

It appears your datapoints are byte tensors, i.e type uint8. Try casting your data into float32
# Replace this
inputs = inputs.to(device)
# With this
inputs = inputs.float().to(device)
Note that the torchvision models expect data to be normalized in a specific way. Check here for the procedure, which basically entails using
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
for normalizing your data.

Related

Find Training/Validation Accuracy & Loss of Faster-RCNN PyTorch model

I am trying to find the training/validation accuracy and loss of my model for each epoch as I train it to find the best epoch to use from now on. I appreciate that there is lots of information on this now but this topic is very new to me, and I find it very difficult to find the right answer for my situation.
I assume that I need to add in one or two bits to the train_one_epoch() and evaluate() functions in order to do this?
My model setup is:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.02, momentum=0.9, weight_decay=0.0001)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20,40], gamma=0.1)
And my training function is:
epochs = 50
for epoch in range(epochs):
train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=20)
lr_scheduler.step()
evaluate(model, val_data_loader, device=device)
print("\n\n")
torch.save(model, f'./Models/trained_{ds}_model_Epoch{epochs}_LR0_02.pt')
I am using coco-like annotations, for example:
{'boxes': tensor([[316.9700, 242.5500, 464.1000, 442.1700], [ 39.2200, 172.6700, 169.8400, 430.9600]]), 'labels': tensor([2, 2]), 'image_id': tensor(1416), 'area': tensor([29370.1094, 33738.3789]), 'iscrowd': tensor([0, 0])}
The train_one_epoch and evaluate functions are from 'engine.py' from Torchvision.
It seems like using Tensorboard is a good tool to use, but I don't really know how to use it.
The engine.py is:
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, scaler=None):
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
header = f"Epoch: [{epoch}]"
lr_scheduler = None
if epoch == 0:
warmup_factor = 1.0 / 1000
warmup_iters = min(1000, len(data_loader) - 1)
lr_scheduler = torch.optim.lr_scheduler.LinearLR(
optimizer, start_factor=warmup_factor, total_iters=warmup_iters
)
for images, targets in metric_logger.log_every(data_loader, print_freq, header):
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
with torch.cuda.amp.autocast(enabled=scaler is not None):
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
loss_value = losses_reduced.item()
if not math.isfinite(loss_value):
print(f"Loss is {loss_value}, stopping training")
print(loss_dict_reduced)
sys.exit(1)
optimizer.zero_grad()
if scaler is not None:
scaler.scale(losses).backward()
scaler.step(optimizer)
scaler.update()
else:
losses.backward()
optimizer.step()
if lr_scheduler is not None:
lr_scheduler.step()
metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
metric_logger.update(lr=optimizer.param_groups[0]["lr"])
return metric_logger
The evaluate function is:
def evaluate(model, data_loader, device):
n_threads = torch.get_num_threads()
# FIXME remove this and make paste_masks_in_image run on the GPU
torch.set_num_threads(1)
cpu_device = torch.device("cpu")
model.eval()
metric_logger = utils.MetricLogger(delimiter=" ")
header = "Test:"
coco = get_coco_api_from_dataset(data_loader.dataset)
iou_types = _get_iou_types(model)
coco_evaluator = CocoEvaluator(coco, iou_types)
for images, targets in metric_logger.log_every(data_loader, 100, header):
images = list(img.to(device) for img in images)
if torch.cuda.is_available():
torch.cuda.synchronize()
model_time = time.time()
outputs = model(images)
outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
model_time = time.time() - model_time
res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
evaluator_time = time.time()
coco_evaluator.update(res)
evaluator_time = time.time() - evaluator_time
metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print("Averaged stats:", metric_logger)
coco_evaluator.synchronize_between_processes()
# accumulate predictions from all images
coco_evaluator.accumulate()
coco_evaluator.summarize()
torch.set_num_threads(n_threads)
return coco_evaluator

How to calculate the f1-score?

I have a pyTorch-code to train a model that should be able to detect placeholder-images among product-images. I didn't write the code by myself as I am very unexperienced with CNNs and Machine Learning.
My boss told me to calculate the f1-score for that model and i found out that the formula for that is ((precision * recall)/(precision + recall)) but I don't know how I get precision and recall. Is someone able to tell me how I can get those two parameters from that following code?
(Sorry for the long piece of code, but I didn't really know what is necessary and what isn't)
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)
data_dir = "data"
# Models to choose from [resnet, alexnet, vgg, squeezenet, densenet, inception]
model_name = "resnet"
# Number of classes in the dataset [we have four classes A-Balik-Duz-Princess]
num_classes = 2
# Batch size for training (change depending on how much memory you have)
batch_size = 25
# Number of epochs to train for (This will need to be calculated in order to address under and over fitting issue)
num_epochs = 20
# Flag for feature extracting. When False, we fine tune the whole model,
# when True we only update the reshaped layer params
feature_extract = True
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_inception=False):
since = time.time()
print("model is : ",model)
val_acc_history = []
val_loss_history = []
train_acc_history = []
train_loss_history = []
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients (This can be changed to the Adam and other optimizers)
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
# Get model outputs and calculate loss
# Special case for inception because in training it has an auxiliary output. In train
# mode we calculate the loss by summing the final output and the auxiliary output
# but in testing we only consider the final output.
if is_inception and phase == 'train':
# From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958
outputs, aux_outputs = model(inputs)
loss1 = criterion(outputs, labels)
loss2 = criterion(aux_outputs, labels)
loss = loss1 + 0.4*loss2
else:
outputs = model(inputs)
loss = criterion(outputs, labels)
_, preds = torch.max(outputs, 1)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / len(dataloaders[phase].dataset)
epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
if phase == 'val':
val_acc_history.append(epoch_acc)
val_loss_history.append(epoch_loss)
if phase == 'train':
train_acc_history.append(epoch_acc)
train_loss_history.append(epoch_loss)
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model, val_acc_history, train_acc_history,val_loss_history,train_loss_history
def set_parameter_requires_grad(model, feature_extracting):
if feature_extracting:
for param in model.parameters():
param.requires_grad = False
###############################################
### Initialize and Reshape the Networks
###############################################
def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True):
# Initialize these variables which will be set in this if statement. Each of these
# variables is model specific.
model_ft = None
input_size = 0
if model_name == "resnet":
""" Resnet18
"""
model_ft = models.resnet152(pretrained=use_pretrained)
#we can select any possible variation of ResNet such as Resnet18, Resnet34, Resnet50, Resnet101, and Resnet152
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, num_classes)
input_size = 224
elif model_name == "alexnet":
""" Alexnet
"""
model_ft = models.alexnet(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.classifier[6].in_features
model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
input_size = 224
elif model_name == "vgg":
""" VGG11_bn
"""
model_ft = models.vgg11_bn(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.classifier[6].in_features
model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
input_size = 224
elif model_name == "squeezenet":
""" Squeezenet
"""
model_ft = models.squeezenet1_0(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
model_ft.classifier[1] = nn.Conv2d(512, num_classes, kernel_size=(1,1), stride=(1,1))
model_ft.num_classes = num_classes
input_size = 224
elif model_name == "densenet":
""" Densenet
"""
model_ft = models.densenet121(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.classifier.in_features
model_ft.classifier = nn.Linear(num_ftrs, num_classes)
input_size = 224
elif model_name == "inception":
""" Inception v3
Be careful, expects (299,299) sized images and has auxiliary output
"""
model_ft = models.inception_v3(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
# Handle the auxilary net
num_ftrs = model_ft.AuxLogits.fc.in_features
model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes)
# Handle the primary net
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs,num_classes)
input_size = 299
else:
print("Invalid model name, exiting...")
exit()
return model_ft, input_size
# Initialize the model for this run
model_ft, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=True)
# Print the model we just instantiated
#print(model_ft)
########################
### LOAD DATA
########################
# Data augmentation and normalization for training
# there are multiple approaches for data augmentation which can be added in the future
# Just normalization for validation
data_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(input_size),
#transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(input_size),
transforms.CenterCrop(input_size),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
print("Initializing Datasets and Dataloaders...")
# Create training and validation datasets
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val']}
# Create training and validation dataloaders
dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=4) for x in ['train', 'val']}
# Detect if we have a GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#############################
### Create the Optimizer
#############################
# Send the model to GPU
model_ft = model_ft.to(device)
# Gather the parameters to be optimized/updated in this run. If we are
# fine tuning we will be updating all parameters. However, if we are
# doing feature extract method, we will only update the parameters
# that we have just initialized, i.e. the parameters with requires_grad
# is True.
params_to_update = model_ft.parameters()
print("Params to learn:")
if feature_extract:
params_to_update = []
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
params_to_update.append(param)
print("\t",name)
else:
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
print("\t",name)
# Observe that all parameters are being optimized we can add leaky ReLU and much more
optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)
###########################
### Run Training and Validation Step
###########################
%time
# Setup the loss fxn
criterion = nn.CrossEntropyLoss()
# Train and evaluate
model_ft, hist, loss_t,vloss_acc, tloss_acc = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs, is_inception=(model_name=="inception"))
...
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
# Add these lines to obtain f1_score
from sklearn.metrics import f1_score
f1_score = f1_score(labels.data, preds)
#or: f1_score = f1_score(labels.cpu().data, preds.cpu())
...
You can use sklearn to calculate f1_score
from sklearn.metrics import f1_score
X, y = get_data(...)
y_pred = model.predict(X)
f1_score(y, y_pred)
I think it's better to call f1-score with macro/micro.
from sklearn.metrics import f1_score
print('F1-Score macro: ',f1_score(outputs, labels, average='macro'))
print('F1-Score micro: ',f1_score(outputs, labels, average='micro'))
The key difference between micro and macro F1 score is their behavior on imbalanced datasets. Micro F1 score often doesn't return an objective measure of model performance when the classes are imbalanced, whilst Macro F1 score is able to do so.Read More

Model not saved after training in PyTorch

I encounter the following problem.
I perform an increasing cross-validation; I have 20 subjects in my dataset and try to classify images. I start with 3 subjects and perform a cross-validation with k=3; that is I train 3 different models and validate on the subject left out. And this is what I do for 4, 5, ..., 20 drivers. Hence, I have a lot of models trained.
Now I wanted to check the performance of all models on another dataset, but for some reason the accuracy is the same for all models, which must be a bug somewhere.
I already use copy.deepcopy(), so I must have an error somewhere else.
I'm open for any hints!
Here is the code for the training function:
def train_model(model, num_classes, dirname, trainloader, valloader, trainset_size, valset_size, criterion, optimizer, scheduler, patience, min_delta, num_epochs, fold):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
since = time.time()
train_loss, train_acc, val_loss, val_acc = [], [], [], []
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
early_stopping = False
counter = 0
last_train_epoch = 0
for epoch in range(num_epochs):
if early_stopping:
print('\nEarly Stopping')
break
print('Epoch {}/{}'.format(epoch+1, num_epochs))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
dataloader = trainloader
dataset_size = trainset_size
else:
model.eval() # Set model to evaluate mode
dataloader = valloader
dataset_size = valset_size
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloader:
model = model.to(device)
inputs = inputs.to(device)
#labels = labels.long().to(device)
labels = labels.to(device) #test_tensor.type(torch.FloatTensor)
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
# zero the parameter gradients
optimizer.zero_grad()
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / dataset_size
epoch_acc = running_corrects.double() / dataset_size
if phase == 'train':
train_loss.append(epoch_loss)
train_acc.append(epoch_acc)
else:
val_loss.append(epoch_loss)
val_acc.append(epoch_acc)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# early stopping
if phase == 'val':
if counter == patience:
early_stopping = True
break
if epoch == 0:
best_loss = epoch_loss
else:
if best_loss >= epoch_loss + min_delta:
print('Validation loss decreased ({:.4f} --> {:.4f}). Saving model ...'.format(best_loss,epoch_loss))
best_model_wts = copy.deepcopy(model.state_dict())
torch.save(model.state_dict(), '{}/weights/model_fold_{}.pth'.format(dirname,fold))
last_train_epoch = epoch + 1
best_loss = epoch_loss
counter = 0
else:
counter += 1
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
# save best model
return model, train_acc, train_loss, val_acc, val_loss, last_train_epoch
Here is how I call the function:
model = net
model = model.to(device)
# train
[model, train_acc, train_loss, val_acc, val_loss, last_train_epoch] = train_model(model, num_classes, dirname,\
trainloader, valloader,\
trainset_size, valset_size,\
criterion, optimizer, \
exp_lr_scheduler, patience, \
min_delta, num_epochs, fold=val_index)
# test model
[preds_val, labels_val, idx_false_val, pred_time_val_fold] = test(model, valloader)
[preds_tr, labels_tr, idx_false_train, pred_time_train_fold] = test(model, trainloader)
[preds_all, labels_all, idx_false_all, pred_time_all_fold] = test(model, allloader)
print('Accuracy on all data: ', accuracy_score(labels_all, preds_all))
and for the sake of completeness, this is what the test() function looks like:
def test(model, dataloader):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
pred_labels, gt_labels, idx_false, pred_time = [], [], [], []
was_training = model.training
model.eval()
with torch.no_grad():
for i, (inputs, labels) in enumerate(dataloader):
inputs = inputs.to(device)
labels = labels.to(device)
start_pred = time.clock()
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
end_pred = time.clock()
pred_time.append(end_pred-start_pred)
for j in range(inputs.size()[0]):
pred_labels.append(preds[j].item())
gt_labels.append(labels[j].item())
for i in range(len(pred_labels)):
if pred_labels[i] != gt_labels[i]:
idx_false.append(i)
model.train(mode=was_training)
return pred_labels, gt_labels, idx_false, pred_time
Edit: It looks as if it always saves the same models even though I try to make sure that only the updated weights of the best model are saved.

CPU usage 99%+ : Stuck when running two different pytorch programs with two different gpus

I have a very simple code which is supposed to run on one gpu. However, whenever I run a program on one gpu and trigger another run on another gpu, both of the programs either get stuck or slow down significantly.
from __future__ import print_function
from __future__ import division
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
import time
import os
import copy
import argparse
#print("PyTorch Version: ",torch.__version__)
#print("Torchvision Version: ",torchvision.__version__)
# Number of classes in the dataset
num_classes = 2 # for now
# Batch size for training (change depending on how much memory you have)
batch_size = 32
# Number of epochs to train for
num_epochs = 50
# Flag for feature extracting. When False, we finetune the whole model,
# when True we only update the reshaped layer params
feature_extract = True
unfreeze_layer = 0
# Implement early stop
early_stop_tol = 5
parser = argparse.ArgumentParser()
parser.add_argument("gpu", type=str,
help="gpu number (e.g. \"0\" )")
parser.add_argument("unfreeze_layer", type=int)
parser.add_argument("path", type=str, help="path to input dir")
parser.add_argument("base_path", type=str, help="path to this python dir")
args = parser.parse_args()
data_dir = args.path
def unfreeze(model, min_layer):
print("params to learn:")
ct = 0
for child in model_ft.children():
ct += 1
if ct >= min_layer:
for name,param in model_ft.named_parameters():
param.requires_grad = True
print("\t",name)
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, init_acc = 0.0):
since = time.time()
val_acc_history = []
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = init_acc
tol = 0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
loss = criterion(outputs, labels)
_, preds = torch.max(outputs, 1)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / len(dataloaders[phase].dataset)
epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
if phase == 'train':
writer.add_scalar("Loss/train", epoch_loss, epoch)
writer.add_scalar("Acc/train", epoch_acc, epoch)
if phase == 'val':
writer.add_scalar("Loss/val", epoch_loss, epoch)
writer.add_scalar("Acc/val", epoch_acc, epoch)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
# save model
# torch.save(model, os.path.join(model_output_path, "model_" + str(epoch)))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
if phase == 'val':
val_acc_history.append(epoch_acc)
tol += 1
if tol >= early_stop_tol:
print("reached max tol")
break
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model, val_acc_history
def set_parameter_requires_grad(model, feature_extracting):
if feature_extracting:
for param in model.parameters():
param.requires_grad = False
def initialize_model(num_classes, feature_extract, use_pretrained=True):
"""
Initialize input model.
"""
model_ft = models.resnet18(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, num_classes)
input_size = 224
return model_ft, input_size
for learning_rate in np.geomspace(0.000001, 0.1, 100):
for finetune_learning_rate in np.geomspace(0.0000001, 0.01, 80):
# Initialize the model for this run
model_ft, input_size = initialize_model(num_classes, feature_extract, use_pretrained=True)
# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
'train': transforms.Compose([
transforms.Resize(input_size),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(input_size),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
# Create training and validation datasets
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val']}
# Create training and validation dataloaders
dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=0) for x in ['train', 'val']}
images, labels = next(iter(dataloaders_dict['train']))
# Detect if we have a GPU available
device = torch.device(args.gpu if torch.cuda.is_available() else "cpu")
# Send the model to GPU
model_ft = model_ft.to(device)
# Gather the parameters to be optimized/updated in this run. If we are
# finetuning we will be updating all parameters. However, if we are
# doing feature extract method, we will only update the parameters
# that we have just initialized, i.e. the parameters with requires_grad
# is True.
params_to_update = model_ft.parameters()
print("Params to learn:")
if feature_extract:
params_to_update = []
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
params_to_update.append(param)
print("\t",name)
else:
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
print("\t",name)
writer = SummaryWriter(os.path.join(os.path.join(args.base_path, 'runs'), "batch_size" + str(learning_rate) + "_lr_" + str(finetune_learning_rate)))
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(params_to_update, lr=learning_rate, momentum=0.9)
# Setup the loss fxn
criterion = nn.CrossEntropyLoss()
# Train and evaluate
model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs)
print("start finetuning")
unfreeze(model_ft, args.unfreeze_layer)
# Observe that all parameters are being optimized
finetune_optimizer_ft = optim.SGD(params_to_update, lr=finetune_learning_rate, momentum=0.9)
# Setup the loss fxn
finetune_criterion = nn.CrossEntropyLoss()
model_ft, hist = train_model(model_ft, dataloaders_dict, finetune_criterion, finetune_optimizer_ft, num_epochs=num_epochs, init_acc = hist[-1])
writer.flush()
writer.close()
# Save model
torch.save(model_ft, os.path.join(os.path.join(args.base_path, 'model') + "batch_size" + str(learning_rate) + "_lr_" + str(finetune_learning_rate)))
del model_ft, criterion, optimizer_ft
torch.cuda.empty_cache()
I pass in "0" for gpu for the first program, and "1" for the gpu for the second program.
nvidia-smi status after running two separate programs
EDIT : It seems like our CPU usage reaches 99.9% if we run the above code. Would there be any suggestion how to optimize? Thanks!

Pytorch resnet with zero loss

I am training a Pytorch Resnet model for image segmentation. I have two classes and I am training with RGB images and corresponding binary masks. While my accuracy values are very high (~0.99), my loss values for both training and validation are zeroes throughout all epochs. How can I fix this issue?
#we do the spatial transformations first, and afterwards do any color augmentations
img_transform = transforms.Compose([
transforms.ToPILImage(),
transforms.RandomVerticalFlip(),
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(size=(patch_size,patch_size),pad_if_needed=True), #these need to be in a reproducible order, first affine transforms and then color
transforms.RandomResizedCrop(size=patch_size),
transforms.RandomRotation(180),
transforms.ColorJitter(brightness=0, contrast=0, saturation=0, hue=.5),
transforms.RandomGrayscale(),
transforms.ToTensor()
])
mask_transform = transforms.Compose([
transforms.ToPILImage(),
transforms.RandomVerticalFlip(),
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(size=(patch_size,patch_size),pad_if_needed=True), #these need to be in a reproducible order, first affine transforms and then color
transforms.RandomResizedCrop(size=patch_size,interpolation=PIL.Image.NEAREST),
transforms.RandomRotation(180),
])
dataset={}
dataLoader={}
for phase in phases: #now for each of the phases, we're creating the dataloader
#interestingly, given the batch size, i've not seen any improvements from using a num_workers>0
dataset[phase]=Dataset(f"/content/{dataname}_{phase}.pytable", img_transform=img_transform , mask_transform = mask_transform ,edge_weight=edge_weight)
dataLoader[phase]=DataLoader(dataset[phase], batch_size=batch_size,
shuffle=True, num_workers=0, pin_memory=True)
optim = torch.optim.Adam(model.parameters())
nclasses = dataset["train"].numpixels.shape[1]
class_weight=dataset["train"].numpixels[1,0:2] #don't take ignored class into account here
class_weight = torch.from_numpy(1-class_weight/class_weight.sum()).type('torch.FloatTensor').to(device)
print(class_weight) #show final used weights, make sure that they're reasonable before continouing
criterion = nn.CrossEntropyLoss(weight = class_weight, ignore_index = ignore_index ,reduce=False) #reduce = False makes sure we get a 2D output instead of a 1D "summary" value
for epoch in range(num_epochs):
#zero out epoch based performance variables
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
all_acc = {key: 0 for key in phases}
all_loss = {key: torch.zeros(0).to(device) for key in phases}
cmatrix = {key: np.zeros((2,2)) for key in phases}
for phase in phases: #iterate through both training and validation states
if phase == 'train':
model.train() # Set model to training mode
else: #when in eval mode, we don't want parameters to be updated
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
for ii , (X, y, y_weight) in enumerate(dataLoader[phase]): #for each of the batches
optim.zero_grad()
X = X.to(device) # [Nbatch, 3, H, W]
y_weight = y_weight.type('torch.FloatTensor').to(device)
y = y.type('torch.LongTensor').to(device) # [Nbatch, H, W] with class indices (0, 1)
with torch.set_grad_enabled(phase == 'train'): #dynamically set gradient computation, in case of validation, this isn't needed
#disabling is good practice and improves inference time
prediction = model_ft(X) # [N, Nclass]
y = y[:, 0,0]
loss = criterion(prediction, y)
print(loss)
_, preds = torch.max(X, 1)
preds = preds[:,0,0]
if phase=="train": #in case we're in train mode, need to do back propogation
loss.mean().backward()
optim.step()
running_loss += loss.data[0]
running_corrects += torch.sum(preds == y)
epoch_loss = running_loss / len(dataLoader[phase].dataset)
epoch_acc = running_corrects.double() / len(dataLoader[phase].dataset)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
if phase == 'val':
val_acc_history.append(epoch_acc)
print()

Resources