trained model for image classification always returns the 0th index in pytorch - pytorch

I've used transfer learning to classify an image(classify between ants and bees) tbvh i just copy pasted the whole exact code available in here and saved the model using pycharm ide later load the same and pass a single image to predict the output but instead of the actual output i keep getting the same zeroth index value defined inside my 'classes'
this is the code base of the whole model
from __future__ import print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import pickle
def main():
data_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
data_dir = 'hymenoptera_data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
data_transforms[x])
for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
shuffle=True, num_workers=4)
for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def imshow(inp, title=None):
"""Imshow for Tensor."""
inp = inp.numpy().transpose((1, 2, 0))
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
inp = std * inp + mean
inp = np.clip(inp, 0, 1)
plt.imshow(inp)
if title is not None:
plt.title(title)
plt.pause(0.001) # pause a bit so that plots are updated
# Get a batch of training data
inputs, classes = next(iter(dataloaders['train']))
# Make a grid from batch
out = torchvision.utils.make_grid(inputs)
imshow(out, title=[class_names[x] for x in classes])
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print(f'Epoch {epoch}/{num_epochs - 1}')
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
print(f'Best val Acc: {best_acc:4f}')
# load best model weights
model.load_state_dict(best_model_wts)
return model
def visualize_model(model, num_images=6):
was_training = model.training
model.eval()
images_so_far = 0
fig = plt.figure()
with torch.no_grad():
for i, (inputs, labels) in enumerate(dataloaders['val']):
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
for j in range(inputs.size()[0]):
images_so_far += 1
ax = plt.subplot(num_images // 2, 2, images_so_far)
ax.axis('off')
ax.set_title(f'predicted: {class_names[preds[j]]}')
imshow(inputs.cpu().data[j])
if images_so_far == num_images:
model.train(mode=was_training)
return
model.train(mode=was_training)
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
# Here the size of each output sample is set to 2.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
model_ft.fc = nn.Linear(num_ftrs, 2)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
###
# save using pickle
# pickle.dump(model_ft, open('model.pkl', 'wb'))
###
# save using torch
# def save_model(model, best_acc):
# state = {
# 'model': model_ft,
# 'acc': best_acc,
# }
torch.save(model_ft, './best_model.pth')
if __name__ == '__main__':
main()
Now loading the saved model and passing an image
# to be worked on
from __future__ import print_function, division
import torch
import numpy as np
from torchvision import transforms
import PIL.Image as Image
classes = [
"ants",
"bees",
]
# loading model
model = torch.load('best_model.pth')
# transform the image
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
image_transforms = transforms.Compose([
transforms.Resize((224, 224,)),
transforms.ToTensor(),
transforms.Normalize(torch.Tensor(mean), torch.Tensor(std))
])
def classify(model, image_transforms, image_path, classes):
model = model.eval()
image = Image.open(image_path)
image = image_transforms(image).float()
image = image.unsqueeze(0)
output = model(image)
_, predicted = torch.max(output.data, 1)
print(classes[predicted.item()])
classify(model,image_transforms,"beeimage.jpg",classes)
output of which should be bees but instead it is returning ants
output for the same is
C:\Users\prasa\Desktop\DL\venv\Scripts\python.exe C:\Users\prasa\Desktop\DL\callmod1.py
ants
Process finished with exit code 0

well i realized there is a silly mistake i did , i shouldve debug it before posting here.
It was that my function train_model was never called in the training model code. As a result of which it was never trained and acted upon the pretrained sets

Related

Function AddmmBackward returned an invalid gradient

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 3)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = NeuralNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
def UploadData(path, train):
#set up transforms for train and test datasets
train_transforms = transforms.Compose([transforms.Grayscale(num_output_channels=1), transforms.Resize(255), transforms.CenterCrop(224), transforms.RandomRotation(30),
transforms.RandomHorizontalFlip(), transforms.transforms.ToTensor()])
valid_transforms = transforms.Compose([transforms.Grayscale(num_output_channels=1), transforms.Resize(255), transforms.CenterCrop(224), transforms.RandomRotation(30),
transforms.RandomHorizontalFlip(), transforms.transforms.ToTensor()])
test_transforms = transforms.Compose([transforms.Grayscale(num_output_channels=1), transforms.Resize(255), transforms.CenterCrop(224), transforms.ToTensor()])
#set up datasets from Image Folders
train_dataset = datasets.ImageFolder(path + '/train', transform=train_transforms)
valid_dataset = datasets.ImageFolder(path + '/validation', transform=valid_transforms)
test_dataset = datasets.ImageFolder(path + '/test', transform=test_transforms)
#set up dataloaders with batch size of 32
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)
return trainloader, validloader, testloader
trainloader, validloader, testloader = UploadData("/home/lns/research/dataset", True)
epochs = 5
min_valid_loss = np.inf
for e in range(epochs):
train_loss = 0.0
for data, labels in trainloader:
# Transfer Data to GPU if available
if torch.cuda.is_available():
print("using GPU for data")
data, labels = data.cuda(), labels.cuda()
# Clear the gradients
optimizer.zero_grad()
# Forward Pass
target = net(data)
# Find the Loss
loss = criterion(target,labels)
# Calculate gradients
loss.backward()
# Update Weights
optimizer.step()
# Calculate Loss
train_loss += loss.item()
valid_loss = 0.0
model.eval() # Optional when not using Model Specific layer
for data, labels in validloader:
# Transfer Data to GPU if available
if torch.cuda.is_available():
print("using GPU for data")
data, labels = data.cuda(), labels.cuda()
# Forward Pass
target = net(data)
# Find the Loss
loss = criterion(target,labels)
# Calculate Loss
valid_loss += loss.item()
print('Epoch ',e+1, '\t\t Training Loss: ',train_loss / len(trainloader),' \t\t Validation Loss: ',valid_loss / len(validloader))
if min_valid_loss > valid_loss:
print("Validation Loss Decreased(",min_valid_loss,"--->",valid_loss,") \t Saving The Model")
min_valid_loss = valid_loss
# Saving State Dict
torch.save(net.state_dict(), '/home/lns/research/MODEL.pth')
After searching a lot i am asking for help. Can someone help me
understand why this error is occuring in backward propagation.
i followed pytorch cnn tutorail and geeksforgeeks tutorial
dataset is x ray images transformed into grayscale and resize to 255
Is my neural network is wrong or data is not processed correctly?
This is a size mismmatch between the output of your CNN and the number of neurons on on your first fully-connected layer. Because of missing padding, the number of elements when flattened is 16*4*4 i.e. 256 (and not 16*5*5):
self.fc1 = nn.Linear(256, 120)
Once modified, the model will run correctly:
>>> model = NeuralNetwork()
>>> model(torch.rand(1, 1, 28, 28)).shape
torch.Size([1, 3])
Alternatively, you can use an nn.LazyLinear which will deduce the in_feature argument during the very first inference based on its input shape.
self.fc1 = nn.LazyLinear(120)

How to calculate the f1-score?

I have a pyTorch-code to train a model that should be able to detect placeholder-images among product-images. I didn't write the code by myself as I am very unexperienced with CNNs and Machine Learning.
My boss told me to calculate the f1-score for that model and i found out that the formula for that is ((precision * recall)/(precision + recall)) but I don't know how I get precision and recall. Is someone able to tell me how I can get those two parameters from that following code?
(Sorry for the long piece of code, but I didn't really know what is necessary and what isn't)
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)
data_dir = "data"
# Models to choose from [resnet, alexnet, vgg, squeezenet, densenet, inception]
model_name = "resnet"
# Number of classes in the dataset [we have four classes A-Balik-Duz-Princess]
num_classes = 2
# Batch size for training (change depending on how much memory you have)
batch_size = 25
# Number of epochs to train for (This will need to be calculated in order to address under and over fitting issue)
num_epochs = 20
# Flag for feature extracting. When False, we fine tune the whole model,
# when True we only update the reshaped layer params
feature_extract = True
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_inception=False):
since = time.time()
print("model is : ",model)
val_acc_history = []
val_loss_history = []
train_acc_history = []
train_loss_history = []
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients (This can be changed to the Adam and other optimizers)
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
# Get model outputs and calculate loss
# Special case for inception because in training it has an auxiliary output. In train
# mode we calculate the loss by summing the final output and the auxiliary output
# but in testing we only consider the final output.
if is_inception and phase == 'train':
# From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958
outputs, aux_outputs = model(inputs)
loss1 = criterion(outputs, labels)
loss2 = criterion(aux_outputs, labels)
loss = loss1 + 0.4*loss2
else:
outputs = model(inputs)
loss = criterion(outputs, labels)
_, preds = torch.max(outputs, 1)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / len(dataloaders[phase].dataset)
epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
if phase == 'val':
val_acc_history.append(epoch_acc)
val_loss_history.append(epoch_loss)
if phase == 'train':
train_acc_history.append(epoch_acc)
train_loss_history.append(epoch_loss)
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model, val_acc_history, train_acc_history,val_loss_history,train_loss_history
def set_parameter_requires_grad(model, feature_extracting):
if feature_extracting:
for param in model.parameters():
param.requires_grad = False
###############################################
### Initialize and Reshape the Networks
###############################################
def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True):
# Initialize these variables which will be set in this if statement. Each of these
# variables is model specific.
model_ft = None
input_size = 0
if model_name == "resnet":
""" Resnet18
"""
model_ft = models.resnet152(pretrained=use_pretrained)
#we can select any possible variation of ResNet such as Resnet18, Resnet34, Resnet50, Resnet101, and Resnet152
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, num_classes)
input_size = 224
elif model_name == "alexnet":
""" Alexnet
"""
model_ft = models.alexnet(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.classifier[6].in_features
model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
input_size = 224
elif model_name == "vgg":
""" VGG11_bn
"""
model_ft = models.vgg11_bn(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.classifier[6].in_features
model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
input_size = 224
elif model_name == "squeezenet":
""" Squeezenet
"""
model_ft = models.squeezenet1_0(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
model_ft.classifier[1] = nn.Conv2d(512, num_classes, kernel_size=(1,1), stride=(1,1))
model_ft.num_classes = num_classes
input_size = 224
elif model_name == "densenet":
""" Densenet
"""
model_ft = models.densenet121(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.classifier.in_features
model_ft.classifier = nn.Linear(num_ftrs, num_classes)
input_size = 224
elif model_name == "inception":
""" Inception v3
Be careful, expects (299,299) sized images and has auxiliary output
"""
model_ft = models.inception_v3(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
# Handle the auxilary net
num_ftrs = model_ft.AuxLogits.fc.in_features
model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes)
# Handle the primary net
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs,num_classes)
input_size = 299
else:
print("Invalid model name, exiting...")
exit()
return model_ft, input_size
# Initialize the model for this run
model_ft, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=True)
# Print the model we just instantiated
#print(model_ft)
########################
### LOAD DATA
########################
# Data augmentation and normalization for training
# there are multiple approaches for data augmentation which can be added in the future
# Just normalization for validation
data_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(input_size),
#transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(input_size),
transforms.CenterCrop(input_size),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
print("Initializing Datasets and Dataloaders...")
# Create training and validation datasets
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val']}
# Create training and validation dataloaders
dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=4) for x in ['train', 'val']}
# Detect if we have a GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#############################
### Create the Optimizer
#############################
# Send the model to GPU
model_ft = model_ft.to(device)
# Gather the parameters to be optimized/updated in this run. If we are
# fine tuning we will be updating all parameters. However, if we are
# doing feature extract method, we will only update the parameters
# that we have just initialized, i.e. the parameters with requires_grad
# is True.
params_to_update = model_ft.parameters()
print("Params to learn:")
if feature_extract:
params_to_update = []
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
params_to_update.append(param)
print("\t",name)
else:
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
print("\t",name)
# Observe that all parameters are being optimized we can add leaky ReLU and much more
optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)
###########################
### Run Training and Validation Step
###########################
%time
# Setup the loss fxn
criterion = nn.CrossEntropyLoss()
# Train and evaluate
model_ft, hist, loss_t,vloss_acc, tloss_acc = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs, is_inception=(model_name=="inception"))
...
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
# Add these lines to obtain f1_score
from sklearn.metrics import f1_score
f1_score = f1_score(labels.data, preds)
#or: f1_score = f1_score(labels.cpu().data, preds.cpu())
...
You can use sklearn to calculate f1_score
from sklearn.metrics import f1_score
X, y = get_data(...)
y_pred = model.predict(X)
f1_score(y, y_pred)
I think it's better to call f1-score with macro/micro.
from sklearn.metrics import f1_score
print('F1-Score macro: ',f1_score(outputs, labels, average='macro'))
print('F1-Score micro: ',f1_score(outputs, labels, average='micro'))
The key difference between micro and macro F1 score is their behavior on imbalanced datasets. Micro F1 score often doesn't return an objective measure of model performance when the classes are imbalanced, whilst Macro F1 score is able to do so.Read More

how can i turn to GPU

how can i turn this classifier to run n the GPU
i tryed to add resent18.cuda() but it gave me an error in when I run the train(epochs=20) and when i run show_preds() before train(epochs=20) and the error that desplays it :
RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same
can you help me to do it
import os
import shutil
import random
import torch
import torchvision
import numpy as np
from google.colab import files
import time
import copy
import urllib
import torch
import PIL
import json
import numpy as np
import torch.nn.functional as F
from PIL import Image
from matplotlib import pyplot as plt
torch.manual_seed(0)
class_names = ['normal', 'viral', 'covid']
root_dir = 'COVID-19 Radiography Database'
source_dirs = ['NORMAL', 'Viral Pneumonia', 'COVID-19']
if os.path.isdir(os.path.join(root_dir, source_dirs[1])):
os.mkdir(os.path.join(root_dir, 'test'))
for i, d in enumerate(source_dirs):
os.rename(os.path.join(root_dir, d), os.path.join(root_dir, class_names[i]))
for c in class_names:
os.mkdir(os.path.join(root_dir, 'test', c))
for c in class_names:
images = [x for x in os.listdir(os.path.join(root_dir, c)) if x.lower().endswith('png')]
selected_images = random.sample(images, 30)
for image in selected_images:
source_path = os.path.join(root_dir, c, image)
target_path = os.path.join(root_dir, 'test', c, image)
shutil.move(source_path, target_path)
class ChestXRayDataset(torch.utils.data.Dataset):
def __init__(self, image_dirs, transform):
def get_images(class_name):
images = [x for x in os.listdir(image_dirs[class_name]) if x[-3:].lower().endswith('png')]
print(f'Found {len(images)} {class_name} examples')
return images
self.images = {}
self.class_names = ['normal', 'viral', 'covid']
for class_name in self.class_names:
self.images[class_name] = get_images(class_name)
self.image_dirs = image_dirs
self.transform = transform
def __len__(self):
return sum([len(self.images[class_name]) for class_name in self.class_names])
def __getitem__(self, index):
class_name = random.choice(self.class_names)
index = index % len(self.images[class_name])
image_name = self.images[class_name][index]
image_path = os.path.join(self.image_dirs[class_name], image_name)
image = Image.open(image_path).convert('RGB')
return self.transform(image), self.class_names.index(class_name)
train_transform = torchvision.transforms.Compose([
torchvision.transforms.Resize(size=(224, 224)),
torchvision.transforms.RandomHorizontalFlip(),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
test_transform = torchvision.transforms.Compose([
torchvision.transforms.Resize(size=(224, 224)),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
train_dirs = {
'normal': '/content/gdrive/MyDrive/covidcalssifierdataset/COVID-19 Radiography Database/normal',
'viral': '/content/gdrive/MyDrive/covidcalssifierdataset/COVID-19 Radiography Database/viral',
'covid': '/content/gdrive/MyDrive/covidcalssifierdataset/COVID-19 Radiography Database/covid'
}
train_dataset = ChestXRayDataset(train_dirs, train_transform)
test_dirs = {
'normal': '/content/gdrive/MyDrive/covidcalssifierdataset/COVID-19 Radiography Database/test/normal',
'viral': '/content/gdrive/MyDrive/covidcalssifierdataset/COVID-19 Radiography Database/test/normal',
'covid': '/content/gdrive/MyDrive/covidcalssifierdataset/COVID-19 Radiography Database/test/covid'
}
test_dataset = ChestXRayDataset(test_dirs, test_transform)
batch_size = 6
dl_train = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dl_test = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
print('Number of training batches', len(dl_train))
print('Number of test batches', len(dl_test))
class_names = train_dataset.class_names
def show_images(images, labels, preds):
plt.figure(figsize=(8, 4))
for i, image in enumerate(images):
plt.subplot(1, 6, i + 1, xticks=[], yticks=[])
image = image.numpy().transpose((1, 2, 0))
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
image = image * std + mean
image = np.clip(image, 0., 1.)
plt.imshow(image)
col = 'green'
if preds[i] != labels[i]:
col = 'red'
plt.xlabel(f'{class_names[int(labels[i].numpy())]}')
plt.ylabel(f'{class_names[int(preds[i].numpy())]}', color=col)
plt.tight_layout()
plt.show()
images, labels = next(iter(dl_train))
show_images(images, labels, labels)
images, labels = next(iter(dl_test))
show_images(images, labels, labels)
resnet18 = torchvision.models.resnet18(pretrained=True)
resnet18.fc = torch.nn.Linear(in_features=512, out_features=3)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(resnet18.parameters(), lr=3e-5)
def show_preds():
resnet18.eval()
images, labels = next(iter(dl_test))
outputs = resnet18(images)
_, preds = torch.max(outputs, 1)
show_images(images, labels, press)
def train(epochs):
print('Starting training..')
for e in range(0, epochs):
print('='*20)
print(f'Starting epoch {e + 1}/{epochs}')
print('='*20)
train_loss = 0.
val_loss = 0.
resnet18.train() # set model to training phase
for train_step, (images, labels) in enumerate(dl_train):
optimizer.zero_grad()
outputs = resnet18(images)
loss = loss_fn(outputs, labels)
loss.backward()
optimizer.step()
train_loss += loss.item()
if train_step % 20 == 0:
print('Evaluating at step', train_step)
accuracy = 0
resnet18.eval() # set model to eval phase
for val_step, (images, labels) in enumerate(dl_test):
outputs = resnet18(images)
loss = loss_fn(outputs, labels)
val_loss += loss.item()
_, preds = torch.max(outputs, 1)
accuracy += sum((preds == labels).numpy())
val_loss /= (val_step + 1)
accuracy = accuracy/len(test_dataset)
print(f'Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}')
show_preds()
resnet18.train()
if accuracy >= 0.95:
print('Performance condition satisfied, stopping..')
return
train_loss /= (train_step + 1)
print(f'Training Loss: {train_loss:.4f}')
print('Training complete..')
train(epochs=20)

CPU usage 99%+ : Stuck when running two different pytorch programs with two different gpus

I have a very simple code which is supposed to run on one gpu. However, whenever I run a program on one gpu and trigger another run on another gpu, both of the programs either get stuck or slow down significantly.
from __future__ import print_function
from __future__ import division
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
import time
import os
import copy
import argparse
#print("PyTorch Version: ",torch.__version__)
#print("Torchvision Version: ",torchvision.__version__)
# Number of classes in the dataset
num_classes = 2 # for now
# Batch size for training (change depending on how much memory you have)
batch_size = 32
# Number of epochs to train for
num_epochs = 50
# Flag for feature extracting. When False, we finetune the whole model,
# when True we only update the reshaped layer params
feature_extract = True
unfreeze_layer = 0
# Implement early stop
early_stop_tol = 5
parser = argparse.ArgumentParser()
parser.add_argument("gpu", type=str,
help="gpu number (e.g. \"0\" )")
parser.add_argument("unfreeze_layer", type=int)
parser.add_argument("path", type=str, help="path to input dir")
parser.add_argument("base_path", type=str, help="path to this python dir")
args = parser.parse_args()
data_dir = args.path
def unfreeze(model, min_layer):
print("params to learn:")
ct = 0
for child in model_ft.children():
ct += 1
if ct >= min_layer:
for name,param in model_ft.named_parameters():
param.requires_grad = True
print("\t",name)
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, init_acc = 0.0):
since = time.time()
val_acc_history = []
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = init_acc
tol = 0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
loss = criterion(outputs, labels)
_, preds = torch.max(outputs, 1)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / len(dataloaders[phase].dataset)
epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
if phase == 'train':
writer.add_scalar("Loss/train", epoch_loss, epoch)
writer.add_scalar("Acc/train", epoch_acc, epoch)
if phase == 'val':
writer.add_scalar("Loss/val", epoch_loss, epoch)
writer.add_scalar("Acc/val", epoch_acc, epoch)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
# save model
# torch.save(model, os.path.join(model_output_path, "model_" + str(epoch)))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
if phase == 'val':
val_acc_history.append(epoch_acc)
tol += 1
if tol >= early_stop_tol:
print("reached max tol")
break
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model, val_acc_history
def set_parameter_requires_grad(model, feature_extracting):
if feature_extracting:
for param in model.parameters():
param.requires_grad = False
def initialize_model(num_classes, feature_extract, use_pretrained=True):
"""
Initialize input model.
"""
model_ft = models.resnet18(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, num_classes)
input_size = 224
return model_ft, input_size
for learning_rate in np.geomspace(0.000001, 0.1, 100):
for finetune_learning_rate in np.geomspace(0.0000001, 0.01, 80):
# Initialize the model for this run
model_ft, input_size = initialize_model(num_classes, feature_extract, use_pretrained=True)
# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
'train': transforms.Compose([
transforms.Resize(input_size),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(input_size),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
# Create training and validation datasets
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val']}
# Create training and validation dataloaders
dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=0) for x in ['train', 'val']}
images, labels = next(iter(dataloaders_dict['train']))
# Detect if we have a GPU available
device = torch.device(args.gpu if torch.cuda.is_available() else "cpu")
# Send the model to GPU
model_ft = model_ft.to(device)
# Gather the parameters to be optimized/updated in this run. If we are
# finetuning we will be updating all parameters. However, if we are
# doing feature extract method, we will only update the parameters
# that we have just initialized, i.e. the parameters with requires_grad
# is True.
params_to_update = model_ft.parameters()
print("Params to learn:")
if feature_extract:
params_to_update = []
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
params_to_update.append(param)
print("\t",name)
else:
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
print("\t",name)
writer = SummaryWriter(os.path.join(os.path.join(args.base_path, 'runs'), "batch_size" + str(learning_rate) + "_lr_" + str(finetune_learning_rate)))
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(params_to_update, lr=learning_rate, momentum=0.9)
# Setup the loss fxn
criterion = nn.CrossEntropyLoss()
# Train and evaluate
model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs)
print("start finetuning")
unfreeze(model_ft, args.unfreeze_layer)
# Observe that all parameters are being optimized
finetune_optimizer_ft = optim.SGD(params_to_update, lr=finetune_learning_rate, momentum=0.9)
# Setup the loss fxn
finetune_criterion = nn.CrossEntropyLoss()
model_ft, hist = train_model(model_ft, dataloaders_dict, finetune_criterion, finetune_optimizer_ft, num_epochs=num_epochs, init_acc = hist[-1])
writer.flush()
writer.close()
# Save model
torch.save(model_ft, os.path.join(os.path.join(args.base_path, 'model') + "batch_size" + str(learning_rate) + "_lr_" + str(finetune_learning_rate)))
del model_ft, criterion, optimizer_ft
torch.cuda.empty_cache()
I pass in "0" for gpu for the first program, and "1" for the gpu for the second program.
nvidia-smi status after running two separate programs
EDIT : It seems like our CPU usage reaches 99.9% if we run the above code. Would there be any suggestion how to optimize? Thanks!

PyTorch: Add validation error in training

I am using PyTorch to train a cnn model. Here is my Network architecture:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as I
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 5)
self.pool = nn.MaxPool2d(2,2)
self.conv1_bn = nn.BatchNorm2d(32)
self.conv2 = nn.Conv2d(32, 64, 5)
self.conv2_drop = nn.Dropout2d()
self.conv2_bn = nn.BatchNorm2d(64)
self.fc1 = torch.nn.Linear(53*53*64, 256)
self.fc2 = nn.Linear(256, 136)
def forward(self, x):
x = F.relu(self.conv1_bn(self.pool(self.conv1(x))))
x = F.relu(self.conv2_bn(self.pool(self.conv2_drop(self.conv2(x)))))
x = x.view(-1, 53*53*64)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return x
Then I train the model like below:
# prepare the net for training
net.train()
for epoch in range(n_epochs): # loop over the dataset multiple times
running_loss = 0.0
# train on batches of data, assumes you already have train_loader
for batch_i, data in enumerate(train_loader):
# get the input images and their corresponding labels
images = data['image']
key_pts = data['keypoints']
# flatten pts
key_pts = key_pts.view(key_pts.size(0), -1)
# wrap them in a torch Variable
images, key_pts = Variable(images), Variable(key_pts)
# convert variables to floats for regression loss
key_pts = key_pts.type(torch.FloatTensor)
images = images.type(torch.FloatTensor)
# forward pass to get outputs
output_pts = net(images)
# calculate the loss between predicted and target keypoints
loss = criterion(output_pts, key_pts)
# zero the parameter (weight) gradients
optimizer.zero_grad()
# backward pass to calculate the weight gradients
loss.backward()
# update the weights
optimizer.step()
# print loss statistics
running_loss += loss.data[0]
I am wondering if it is possible to add the validation error in the training? I mean something like this (validation split) in Keras:
myModel.fit(trainX, trainY, epochs=50, batch_size=1, verbose=2, validation_split = 0.1)
Here is an example how to split your dataset for training and validation, then switch between the two phases every epoch:
import numpy as np
import torch
from torchvision import datasets
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler
# Examples:
my_dataset = datasets.MNIST(root="/home/benjamin/datasets/mnist", train=True, download=True)
validation_split = 0.1
dataset_len = len(my_dataset)
indices = list(range(dataset_len))
# Randomly splitting indices:
val_len = int(np.floor(validation_split * dataset_len))
validation_idx = np.random.choice(indices, size=val_len, replace=False)
train_idx = list(set(indices) - set(validation_idx))
# Contiguous split
# train_idx, validation_idx = indices[split:], indices[:split]
## Defining the samplers for each phase based on the random indices:
train_sampler = SubsetRandomSampler(train_idx)
validation_sampler = SubsetRandomSampler(validation_idx)
train_loader = torch.utils.data.DataLoader(my_dataset, sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(my_dataset, sampler=validation_sampler)
data_loaders = {"train": train_loader, "val": validation_loader}
data_lengths = {"train": len(train_idx), "val": val_len}
# Training with Validation (your code + code from Pytorch tutorial: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html)
n_epochs = 40
net = ...
for epoch in range(n_epochs):
print('Epoch {}/{}'.format(epoch, n_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
optimizer = scheduler(optimizer, epoch)
net.train(True) # Set model to training mode
else:
net.train(False) # Set model to evaluate mode
running_loss = 0.0
# Iterate over data.
for data in data_loaders[phase]:
# get the input images and their corresponding labels
images = data['image']
key_pts = data['keypoints']
# flatten pts
key_pts = key_pts.view(key_pts.size(0), -1)
# wrap them in a torch Variable
images, key_pts = Variable(images), Variable(key_pts)
# convert variables to floats for regression loss
key_pts = key_pts.type(torch.FloatTensor)
images = images.type(torch.FloatTensor)
# forward pass to get outputs
output_pts = net(images)
# calculate the loss between predicted and target keypoints
loss = criterion(output_pts, key_pts)
# zero the parameter (weight) gradients
optimizer.zero_grad()
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
# update the weights
optimizer.step()
# print loss statistics
running_loss += loss.data[0]
epoch_loss = running_loss / data_lengths[phase]
print('{} Loss: {:.4f}'.format(phase, epoch_loss))

Resources