I am trying to implement the following function to save the model_state checkpoints:
def train_epoch(self):
for epoch in tqdm.trange(self.epoch, self.max_epoch, desc='Train Epoch', ncols=100):
self.epoch = epoch # increments the epoch of Trainer
checkpoint = {} # fixme: here checkpoint!!!
# model_save_criteria = self.model_save_criteria
self.train()
if epoch % 1 == 0:
self.validate(checkpoint)
checkpoint_latest = {
'epoch': self.epoch,
'arch': self.model.__class__.__name__,
'model_state_dict': self.model.state_dict(),
'optimizer_state_dict': self.optim.state_dict(),
'model_save_criteria': self.model_save_criteria
}
checkpoint['checkpoint_latest'] = checkpoint_latest
torch.save(checkpoint, self.model_pth)
Previously I did the same by just running a for loop:
train_states = {}
for epoch in range(max_epochs):
running_loss = 0
time_batch_start = time.time()
model.train()
for bIdx, sample in enumerate(train_loader):
...
train...
validation...
train_states_latest = {
'epoch': epoch + 1,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'model_save_criteria': chosen_criteria}
train_states['train_states_latest'] = train_states_latest
torch.save(train_states, FILEPATH_MODEL_SAVE)
Are there ways to initiate the checkpoint={} and update it every loop? Or checkpoint={} in every epoch is fine since the model itself is holding the state_dict(). Just I am overwriting the checkpoint each time.
You can avoid overwriting the checkpoint by simply changing the FILEPATH_MODEL_SAVE path and have that path contain info on the epoch or iteration number. For example (taking your original code),
train_states = {}
for epoch in range(max_epochs):
running_loss = 0
time_batch_start = time.time()
model.train()
for bIdx, sample in enumerate(train_loader):
...
train...
validation...
train_states_latest = {
'epoch': epoch + 1,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'model_save_criteria': chosen_criteria}
train_states['train_states_latest'] = train_states_latest
# This is the code you can add
FILEPATH_MODEL_SAVE = "Epoch{}batch{}model_weights.pth".format(epoch, bIdx)
torch.save(train_states, FILEPATH_MODEL_SAVE)
With this new code above torch.save you avoid overwriting the checkpoint.
Sarthak
Related
I am training an LSTM and doing some hyperparameter tuning using Ray Tune that is based very much off of the PyTorch Tutorial here: https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html.
I get the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-161-e7423f64e4d4> in <module>()
108 best_trial = result.get_best_trial(metric = "loss", mode = "min")
109
--> 110 print("Best trial config: {}".format(best_trial.config))
111 print("Best trial final validation loss: {}".format(
112 best_trial.last_result["loss"]))
AttributeError: 'NoneType' object has no attribute 'config'
My code is:
checkpoint_dir = '/content/gdrive/MyDrive/Checkpoint_dir'
data_dir = './data'
epochs=5
def custom_train_part(config, checkpoint_dir=None, data_dir=None):
model = LSTM(len(True_IMF_df.T), config["Hidden"], config["Layers"], 1)
device = "cpu"
if torch.cuda.is_available():
device = "cuda:0"
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
criterion = nn.MSELoss()
if checkpoint_dir:
model_state, optimizer_state = torch.load(
os.path.join(checkpoint_dir, "checkpoint"))
model.load_state_dict(model_state)
optimizer.load_state_dict(optimizer_state)
for e in range(epochs):
running_loss = 0.0
epoch_steps = 0
model.train() # put model to training mode
x = x_hht_train.to(device)
y = y_hht_train.to(device)
scores = model(x)
loss = criterion(scores, y_hht_train)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f"Running loss: {running_loss}")
epoch_steps += 1
if e % 5 == 0:
print(f'Epoch: {e}, loss = {loss.cpu().item()}')
# check_accuracy(loader_val, model)
print()
# Validation loss
val_loss = 0.0
val_steps = 0
total = 0
correct = 0
with torch.no_grad():
x = x_hht_val.to(device) # move to device, e.g. GPU
y = y_hht_val.to(device)
scores = model(x)
scores = scores.cpu()
y = y.cpu()
correct += (np.sign(scores) == np.sign(y)).sum().item()
print(f"Correct: {correct}")
loss = criterion(scores, y)
print(f"Val Loss: {loss.item()}")
val_loss += loss.cpu()
val_steps += 1
with tune.checkpoint_dir(e) as checkpoint_dir:
path = os.path.join(checkpoint_dir, "checkpoint")
torch.save((model.state_dict(), optimizer.state_dict()), path)
tune.report(loss=(val_loss / val_steps), accuracy = correct / len(y_hht_val))
print("Finished Training")
config = {
"lr": tune.loguniform(1e-6, 1e-1),
"Layers": tune.sample_from(lambda _: np.random.randint(1, 10)),
"Hidden" : tune.sample_from(lambda _: np.random.randint(2, 100))
}
scheduler = ASHAScheduler(
metric="loss",
mode="min",
max_t=10,
grace_period=1,
reduction_factor=2)
reporter = CLIReporter(
#parameter_columns=["lr", "Layers", "Hidden"],
metric_columns=["loss", "accuracy", "training_iteration"]
)
result = tune.run(
custom_train_part,
resources_per_trial={"cpu": 4, "gpu": 1},
config=config,
num_samples=1,
scheduler = scheduler,
progress_reporter=reporter
)
print(f"Results: {result.get_best_config(metric = 'loss', mode = 'min')}")
best_trial = result.get_best_trial(metric = "loss", mode = "min")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(
best_trial.last_result["accuracy"]))
##############################################################
# AFTER TUNNING #
##############################################################
best_trained_model = LSTM(len(True_IMF_df.T), best_trial.config["Hidden"], best_trial.config["Layers"], 1)
best_checkpoint_dir = best_trial.checkpoint.value
model_state, optimizer_state = torch.load(os.path.join(
best_checkpoint_dir, "checkpoint"))
best_trained_model.load_state_dict(model_state)
It is evident that the issue is with the tune.run() but I don't know why it returns a NoneType Object
I encounter the following problem.
I perform an increasing cross-validation; I have 20 subjects in my dataset and try to classify images. I start with 3 subjects and perform a cross-validation with k=3; that is I train 3 different models and validate on the subject left out. And this is what I do for 4, 5, ..., 20 drivers. Hence, I have a lot of models trained.
Now I wanted to check the performance of all models on another dataset, but for some reason the accuracy is the same for all models, which must be a bug somewhere.
I already use copy.deepcopy(), so I must have an error somewhere else.
I'm open for any hints!
Here is the code for the training function:
def train_model(model, num_classes, dirname, trainloader, valloader, trainset_size, valset_size, criterion, optimizer, scheduler, patience, min_delta, num_epochs, fold):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
since = time.time()
train_loss, train_acc, val_loss, val_acc = [], [], [], []
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
early_stopping = False
counter = 0
last_train_epoch = 0
for epoch in range(num_epochs):
if early_stopping:
print('\nEarly Stopping')
break
print('Epoch {}/{}'.format(epoch+1, num_epochs))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
dataloader = trainloader
dataset_size = trainset_size
else:
model.eval() # Set model to evaluate mode
dataloader = valloader
dataset_size = valset_size
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloader:
model = model.to(device)
inputs = inputs.to(device)
#labels = labels.long().to(device)
labels = labels.to(device) #test_tensor.type(torch.FloatTensor)
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
# zero the parameter gradients
optimizer.zero_grad()
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / dataset_size
epoch_acc = running_corrects.double() / dataset_size
if phase == 'train':
train_loss.append(epoch_loss)
train_acc.append(epoch_acc)
else:
val_loss.append(epoch_loss)
val_acc.append(epoch_acc)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# early stopping
if phase == 'val':
if counter == patience:
early_stopping = True
break
if epoch == 0:
best_loss = epoch_loss
else:
if best_loss >= epoch_loss + min_delta:
print('Validation loss decreased ({:.4f} --> {:.4f}). Saving model ...'.format(best_loss,epoch_loss))
best_model_wts = copy.deepcopy(model.state_dict())
torch.save(model.state_dict(), '{}/weights/model_fold_{}.pth'.format(dirname,fold))
last_train_epoch = epoch + 1
best_loss = epoch_loss
counter = 0
else:
counter += 1
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
# save best model
return model, train_acc, train_loss, val_acc, val_loss, last_train_epoch
Here is how I call the function:
model = net
model = model.to(device)
# train
[model, train_acc, train_loss, val_acc, val_loss, last_train_epoch] = train_model(model, num_classes, dirname,\
trainloader, valloader,\
trainset_size, valset_size,\
criterion, optimizer, \
exp_lr_scheduler, patience, \
min_delta, num_epochs, fold=val_index)
# test model
[preds_val, labels_val, idx_false_val, pred_time_val_fold] = test(model, valloader)
[preds_tr, labels_tr, idx_false_train, pred_time_train_fold] = test(model, trainloader)
[preds_all, labels_all, idx_false_all, pred_time_all_fold] = test(model, allloader)
print('Accuracy on all data: ', accuracy_score(labels_all, preds_all))
and for the sake of completeness, this is what the test() function looks like:
def test(model, dataloader):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
pred_labels, gt_labels, idx_false, pred_time = [], [], [], []
was_training = model.training
model.eval()
with torch.no_grad():
for i, (inputs, labels) in enumerate(dataloader):
inputs = inputs.to(device)
labels = labels.to(device)
start_pred = time.clock()
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
end_pred = time.clock()
pred_time.append(end_pred-start_pred)
for j in range(inputs.size()[0]):
pred_labels.append(preds[j].item())
gt_labels.append(labels[j].item())
for i in range(len(pred_labels)):
if pred_labels[i] != gt_labels[i]:
idx_false.append(i)
model.train(mode=was_training)
return pred_labels, gt_labels, idx_false, pred_time
Edit: It looks as if it always saves the same models even though I try to make sure that only the updated weights of the best model are saved.
I tried to categorize the image of the skin. The following code is used to train the data, but I will encounter problems like the title when I train halfway.
use
python: 3.7
pytorch: 1.1
system: win10
start_epoch=0
for epoch in range(num_epochs):
print('Starting train epoch %d / %d' % (start_epoch + epoch + 1, num_epochs))
model = model.to(device)
print(device)
#model.train()
running_loss = 0
count = 0
epoch_loss = 0
#for i, (input, depth) in enumerate(train_loader):
for step,(input, depth) in enumerate(train_loader):
# input, depth = data
input = input.to(device)
depth = depth.to(device)
input_var=torch.tensor(input)
depth_var=torch.tensor(depth).squeeze(1)
#input_tensor = input_var.to(device)
#depth_tensor = depth_var.to(device)
output = model.forward(input_var)
#new_output = output.to(device)
#result = output.type(torch.FloatTensor)
loss = loss_fn(output, depth_var)
#loss = loss.type(torch.FloatTensor)
print('count: ',count,' loss:', loss.item())
count += 1
running_loss += loss.data.cpu().numpy()
optimizer.zero_grad()
loss.backward()
optimizer.step()
if count%100==0:
torch.save(model,"./pkl/cifar.pkl")
I am new to PyTorch. I'm trying to use a pre-trained Faster RCNN torchvision.models.detection.fasterrcnn_resnet50_fpn() for object detection project. I have created a CustomDataset(Dataset) class to handle the custom dataset.
Here is the custom class implementation
class ToTensor(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
image, landmarks = sample['image'], sample['meta_data']
# swap color axis because
# numpy image: H x W x C
# torch image: C X H X W
image = image.transpose((2, 0, 1))
return {'image': torch.from_numpy(image),
'meta_data': landmarks}
class CustomDataset(Dataset):
"""Custom Landmarks dataset."""
def __init__(self, data_dir, root_dir, transform=None):
"""
Args:
data_dir (string): Directory with all the labels(json).
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.data_dir = data_dir
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(os.listdir(self.data_dir))
def __getitem__(self, idx):
img_name = sorted(os.listdir(self.root_dir))[idx]
image = io.imread(self.root_dir+'/'+img_name, plugin='matplotlib')
json_file = sorted(os.listdir(self.data_dir))[idx]
with open(self.data_dir+'/'+json_file) as f:
meta_data = json.load(f)
meta_data = meta_data['annotation']['object']
sample = {'image': image, 'meta_data': meta_data}
to_tensor = ToTensor()
transformed_sample = to_tensor(sample)
if self.transform:
sample = self.transform(sample)
return transformed_sample
Here is the train_model function
def train_model(model, criterion, optimizer, lr_scheduler, num_epochs=25):
since = time.time()
best_model = model
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'test']:
if phase == 'train':
optimizer = lr_scheduler(optimizer, epoch)
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
for data in dset_loaders[phase]:
# get the inputs
inputs, labels = data['image'], data['meta_data']
inputs= inputs.to(device) # ,
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs, labels)
_, preds = torch.max(outputs.data, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item()
running_corrects += torch.sum(preds == labels).item()
epoch_loss = running_loss / dset_sizes[phase]
epoch_acc = running_corrects / dset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'test' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model = copy.deepcopy(model)
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
return best_model
While performing model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=25) I am getting "RuntimeError: _thnn_upsample_bilinear2d_forward not supported on CUDAType for Byte"
It appears your datapoints are byte tensors, i.e type uint8. Try casting your data into float32
# Replace this
inputs = inputs.to(device)
# With this
inputs = inputs.float().to(device)
Note that the torchvision models expect data to be normalized in a specific way. Check here for the procedure, which basically entails using
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
for normalizing your data.
I'm trying to use a custom loss function by extending nn.Module, but I can't get past the error
element 0 of variables does not require grad and does not have a grad_fn
Note: my labels are lists of size: num_samples, but each batch will have the same labels throughout the batch, so we shrink labels for the whole batch to be a single label by calling .diag()
My code is as follows and is based on the transfer learning tutorial:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
model.train(True) # Set model to training mode
else:
model.train(False) # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for data in dataloaders[phase]:
# get the inputs
inputs, labels = data
inputs = inputs.float()
# wrap them in Variable
if use_gpu:
inputs = Variable(inputs.cuda())
labels = Variable(labels.cuda())
else:
inputs = Variable(inputs)
labels = Variable(labels)
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs)
#outputs = nn.functional.sigmoid(outputs).round()
_, preds = torch.max(outputs, 1)
label = labels.diag().float()
preds = preds.float()
loss = criterion(preds, label)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.data[0] * inputs.size(0)
running_corrects += torch.sum(pred == label.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
and my loss function is defined below:
class CustLoss(nn.Module):
def __init__(self):
super(CustLoss, self).__init__()
def forward(self, outputs, labels):
return cust_loss(outputs, labels)
def cust_loss(pred, targets):
'''preds are arrays of size classes with floats in them'''
'''targets are arrays of all the classes from the batch'''
'''we sum the classes from the batch and find the num correct'''
r = torch.sum(pred == targets)
return r
Then I run the following to run the model:
model_ft = models.resnet18(pretrained=True)
for param in model_ft.parameters():
param.requires_grad = False
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 3)
if use_gpu:
model_ft = model_ft.cuda()
criterion = CustLoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.fc.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,num_epochs=25)
I tried getting it to work with other loss functions to no avail. I always get the same error when loss.backward() is called.
It was my understanding that I wouldn't need a custom implementation of loss.backward if I extend nn.Module.
You are subclassing nn.Module to define a function, in your case Loss function. So, when you compute loss.backward(), it tries to store the gradients in the loss itself, instead of the model and there is no variable in the loss for which to store the gradients. Your loss needs to be a function and not a module. See Extending autograd.
You have two options here -
The easiest one is to directly pass cust_loss function as criterion parameter to train_model.
You can extend torch.autograd.Function to define the custom loss (and if you wish, the backward function as well).
P.S. - It is mentioned that you need to implement the backward of the custom loss functions. This is not always the case. It is required only when your loss function is non-differentiable at some point. But, I do not think so that you’ll need to do that.