I'm trying to use Sharpness-Aware Minimization (SAM) optimizer in my code, using the already built Pytorch code from here. Then, I would also like to use gradient accumulation, but I have no idea how to make this works properly. Using the proposed idea in one of the closed issue for mixed-precision:
def train(
args, model, device, train_loader, optimizer, first_step_scaler, second_step_scaler, epoch
):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
enable_running_stats(model)
# First forward step
with autocast():
output = model(data)
loss = F.nll_loss(output, target)
first_step_scaler.scale(loss).backward()
# We unscale manually for two reasons: (1) SAM's first-step adds the gradient
# to weights directly. So gradient must be unscaled; (2) unscale_ checks if any
# gradient is inf and updates optimizer_state["found_inf_per_device"] accordingly.
# We use optimizer_state["found_inf_per_device"] to decide whether to apply
# SAM's first-step or not.
first_step_scaler.unscale_(optimizer)
optimizer_state = first_step_scaler._per_optimizer_states[id(optimizer)]
# Check if any gradients are inf/nan
inf_grad_cnt = sum(v.item() for v in optimizer_state["found_inf_per_device"].values())
if inf_grad_cnt == 0:
# if valid graident, apply sam_first_step
optimizer.first_step(zero_grad=True, mixed_precision=True)
sam_first_step_applied = True
else:
# if invalid graident, skip sam and revert to single optimization step
optimizer.zero_grad()
sam_first_step_applied = False
# Update the scaler with no impact on the model (weights or gradient). This update step
# resets the optimizer_state["found_inf_per_device"]. So, it is applied after computing
# inf_grad_cnt. Note that zero_grad() has no impact on the update() operation,
# because update() leverage optimizer_state["found_inf_per_device"]
first_step_scaler.update()
disable_running_stats(model)
# Second forward step
with autocast():
output = model(data)
loss = F.nll_loss(output, target)
second_step_scaler.scale(loss).backward()
if sam_first_step_applied:
# If sam_first_step was applied, apply the 2nd step
optimizer.second_step(mixed_precision=True)
second_step_scaler.step(optimizer)
I tried something like this:
def train(
args, model, device, train_loader, optimizer, first_step_scaler, second_step_scaler, epoch, gradient_acc=2
):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
enable_running_stats(model)
# First forward step
with autocast():
output = model(data)
loss = F.nll_loss(output, target)
loss = loss / gradient_acc
first_step_scaler.scale(loss).backward()
# We unscale manually for two reasons: (1) SAM's first-step adds the gradient
# to weights directly. So gradient must be unscaled; (2) unscale_ checks if any
# gradient is inf and updates optimizer_state["found_inf_per_device"] accordingly.
# We use optimizer_state["found_inf_per_device"] to decide whether to apply
# SAM's first-step or not.
first_step_scaler.unscale_(optimizer)
optimizer_state = first_step_scaler._per_optimizer_states[id(optimizer)]
# Check if any gradients are inf/nan
inf_grad_cnt = sum(v.item() for v in optimizer_state["found_inf_per_device"].values())
if inf_grad_cnt == 0:
# if valid graident, apply sam_first_step
optimizer.first_step(zero_grad=True, mixed_precision=True)
sam_first_step_applied = True
else:
# if invalid graident, skip sam and revert to single optimization step
optimizer.zero_grad()
sam_first_step_applied = False
# Update the scaler with no impact on the model (weights or gradient). This update step
# resets the optimizer_state["found_inf_per_device"]. So, it is applied after computing
# inf_grad_cnt. Note that zero_grad() has no impact on the update() operation,
# because update() leverage optimizer_state["found_inf_per_device"]
first_step_scaler.update()
disable_running_stats(model)
# Second forward step
with autocast():
output = model(data)
loss = F.nll_loss(output, target)
loss = loss / gradient_acc
second_step_scaler.scale(loss).backward()
if sam_first_step_applied:
# If sam_first_step was applied, apply the 2nd step
optimizer.second_step(mixed_precision=True)
if not (batch_idx + 1) % gradient_acc != 0:
second_step_scaler.step(optimizer)
second_step_scaler.update()
optimizer.zero_grad()
But I noticed this makes my loss increasing rather than decreasing, anyone have any idea how to improvise this?
Related
I want to develop a lifelong learning system,so i need to prevent important parameter from changing.I read related paper 'Memory Aware Synapses: Learning what (not) to forget',a method was mentioned,I need to calculate the gradient of each parameter conresponding to each input image,so how should i write my code in pytorch?
'Memory Aware Synapses: Learning what (not) to forget'
You can do it using standard optimization procedure and .backward() method on your loss function.
First, scaling as defined in your link:
class Scaler:
def __init__(self, parameters, delta):
self.parameters = parameters
self.delta = delta
def step(self):
"""Multiplies gradients in place."""
for param in self.parameters:
if param.grad is None:
raise ValueError("backward() has to be called before running scaler")
param.grad *= self.delta
One can use it just like optimizer.step(), see below (see comments):
model = torch.nn.Sequential(
torch.nn.Linear(10, 100), torch.nn.ReLU(), torch.nn.Linear(100, 1)
)
scaler = Scaler(model.parameters(), delta=0.001)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.MSELoss()
X, y = torch.randn(64, 10), torch.randn(64)
# Optimization loop
EPOCHS = 10
for _ in range(EPOCHS):
output = model(X)
loss = criterion(output, y)
loss.backward() # Now model has the gradients
optimizer.step() # Optimize model's parameters
print(next(model.parameters()).grad)
scaler.step() # Scaler gradients
optimizer.zero_grad() # Zero gradient before next step
After scaler.step() you will have gradient scaled available inside param.grad for each parameter (just like those are accessed within Scaler's step method) so you can do whatever you want with them.
The error message indicates that [torch.cuda.FloatTensor [256, 1, 4, 4]] is at version 2; expected version 1 instead, and execution breaks on d_loss.backward() — i.e., the backward call on my Discriminator.
UPDATE: Okay, I tracked it down to an optimizer.step() for my Generator that was happening before running .backward() on my Discriminator.
UPDATE 2: So once I got the model running on PyTorch 1.5 (by moving G's optimizer to after the d_loss.backward() call, as above), I noticed that losses were suddenly much higher during training. I let the model run for a few epochs and the images were basically noise. So, out of curiosity I switched back to my PyTorch 1.4 environment and ran the original for a few epochs, and the images were good again. It's a ClusterGAN that I'm training — so not the standard routine — and I'm wondering why this change is so detrimental to the output. Also, how can I get the model to run in PyTorch 1.5 without the degradation in performance? Presumably I have to keep the optimizer update where it was originally (right after ge_loss.backward(retain_graph=True)), but somehow avoid the error PyTorch 1.5 reports when we hit d_loss.backward() later in the code. I suppose I have to clone() something, but I'm not clear what... ?
[...]
# main training block
for epoch in range(n_epochs):
for i, (imgs, itruth_label) in enumerate(dataloader):
iter_count += 1
# Ensure generator/encoder are trainable
generator.train()
encoder.train()
# Zero gradients for models
generator.zero_grad()
encoder.zero_grad()
discriminator.zero_grad()
# Configure input
real_imgs = Variable(imgs.type(Tensor))
# ---------------------------
# Train Generator + Encoder
# ---------------------------
optimizer_GE.zero_grad()
# Sample random latent variables
zn, zc, zc_idx = sample_z(shape=imgs.shape[0],
latent_dim=latent_dim,
n_c=n_c)
# Generate a batch of images
gen_imgs = generator(zn, zc)
# Discriminator output from real and generated samples
D_gen = discriminator(gen_imgs)
D_real = discriminator(real_imgs)
# Step for Generator & Encoder, n_skip_iter times less than for discriminator
did_update = False
if (i % n_skip_iter == 0):
# Encode the generated images
enc_gen_zn, enc_gen_zc, enc_gen_zc_logits = encoder(gen_imgs)
# Calculate losses for z_n, z_c
zn_loss = mse_loss(enc_gen_zn, zn)
zc_loss = xe_loss(enc_gen_zc_logits, zc_idx)
# additional top-k step (from Sinha et al, 2020)
if top_k <= D_gen.size()[0]:
top_k_gen = torch.topk(D_gen, top_k, 0)
else:
top_k_gen = torch.topk(D_gen, D_gen.size()[0], 0)
# Check requested metric
if wass_metric:
# Wasserstein GAN loss
ge_loss = torch.mean(top_k_gen[0]) + betan * zn_loss + betac * zc_loss
else:
# Vanilla GAN loss
valid = Variable(Tensor(gen_imgs.size(0), 1).fill_(1.0), requires_grad=False)
v_loss = bce_loss(D_gen, valid)
ge_loss = v_loss + betan * zn_loss + betac * zc_loss
ge_loss.backward(retain_graph=True)
# ---- ORIGINAL OPTIMIZER UPDATE ---- #
optimizer_GE.step()
scheduler.step(epoch + i / iters)
did_update = True
# ---------------------
# Train Discriminator
# ---------------------
optimizer_D.zero_grad()
# Measure discriminator's ability to classify real from generated samples
if wass_metric:
# Gradient penalty term
grad_penalty = calc_gradient_penalty(discriminator, real_imgs, gen_imgs)
# Wasserstein GAN loss w/gradient penalty
d_loss = torch.mean(D_real) - torch.mean(D_gen) + grad_penalty
else:
# Vanilla GAN loss
fake = Variable(Tensor(gen_imgs.size(0), 1).fill_(0.0), requires_grad=False)
real_loss = bce_loss(D_real, valid)
fake_loss = bce_loss(D_gen, fake)
d_loss = (real_loss + fake_loss) / 2
d_loss.backward()
# --- REVISED OPTIMIZER UPDATE FOR PyTorch 1.5 ------ #
# if did_update:
# optimizer_GE.step()
optimizer_D.step()
# scheduler.step(epoch + i / iters)
[...]
If I understand correctly, the error occurs at the second time you call .backward().
The problem is caused by calling .backward() on D_gen and D_real twice.
I don't know exactly what you're doing with this model, but I guess you don't have to backward and update the parameters of Discriminators while training Generators, right?
So, try this:
1.set the requires_grad of D.parameters() to False in the Train Generator + Encoder stage
2.set the requires_grad of D.parameters() to True in the Train Discriminator stage
I wrote a Python 3.5 script for doing street segmentation. Since I'm new in Image Segementation, I did not use predefined dataloaders from pytorch, instead I wrote them by my self (for better understanding). Until now I only use a batch size of 1. Now I want to generalize this for arbitrary batch sizes.
This is a snippet of my Dataloader:
def augment_data(batch_size):
# [...] defining some paths and data transformation (including ToTensor() function)
# The images are named by numbers (Frame numbers), this allows me to find the correct label image for a given input image.
all_input_image_paths = {int(elem.split('\\')[-1].split('.')[0]) : elem for idx, elem in enumerate(glob.glob(input_dir + "*"))}
all_label_image_paths = {int(elem.split('\\')[-1].split('.')[0]) : elem for idx, elem in enumerate(glob.glob(label_dir + "*"))}
dataloader = {"train":[], "val":[]}
all_samples = []
img_counter = 0
for key, value in all_input_image_paths.items():
input_img = Image.open(all_input_image_paths[key])
label_img = Image.open(all_label_image_paths[key])
# Here I use my own augmentation function which crops the input and label on the same position and do other things.
# We get a list of new augmented data
augmented_images = generate_augmented_images(input_img, label_img)
for elem in augmented_images:
input_as_tensor = data_transforms['norm'](elem[0])
label_as_tensor = data_transforms['val'](elem[1])
input_as_tensor.unsqueeze_(0)
label_as_tensor.unsqueeze_(0)
is_training_data = random.uniform(0.0, 1.0)
if is_training_data <= 0.7:
dataloader["train"].append([input_as_tensor, label_as_tensor])
else:
dataloader["val"].append([input_as_tensor, label_as_tensor])
img_counter += 1
shuffle(dataloader["train"])
shuffle(dataloader["val"])
dataloader_batched = {"train":[], "val":[]}
# Here I group my data to a given batch size
for elem in dataloader["train"]:
batch = []
for i in range(batch_size):
batch.append(elem)
dataloader_batched["train"].append(batch)
for elem in dataloader["val"]:
batch = []
for i in range(batch_size):
batch.append(elem)
dataloader_batched["val"].append(batch)
return dataloader_batched
This is a snippet of my training method with batch size 1:
while epoch <= num_epochs:
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step(3)
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
counter = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
counter += 1
max_num = len(dataloaders[phase])
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
epoch_loss = running_loss / dataset_sizes[phase]
If I execute this, I get of course the error:
for inputs, labels in dataloaders[phase]:
ValueError: not enough values to unpack (expected 2, got 1)
I understand why, because now I have a list of images and not only an input and label image as before. So guessed I need a second for loop which iterates over these batches. So I tried this:
# Iterate over data.
for elem in dataloaders[phase]:
for inputs, labels in elem:
counter += 1
max_num = len(dataloaders[phase])
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
# _, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
But for me it looks like the optimization step (back-prop) is only applied on the last image of the batch. Is that true? And if so, how can I fix this? I guess if I indent the with-Block, then I get again a batch size 1 optimization.
Thanks in advance
But for me it looks like the optimization step (back-prop) is only applied on the last image of the batch.
It should not apply only based on the last image. It should apply based on the batch size.
If you set bs=2 and it should apply to the batch of two images.
Optimization step actually will update the params of your network. Backprop is a fancy name for PyTorch autograd system that computes the first order gradients.
When trying to train the CNN model, I came across a code shown below:
def train(n_epochs, loaders, model, optimizer, criterion):
for epoch in range(1,n_epochs):
train_loss = 0
valid_loss = 0
model.train()
for i, (data,target) in enumerate(loaders['train']):
# zero the parameter (weight) gradients
optimizer.zero_grad()
# forward pass to get outputs
output = model(data)
# calculate the loss
loss = criterion(output, target)
# backward pass to calculate the parameter gradients
loss.backward()
# update the parameters
optimizer.step()
Can someone please tell me why is the second for loop used?
i.e; for i, (data,target) in enumerate(loaders['train']):
And why optimizer.zero_grad() and optimizer.step() is used?
torch.utils.data.DataLoader comes in handy when you need to prepare data batches (and perhaps shuffle them before every run).
data_train_loader = DataLoader(data_train, batch_size=64, shuffle=True)
In the above code, first for-loop iterates through the number of epochs while second loop iterates through the training dataset converted into batches via above code. For example:
for batch_idx, samples in enumerate(data_train_loader):
# samples will be a 64 x D dimensional tensor
# batch_idx is each batch index
Learn more about torch.utils.data.DataLoader from here.
Optimizer.zero_gradient(): Before the backward pass, use the optimizer object to zero all of the gradients for the tensors it will update (which are the learnable weights of the model)
optimizer.step(): We generally use optimizer.step() to make the gradient descent step. Calling the step function on an Optimizer makes an update to its parameters.
Learn more about these from here.
Optimizer is used first to load the params like this (missing in your code):
optimizer = optim.Adam(model.parameters(), lr=0.001, momentum=0.9)
This code
loss = criterion(output, target)
Is used to calculate the loss of a single batch where targets is what you got from a tuple (data,target) and data is used as the input for the model, where we got the output.
This step:
optimizer.zero_grad()
Will zero all the gradients found in the optimizer, which is very important on initialization.
The part
loss.backward()
Calculates the gradients, and the optimizer.step() updates our model weights and biases (parameters).
In PyTorch you typically use DataLoader class to load the trainging and validation sets.
loaders['train']
Is probable the full train set, which represents a single epoch.
Trying to work with the framework provided in the course Stanford cs231n, given the code below.
I can see the accuracy getting better and the net is trained however after the training process and checking the results on the validation set, how would I go to input one image into the model and see its prediction?
I have searched around and couldn't find some built in predict function in tensorflow as there is in keras.
Initializing the net and its parameters
# clear old variables
tf.reset_default_graph()
# setup input (e.g. the data that changes every batch)
# The first dim is None, and gets sets automatically based on batch size fed in
X = tf.placeholder(tf.float32, [None, 30, 30, 1])
y = tf.placeholder(tf.int64, [None])
is_training = tf.placeholder(tf.bool)
def simple_model(X,y):
# define our weights (e.g. init_two_layer_convnet)
# setup variables
Wconv1 = tf.get_variable("Wconv1", shape=[7, 7, 1, 32]) # Filter of size 7x7 with depth of 3. No. of filters is 32
bconv1 = tf.get_variable("bconv1", shape=[32])
W1 = tf.get_variable("W1", shape=[4608, 360]) # 5408 is 13x13x32 where 13x13 is the output of 7x7 filter on 32x32 image with padding of 2.
b1 = tf.get_variable("b1", shape=[360])
# define our graph (e.g. two_layer_convnet)
a1 = tf.nn.conv2d(X, Wconv1, strides=[1,2,2,1], padding='VALID') + bconv1
h1 = tf.nn.relu(a1)
h1_flat = tf.reshape(h1,[-1,4608])
y_out = tf.matmul(h1_flat,W1) + b1
return y_out
y_out = simple_model(X,y)
# define our loss
total_loss = tf.losses.hinge_loss(tf.one_hot(y,360),logits=y_out)
mean_loss = tf.reduce_mean(total_loss)
# define our optimizer
optimizer = tf.train.AdamOptimizer(5e-4) # select optimizer and set learning rate
train_step = optimizer.minimize(mean_loss)
Function for evaluating the model whether for training or validation and plots the results:
def run_model(session, predict, loss_val, Xd, yd,
epochs=1, batch_size=64, print_every=100,
training=None, plot_losses=False):
# Have tensorflow compute accuracy
correct_prediction = tf.equal(tf.argmax(predict,1), y)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# shuffle indicies
train_indicies = np.arange(Xd.shape[0])
np.random.shuffle(train_indicies)
training_now = training is not None
# setting up variables we want to compute and optimize
# if we have a training function, add that to things we compute
variables = [mean_loss,correct_prediction,accuracy]
if training_now:
variables[-1] = training
# counter
iter_cnt = 0
for e in range(epochs):
# keep track of losses and accuracy
correct = 0
losses = []
# make sure we iterate over the dataset once
for i in range(int(math.ceil(Xd.shape[0]/batch_size))):
# generate indicies for the batch
start_idx = (i*batch_size)%Xd.shape[0]
idx = train_indicies[start_idx:start_idx+batch_size]
# create a feed dictionary for this batch
feed_dict = {X: Xd[idx,:],
y: yd[idx],
is_training: training_now }
# get batch size
actual_batch_size = yd[idx].shape[0]
# have tensorflow compute loss and correct predictions
# and (if given) perform a training step
loss, corr, _ = session.run(variables,feed_dict=feed_dict)
# aggregate performance stats
losses.append(loss*actual_batch_size)
correct += np.sum(corr)
# print every now and then
if training_now and (iter_cnt % print_every) == 0:
print("Iteration {0}: with minibatch training loss = {1:.3g} and accuracy of {2:.2g}"\
.format(iter_cnt,loss,np.sum(corr)/actual_batch_size))
iter_cnt += 1
total_correct = correct/Xd.shape[0]
total_loss = np.sum(losses)/Xd.shape[0]
print("Epoch {2}, Overall loss = {0:.3g} and accuracy of {1:.3g}"\
.format(total_loss,total_correct,e+1))
if plot_losses:
plt.plot(losses)
plt.grid(True)
plt.title('Epoch {} Loss'.format(e+1))
plt.xlabel('minibatch number')
plt.ylabel('minibatch loss')
plt.show()
return total_loss,total_correct
The functions calls that trains the model
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
print('Training')
run_model(sess,y_out,mean_loss,x_train,y_train,1,64,100,train_step,True)
print('Validation')
run_model(sess,y_out,mean_loss,x_val,y_val,1,64)
You do not need to go far, you simply pass your new (test) feature matrix X_test into your network and perform a forward pass - the output layer is the prediction. So the code is something like this
session.run(y_out, feed_dict={X: X_test})