How can i solve the problem of accessing the bert model - nlp

i try to access the huggingface arabert model but this error appears,
`RuntimeError
Traceback (most recent call last)
in
28 lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)
29
---> 30 train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)
3 frames
/usr/local/lib/python3.8/dist-packages/torch/serialization.py in init(self, name)
285 class _open_zipfile_writer_file(_opener):
286 def init(self, name) -> None:
--> 287 super(_open_zipfile_writer_file, self).init(torch._C.PyTorchFileWriter(str(name)))
288
289 def exit(self, *args) -> None:
RuntimeError: Parent directory models/aubmindlab does not exist.`
i copied the model name and i piped the farasapy but it still occur with me, how can i accesed?
here is the calling block:
def train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):
best_loss = np.Inf
best_ep = 1
nb_iterations = len(train_loader)
print_every = nb_iterations // 5 # print the training loss 5 times per epoch
iters = []
train_losses = []
val_losses = []
scaler = GradScaler()
for ep in range(epochs):
net.train()
running_loss = 0.0
for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):
# Converting to cuda tensors
seq, attn_masks, token_type_ids, labels = \
seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
# Enables autocasting for the forward pass (model + loss)
with autocast():
# Obtaining the logits from the model
logits = net(seq, attn_masks, token_type_ids)
# Computing loss
loss = criterion(logits.squeeze(-1), labels.float())
loss = loss / iters_to_accumulate # Normalize the loss because it is averaged
# Backpropagating the gradients
# Scales loss. Calls backward() on scaled loss to create scaled gradients.
scaler.scale(loss).backward()
if (it + 1) % iters_to_accumulate == 0:
# Optimization step
# scaler.step() first unscales the gradients of the optimizer's assigned params.
# If these gradients do not contain infs or NaNs, opti.step() is then called,
# otherwise, opti.step() is skipped.
scaler.step(opti)
# Updates the scale for next iteration.
scaler.update()
# Adjust the learning rate based on the number of iterations.
lr_scheduler.step()
# Clear gradients
opti.zero_grad()
running_loss += loss.item()
if (it + 1) % print_every == 0: # Print training loss information
print()
print("Iteration {}/{} of epoch {} complete. Loss : {} "
.format(it+1, nb_iterations, ep+1, running_loss / print_every))
running_loss = 0.0
val_loss = evaluate_loss(net, device, criterion, val_loader) # Compute validation loss
print()
print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))
if val_loss < best_loss:
print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
print()
net_copy = copy.deepcopy(net) # save a copy of the model
best_loss = val_loss
best_ep = ep + 1
# Saving the model
path_to_model='models/{}_lr_{}_val_loss_{}_ep_{}.pt'.format(bert_model, lr, round(best_loss, 5), best_ep)
torch.save(net_copy.state_dict(), path_to_model)
print("The model has been saved in {}".format(path_to_model))
del loss
torch.cuda.empty_cache()
and here is the block that has an error the last line where it is occur:
`# Set all seeds to make reproducible results
set_seed(1)
# Creating instances of training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model)
# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=5)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)
if torch.cuda.device_count() > 1: # if multiple GPUs
print("Let's use", torch.cuda.device_count(), "GPUs!")
net = nn.DataParallel(net)
net.to(device)
criterion = nn.BCEWithLogitsLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader) # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * epochs # Necessary to take into account Gradient accumulation
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)
train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)`

Related

Find Training/Validation Accuracy & Loss of Faster-RCNN PyTorch model

I am trying to find the training/validation accuracy and loss of my model for each epoch as I train it to find the best epoch to use from now on. I appreciate that there is lots of information on this now but this topic is very new to me, and I find it very difficult to find the right answer for my situation.
I assume that I need to add in one or two bits to the train_one_epoch() and evaluate() functions in order to do this?
My model setup is:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.02, momentum=0.9, weight_decay=0.0001)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20,40], gamma=0.1)
And my training function is:
epochs = 50
for epoch in range(epochs):
train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=20)
lr_scheduler.step()
evaluate(model, val_data_loader, device=device)
print("\n\n")
torch.save(model, f'./Models/trained_{ds}_model_Epoch{epochs}_LR0_02.pt')
I am using coco-like annotations, for example:
{'boxes': tensor([[316.9700, 242.5500, 464.1000, 442.1700], [ 39.2200, 172.6700, 169.8400, 430.9600]]), 'labels': tensor([2, 2]), 'image_id': tensor(1416), 'area': tensor([29370.1094, 33738.3789]), 'iscrowd': tensor([0, 0])}
The train_one_epoch and evaluate functions are from 'engine.py' from Torchvision.
It seems like using Tensorboard is a good tool to use, but I don't really know how to use it.
The engine.py is:
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, scaler=None):
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
header = f"Epoch: [{epoch}]"
lr_scheduler = None
if epoch == 0:
warmup_factor = 1.0 / 1000
warmup_iters = min(1000, len(data_loader) - 1)
lr_scheduler = torch.optim.lr_scheduler.LinearLR(
optimizer, start_factor=warmup_factor, total_iters=warmup_iters
)
for images, targets in metric_logger.log_every(data_loader, print_freq, header):
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
with torch.cuda.amp.autocast(enabled=scaler is not None):
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
loss_value = losses_reduced.item()
if not math.isfinite(loss_value):
print(f"Loss is {loss_value}, stopping training")
print(loss_dict_reduced)
sys.exit(1)
optimizer.zero_grad()
if scaler is not None:
scaler.scale(losses).backward()
scaler.step(optimizer)
scaler.update()
else:
losses.backward()
optimizer.step()
if lr_scheduler is not None:
lr_scheduler.step()
metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
metric_logger.update(lr=optimizer.param_groups[0]["lr"])
return metric_logger
The evaluate function is:
def evaluate(model, data_loader, device):
n_threads = torch.get_num_threads()
# FIXME remove this and make paste_masks_in_image run on the GPU
torch.set_num_threads(1)
cpu_device = torch.device("cpu")
model.eval()
metric_logger = utils.MetricLogger(delimiter=" ")
header = "Test:"
coco = get_coco_api_from_dataset(data_loader.dataset)
iou_types = _get_iou_types(model)
coco_evaluator = CocoEvaluator(coco, iou_types)
for images, targets in metric_logger.log_every(data_loader, 100, header):
images = list(img.to(device) for img in images)
if torch.cuda.is_available():
torch.cuda.synchronize()
model_time = time.time()
outputs = model(images)
outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
model_time = time.time() - model_time
res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
evaluator_time = time.time()
coco_evaluator.update(res)
evaluator_time = time.time() - evaluator_time
metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print("Averaged stats:", metric_logger)
coco_evaluator.synchronize_between_processes()
# accumulate predictions from all images
coco_evaluator.accumulate()
coco_evaluator.summarize()
torch.set_num_threads(n_threads)
return coco_evaluator

Strange loss curve while training EfficientNetV2 with Pytorch

I'm new to Pytorch. And I use the architecture that a pre-trained EfficientNetV2 model to connect to a single fully connected layer with one neuron using the ReLU activation function in regression task. However, both losses on training and validation set suddenly increase after first epoch and keep at about the same value during 50 epochs, then suddenly decrease to about same value as first epoch. Can anyone help me figure out what's happening?
Some codes for model and training process:
# hyper-parameter
image_size = 256
learning_rate = 1e-3
batch_size = 32
epochs = 60
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.net = models.efficientnet_v2_m(pretrained=True,weights='DEFAULT')
self.net.classifier[1] = nn.Linear(in_features=1280, out_features=1, bias=True)
self.net.classifier = nn.Sequential(self.net.classifier,nn.ReLU())
def forward(self, input):
output = self.net(input)
return output
model = Model()
# Define the loss function with Classification Cross-Entropy loss and an optimizer with Adam optimizer
loss_fn = nn.L1Loss()
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
# Function to test the model with the test dataset and print the accuracy for the test images
def testAccuracy():
model.eval()
loss = 0.0
total = 0.0
with torch.no_grad():
for data in validation_loader:
images, labels = data
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print("The model test will be running on", device, "device")
# get the inputs
images = Variable(images.to(device))
labels = Variable(labels.to(device))
# run the model on the test set to predict labels
outputs = model(images)
# the label with the highest energy will be our prediction
# print('outputs: ',outputs)
# print('labels: ',labels)
temp = loss_fn(outputs, labels.unsqueeze(1))
loss += loss_fn(outputs, labels.unsqueeze(1)).item()
total += 1
# compute the accuracy over all test images
mae = loss/total
return(mae)
# Training function. We simply have to loop over our data iterator and feed the inputs to the network and optimize.
def train(num_epochs):
best_accuracy = 0.0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
train_loss_all = []
val_loss_all = []
for epoch in range(num_epochs): # loop over the dataset multiple times
running_loss = 0.0
total = 0
for i, (images, labels) in tqdm(enumerate(train_loader, 0),total=len(train_loader)):
# get the inputs
images = Variable(images.to(device))
labels = Variable(labels.to(device))
# zero the parameter gradients
optimizer.zero_grad()
# predict classes using images from the training set
outputs = model(images)
# compute the loss based on model output and real labels
loss = loss_fn(outputs, labels.unsqueeze(1))
# backpropagate the loss
loss.backward()
# adjust parameters based on the calculated gradients
optimizer.step()
# Let's print statistics for every one batch
running_loss += loss.item() # extract the loss value
total += 1
train_loss = running_loss/total
train_loss_all.append(train_loss)
accuracy = testAccuracy()
val_loss_all.append(accuracy)
if accuracy > best_accuracy:
saveModel()
best_accuracy = accuracy
history = {'train_loss':train_loss_all,'val_loss':val_loss_all}
return(history)
Loss curve:
loss curve

Load batch to GPU problem in pytorch using BERT model

I have created a function for evaluation a function. It takes as an input the model and validation data loader and return the validation accuracy, validation loss and f1_weighted score.
def evaluate(model, val_dataloader):
"""
After the completion of each training epoch, measure the model's performance
on our validation set.
"""
# Put the model into the evaluation mode. The dropout layers are disabled during
# the test time.
model.eval()
# Tracking variables
val_accuracy = []
val_loss = []
f1_weighted = []
# For each batch in our validation set...
for batch in val_dataloader:
# Load batch to GPU
b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
# Compute logits
with torch.no_grad():
logits = model(b_input_ids, b_attn_mask)
# Compute loss
loss = loss_fn(logits, b_labels)
val_loss.append(loss.item())
# Get the predictions
preds = torch.argmax(logits, dim=1).flatten()
# Calculate the accuracy rate
accuracy = (preds == b_labels).cpu().numpy().mean() * 100
val_accuracy.append(accuracy)
# Calculate the f1 weighted score
f1_metric = F1Score('weighted')
f1_weighted = f1_metric(preds, b_labels)
# Compute the average accuracy and loss over the validation set.
val_loss = np.mean(val_loss)
val_accuracy = np.mean(val_accuracy)
f1_weighted = np.mean(f1_weighted)
return val_loss, val_accuracy, f1_weighted
The core for f1 score can be found here
Measuring F1 score for multiclass classification natively in PyTorch
Before the evaluation function there is a function which trains a bert model and has the following inputs
train(model, train_dataloader, val_dataloader, epochs, evaluation).
Thus if the evaluation = True, then the validation accuracy seems in the end of each epoch.
As for the dataloaders are created with the following way:
# Convert other data types to torch.Tensor
train_labels = torch.tensor(authors_train)
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
With a similar way you cal create the dataloader for validation and testing set.
Update:
I changed the line
f1_weighted = f1_metric(preds, b_labels)
with this one
f1_weighted.append(f1_metric(preds, b_labels))
and now I have the following error
AttributeError Traceback (most recent call last)
<ipython-input-49-0e0f6d227c4f> in <module>()
1 set_seed(42) # Set seed for reproducibility
2 bert_classifier, optimizer, scheduler = initialize_model(epochs=4)
----> 3 train(bert_classifier, train_dataloader, val_dataloader, epochs=4, evaluation=True)
4
5 #1. 77.28
3 frames
<__array_function__ internals> in mean(*args, **kwargs)
/usr/local/lib/python3.7/dist-packages/numpy/core/_methods.py in _mean(a, axis, dtype, out, keepdims)
168 ret = arr.dtype.type(ret / rcount)
169 else:
--> 170 ret = ret.dtype.type(ret / rcount)
171 else:
172 ret = ret / rcount
AttributeError: 'torch.dtype' object has no attribute 'type'

Expected more than 1 value per channel when training, got input size torch.Size([1, **])

I met an error when I use BatchNorm1d, code:
##% first I set a model
class net(nn.Module):
def __init__(self, max_len, feature_linear, rnn, input_size, hidden_size, output_dim, num__rnn_layers, bidirectional, batch_first=True, p=0.2):
super(net, self).__init__()
self.max_len = max_len
self.feature_linear = feature_linear
self.input_size = input_size
self.hidden_size = hidden_size
self.bidirectional = bidirectional
self.num_directions = 2 if bidirectional == True else 1
self.p = p
self.batch_first = batch_first
self.linear1 = nn.Linear(max_len, feature_linear)
init.kaiming_normal_(self.linear1.weight, mode='fan_in')
self.BN1 = BN(feature_linear)
def forward(self, xb, seq_len_crt):
rnn_input = torch.zeros(xb.shape[0], self.feature_linear, self.input_size)
for i in range(self.input_size):
out = self.linear1(xb[:, :, i]) # xb[:,:,i].shape:(1,34), out.shape(1,100)
out = F.relu(out) # 输入:out.shape(1,100), 输出:out.shape(1,100)
out = self.BN1(out) # 输入:out.shape(1,100),输出:out.shape(1,100)
return y_hat.squeeze(-1)
##% make the model as a function and optimize it
input_size = 5
hidden_size = 32
output_dim = 1
num_rnn_layers = 2
bidirectional = True
rnn = nn.LSTM
batch_size = batch_size
feature_linear = 60
BN = nn.BatchNorm1d
model = net(max_len, feature_linear, rnn, input_size, hidden_size, output_dim, num_rnn_layers, bidirectional, p=0.1)
loss_func = nn.MSELoss(reduction='none')
# optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# optimizer = optim.Adam(model.parameters(), lr=0.01)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.05)
##% use this model to predict data
def predict(xb, model, seq_len):
# xb's shape should be (batch_size, seq_len, n_features)
if xb.ndim == 2: # suitable for both ndarray and Tensor
# add a {batch_size} dim
xb = xb[None, ]
if not isinstance(xb, torch.Tensor):
xb = torch.Tensor(xb)
return model(xb, seq_len) # xb.shape(1,34,5)
##% create training/valid/test data
seq_len_train_iter = []
for i in range(0, len(seq_len_train), batch_size):
if i + batch_size <= len(seq_len_train):
seq_len_train_iter.append(seq_len_train[i:i+batch_size])
else:
seq_len_train_iter.append(seq_len_train[i:])
seq_len_valid_iter = []
for i in range(0, len(seq_len_valid), batch_size):
if i + batch_size <= len(seq_len_valid):
seq_len_valid_iter.append(seq_len_valid[i:i+batch_size])
else:
seq_len_valid_iter.append(seq_len_valid[i:])
seq_len_test_iter = []
for i in range(0, len(seq_len_test), batch_size):
if i + batch_size <= len(seq_len_test):
seq_len_test_iter.append(seq_len_test[i:i+batch_size])
else:
seq_len_test_iter.append(seq_len_test[i:])
##% fit model
def fit(epochs, model, loss_func, optimizer, train_dl, valid_dl, valid_ds, seq_len_train_iter, seq_len_valid_iter):
train_loss_record = []
valid_loss_record = []
mean_pct_final = []
mean_abs_final = []
is_better = False
last_epoch_abs_error = 0
last_epoch_pct_error = 0
mean_pct_final_train = []
mean_abs_final_train = []
for epoch in range(epochs):
# seq_len_crt: current batch seq len
for batches, ((xb, yb), seq_len_crt) in enumerate(zip(train_dl, seq_len_train_iter)):
if isinstance(seq_len_crt, np.int64):
seq_len_crt = [seq_len_crt]
y_hat = model(xb, seq_len_crt)
packed_yb = nn.utils.rnn.pack_padded_sequence(yb, seq_len_crt, batch_first=True, enforce_sorted=False)
final_yb, input_sizes = nn.utils.rnn.pad_packed_sequence(packed_yb)
final_yb = final_yb.permute(1, 0)
# assert torch.all(torch.tensor(seq_len_crt).eq(input_sizes))
loss = loss_func(y_hat, final_yb)
batch_size_crt = final_yb.shape[0]
loss = (loss.sum(-1) / input_sizes).sum() / batch_size_crt
loss.backward()
optimizer.step()
# scheduler.step()
optimizer.zero_grad()
# print(i)
with torch.no_grad():
train_loss_record.append(loss.item())
if batches % 50 == 0 and epoch % 1 == 0:
# print(f'Epoch {epoch}, batch {i} training loss: {loss.item()}')
y_hat = predict(xb[0], model, torch.tensor([seq_len_crt[0]])).detach().numpy().squeeze() # xb[0].shape(34,5)
label = yb[0][:len(y_hat)]
# plt.ion()
plt.plot(y_hat, label='predicted')
plt.plot(label, label='label')
plt.legend(loc='upper right')
plt.title('training mode')
plt.text(len(y_hat)+1, max(y_hat.max(), label.max()), f'Epoch {epoch}, batch {batches} training loss: {loss.item()}')
plt.show()
return train_loss_record
but I met:Expected more than 1 value per channel when training, got input size torch.Size([1, 60])
the error message is:
ValueError Traceback (most recent call last)
<ipython-input-119-fb062ad3f20e> in <module>
----> 1 fit(500, model, loss_func, optimizer, train_dl, valid_dl, valid_ds, seq_len_train_iter, seq_len_valid_iter)
<ipython-input-118-2eb946c379bf> in fit(epochs, model, loss_func, optimizer, train_dl, valid_dl, valid_ds, seq_len_train_iter, seq_len_valid_iter)
38 # print(f'Epoch {epoch}, batch {i} training loss: {loss.item()}')
39
---> 40 y_hat = predict(xb[0], model, torch.tensor([seq_len_crt[0]])).detach().numpy().squeeze() # xb[0].shape(34,5)
41 label = yb[0][:len(y_hat)]
42 # plt.ion()
<ipython-input-116-28afce77e325> in predict(xb, model, seq_len)
7 if not isinstance(xb, torch.Tensor):
8 xb = torch.Tensor(xb)
----> 9 return model(xb, seq_len) # xb.shape(None,34,5)
D:\Anaconda3\envs\LSTM\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
<ipython-input-114-3e9c30d20ed6> in forward(self, xb, seq_len_crt)
50 out = self.linear1(xb[:, :, i]) # xb[:,:,i].shape:(None,34), out.shape(None,100)
51 out = F.relu(out) # 输入:out.shape(None,100), 输出:out.shape(None,100)
---> 52 out = self.BN1(out) # 输入:out.shape(None,100),输出:out.shape(None,100)
53
54 out = self.linear2(out)
D:\Anaconda3\envs\LSTM\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
D:\Anaconda3\envs\LSTM\lib\site-packages\torch\nn\modules\batchnorm.py in forward(self, input)
129 used for normalization (i.e. in eval mode when buffers are not None).
130 """
--> 131 return F.batch_norm(
132 input,
133 # If buffers are not to be tracked, ensure that they won't be updated
D:\Anaconda3\envs\LSTM\lib\site-packages\torch\nn\functional.py in batch_norm(input, running_mean, running_var, weight, bias, training, momentum, eps)
2052 bias=bias, training=training, momentum=momentum, eps=eps)
2053 if training:
-> 2054 _verify_batch_size(input.size())
2055
2056 return torch.batch_norm(
D:\Anaconda3\envs\LSTM\lib\site-packages\torch\nn\functional.py in _verify_batch_size(size)
2035 size_prods *= size[i + 2]
2036 if size_prods == 1:
-> 2037 raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size))
2038
2039
ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 60])
I have checked and I found that in out = self.BN1(out),out.shape = (1,60),it seems that batchsize=1 is not permitted in BatchNorm1d .But I don't know how to modify it.
what does BatchNorm1d do mathematically?
try and write down the equation for the case of batch_size=1 and you'll understand why pytorch is angry with you.
How to solve it?
It is simple: BatchNorm has two "modes of operation": one is for training where it estimates the current batch's mean and variance (this is why you must have batch_size>1 for training).
The other "mode" is for evaluation: it uses accumulated mean and variance to normalize new inputs without re-estimating the mean and variance. In this mode there is no problem processing samples one by one.
When evaluating your model use model.eval() before and model.train() after.
I met this problem when I load the model and started to test. Add the model.eval() before you fill in your data. This can solve the problem.
If you are using the DataLoader class, sometimes the last batch in an epoch will have only a single training example (imagine a training set of 33 examples with a batch size of 32). This can trigger the error if the network is in training mode and a batch norm layer is present.
Set the drop_last argument in the DataLoader to True like:
from torch.utils.data import DataLoader
...
trainloader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
to discard the last incomplete batch in each epoch.

PyTorch Getting Custom Loss Function Running

I'm trying to use a custom loss function by extending nn.Module, but I can't get past the error
element 0 of variables does not require grad and does not have a grad_fn
Note: my labels are lists of size: num_samples, but each batch will have the same labels throughout the batch, so we shrink labels for the whole batch to be a single label by calling .diag()
My code is as follows and is based on the transfer learning tutorial:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
model.train(True) # Set model to training mode
else:
model.train(False) # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for data in dataloaders[phase]:
# get the inputs
inputs, labels = data
inputs = inputs.float()
# wrap them in Variable
if use_gpu:
inputs = Variable(inputs.cuda())
labels = Variable(labels.cuda())
else:
inputs = Variable(inputs)
labels = Variable(labels)
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs)
#outputs = nn.functional.sigmoid(outputs).round()
_, preds = torch.max(outputs, 1)
label = labels.diag().float()
preds = preds.float()
loss = criterion(preds, label)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.data[0] * inputs.size(0)
running_corrects += torch.sum(pred == label.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
and my loss function is defined below:
class CustLoss(nn.Module):
def __init__(self):
super(CustLoss, self).__init__()
def forward(self, outputs, labels):
return cust_loss(outputs, labels)
def cust_loss(pred, targets):
'''preds are arrays of size classes with floats in them'''
'''targets are arrays of all the classes from the batch'''
'''we sum the classes from the batch and find the num correct'''
r = torch.sum(pred == targets)
return r
Then I run the following to run the model:
model_ft = models.resnet18(pretrained=True)
for param in model_ft.parameters():
param.requires_grad = False
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 3)
if use_gpu:
model_ft = model_ft.cuda()
criterion = CustLoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.fc.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,num_epochs=25)
I tried getting it to work with other loss functions to no avail. I always get the same error when loss.backward() is called.
It was my understanding that I wouldn't need a custom implementation of loss.backward if I extend nn.Module.
You are subclassing nn.Module to define a function, in your case Loss function. So, when you compute loss.backward(), it tries to store the gradients in the loss itself, instead of the model and there is no variable in the loss for which to store the gradients. Your loss needs to be a function and not a module. See Extending autograd.
You have two options here -
The easiest one is to directly pass cust_loss function as criterion parameter to train_model.
You can extend torch.autograd.Function to define the custom loss (and if you wish, the backward function as well).
P.S. - It is mentioned that you need to implement the backward of the custom loss functions. This is not always the case. It is required only when your loss function is non-differentiable at some point. But, I do not think so that you’ll need to do that.

Resources