Bigger batch size improves training by too much - pytorch

I am writing a classifier that takes a surname and predicts a language it belongs to. I found that small batch sizes (256 and less) perform poorly compared to big batch sizes (2048 and more). Could someone give me some insight on why this is happening and how to fix it? Thank you.
Training code:
def indices_to_packed(names, input_size):
names = [F.one_hot(item, input_size).float() for item in names]
names_packed = pack_sequence(names, enforce_sorted=False)
return names_packed
def infer(model, data, labels, lengths, device):
data_packed = indices_to_packed(data, model.rnn.input_size)
data_packed, labels, lengths = data_packed.to(device), labels.to(device), lengths.to(device)
preds = model(data_packed, lengths)
loss = loss_fn(preds, labels)
return loss, preds
results = {}
epochs = 100
for BATCH_SIZE in [4096, 2048, 256]:
train_loader = data.DataLoader(train_data, BATCH_SIZE, sampler=train_sampler, collate_fn=partial(my_collate, input_size=input_size, output_size=output_size))
val_loader = data.DataLoader(val_data, BATCH_SIZE, sampler=val_sampler, collate_fn=partial(my_collate, input_size=input_size, output_size=output_size))
model = LSTM(input_size, HIDDEN_SIZE, NUM_LAYERS, DROPOUT, output_size)
optimizer = torch.optim.Adam(model.parameters())
model.to(device)
train_losses = []
val_losses = []
cur_losses = {}
duration = 0
for epoch in range(epochs):
start = time.time()
train_loss = 0
model.train()
# Using PackedSequence
for names, langs, lengths in train_loader:
optimizer.zero_grad()
loss, _ = infer(model, names, langs, lengths, device)
loss.backward()
optimizer.step()
train_loss += loss
train_loss /= len(train_data)
train_losses.append(train_loss.cpu().detach().numpy())
model.eval()
val_loss = 0
with torch.no_grad():
for names, langs, lengths in val_loader:
loss, _ = infer(model, names, langs, lengths, device)
val_loss += loss
val_loss /= len(val_data)
val_losses.append(val_loss.cpu().detach().numpy())
cur_duration = time.time() - start
duration += cur_duration
log_line = (f"BATCH_SIZE: {BATCH_SIZE} epoch: {epoch} train loss: "
f"{train_loss:.5f} val loss: {val_loss:.5f}")
print(log_line)
cur_losses["train_losses"] = train_losses
cur_losses["val_losses"] = val_losses
results[BATCH_SIZE] = {"losses" : cur_losses, "duration" : duration, "model": model}
Model:
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, dropout, output_size):
super().__init__()
self.rnn = nn.LSTM(input_size, hidden_size, num_layers, dropout=DROPOUT)
self.linear = nn.Linear(hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, x, lengths):
lstm_out, _ = self.rnn(x)
# https://discuss.pytorch.org/t/get-each-sequences-last-item-from-packed-sequence/41118/7
sum_batch_sizes = torch.cat((
torch.zeros(2, dtype=torch.int64),
torch.cumsum(lstm_out.batch_sizes, 0)
))
sorted_lengths = lengths[lstm_out.sorted_indices]
last_seq_idxs = sum_batch_sizes[sorted_lengths] + torch.arange(lengths.size(0))
last_seq_items = lstm_out.data[last_seq_idxs]
lstm_last_out = last_seq_items[lstm_out.unsorted_indices]
linear_out = self.linear(lstm_last_out)
softmax_out = self.softmax(linear_out)
return softmax_out
Losses with different batch sizes:

It looks like there issue is how the loss is calculated.
train_loss += loss line accumulates the loss. When batch size is higher, there will be fewer steps to do. The code normalizes this by dividing by the length of train data, train_loss /= len(train_data), but should probably take into account the batch size: train_loss /= (len(train_data) / BATCH_SIZE).
The same for validation loss, but the effect is different probably because of smaller data size compared to training data.

Related

Find Training/Validation Accuracy & Loss of Faster-RCNN PyTorch model

I am trying to find the training/validation accuracy and loss of my model for each epoch as I train it to find the best epoch to use from now on. I appreciate that there is lots of information on this now but this topic is very new to me, and I find it very difficult to find the right answer for my situation.
I assume that I need to add in one or two bits to the train_one_epoch() and evaluate() functions in order to do this?
My model setup is:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.02, momentum=0.9, weight_decay=0.0001)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20,40], gamma=0.1)
And my training function is:
epochs = 50
for epoch in range(epochs):
train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=20)
lr_scheduler.step()
evaluate(model, val_data_loader, device=device)
print("\n\n")
torch.save(model, f'./Models/trained_{ds}_model_Epoch{epochs}_LR0_02.pt')
I am using coco-like annotations, for example:
{'boxes': tensor([[316.9700, 242.5500, 464.1000, 442.1700], [ 39.2200, 172.6700, 169.8400, 430.9600]]), 'labels': tensor([2, 2]), 'image_id': tensor(1416), 'area': tensor([29370.1094, 33738.3789]), 'iscrowd': tensor([0, 0])}
The train_one_epoch and evaluate functions are from 'engine.py' from Torchvision.
It seems like using Tensorboard is a good tool to use, but I don't really know how to use it.
The engine.py is:
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, scaler=None):
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
header = f"Epoch: [{epoch}]"
lr_scheduler = None
if epoch == 0:
warmup_factor = 1.0 / 1000
warmup_iters = min(1000, len(data_loader) - 1)
lr_scheduler = torch.optim.lr_scheduler.LinearLR(
optimizer, start_factor=warmup_factor, total_iters=warmup_iters
)
for images, targets in metric_logger.log_every(data_loader, print_freq, header):
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
with torch.cuda.amp.autocast(enabled=scaler is not None):
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
loss_value = losses_reduced.item()
if not math.isfinite(loss_value):
print(f"Loss is {loss_value}, stopping training")
print(loss_dict_reduced)
sys.exit(1)
optimizer.zero_grad()
if scaler is not None:
scaler.scale(losses).backward()
scaler.step(optimizer)
scaler.update()
else:
losses.backward()
optimizer.step()
if lr_scheduler is not None:
lr_scheduler.step()
metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
metric_logger.update(lr=optimizer.param_groups[0]["lr"])
return metric_logger
The evaluate function is:
def evaluate(model, data_loader, device):
n_threads = torch.get_num_threads()
# FIXME remove this and make paste_masks_in_image run on the GPU
torch.set_num_threads(1)
cpu_device = torch.device("cpu")
model.eval()
metric_logger = utils.MetricLogger(delimiter=" ")
header = "Test:"
coco = get_coco_api_from_dataset(data_loader.dataset)
iou_types = _get_iou_types(model)
coco_evaluator = CocoEvaluator(coco, iou_types)
for images, targets in metric_logger.log_every(data_loader, 100, header):
images = list(img.to(device) for img in images)
if torch.cuda.is_available():
torch.cuda.synchronize()
model_time = time.time()
outputs = model(images)
outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
model_time = time.time() - model_time
res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
evaluator_time = time.time()
coco_evaluator.update(res)
evaluator_time = time.time() - evaluator_time
metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print("Averaged stats:", metric_logger)
coco_evaluator.synchronize_between_processes()
# accumulate predictions from all images
coco_evaluator.accumulate()
coco_evaluator.summarize()
torch.set_num_threads(n_threads)
return coco_evaluator

mini-batch gradient decent bad accuracy/loss

I’m trying mini-batch gradient descent on the popular iris dataset, but somehow I don’t manage to get the accuracy of the model above 75-80%. Also, I’m not certain if I’m calculating the loss as well as the accuracy correctly. Any suggestions on how to improve my code or mistakes I’m doing are appreciated.
batch_size = 10
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
Training loop:
n_iters = 1000
steps = n_iters/10
LOSS = []
for epochs in range(n_iters):
for i,(inputs, labels) in enumerate(train_loader):
out = model(inputs)
train_labels = transform_label(labels)
l = loss(out, train_labels)
l.backward()
#update weights
optim.step()
optim.zero_grad()
LOSS.append(l.item())
if epochs%steps == 0:
print(f"\n epoch: {int(epochs+steps)}/{n_iters}, loss: {sum(LOSS)/len(LOSS)}")
#if i % 1 == 0:
#print(f" steps: {i+1}, loss : {l.item()}")
claculate accuracy:
def accuracy(model,test_loader):
sum_acc= 0
#map labels with 0,1,2
def transform_label(label_data):
data = []
for i in label_data:
if i == "Iris-setosa":
data.append(torch.tensor([0]))
if i == "Iris-versicolor":
data.append(torch.tensor([1]))
if i == "Iris-virginica":
data.append(torch.tensor([2]))
return torch.stack(data)
for i,(X_test, test_labels) in enumerate(test_loader):
test_labels = transform_label(test_labels)
x_label_pre = model(X_test)
_, x_label_pre_hat = torch.max(x_label_pre, 1)
idx = 0
number_pred = 0
while idx < len(X_test):
if x_label_pre_hat[idx].item() == test_labels[idx].item():
number_correct += 1
idx +=1
lr = 0.01
model = NeuralNetwork()
optim = torch.optim.Adam(model.parameters(), lr=lr)
#optim = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
loss = torch.nn.CrossEntropyLoss()
#loss = torch.nn.MSELoss()
#Weights are by default torch.32 not 64 --> error message
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.linear_stack = nn.Sequential(
nn.Linear(4,128),
nn.ReLU(),
nn.Linear(128,64),
nn.ReLU(),
nn.Linear(64,3),
)
def forward(self, x):
logits = self.linear_stack(x)
return logits

PyTorch text classification model not improving

I am training a simple LSTM model for binary text classification. Here is the model class:
class LSTM(nn.Module):
def __init__(self, vocabulary_size, embeddings_size, num_classes):
super(LSTM, self).__init__()
self.vocabulary_size = vocabulary_size
self.embeddings_size = embeddings_size
self.embedding = nn.Embedding(num_embeddings=vocabulary_size,
embedding_dim=embeddings_size,
padding_idx=0)
self.lstm = nn.LSTM(input_size=embeddings_size,
hidden_size=128,
num_layers=1,
batch_first=True)
self.fc = nn.Linear(in_features=128,
out_features=num_classes)
def forward(self, x):
out = self.embedding(x)
out, _ = self.lstm(out)
out = out[:, -1]
out = self.fc(out)
out = torch.sigmoid(out)
return out
I am using BCELoss and Adam optimizer created with the following code:
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=learning_rate)
This is the training loop that I am using:
train_steps = len(train_data_loader)
for epoch in range(epochs):
train_loss = 0
model.train()
for i, (sequences, labels) in enumerate(train_data_loader):
optimizer.zero_grad()
sequences = sequences.to(device)
labels = labels.to(device)
outputs = model(sequences)
loss = criterion(outputs, labels)
train_loss += loss.item()
loss.backward()
optimizer.step()
print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss / train_steps:.4f}')
I have experimented with different datasets, number of epochs, learning rate, batch size. However, the model does not seem to learn - the loss is always around 0.7 and only the 0 class is predicted.
Does anyone know what the issue could be?

Almost non-existent training accuracy and low test accuracy

I am really new to Machine Learning and I am not so well versed in coding in general. However there is need to look through the customers feedback at our store, that average quite a lot each year, yet we cannot tell % of positive, negative and neutral.
Currently I am trying to train a Bert Model to do simple multi labeled sentiment analysis. The input is our store's customers feedback. The customers feedback is not always so clearly defined since customers do tend to tell long and long about their experience and their sentiment is not always so clear. However we managed to get positive, negative and neutral, each set 2247 samples.
But when I try to train it the training accuracy is around 0.4% which is super low. Validation score is around 60%. F1-score is around 60% for each of the label. I wonder what can be done to improve this training accuracy. I have been stuck for a while. Please take a look at my codes and help me out with this.
I have tried changing learning rate (tried all learning rate Bert suggested and 1e-5),changing Max LEN, changing amount of EPOCH, changing drop out rate (0.1, 0.2, 0.3, 0.4, 0.5), but so far nothing yielded results.
#read dataset
df = pd.read_csv("data.csv",header=None, names=['content', 'sentiment'], sep='\;', lineterminator='\r',encoding = "ISO-8859-1",engine="python")
from sklearn.utils import shuffle
df = shuffle(df)
df['sentiment'] = df['sentiment'].replace(to_replace = [-1, 0, 1], value = [0, 1, 2])
df.head()
#Load pretrained FinBert model and get bert tokenizer from it
PRE_TRAINED_MODEL_NAME = 'TurkuNLP/bert-base-finnish-cased-v1'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
#Choose sequence Length
token_lens = []
for txt in df.content:
tokens = tokenizer.encode(txt, max_length=512)
token_lens.append(len(tokens))
sns.distplot(token_lens)
plt.xlim([0, 256]);
plt.xlabel('Token count');
MAX_LEN = 260
#Make a PyTorch dataset
class FIDataset(Dataset):
def __init__(self, texts, targets, tokenizer, max_len):
self.texts = texts
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, item):
text = str(self.texts[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
#split test and train
df_train, df_test = train_test_split(
df,
test_size=0.1,
random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
df_test,
test_size=0.5,
random_state=RANDOM_SEED
)
df_train.shape, df_val.shape, df_test.shape
#data loader function
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = FIDataset(
texts=df.content.to_numpy(),
targets=df.sentiment.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
return DataLoader(
ds,
batch_size=batch_size,
num_workers=4
)
#Load data into train, test, val
BATCH_SIZE = 16
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.1)
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
def forward(self, input_ids, attention_mask):
returned = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
pooled_output = returned["pooler_output"]
output = self.drop(pooled_output)
return self.out(output)
#Create a Classifier instance and move to GPU
model = SentimentClassifier(3)
model = model.to(device)
#Optimize with AdamW
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr= 2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)
#Train each Epoch function
def train_epoch(
model,
data_loader,
loss_fn,
optimizer,
device,
scheduler,
n_examples
):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
#Eval model function
def eval_model(model, data_loader, loss_fn, device, n_examples):
model = model.eval()
losses = []
correct_predictions = 0
with torch.no_grad():
torch.cuda.empty_cache()
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
return correct_predictions.double() / n_examples, np.mean(losses)
#training loop through each epochs
import torch
torch.cuda.empty_cache()
history = defaultdict(list)
best_accuracy = 0
if __name__ == '__main__':
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(df_train)
)
print(f'Train loss {train_loss} accuracy {train_acc}')
val_acc, val_loss = eval_model(
model,
val_data_loader,
loss_fn,
device,
len(df_val)
)
print(f'Val loss {val_loss} accuracy {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_accuracy:
torch.save(model.state_dict(), 'best_model_state.bin')
best_accuracy = val_acc
-- Edit: I have printed out preds and targets as well as train and val accuracy
Here _, preds = torch.max(outputs, dim=1), you probably want argmax, not max?
Print out preds and targets to better see what's going on.
Edit after preds and targets printed out. For epochs 4 and 5, preds matches targets exactly, so train accuracy should be 1. I think the issue is that the accuracy is divided by n_examples, which is a number of examples in the whole train dataset, while it should be divided by the number of examples in the epoch.

Training loss is not changing at all while training model

I’m trying to solve a VQA classification problem. my training loss is not changing at all while training the model.
I put in comment the CNN model and try to run it with the text only, but still, no change in loss value.
I pass through those models:
class question_lstm(nn.Module):
def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout, output_dim, que_size):
super(question_lstm, self).__init__()
self.hid_dim = hid_dim
self.n_layers = n_layers
self.embedding = nn.Embedding(input_dim, emb_dim)
self.tanh = nn.Tanh()
self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
self.dropout = nn.Dropout(dropout)
#self.fc1=nn.Linear(n_layers*hid_dim,que_size)
self.fc1=nn.Linear(n_layers*output_dim,que_size)
def forward(self, question):
emb_question=self.embedding(question) #(batchsize, input_dim, emb_dim=256)
emb_question=self.dropout(emb_question)
emb_question=self.tanh(emb_question)
emb_question = emb_question.transpose(0, 1) #(input_dim, batchsize, emb_dim)
output, (hidden, cell) = self.lstm(emb_question)
qu_feature = torch.cat((hidden, cell), dim=2)
qu_feature = qu_feature.transpose(0, 1) #(batchsize=100, num_layer=2, hid_dim=2048)
question_output =self.fc1(qu_feature)
return question_output
class vqamodel(nn.Module):
def __init__(self, output_dim,input_dim, emb_dim, hid_dim, n_layers, dropout, answer_len, que_size,):
super(vqamodel,self).__init__()
#self.image=img_CNN(img_size,image_feature)
self.question=question_lstm(input_dim, emb_dim, hid_dim, n_layers, dropout,output_dim,que_size)
self.tanh=nn.Tanh()
self.relu=nn.ReLU()
self.dropout=nn.Dropout(dropout)
self.fc1=nn.Linear(que_size,output_dim)
self.fc2=nn.Linear(output_dim,answer_len)
def forward(self, image, question):
question_emb=self.question(question)
combine =question_emb #*img_emb
out_feature=self.fc1(combine) #(batchsize=100, output_dim=2048)
out_feature=self.relu(out_feature)
out_feature=self.dropout(out_feature)
out_feature=self.fc2(out_feature) #(batchsize=100, answer_len=1000)
return (out_feature)
I’m using cross entropy loss and Adam:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vqa_model.parameters(),lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
any idea what can cause this constant loss value?
the train loop:
def train(model,criterion,optimizer,scheduler):
start_time = time.time() #the time we start the train
for epoch in range(num_epochs):
train_loss = 0
#test_loss = 0
train_correct = 0
#test_correct = 0
vqa_model.train()
for i,sample in enumerate(train_VQAdataset_loader):
#image = sample['image'].to(device=device)
question = sample['question'].to(torch.int64).to(device=device)
label = sample['answer'].to(device=device)
output = vqa_model(image, question) # forward
loss = criterion(output, label)
optimizer.zero_grad() # Zero the gradients
loss.backward() # backprop
optimizer.step() # Update weights
scheduler.step()
# Statitcs
train_loss += loss.item() # save the loss for the entire epoch
_, predictions = torch.max(output, 1)
train_correct += (predictions == label).sum() #number of success - cumulative
train_losses.append(train_loss / len(train_VQAdataset_loader))

Resources