Training with Pytorch: error due to CUDA memory issue - pytorch

I am trying to train a model on the Cityscapes dataset, for segmentation. I use torchvision deeplabv3_resnet50 model and it's Cityscapes dataset class and transforms. In case it matters, I am running the code in Jupyter notebook.
The datasets are working, as are the dataloaders. When I attempt to train, I always get this error, at the point when the first batch is trying to be put thru the network (y_ = net(xb) in one_epoch function).
RuntimeError: CUDA out of memory. Tried to allocate 128.00 MiB (GPU 0; 6.00 GiB total capacity; 4.20 GiB already allocated; 6.87 MiB free; 4.20 GiB reserved in total by PyTorch)
What is strange, is that no matter what the batch size (bs) is, the the amount of memory free according to the error is a value a little less than the amount of memory that is trying to be allocated, e.g. for bs=16 I get:
RuntimeError: CUDA out of memory. Tried to allocate 2.00 GiB (GPU 0; 6.00 GiB total capacity; 2.90 GiB already allocated; 1.70 GiB free; 2.92 GiB reserved in total by PyTorch)
I have a much more complicated model running, that will work with bs=16. This model builds everything from scratch. But I really want to be able to use the simplicity that torchvision seems to have with it's model zoo and datasets.
My code is below, not much more than the bare essentials, enough to show if it is running ok on the GPU.
def one_epoch(net, loss, dl, opt=None, metric=None):
if opt:
net.train() # only affects some layers
else:
net.eval()
rq_stored = []
for p in net.parameters():
rq_stored.append(p.requires_grad)
p.requires_grad = False
L, M = [], []
dl_it = iter(dl)
for xb, yb in tqdm(dl_it, leave=False):
xb, yb = xb.cuda(), yb.cuda()
y_ = net(xb)
l = loss(y_, yb)
if opt:
opt.zero_grad()
l.backward()
opt.step()
L.append(l.detach().cpu().numpy())
if metric: M.append(metric(y_, yb).cpu().numpy())
if not opt:
for p,rq in zip(net.parameters(), rq_stored): p.requires_grad = rq
return L, M
accuracy = lambda y_,yb: (y_.max(dim=1)[1] == yb).float().mean()
def fit(net, tr_dl, val_dl, loss=nn.CrossEntropyLoss(), epochs=3, lr=3e-3, wd=1e-3):
opt = optim.Adam(net.parameters(), lr=lr, weight_decay=wd)
Ltr_hist, Lval_hist = [], []
for epoch in trange(epochs):
Ltr, _ = one_epoch(net, loss, tr_dl, opt)
Lval, Aval = one_epoch(net, loss, val_dl, None, accuracy)
Ltr_hist.append(np.mean(Ltr))
Lval_hist.append(np.mean(Lval))
print(f'epoch: {epoch+1}\ttraining loss: {np.mean(Ltr):0.4f}\tvalidation loss: {np.mean(Lval):0.4f}\tvalidation accuracy: {np.mean(Aval):0.2f}')
return Ltr_hist, Lval_hist
class To3ch(object):
def __call__(self, pic):
if pic.shape[0]==1: pic = pic.repeat(3,1,1)
return pic
bs = 1
imagenet_stats = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
transf = transforms.Compose([
transforms.ToTensor(),
To3ch(),
transforms.Normalize(*imagenet_stats)
])
train_ds = datasets.Cityscapes('C:/cityscapes_ds', split='train', target_type='semantic', transform=transf, target_transform=transf)
val_ds = datasets.Cityscapes('C:/cityscapes_ds', split='val', target_type='semantic', transform=transf, target_transform=transf)
train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True, num_workers=0)
val_dl = DataLoader(val_ds, batch_size=2*bs, shuffle=False, num_workers=0)
net = models.segmentation.deeplabv3_resnet50(num_classes=20)
fit(net.cuda(), train_dl, val_dl, loss=nn.CrossEntropyLoss(), epochs=1, lr=1e-4, wd=1e-4, plot=True)

You didn't specify, but if you're using the original Cityscapes, this OOM is completely expected.
The original Cityscapes dataset has large images (something like 1024x2048, IIRC), and it looks like you have a 6GB GPU. FYI, I cannot fit batch_size=2 in a 12GB GPU with inputs of this size.
When training DeepLab models, it is common to apply transformations on the input (e.g., random crops, resize, scaling, etc.), and it looks like you don't apply any.
When you say:
I have a much more complicated model running, that will work with bs=16.
Perhaps you're looking at a different kind of complexity, something that has less impact on memory requirements than you think.

Related

huggingface longformer memory issues

I am building Huggingface Longformer based classifier. My main code below
model = LongformerForSequenceClassification.from_pretrained('/mnt/longformer_official/',
gradient_checkpointing=False,
attention_window = 512)
tokenizer = LongformerTokenizerFast.from_pretrained('/mnt/longformer_official/', max_length = 4000)
train_df_tuning_dataset_tokenized = train_df_tuning_dataset.map(tokenization, batched = True, batch_size = len(train_df_tuning_dataset))
training_args = TrainingArguments(
output_dir="xyz",
num_train_epochs = 5,# changed this from 5
per_device_train_batch_size = 4,#4,#8,#adding on 18 march from huggingface example notebook
gradient_accumulation_steps = 16,#16, #8 adding it back 18 march even though missing in huggingface example notebook as otherwise memory issues
per_device_eval_batch_size= 16,#16
evaluation_strategy = "epoch",
save_strategy = "epoch",#adding on 18 march from huggingface example notebook
learning_rate=2e-5,#adding on 18 march from huggingface example notebook
load_best_model_at_end=True,
greater_is_better=False,
disable_tqdm = False,
weight_decay=0.01,
optim="adamw_torch",#removing on 18 march from huggingface example notebook
run_name = 'longformer-classification-16March2022'
)
#class weights
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.get("labels")
# forward pass
outputs = model(**inputs)
logits = outputs.get("logits")
# compute custom loss (suppose one has 3 labels with different weights)
loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 0.5243])).to(device)
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)).to(device)
return (loss, outputs) if return_outputs else loss
trainer = CustomTrainer(
model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_df_tuning_dataset_tokenized,
eval_dataset=val_dataset_tokenized
)
When I try max_length=1500 in the tokenizer, the code runs fine. It fails when run with max_length=4000
I even tried setting these parameters as
per_device_train_batch_size = 1, gradient_accumulation_steps = 1, per_device_eval_batch_size = 1
My questions:
is it okay to set per_device_train_batch_size = 1, gradient_accumulation_steps = 1, per_device_eval_batch_size = 1?
The error that I get is as below. Is there any way around this other than getting more memory?
RuntimeError: CUDA out of memory. Tried to allocate 720.00 MiB (GPU 0; 14.76 GiB total capacity; 12.77 GiB already allocated; 111.75 MiB free; 13.69 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
try setting
gradient_accumulation_steps = int(math.ceil(len(tr_inputs) / per_device_train_batch_size) / 1) * epochs
as gradient_aacumulation_steps should be derived on the basis of epochs and batch size

Running out of Memory Training Google Big Bird with Huggingface

I've been struggling to train Google's Big Bird model using the Huggingface transformers library due to out of memory errors. I have two Tesla V100 GPU's with 32 GB RAM each. I'm trying to train the google/bigbird-roberta-base model (https://huggingface.co/google/bigbird-roberta-base) on Spider (a natural language to SQL dataset) using the Huggingface trainer API. I'm using a batch size of 1 and the smallest version of this model, and still get OOM errors. According to the Big Bird paper (https://arxiv.org/abs/2007.14062), Big Bird can be trained on chips with 16 GB of memory, so I'm not sure why I'm running into OOM. Has anyone encountered trouble training Big Bird due to memory problems?
Here's the code that does the training:
rouge = datasets.load_metric("rouge")
training_args = Seq2SeqTrainingArguments(
predict_with_generate = True,
evaluation_strategy = "steps",
per_device_train_batch_size = batch_size,
per_device_eval_batch_size = batch_size,
output_dir = "./",
logging_steps = 2,
save_steps = 400,
eval_steps = 4
)
def compute_metrics(pred):
labels_ids = pred.label_ids
pred_ids = pred.predictions
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
labels_ids[labels_ids == -100] = tokenizer.pad_token_id
label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid
return {
"rouge2_precision": round(rouge_output.precision, 4),
"rouge2_recall": round(rouge_output.recall, 4),
"rouge2_fmeasure": round(rouge_output.fmeasure, 4),
}
trainer = Seq2SeqTrainer(
model = model,
tokenizer = tokenizer,
args = training_args,
compute_metrics = compute_metrics,
train_dataset = train_data,
eval_dataset = val_data
)
trainer.train()
Here's the exact error I get:
RuntimeError: CUDA out of memory. Tried to allocate 36.00 MiB (GPU 0; 31.75 GiB total capacity; 25.14 GiB already allocated; 21.50 MiB free; 26.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Thanks so much for sharing any experience you have with this!
i had same issue with a6000 which has 48gb ram and after setting batch size to 1 from 128 oom didn't occur I guess model is bigger than they said.
I guess it is the problem of huggingface. There is no OOM issue with the original code in https://github.com/google-research/bigbird.

Training time increases dramatically within just a few iterations using tensorflow GPU

I am training a CNN AlexNet model on a quite large dataset (DeepFashion) containing around 300,000 images. I resize the images to 96x96x3 and I am using GPU nvidia tesla K80 and 4 vCPUs, 15 GB memory on GCP (google cloud). The thing is that when I start the training it runs really well for about 500 iterations but then speed dramatically drops. The GPU utilization in the start is around 50% but then drops to 0-7%. I really don't know what might be causing this.
Test set is 209,222 images
Val set is 40,000 images
train set is 40,000 images
Below is my simple code snippet, does anyone have any idea what can be causing this dramatic drop in computation speed and how I can fix it?
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
os.path.join(dataset_path, 'train'),
label_mode='categorical',
seed=seed,
image_size=(96, 96),
batch_size=64)
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
os.path.join(dataset_path, 'val'),
label_mode='categorical',
seed=seed,
image_size=(96, 96),
batch_size=64)
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
os.path.join(dataset_path, 'test'),
label_mode='categorical',
seed=seed,
image_size=(96, 96),
batch_size=64)
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
model = get_uncompiled_AlexNet(img_width=96, img_height=96, img_channel=3, num_classes=46)
model_optimizer = optimizers.Adam(learning_rate=learning_rate)
model_metrics_top_3 = tf.keras.metrics.TopKCategoricalAccuracy(k=3, name='top_3_accuracy')
model_metrics_top_5 = tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='top_5_accuracy')
model.compile(
loss="categorical_crossentropy",
optimizer=model_optimizer,
metrics=["accuracy", model_metrics_top_3, model_metrics_top_5])
history = model.fit(
train_ds,
validation_data=val_ds,
epochs=10)

CUDA out of memory when fine-tuning a large model

I have previously trained a VGG mode(say model1), and a two layer model(say model2) separately, now I have to train a new model which combines those two models together, and each part of the new model is initialized with the learned weights of model1 and model2, which I implemented as follows:
class TransferModel(nn.Module):
def __init__(self, VGG, TwoLayer):
super(TransferModel, self).__init__()
self.vgg_layer=VGG
self.linear = TwoLayer
for param in self.vgg_layer.parameters():
param.requires_grad = True
def forward(self, x):
h1_vgg = self.vgg_layer(x)
y_pred = self.linear(h1_vgg)
return y_pred
# for image_id in train_ids[0:1]:
# img = load_image(train_id_to_file[image_id])
new_model=TransferModel(trained_vgg_instance, trained_twolayer_instance)
new_model.linear.load_state_dict(trained_twolayer_instance.state_dict())
new_model.vgg_layer.load_state_dict(trained_vgg_instance.state_dict())
new_model.cuda()
And when training, I try:
def train(model, learning_rate=0.001, batch_size=50, epochs=2):
optimizer=optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.MultiLabelSoftMarginLoss()
x = torch.zeros([batch_size, 3, img_size, img_size])
y_true = torch.zeros([batch_size, 4096])
for epoch in range(epochs): # loop over the dataset multiple times
running_loss = 0.0
shuffled_indcs=torch.randperm(20000)
for i in range(20000):
for batch_num in range(int(20000/batch_size)):
optimizer.zero_grad()
for j in range(batch_size):
# ... some code to load batches of images into x....
x_batch=Variable(x).cuda()
print(batch_num)
y_true_batch=Variable(train_labels[batch_num*batch_size:(batch_num+1)*batch_size, :]).cuda()
y_pred =model(x_batch)
loss = criterion(y_pred, y_true_batch)
loss.backward()
optimizer.step()
running_loss += loss
del x_batch, y_true_batch, y_pred
torch.cuda.empty_cache()
print("in epoch[%d] = %.8f " % (epoch, running_loss /(batch_num+1)))
running_loss = 0.0
print('Finished Training')
train(new_model)
In the second iteration(batch_num=1) of the first epoch, I get this error:
CUDA out of memory. Tried to allocate 153.12 MiB (GPU 0; 5.93 GiB
total capacity; 4.83 GiB already allocated; 66.94 MiB free; 374.12 MiB
cached)
Although I have explicitly used 'del' in my training, by running nvidia-smi it looks like it doesn't do anything and the memory isn't being freed.
What should I do?
Change this line:
running_loss += loss
to this:
running_loss += loss.item()
By adding loss to running_loss, you are telling pytorch to keep all the gradients with respect to loss for that batch in memory, even when you start training on the next batch. Pytorch thinks that maybe you will want to use running_loss in some big loss function over multiple batches later, and therefore keeps all the gradients (and therefore activations) for all batches in memory.
By adding .item() you just get the loss as a python float, rather than a torch.FloatTensor. This float is detached from the pytorch graph and thus pytorch knows you don't want gradients with respect to it.
If you are running an older version of pytorch without .item(), you can try:
running_loss += float(loss).cpu().detach
This could also be caused by a similar bug in a test() loop, if you have one.

fit() gives different result than fit_generator() on the same dataset

I have been playing around with CNNs to try and remove coherent noise from relatively large images. Since the images are large, I cannot load too many into memory at once. Size of images: 1500x250. Because of this issue, I have tried to implement a generator which i feeding the images to the network. I have been struggling for a while getting bad results, but assumed the issues where in the network. I tried using fit() with a subset of my data and got extremely good results without doing anything with the network. Testing the generator on the same subset resulted in bad results. What is the catch that I cannot see? Why does my generator fail?
My dataset is approx 114 000 images, which is approximately 475 GB thus explaining why I cannot load all into memory at once. I get results and they are actual results recreating the images, but they are extremely bad. My generator class i here:
class genOne(k.utils.Sequence):
def __init__(self, img_rows, img_cols, channels, batch_size, clean_dir,
noisy_dir, clean_files, noisy_files, shuffle=True):
"""Initialize variables:
img_rows, img_cols, channels: the shape of the image
batch_size : Self explanatory
clean_dir, noisy_dir : directories with files
clean_files : Randomized list with clean images
noisy_file : Randomized list with noise"""
self.img_rows = img_rows
self.img_cols = img_cols
self.channels = channels
self.batch_size = batch_size
self.clean_dir = clean_dir
self.noisy_dir = noisy_dir
self.clean_files = clean_files.tolist()
self.noisy_files = noisy_files.tolist()
self.shuffled_noisy = []
self.tmp_noisy = []
self.tmp_clean = []
self.shuffle = shuffle
self.on_epoch_end()
def __len__(self):
"""Sets the number of batches per epoch"""
return floor((len(self.noisy_files)*len(self.clean_files))/self.batch_size)
def __getitem__(self, index):
"""Generates data for each batch
combine every type of noise with each image."""
X = np.empty((self.batch_size, self.img_rows, self.img_cols,
self.channels))
Y = np.zeros((self.batch_size, self.img_rows, self.img_cols,
self.channels))
for i in range(self.batch_size):
if not self.tmp_noisy:
self.tmp_noisy = self.shuffled_noisy
self.tmp_clean.pop(0)
x_test = self.tmp_noisy.pop(0)
X[i,] = np.expand_dims(np.load(self.noisy_dir + x_test).T[
:self.img_rows,:self.img_cols],-1)
Y[i,] = np.expand_dims(np.load(self.clean_dir + self.tmp_clean[0
]).T[:self.img_rows, :self.img_cols],-1)
y_test = self.tmp_clean[0]
# Input equals ground truth + noise
X[i,] += Y[i,]
# Normalize data between 0 and 1
X[i,] = ((X[i,]/np.amax(np.absolute(X[i,])))+1)/2
Y[i,] = ((Y[i,]/np.amax(np.absolute(Y[i,])))+1)/2
return X, Y
def on_epoch_end(self):
"""Refresh all data on epoch end"""
self.tmp_noisy = self.noisy_files
self.tmp_clean = self.clean_files
if self.shuffle == True:
np.random.shuffle(self.tmp_noisy)
np.random.shuffle(self.tmp_clean)
self.shuffled_noisy = self.tmp_noisy
I have 475 clean images, and 300 images consisting of pure noise. I combine them such that each image is fed into the network with each type of noise. The small case which worked with fit() was simply 300 images where every image was a different clean image with a different noise.
I am aware that my driver version is rather old, which requires an old version of tensorflow. I can not update this, so I'm stuck with tensorflow 1.4.1.
Specs:
2x Nvidia Geforce GTX 1080 7.9 GB
Nvidia Driver version 367.44
cuDNN 6.0.21
CUDA 8.0
Debian Wheezy 7
Tensorflow-gpu 1.4.1
Keras 2.0.8
Python 3.6.7

Resources