memory leakage in inference of pytorch_pretrained_bert.BertForSequenceClassification python - memory-leaks

I'm facing memory leakage in the realtime inference of pytorch_pretrained_bert's BertForSequenceClassification model.
Although I'm using GPU but still CPU memory is exhausting
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained("bert-base- uncased", num_labels=num_labels)
model.load_state_dict(torch.load(path, map_location='cpu'))
model.eval()
model.to(device)
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
attention_masks = []
for seq in input_ids:
seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask)
_inputs = torch.tensor(input_ids)
_masks = torch.tensor(attention_masks)
del input_ids
del attention_masks
gc.collect()
torch.cuda.empty_cache()
_data = TensorDataset(_inputs, _masks )#, _labels)
_sampler = RandomSampler(_data)
_dataloader = DataLoader(_data, sampler=_sampler, batch_size=batch_size,
shuffle=False, num_workers=2
)
del _data
del _sampler
predictions =[]
for batch in _dataloader:
batch = tuple(t.to(device) for t in batch)
with torch.no_grad():
logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
logits = logits.detach().cpu().numpy()
predictions.append(logits)
gc.collect()
torch.cuda.empty_cache()
del logits
model, b_input_ids and b_input_mask are mapped on gpu using .to(device)
Can anyone suggest me what can I do.. memory is increasing gradually

Related

Find Training/Validation Accuracy & Loss of Faster-RCNN PyTorch model

I am trying to find the training/validation accuracy and loss of my model for each epoch as I train it to find the best epoch to use from now on. I appreciate that there is lots of information on this now but this topic is very new to me, and I find it very difficult to find the right answer for my situation.
I assume that I need to add in one or two bits to the train_one_epoch() and evaluate() functions in order to do this?
My model setup is:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.02, momentum=0.9, weight_decay=0.0001)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20,40], gamma=0.1)
And my training function is:
epochs = 50
for epoch in range(epochs):
train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=20)
lr_scheduler.step()
evaluate(model, val_data_loader, device=device)
print("\n\n")
torch.save(model, f'./Models/trained_{ds}_model_Epoch{epochs}_LR0_02.pt')
I am using coco-like annotations, for example:
{'boxes': tensor([[316.9700, 242.5500, 464.1000, 442.1700], [ 39.2200, 172.6700, 169.8400, 430.9600]]), 'labels': tensor([2, 2]), 'image_id': tensor(1416), 'area': tensor([29370.1094, 33738.3789]), 'iscrowd': tensor([0, 0])}
The train_one_epoch and evaluate functions are from 'engine.py' from Torchvision.
It seems like using Tensorboard is a good tool to use, but I don't really know how to use it.
The engine.py is:
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, scaler=None):
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
header = f"Epoch: [{epoch}]"
lr_scheduler = None
if epoch == 0:
warmup_factor = 1.0 / 1000
warmup_iters = min(1000, len(data_loader) - 1)
lr_scheduler = torch.optim.lr_scheduler.LinearLR(
optimizer, start_factor=warmup_factor, total_iters=warmup_iters
)
for images, targets in metric_logger.log_every(data_loader, print_freq, header):
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
with torch.cuda.amp.autocast(enabled=scaler is not None):
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
loss_value = losses_reduced.item()
if not math.isfinite(loss_value):
print(f"Loss is {loss_value}, stopping training")
print(loss_dict_reduced)
sys.exit(1)
optimizer.zero_grad()
if scaler is not None:
scaler.scale(losses).backward()
scaler.step(optimizer)
scaler.update()
else:
losses.backward()
optimizer.step()
if lr_scheduler is not None:
lr_scheduler.step()
metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
metric_logger.update(lr=optimizer.param_groups[0]["lr"])
return metric_logger
The evaluate function is:
def evaluate(model, data_loader, device):
n_threads = torch.get_num_threads()
# FIXME remove this and make paste_masks_in_image run on the GPU
torch.set_num_threads(1)
cpu_device = torch.device("cpu")
model.eval()
metric_logger = utils.MetricLogger(delimiter=" ")
header = "Test:"
coco = get_coco_api_from_dataset(data_loader.dataset)
iou_types = _get_iou_types(model)
coco_evaluator = CocoEvaluator(coco, iou_types)
for images, targets in metric_logger.log_every(data_loader, 100, header):
images = list(img.to(device) for img in images)
if torch.cuda.is_available():
torch.cuda.synchronize()
model_time = time.time()
outputs = model(images)
outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
model_time = time.time() - model_time
res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
evaluator_time = time.time()
coco_evaluator.update(res)
evaluator_time = time.time() - evaluator_time
metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print("Averaged stats:", metric_logger)
coco_evaluator.synchronize_between_processes()
# accumulate predictions from all images
coco_evaluator.accumulate()
coco_evaluator.summarize()
torch.set_num_threads(n_threads)
return coco_evaluator

DDP and cuda graph in pytorch

This is my code and I am currently running it on 4 gpus
setup(rank, gpus)
dataset = RandomDataset(input_shape, 80*batch_size, rank)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
data_iter = iter(dataloader)
model = model(pretrained=True).to(rank)
optimizer = optim.SGD(model.parameters(), lr=0.0001)
criterion = torch.nn.CrossEntropyLoss()
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s):
print("[MAKING DDP Model]")
model = DDP(model)
print("[MODEL CREATED]")
for i in range(11):
optimizer.zero_grad(set_to_none=True)
inputs, labels = next(data_iter)
output = model(inputs)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
capture_input = torch.empty((batch_size, 3, input_shape, input_shape)).to(rank)
capture_target = torch.argmax(torch.from_numpy(np.eye(1000)[np.random.choice(1000, batch_size)]), axis=1).to(rank)
g = torch.cuda.CUDAGraph()
optimizer.zero_grad(set_to_none=True)
with torch.cuda.graph(g):
capture_y_pred = model(capture_input)
capture_loss = criterion(capture_y_pred, capture_target)
capture_loss.backward()
optimizer.step()
print("RECORDED")
for i in range(20):
inputs, label = next(data_iter)
capture_input.copy_(inputs)
capture_target.copy_(label)
g.replay()
optimizer.step()
print("DATASET DONE")
RuntimeError: CUDA error: operation failed due to a previous error during capture
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Does anyone know how to solve this problem?

Looking for help on why GPU is not used when I train a Pytorch model

The machine I am using for training has 4 GPUs. I am "moving" classifier, loss function and tensors to GPU. But when I run nvidia-smi on the machine while training is ongoing, I see GPU utilization is very low (3%) on one core and 0 on other cores.
Questions I have are
Is there an easier approach to ask Pytorch to use GPU and as many cores as available without me having to do so many .to(device) all over the place
Is there something other than .to(device) that is needed to use GPU?
Is there a way to see if training is happening on CPU vs GPU or is running nvidia-smi on the machine and looking at GPU utilization the only way?
How do I interpret GPU utilization of 3% in nvidia-smi. Does it mean CPU is being used in many places? If yes, is there a way to debug what is making the training use CPU?
Will setting num_workers to number of available cores in DataLoader class be enough to use multiple GPU cores? Is there any generic way to automatically learn number of GPU cores available?
Code used to train
random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)
torch.cuda.manual_seed(1234)
torch.backends.cudnn.deterministic = True
start_time = time.time()
clf = MLP(len(X_training[0]), hidden_size=[100, 100, 100, 100, 100])
#Move to GPU if available
use_gpu = torch.cuda.is_available()
device = torch.device('cuda' if use_gpu else 'cpu')
# Define the loss function and optimizer
optimizer = torch.optim.Adam(clf.parameters(), lr=8e-4)
clf = clf.to(device)
loss_function = nn.BCELoss()
loss_function = loss_function.to(device)
# Run the training loop
# per_epoch_precision = []
# per_epoch_recall = []
for epoch in range(0, 150):
# Set current loss value
current_loss = 0.0
dataset = MyDataset(X_training, y_training, use_gpu)
kwargs = {'num_workers': 1, 'pin_memory': True} if use_gpu else {}
trainloader = torch.utils.data.DataLoader(dataset, batch_size=10000, shuffle=True, **kwargs)
# Iterate over the DataLoader for training data
clf.train() # set to train mode
for i, data in enumerate(trainloader):
# Get inputs
inputs, targets = data
inputs = inputs.to(device)
targets = targets.to(device)
# Zero the gradients
optimizer.zero_grad()
# Perform forward pass
outputs = clf(inputs)
# Compute loss
targets = targets.float().unsqueeze(1)
loss = loss_function(outputs, targets)
# Perform backward pass
loss.backward()
# Perform optimization
optimizer.step()
# Print statistics
current_loss += loss.item()
if i % 20000 == 19999:
print("Loss after mini-batch %5d: %.3f" % (i + 1, current_loss / 500))
current_loss = 0.0
# Process is complete.
print("Training process has finished.")
print(f"Train time is {time.time() - start_time}")
class MyDataset(Dataset):
def __init__(self, x, y, use_gpu=False):
x = x.astype(np.float32)
self.x_train = torch.from_numpy(x)
self.y_train = torch.from_numpy(y.values)
if use_gpu:
device = torch.device("cuda")
self.x_train.to(device)
self.y_train.to(device)
# self.y_train = torch.LongTensor(y.values, dtype=torch.int)
def __len__(self):
return len(self.y_train)
def __getitem__(self,idx):
return self.x_train[idx],self.y_train[idx]
class MLP(nn.Module):
def __init__(self, input_size, hidden_size, act_fn=nn.ReLU(), use_dropout=False, drop_rate=0.25):
super(MLP, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.layers = nn.Sequential()
if use_dropout:
self.layers.append(nn.Dropout(p=drop_rate))
self.layers.append(nn.Linear(self.input_size, self.hidden_size[0]))
self.layers.append(act_fn)
for i in range(1, len(hidden_size)):
if use_dropout:
self.layers.append(nn.Dropout(p=drop_rate))
self.layers.append(nn.Linear(self.hidden_size[i - 1], self.hidden_size[i]))
self.layers.append(act_fn)
if use_dropout:
self.layers.append(nn.Dropout(p=drop_rate))
self.layers.append(nn.Linear(self.hidden_size[-1], 1))
self.layers.append(nn.Sigmoid())
def forward(self, x):
return self.layers(x)

Almost non-existent training accuracy and low test accuracy

I am really new to Machine Learning and I am not so well versed in coding in general. However there is need to look through the customers feedback at our store, that average quite a lot each year, yet we cannot tell % of positive, negative and neutral.
Currently I am trying to train a Bert Model to do simple multi labeled sentiment analysis. The input is our store's customers feedback. The customers feedback is not always so clearly defined since customers do tend to tell long and long about their experience and their sentiment is not always so clear. However we managed to get positive, negative and neutral, each set 2247 samples.
But when I try to train it the training accuracy is around 0.4% which is super low. Validation score is around 60%. F1-score is around 60% for each of the label. I wonder what can be done to improve this training accuracy. I have been stuck for a while. Please take a look at my codes and help me out with this.
I have tried changing learning rate (tried all learning rate Bert suggested and 1e-5),changing Max LEN, changing amount of EPOCH, changing drop out rate (0.1, 0.2, 0.3, 0.4, 0.5), but so far nothing yielded results.
#read dataset
df = pd.read_csv("data.csv",header=None, names=['content', 'sentiment'], sep='\;', lineterminator='\r',encoding = "ISO-8859-1",engine="python")
from sklearn.utils import shuffle
df = shuffle(df)
df['sentiment'] = df['sentiment'].replace(to_replace = [-1, 0, 1], value = [0, 1, 2])
df.head()
#Load pretrained FinBert model and get bert tokenizer from it
PRE_TRAINED_MODEL_NAME = 'TurkuNLP/bert-base-finnish-cased-v1'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
#Choose sequence Length
token_lens = []
for txt in df.content:
tokens = tokenizer.encode(txt, max_length=512)
token_lens.append(len(tokens))
sns.distplot(token_lens)
plt.xlim([0, 256]);
plt.xlabel('Token count');
MAX_LEN = 260
#Make a PyTorch dataset
class FIDataset(Dataset):
def __init__(self, texts, targets, tokenizer, max_len):
self.texts = texts
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, item):
text = str(self.texts[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
#split test and train
df_train, df_test = train_test_split(
df,
test_size=0.1,
random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
df_test,
test_size=0.5,
random_state=RANDOM_SEED
)
df_train.shape, df_val.shape, df_test.shape
#data loader function
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = FIDataset(
texts=df.content.to_numpy(),
targets=df.sentiment.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
return DataLoader(
ds,
batch_size=batch_size,
num_workers=4
)
#Load data into train, test, val
BATCH_SIZE = 16
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.1)
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
def forward(self, input_ids, attention_mask):
returned = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
pooled_output = returned["pooler_output"]
output = self.drop(pooled_output)
return self.out(output)
#Create a Classifier instance and move to GPU
model = SentimentClassifier(3)
model = model.to(device)
#Optimize with AdamW
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr= 2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)
#Train each Epoch function
def train_epoch(
model,
data_loader,
loss_fn,
optimizer,
device,
scheduler,
n_examples
):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
#Eval model function
def eval_model(model, data_loader, loss_fn, device, n_examples):
model = model.eval()
losses = []
correct_predictions = 0
with torch.no_grad():
torch.cuda.empty_cache()
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
return correct_predictions.double() / n_examples, np.mean(losses)
#training loop through each epochs
import torch
torch.cuda.empty_cache()
history = defaultdict(list)
best_accuracy = 0
if __name__ == '__main__':
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(df_train)
)
print(f'Train loss {train_loss} accuracy {train_acc}')
val_acc, val_loss = eval_model(
model,
val_data_loader,
loss_fn,
device,
len(df_val)
)
print(f'Val loss {val_loss} accuracy {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_accuracy:
torch.save(model.state_dict(), 'best_model_state.bin')
best_accuracy = val_acc
-- Edit: I have printed out preds and targets as well as train and val accuracy
Here _, preds = torch.max(outputs, dim=1), you probably want argmax, not max?
Print out preds and targets to better see what's going on.
Edit after preds and targets printed out. For epochs 4 and 5, preds matches targets exactly, so train accuracy should be 1. I think the issue is that the accuracy is divided by n_examples, which is a number of examples in the whole train dataset, while it should be divided by the number of examples in the epoch.

How to gather prediction result on TPU (Pytorch)?

I'm trying to fine-tune my bert-based QA model(PyTorch) with Tpu v3-8 provided by Kaggle. In the validation process I used a ParallelLoader to make predictions on 8 cores at the same time. But after that I don't know what should I do to gather all the results back from each core(and in the correct order corresponding to dataset), in order to calculate the overall EM & F1 score. Can anybody help?
Code:
def _run():
MAX_LEN = 192 # maximum text length in the batch (cannot have too high due to memory constraints)
BATCH_SIZE = 16 # batch size (cannot have too high due to memory constraints)
EPOCHS = 2 # number of epochs
train_sampler = torch.utils.data.distributed.DistributedSampler(
tokenized_datasets['train'],
num_replicas=xm.xrt_world_size(), # tell PyTorch how many devices (TPU cores) we are using for training
rank=xm.get_ordinal(), # tell PyTorch which device (core) we are on currently
shuffle=True
)
train_data_loader = torch.utils.data.DataLoader(
tokenized_datasets['train'],
batch_size=BATCH_SIZE,
sampler=train_sampler,
drop_last=True,
num_workers=0,
)
valid_sampler = torch.utils.data.distributed.DistributedSampler(
tokenized_datasets['validation'],
num_replicas=xm.xrt_world_size(),
rank=xm.get_ordinal(),
shuffle=False
)
valid_data_loader = torch.utils.data.DataLoader(
tokenized_datasets['validation'],
batch_size=BATCH_SIZE,
sampler=valid_sampler,
drop_last=False,
num_workers=0
)
device = xm.xla_device() # device (single TPU core)
model = model.to(device) # put model onto the TPU core
xm.master_print('done loading model')
xm.master_print(xm.xrt_world_size(),'as size')
lr = 0.5e-5 * xm.xrt_world_size()
optimizer = AdamW(model.parameters(), lr=lr) # define our optimizer
for epoch in range(EPOCHS):
gc.collect()
# use ParallelLoader (provided by PyTorch XLA) for TPU-core-specific dataloading:
para_loader = pl.ParallelLoader(train_data_loader, [device])
xm.master_print('parallel loader created... training now')
gc.collect()
call training loop:
train_loop_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler=None)
del para_loader
model.eval()
para_loader = pl.ParallelLoader(valid_data_loader, [device])
gc.collect()
model.eval()
# call evaluation loop
print("call evaluation loop")
start_logits, end_logits = eval_loop_fn(para_loader.per_device_loader(device), model, device)

Resources