Pytorch ram does not free after every epoch - pytorch

I am training a multi-label text classifier using PyTorch with Roberta. However after 2nd epoch ram fills and kernel crashes I checked and ram is not freed after every epoch. I have 64GB RAM, 8 CPU cores
What can be the problem?
Here is the my PyTorch implementation:
class ReaderTextDataset(Dataset):
def __init__(self,
data: pd.DataFrame,
tokenizer: RobertaTokenizer,
max_token_len: int = 512):
self.tokenizer = tokenizer
self.data = data
self.max_token_len = max_token_len
def __len__(self):
return len(self.data)
def __getitem__(self, index: int):
data_row = self.data.iloc[index]
readerText = data_row.readerText
labels = data_row[LABEL_COLUMNS]
encoding = self.tokenizer.encode_plus(
readerText,
add_special_tokens=True,
max_length=self.max_token_len,
return_token_type_ids=False,
padding="max_length",
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
return dict(readerText=readerText,
input_ids=encoding["input_ids"].flatten(),
attention_mask=encoding["attention_mask"].flatten(),
labels=torch.FloatTensor(labels))
train_dataset = ReaderTextDataset(train_df, tokenizer, max_token_len=512)
roberta_model = RobertaModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
NUM_WORKERS = 6
class ReaderTextDataModule(pl.LightningDataModule):
def __init__(self,
train_df,
test_df,
tokenizer,
batch_size=8,
max_token_len=512):
super().__init__()
self.batch_size = batch_size
self.train_df = train_df
self.test_df = test_df
self.tokenizer = tokenizer
self.max_token_len = max_token_len
def setup(self, stage=None):
self.train_dataset = ReaderTextDataset(self.train_df, self.tokenizer,
self.max_token_len)
self.test_dataset = ReaderTextDataset(self.test_df, self.tokenizer,
self.max_token_len)
def train_dataloader(self):
return DataLoader(self.train_dataset,
batch_size=self.batch_size,
shuffle=True,
num_workers=NUM_WORKERS)
def val_dataloader(self):
return DataLoader(self.test_dataset,
batch_size=self.batch_size,
num_workers=NUM_WORKERS)
def test_dataloader(self):
return DataLoader(self.test_dataset,
batch_size=self.batch_size,
num_workers=NUM_WORKERS)
class ReaderTextTagger(pl.LightningModule):
def __init__(self,
n_classes: int,
n_training_steps=None,
n_warmup_steps=None):
super().__init__()
self.bert = RobertaModel.from_pretrained(BERT_MODEL_NAME,
return_dict=True)
self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
self.n_training_steps = n_training_steps
self.n_warmup_steps = n_warmup_steps
self.criterion = nn.BCELoss()
def forward(self, input_ids, attention_mask, labels=None):
output = self.bert(input_ids, attention_mask=attention_mask)
output = self.classifier(output.pooler_output)
output = torch.sigmoid(output)
loss = 0
if labels is not None:
loss = self.criterion(output, labels)
return loss, output
def training_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("train_loss", loss, prog_bar=True, logger=True)
return {"loss": loss, "predictions": outputs, "labels": labels}
def validation_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("val_loss", loss, prog_bar=True, logger=True)
return loss
def test_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("test_loss", loss, prog_bar=True, logger=True)
return loss
def training_epoch_end(self, outputs):
labels = []
predictions = []
for output in outputs:
for out_labels in output["labels"].detach().cpu():
labels.append(out_labels)
for out_predictions in output["predictions"].detach().cpu():
predictions.append(out_predictions)
labels = torch.stack(labels).int()
predictions = torch.stack(predictions)
for i, name in enumerate(LABEL_COLUMNS):
class_roc_auc = auroc(predictions[:, i], labels[:, i])
self.logger.experiment.add_scalar(f"{name}_roc_auc/Train",
class_roc_auc,
self.current_epoch)
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=self.n_warmup_steps,
num_training_steps=self.n_training_steps)
return dict(optimizer=optimizer,
lr_scheduler=dict(scheduler=scheduler, interval='step'))
enter code here

You return output, which stores in the GPU, in every train_step. Try to move it to cpu if you really need to store output by
return {"loss": loss, "predictions": outputs.cpu(), "labels": labels}

Related

Bert NER pytorch lightning

I'm trying to use pytorch-lightning for token-classification model. I have already built a model for token classification without lightning. I'm confused on what changes should be done with existing code to integrate pytorch-lightning.
Following is my pytorch code:
model = BertForTokenClassification.from_pretrained(
'bert-large-cased',
num_labels=len(tag2idx),
output_attentions = False,
output_hidden_states = False
)
for _ in trange(epochs, desc="Epoch"):
# ========================================
# Training
# ========================================
model.train()
total_loss = 0
for step, batch in enumerate(train_dataloader):
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
model.zero_grad()
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
loss = outputs[0]
loss.backward()
total_loss += loss.item()
torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
optimizer.step()
scheduler.step()
avg_train_loss = total_loss / len(train_dataloader)
loss_values.append(avg_train_loss)
# ========================================
# Validation
# ========================================
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions , true_labels = [], []
for batch in valid_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
logits = outputs[1].detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
eval_loss += outputs[0].mean().item()
predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
true_labels.extend(label_ids)
eval_loss = eval_loss / len(valid_dataloader)
validation_loss_values.append(eval_loss)
pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
valid_tags = [tag_values[l_i] for l in true_labels
for l_i in l if tag_values[l_i] != "PAD"]
f1 = f1_score([valid_tags], [pred_tags])
Following is the code which I tried for pytorch lightning.
class LightningModule(pl.LightningModule):
def __init__(self, lr, lr_backbone, weight_decay, batch_size):
super().__init__()
self.model = BertForTokenClassification.from_pretrained("bert-large-cased",
num_labels=len(tag2idx),
output_attentions = False,
output_hidden_states = False)
self.lr = lr
self.lr_backbone = lr_backbone
self.weight_decay = weight_decay
self.batch_size = batch_size
def forward(self, input_ids, attention_mask, labels):
outputs = self.model(
input_ids, token_type_ids=None, attention_mask=attention_mask, labels=labels
)
loss = outputs[0]
logits = outputs[1]
return loss, logits
def training_step(self, batch, batch_idx):
b_input_ids, b_input_mask, b_labels = batch
outputs = self.model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
loss = outputs[0]
self.log("train_loss", loss)
return loss
def validation_step(self, batch, batch_idx):
b_input_ids, b_input_mask, b_labels = batch
outputs = self.model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
eval_loss = outputs[0]
self.log("val_loss", eval_loss)
return eval_loss
def validation_end(self, outputs):
eval_loss = np.mean([x["val_loss"] for x in outputs])
self.log("val_loss", eval_loss)
pred_tags = [tag_values[p_i] for p, l in zip(self.predictions, self.true_labels)
for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
valid_tags = [tag_values[l_i] for l in self.true_labels
for l_i in l if tag_values[l_i] != "PAD"]
f1 = f1_score([valid_tags], [pred_tags])
self.log("val_f1", f1)
def configure_optimizers(self):
# optimizer = torch.optim.NAdam(optimizer_grouped_parameters,lr=4e-6,eps=1e-8)
# scheduler = scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps )
return torch.optim.NAdam(optimizer_grouped_parameters,lr=4e-6,eps=1e-8)
def train_dataloader(self):
return train_dataloader # return your dataloader
def val_dataloader(self):
return valid_dataloader # return your validation dataloader
model = LightningModule(lr=1e-6, lr_backbone=1e-5, weight_decay=1e-4, batch_size=32)
trainer = pl.Trainer(accelerator='gpu', gradient_clip_val=0.1, max_epochs=epochs, auto_scale_batch_size=None, default_root_dir="lightning_output/", enable_checkpointing=False)
trainer.fit(model)
But, when I run inference, I get the following error.
TypeError: forward() missing 2 required positional arguments: 'attention_mask' and 'labels'

CUDA error: device-side assert triggered when training sample size is increased

I'm getting CUDA error: device-side assert triggered error when finetuning LayoutLMv3ForSequenceClassification model using images having two class labels.
I have a total 1734 for training image samples. Following is the major part of my code
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False, ocr_lang="eng")
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
processor = LayoutLMv3Processor(feature_extractor,tokenizer)
class DocumentClassificationDataset(Dataset):
def __init__(self, image_paths, processor):
self.image_paths = image_paths
self.processor = processor
def __len__(self):
return len(self.image_paths)
def __getitem__(self, item):
image_path = self.image_paths[item]
image = Image.open(image_path).convert("RGB")
width, height = image.size
json_path = image_path.with_suffix(".json")
with open(json_path, "r") as f:
ocr_result = json.load(f)
width_scale = 1000/width
height_scale = 1000/height
words = []
boxes = []
for row in ocr_result:
boxes.append(scale_bounding_box(row["bounding_box"], width_scale, height_scale))
words.append(row["word"])
encoding = processor(
image,
words,
boxes=boxes,
max_length=512,
padding="max_length",
truncation=True,
return_tensors="pt"
)
label = DOCUMENT_CLASSES.index(image_path.parent.name)
return dict(
input_ids = encoding["input_ids"].flatten(),
attention_mask = encoding["attention_mask"].flatten(),
bbox = encoding["bbox"].flatten(end_dim=1),
pixel_values = encoding["pixel_values"].flatten(end_dim=1),
labels = torch.tensor(label, dtype=torch.long)
)
train_dataset = DocumentClassificationDataset(train_images, processor)
test_dataset = DocumentClassificationDataset(test_images, processor)
train_data_loader = DataLoader(
train_dataset,
batch_size=4,
shuffle=True,
num_workers=2
)
test_data_loader = DataLoader(
test_dataset,
batch_size=4,
shuffle=False,
num_workers=2
)
class ModelModule(pl.LightningModule):
def __init__(self, n_classes: int):
super().__init__()
self.model =
LayoutLMv3ForSequenceClassification.from_pretrained(
"microsoft/layoutlmv3-base",
num_labels = n_classes
)
self.train_accuracy = Accuracy(task="multiclass",
num_classes=n_classes)
self.val_accuracy = Accuracy(task="multiclass",
num_classes=n_classes)
def forward(self, input_ids, attention_mask, bbox,
pixel_values, labels=None):
return self.model(
input_ids,
attention_mask=attention_mask,
bbox=bbox,
pixel_values=pixel_values,
labels=labels
)
def training_step(self, batch, batch_idx):
labels = batch["labels"]
outputs = self(
batch["input_ids"],
batch["attention_mask"],
batch["bbox"],
batch["pixel_values"],
labels
)
loss = outputs.loss
self.log("train_loss", loss)
self.train_accuracy(outputs.logits, labels)
self.log("train_acc", self.train_accuracy, on_step=True, on_epoch=True)
return loss
def validation_step(self, batch, batch_idx):
labels = batch["labels"]
outputs = self(
batch["input_ids"],
batch["attention_mask"],
batch["bbox"],
batch["pixel_values"],
labels
)
loss = outputs.loss
self.log("val_loss", loss)
self.val_accuracy(outputs.logits, labels)
self.log("val_acc", self.val_accuracy, on_step=False, on_epoch=True)
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.model.parameters(), lr=0.00001)
model_checkpoint = ModelCheckpoint(
filename="{epcoh}-{step}-{val_loss:.4f}",
save_last=True,
save_top_k=3,
monitor="val_loss",
mode="min"
)
trainer = pl.Trainer(
accelerator="gpu",
precision=16,
devices=1,
max_epochs=4,
callbacks=[
model_checkpoint
]
)
trainer.fit(model_module, train_data_loader, test_data_loader)
Model is getting trained when I tried with lesser number of samples (200). I'm getting this error only When the number of samples increases.
When I searched, most of the time this issue occurs because of the label mismatch, but that will not be the issue in this case.

Why is my PyTorch lightning model performing bad?

Trying to train a basic model for this competition. This is a multi-class problem where the values are from 1 to 5 (with 0.5 size steps). The problem is that the training error is very bad and so is the inference.
Is my model correct for this problem? I didn't use warmup at all and I ran it only for a few epochs but I don't think it will make much difference.
Also, here's a result of an inference:
(0, tensor([[0.0024, 0.4513, 1.3045, 0.0350, 0.4182, 0.6813]]))
loss is 0?
predictions are too low (we should start from 1)
Clearly something is wrong with the model - What is it?
Here's my code (which is based on this article):
class LangKnolModel(pl.LightningModule):
def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
super().__init__()
self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
self.classifier = nn.Linear(self.bert.config.hidden_size, 6)
self.n_training_steps = n_training_steps
self.max_epochs=8,
self.n_warmup_steps = n_warmup_steps
self.criterion = nn.MSELoss()
def forward(self, input_ids, attention_mask, labels=None):
output = self.bert(input_ids, attention_mask=attention_mask)
output = self.classifier(output.pooler_output)
# output = torch.sigmoid(output)
loss = 0
if labels is not None:
loss = self.criterion(output, labels)
return loss, output
def training_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("train_loss", loss, prog_bar=True, logger=True)
return {"loss": loss, "predictions": outputs, "labels": labels}
def validation_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
return loss
def test_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("test_loss", loss, prog_bar=True, logger=True)
return loss
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=self.n_warmup_steps,
num_training_steps=self.n_training_steps
)
return dict(
optimizer=optimizer,
lr_scheduler=dict(
scheduler=scheduler,
interval='step'
)
)
training
model = LangKnolModel(
n_classes=len(LABELS_COLS),
n_warmup_steps=1,
n_training_steps=16,
)
trainer = pl.Trainer(
gpus=1,
max_epochs=5
)
trainer.fit(model, data_module)

How to extract the encoded features after running a PyTorch LSTM autoencoder model?

I am very new to PyTorch and Python in general, and I am now struggling to get the encoded features from my pre-trained LSTM autoencoder which can be seen below:
import torch
import torch.nn as nn
# Bulding an LSTM autoencoder
class Encoder(nn.Module):
def __init__(self, seq_len, n_features, embedding_dim=32):
super(Encoder, self).__init__()
self.seq_len, self.n_features = seq_len, n_features
self.embedding_dim, self.hidden_dim1, self.hidden_dim2 = embedding_dim, 4 * embedding_dim, 2* embedding_dim
self.rnn1 = nn.LSTM(
input_size=n_features,
hidden_size=self.hidden_dim1, #128
num_layers=1,
batch_first=True
)
self.rnn2 = nn.LSTM(
input_size=self.hidden_dim1,
hidden_size=self.hidden_dim2, #64
num_layers=1,
batch_first=True
)
self.rnn3 = nn.LSTM(
input_size=self.hidden_dim2,
hidden_size=embedding_dim, #32
num_layers=1,
batch_first=True
)
def forward(self, x):
x = x.reshape((1, self.seq_len, self.n_features))
x, (_, _) = self.rnn1(x)
x, (_, _) = self.rnn2(x)
x, (hidden_n, _) = self.rnn3(x)
return hidden_n.reshape((self.n_features, self.embedding_dim))
class Decoder(nn.Module):
def __init__(self, seq_len, input_dim=32, n_features=1):
super(Decoder, self).__init__()
self.seq_len, self.input_dim = seq_len, input_dim
self.hidden_dim2, self.hidden_dim1, self.n_features = 4 * input_dim,2 * input_dim, n_features
self.rnn1 = nn.LSTM(
input_size=input_dim,
hidden_size=input_dim,
num_layers=1,
batch_first=True
)
self.rnn2 = nn.LSTM(
input_size=input_dim,
hidden_size=self.hidden_dim1,
num_layers=1,
batch_first=True
)
self.rnn3 = nn.LSTM(
input_size=self.hidden_dim1,
hidden_size=self.hidden_dim2,
num_layers=1,
batch_first=True
)
self.output_layer = nn.Linear(self.hidden_dim2, n_features)
def forward(self, x):
x = x.repeat(self.seq_len, self.n_features)
x = x.reshape((self.n_features, self.seq_len, self.input_dim))
x, (hidden_n, cell_n) = self.rnn1(x)
x, (hidden_n, cell_n) = self.rnn2(x)
x, (hidden_n, cell_n) = self.rnn3(x)
x = x.reshape((self.seq_len, self.hidden_dim2))
return self.output_layer(x)
class RAE(nn.Module):
def __init__(self,seq_len, n_features, embedding_dim=32):
super(RAE, self).__init__()
self.seq_len, self.n_features = seq_len, n_features
self.embedding_dim = embedding_dim
self.encoder = Encoder (seq_len, n_features, embedding_dim).to(device)
self.decoder = Decoder (seq_len, embedding_dim, n_features).to(device)
def forward(self,x):
x = self.encoder(x)
x = self.decoder(x)
return x
### TRAINING
def train_model(model,train_dataset,val_dataset, n_epochs):
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.MSELoss(reduction='mean').to(device) # nn.L1Loss sum
history = dict(train = [], val = [])
for epoch in range(1, n_epochs + 1):
model = model.train()
train_losses = []
for seq_true in train_dataset:
optimizer.zero_grad()
seq_true = seq_true.to(device)
seq_pred = model(seq_true)
loss = criterion(seq_pred, seq_true)
loss.backward()
optimizer.step()
train_losses.append(loss.item())
val_losses = []
model = model.eval()
with torch.no_grad():
for seq_true in val_dataset:
seq_true = seq_true.to(device)
seq_pred =model(seq_true)
loss = criterion(seq_pred, seq_true)
val_losses.append(loss.item())
#add accuracy
train_loss = np.mean(train_losses)
val_loss = np.mean(val_losses)
history['train'].append(train_loss)
history['val'].append(val_loss)
print(f'Epoch {epoch}: train loss {train_loss} val loss {val_loss}')
return model.eval(),history
Once I trained my model I followed the advice given by ptrblck here and implemented it as follows:
activation = {}
def get_activation(name):
def hook(model, input, output):
activation[name] = output.detach()
return hook
model.encoder.register_forward_hook(get_activation('encoder'))
x = test_dataset_SR[1] # instead of using his random example I used one example from my training set
x = x.cuda()
output = model(x)
print(activation['encoder'])
but this gives me this error:
2 def get_activation(name):
3 def hook(model, input, output):
----> 4 activation[name] = output.detach()
5 return hook
AttributeError: 'tuple' object has no attribute 'detach'
Can you please help me solve this issue? I want to take these encoded features, store them and use them as input to another network. I know I could probably train the encoder separately(not sure), but I will need both encoder and decoder so I thought hooks will be my salvation.

Training not speeding up even after using GPU

transform = transforms.Compose([transforms.Resize(IMG_SIZE),
transforms.CenterCrop(CROP_SIZE),
transforms.ToTensor()])
class LandmarksDatasetTrain(Dataset):
"""Landmarks dataset."""
def __init__(self, landmarks_frame, root_dir, transform=None):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.landmarks_frame = landmarks_frame
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.landmarks_frame)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
img_name = os.path.join(self.root_dir,self.landmarks_frame.loc[idx, 'id'][0],self.landmarks_frame.loc[idx, 'id'][1], self.landmarks_frame.loc[idx, 'id'][2], self.landmarks_frame.loc[idx, 'id'])
img_name += ".jpg"
image = Image.open(img_name)
landmarks = self.landmarks_frame.loc[idx, 'landmark_id']
sample = {'image': image, 'landmarks': landmarks}
if self.transform:
sample['image'] = self.transform(sample['image'])
sample['landmarks'] = torch.tensor(sample['landmarks'])
return sample
dataset_train = LandmarksDatasetTrain(landmarks_frame = frame,
root_dir='/kaggle/input/landmark-recognition-2020/train',
transform=transform)
train_loader = DataLoader(dataset_train, batch_size=4, shuffle=True, num_workers=4, drop_last=False)
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(CROP_SIZE*CROP_SIZE*3, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, 64)
self.fc4 = nn.Linear(64, frame['landmark_id'].nunique())
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = self.fc4(x)
return F.log_softmax(x, dim=1)
net = Net()
net.to(device)
for epoch in range(3):
optimizer = optim.Adam(net.parameters(), lr=0.001)
for data in tqdm(train_loader):
X = data['image'].to(device)
y = data['landmarks'].to(device)
net.zero_grad()
output = net(X.view(-1,CROP_SIZE*CROP_SIZE*3))
loss = F.nll_loss(output, y)
loss.backward()
optimizer.step()
print(loss)
batch size is 4
data['image'] and data['landmarks'] are tensors device = torch.device("cuda:0") and deep learning library I am using is Pytorch but GPU is still not working for me. Its usage shows 5% and total time for 1 epoch 3.5 to 4 hours
Will be really helpful if someone points out my mistake.
Attaching an image of resource usage and GPU config.
Attaching image for showing that GPU is on
Here is the link to my notebook
https://www.kaggle.com/hiteshsom/google-landmark-recognition

Resources