How to gather prediction result on TPU (Pytorch)? - nlp

I'm trying to fine-tune my bert-based QA model(PyTorch) with Tpu v3-8 provided by Kaggle. In the validation process I used a ParallelLoader to make predictions on 8 cores at the same time. But after that I don't know what should I do to gather all the results back from each core(and in the correct order corresponding to dataset), in order to calculate the overall EM & F1 score. Can anybody help?
Code:
def _run():
MAX_LEN = 192 # maximum text length in the batch (cannot have too high due to memory constraints)
BATCH_SIZE = 16 # batch size (cannot have too high due to memory constraints)
EPOCHS = 2 # number of epochs
train_sampler = torch.utils.data.distributed.DistributedSampler(
tokenized_datasets['train'],
num_replicas=xm.xrt_world_size(), # tell PyTorch how many devices (TPU cores) we are using for training
rank=xm.get_ordinal(), # tell PyTorch which device (core) we are on currently
shuffle=True
)
train_data_loader = torch.utils.data.DataLoader(
tokenized_datasets['train'],
batch_size=BATCH_SIZE,
sampler=train_sampler,
drop_last=True,
num_workers=0,
)
valid_sampler = torch.utils.data.distributed.DistributedSampler(
tokenized_datasets['validation'],
num_replicas=xm.xrt_world_size(),
rank=xm.get_ordinal(),
shuffle=False
)
valid_data_loader = torch.utils.data.DataLoader(
tokenized_datasets['validation'],
batch_size=BATCH_SIZE,
sampler=valid_sampler,
drop_last=False,
num_workers=0
)
device = xm.xla_device() # device (single TPU core)
model = model.to(device) # put model onto the TPU core
xm.master_print('done loading model')
xm.master_print(xm.xrt_world_size(),'as size')
lr = 0.5e-5 * xm.xrt_world_size()
optimizer = AdamW(model.parameters(), lr=lr) # define our optimizer
for epoch in range(EPOCHS):
gc.collect()
# use ParallelLoader (provided by PyTorch XLA) for TPU-core-specific dataloading:
para_loader = pl.ParallelLoader(train_data_loader, [device])
xm.master_print('parallel loader created... training now')
gc.collect()
call training loop:
train_loop_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler=None)
del para_loader
model.eval()
para_loader = pl.ParallelLoader(valid_data_loader, [device])
gc.collect()
model.eval()
# call evaluation loop
print("call evaluation loop")
start_logits, end_logits = eval_loop_fn(para_loader.per_device_loader(device), model, device)

Related

Strange loss curve while training EfficientNetV2 with Pytorch

I'm new to Pytorch. And I use the architecture that a pre-trained EfficientNetV2 model to connect to a single fully connected layer with one neuron using the ReLU activation function in regression task. However, both losses on training and validation set suddenly increase after first epoch and keep at about the same value during 50 epochs, then suddenly decrease to about same value as first epoch. Can anyone help me figure out what's happening?
Some codes for model and training process:
# hyper-parameter
image_size = 256
learning_rate = 1e-3
batch_size = 32
epochs = 60
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.net = models.efficientnet_v2_m(pretrained=True,weights='DEFAULT')
self.net.classifier[1] = nn.Linear(in_features=1280, out_features=1, bias=True)
self.net.classifier = nn.Sequential(self.net.classifier,nn.ReLU())
def forward(self, input):
output = self.net(input)
return output
model = Model()
# Define the loss function with Classification Cross-Entropy loss and an optimizer with Adam optimizer
loss_fn = nn.L1Loss()
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
# Function to test the model with the test dataset and print the accuracy for the test images
def testAccuracy():
model.eval()
loss = 0.0
total = 0.0
with torch.no_grad():
for data in validation_loader:
images, labels = data
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print("The model test will be running on", device, "device")
# get the inputs
images = Variable(images.to(device))
labels = Variable(labels.to(device))
# run the model on the test set to predict labels
outputs = model(images)
# the label with the highest energy will be our prediction
# print('outputs: ',outputs)
# print('labels: ',labels)
temp = loss_fn(outputs, labels.unsqueeze(1))
loss += loss_fn(outputs, labels.unsqueeze(1)).item()
total += 1
# compute the accuracy over all test images
mae = loss/total
return(mae)
# Training function. We simply have to loop over our data iterator and feed the inputs to the network and optimize.
def train(num_epochs):
best_accuracy = 0.0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
train_loss_all = []
val_loss_all = []
for epoch in range(num_epochs): # loop over the dataset multiple times
running_loss = 0.0
total = 0
for i, (images, labels) in tqdm(enumerate(train_loader, 0),total=len(train_loader)):
# get the inputs
images = Variable(images.to(device))
labels = Variable(labels.to(device))
# zero the parameter gradients
optimizer.zero_grad()
# predict classes using images from the training set
outputs = model(images)
# compute the loss based on model output and real labels
loss = loss_fn(outputs, labels.unsqueeze(1))
# backpropagate the loss
loss.backward()
# adjust parameters based on the calculated gradients
optimizer.step()
# Let's print statistics for every one batch
running_loss += loss.item() # extract the loss value
total += 1
train_loss = running_loss/total
train_loss_all.append(train_loss)
accuracy = testAccuracy()
val_loss_all.append(accuracy)
if accuracy > best_accuracy:
saveModel()
best_accuracy = accuracy
history = {'train_loss':train_loss_all,'val_loss':val_loss_all}
return(history)
Loss curve:
loss curve

Looking for help on why GPU is not used when I train a Pytorch model

The machine I am using for training has 4 GPUs. I am "moving" classifier, loss function and tensors to GPU. But when I run nvidia-smi on the machine while training is ongoing, I see GPU utilization is very low (3%) on one core and 0 on other cores.
Questions I have are
Is there an easier approach to ask Pytorch to use GPU and as many cores as available without me having to do so many .to(device) all over the place
Is there something other than .to(device) that is needed to use GPU?
Is there a way to see if training is happening on CPU vs GPU or is running nvidia-smi on the machine and looking at GPU utilization the only way?
How do I interpret GPU utilization of 3% in nvidia-smi. Does it mean CPU is being used in many places? If yes, is there a way to debug what is making the training use CPU?
Will setting num_workers to number of available cores in DataLoader class be enough to use multiple GPU cores? Is there any generic way to automatically learn number of GPU cores available?
Code used to train
random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)
torch.cuda.manual_seed(1234)
torch.backends.cudnn.deterministic = True
start_time = time.time()
clf = MLP(len(X_training[0]), hidden_size=[100, 100, 100, 100, 100])
#Move to GPU if available
use_gpu = torch.cuda.is_available()
device = torch.device('cuda' if use_gpu else 'cpu')
# Define the loss function and optimizer
optimizer = torch.optim.Adam(clf.parameters(), lr=8e-4)
clf = clf.to(device)
loss_function = nn.BCELoss()
loss_function = loss_function.to(device)
# Run the training loop
# per_epoch_precision = []
# per_epoch_recall = []
for epoch in range(0, 150):
# Set current loss value
current_loss = 0.0
dataset = MyDataset(X_training, y_training, use_gpu)
kwargs = {'num_workers': 1, 'pin_memory': True} if use_gpu else {}
trainloader = torch.utils.data.DataLoader(dataset, batch_size=10000, shuffle=True, **kwargs)
# Iterate over the DataLoader for training data
clf.train() # set to train mode
for i, data in enumerate(trainloader):
# Get inputs
inputs, targets = data
inputs = inputs.to(device)
targets = targets.to(device)
# Zero the gradients
optimizer.zero_grad()
# Perform forward pass
outputs = clf(inputs)
# Compute loss
targets = targets.float().unsqueeze(1)
loss = loss_function(outputs, targets)
# Perform backward pass
loss.backward()
# Perform optimization
optimizer.step()
# Print statistics
current_loss += loss.item()
if i % 20000 == 19999:
print("Loss after mini-batch %5d: %.3f" % (i + 1, current_loss / 500))
current_loss = 0.0
# Process is complete.
print("Training process has finished.")
print(f"Train time is {time.time() - start_time}")
class MyDataset(Dataset):
def __init__(self, x, y, use_gpu=False):
x = x.astype(np.float32)
self.x_train = torch.from_numpy(x)
self.y_train = torch.from_numpy(y.values)
if use_gpu:
device = torch.device("cuda")
self.x_train.to(device)
self.y_train.to(device)
# self.y_train = torch.LongTensor(y.values, dtype=torch.int)
def __len__(self):
return len(self.y_train)
def __getitem__(self,idx):
return self.x_train[idx],self.y_train[idx]
class MLP(nn.Module):
def __init__(self, input_size, hidden_size, act_fn=nn.ReLU(), use_dropout=False, drop_rate=0.25):
super(MLP, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.layers = nn.Sequential()
if use_dropout:
self.layers.append(nn.Dropout(p=drop_rate))
self.layers.append(nn.Linear(self.input_size, self.hidden_size[0]))
self.layers.append(act_fn)
for i in range(1, len(hidden_size)):
if use_dropout:
self.layers.append(nn.Dropout(p=drop_rate))
self.layers.append(nn.Linear(self.hidden_size[i - 1], self.hidden_size[i]))
self.layers.append(act_fn)
if use_dropout:
self.layers.append(nn.Dropout(p=drop_rate))
self.layers.append(nn.Linear(self.hidden_size[-1], 1))
self.layers.append(nn.Sigmoid())
def forward(self, x):
return self.layers(x)

Load batch to GPU problem in pytorch using BERT model

I have created a function for evaluation a function. It takes as an input the model and validation data loader and return the validation accuracy, validation loss and f1_weighted score.
def evaluate(model, val_dataloader):
"""
After the completion of each training epoch, measure the model's performance
on our validation set.
"""
# Put the model into the evaluation mode. The dropout layers are disabled during
# the test time.
model.eval()
# Tracking variables
val_accuracy = []
val_loss = []
f1_weighted = []
# For each batch in our validation set...
for batch in val_dataloader:
# Load batch to GPU
b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
# Compute logits
with torch.no_grad():
logits = model(b_input_ids, b_attn_mask)
# Compute loss
loss = loss_fn(logits, b_labels)
val_loss.append(loss.item())
# Get the predictions
preds = torch.argmax(logits, dim=1).flatten()
# Calculate the accuracy rate
accuracy = (preds == b_labels).cpu().numpy().mean() * 100
val_accuracy.append(accuracy)
# Calculate the f1 weighted score
f1_metric = F1Score('weighted')
f1_weighted = f1_metric(preds, b_labels)
# Compute the average accuracy and loss over the validation set.
val_loss = np.mean(val_loss)
val_accuracy = np.mean(val_accuracy)
f1_weighted = np.mean(f1_weighted)
return val_loss, val_accuracy, f1_weighted
The core for f1 score can be found here
Measuring F1 score for multiclass classification natively in PyTorch
Before the evaluation function there is a function which trains a bert model and has the following inputs
train(model, train_dataloader, val_dataloader, epochs, evaluation).
Thus if the evaluation = True, then the validation accuracy seems in the end of each epoch.
As for the dataloaders are created with the following way:
# Convert other data types to torch.Tensor
train_labels = torch.tensor(authors_train)
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
With a similar way you cal create the dataloader for validation and testing set.
Update:
I changed the line
f1_weighted = f1_metric(preds, b_labels)
with this one
f1_weighted.append(f1_metric(preds, b_labels))
and now I have the following error
AttributeError Traceback (most recent call last)
<ipython-input-49-0e0f6d227c4f> in <module>()
1 set_seed(42) # Set seed for reproducibility
2 bert_classifier, optimizer, scheduler = initialize_model(epochs=4)
----> 3 train(bert_classifier, train_dataloader, val_dataloader, epochs=4, evaluation=True)
4
5 #1. 77.28
3 frames
<__array_function__ internals> in mean(*args, **kwargs)
/usr/local/lib/python3.7/dist-packages/numpy/core/_methods.py in _mean(a, axis, dtype, out, keepdims)
168 ret = arr.dtype.type(ret / rcount)
169 else:
--> 170 ret = ret.dtype.type(ret / rcount)
171 else:
172 ret = ret / rcount
AttributeError: 'torch.dtype' object has no attribute 'type'

Error(s) in loading state_dict for RobertaForSequenceClassification

I am using a fine-tuned Roberta Model that is unbiased-toxic-roberta trained on Jigsaw Data:
https://huggingface.co/unitary/unbiased-toxic-roberta
It is fine-tuned on 16 classes.
I am writing my code for binary classification:
Metrics to calculate loss on binary labels as accuracy
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
acc = np.sum(predictions == labels) / predictions.shape[0]
return {"accuracy" : acc}
import torch.nn as nn
model = tr.RobertaForSequenceClassification.from_pretrained("/home/pc/unbiased_toxic_roberta",num_labels=2)
model.to(device)
training_args = tr.TrainingArguments(
# report_to = 'wandb',
output_dir='/home/pc/1_Proj_hate_speech/results_roberta', # output directory
overwrite_output_dir = True,
num_train_epochs=20, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=32, # batch size for evaluation
learning_rate=2e-5,
warmup_steps=1000, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs3', # directory for storing logs
logging_steps=1000,
evaluation_strategy="epoch"
,save_strategy="epoch"
,load_best_model_at_end=True
)
trainer = tr.Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_data, # training dataset
eval_dataset=val_data, # evaluation dataset
compute_metrics=compute_metrics
)
When I run this, I get an error:
loading weights file /home/pc/unbiased_toxic_roberta/pytorch_model.bin
RuntimeError: Error(s) in loading state_dict for RobertaForSequenceClassification:
size mismatch for classifier.out_proj.weight: copying a param with shape torch.Size([16, 768]) from checkpoint, the shape in current model is torch.Size([2, 768]).
size mismatch for classifier.out_proj.bias: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([2]).
How can I add a linear layer and solve this error ?
Load with ignore_mismatched_sizes=True:
model = tr.RobertaForSequenceClassification.from_pretrained(
"/home/pc/unbiased_toxic_roberta",
num_labels=2,
ignore_mismatched_sizes=True)
then you can finetune the model.

Pytorch BCE loss not decreasing for word sense disambiguation task

I am performing word sense disambiguation and have created my own vocabulary of the top 300k most common English words. My model is very simple where each word in the sentences (their respective index value) is passed through an embedding layer which embeds the word and average the resulting embedding. The averaged embedding is then sent through a linear layer, as shown in the model below.
class TestingClassifier(nn.Module):
def __init__(self, vocabSize, features, embeddingDim):
super(TestingClassifier, self).__init__()
self.embeddings = nn.Embedding(vocabSize, embeddingDim)
self.linear = nn.Linear(features, 2)
self.sigmoid = nn.Sigmoid()
def forward(self, inputs):
embeds = self.embeddings(inputs)
avged = torch.mean(embeds, dim=-1)
output = self.linear(avged)
output = self.sigmoid(output)
return output
I am running BCELoss as loss function and SGD as optimizer. My problem is that my loss barely decreases as training goes on, almost as if it converges with a very high loss. I have tried different learning rates (0.0001, 0.001, 0.01 and 0.1) but I get the same issue.
My training function is as follows:
def train_model(model,
optimizer,
lossFunction,
batchSize,
epochs,
isRnnModel,
trainDataLoader,
validDataLoader,
earlyStop = False,
maxPatience = 1
):
validationAcc = []
patienceCounter = 0
stopTraining = False
model.train()
# Train network
for epoch in range(epochs):
losses = []
if(stopTraining):
break
for inputs, labels in tqdm(trainDataLoader, position=0, leave=True):
optimizer.zero_grad()
# Predict and calculate loss
prediction = model(inputs)
loss = lossFunction(prediction, labels)
losses.append(loss)
# Backward propagation
loss.backward()
# Readjust weights
optimizer.step()
print(sum(losses) / len(losses))
curValidAcc = check_accuracy(validDataLoader, model, isRnnModel) # Check accuracy on validation set
curTrainAcc = check_accuracy(trainDataLoader, model, isRnnModel)
print("Epoch", epoch + 1, "Training accuracy", curTrainAcc, "Validation accuracy:", curValidAcc)
# Control early stopping
if(earlyStop):
if(patienceCounter == 0):
if(len(validationAcc) > 0 and curValidAcc < validationAcc[-1]):
benchmark = validationAcc[-1]
patienceCounter += 1
print("Patience counter", patienceCounter)
elif(patienceCounter == maxPatience):
print("EARLY STOP. Patience level:", patienceCounter)
stopTraining = True
else:
if(curValidAcc < benchmark):
patienceCounter += 1
print("Patience counter", patienceCounter)
else:
benchmark = curValidAcc
patienceCounter = 0
validationAcc.append(curValidAcc)
Batch size is 32 (training set contains 8000 rows), vocabulary size is 300k, embedding dimension is 24. I have tried adding more linear layers to the network, but it makes no difference. The prediction accuracy on the training and validation sets stays at around 50% (which is horrible) even after many epochs of training. Any help is much appreciated!

Resources