How to train GPT2 with Huggingface trainer - python-3.x

I am trying to fine tune GPT2, with Huggingface's trainer class.
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, Trainer, TrainingArguments
class torchDataset(Dataset):
def __init__(self, encodings):
self.encodings = encodings
self.len = len(encodings)
def __getitem__(self, index):
item = {torch.tensor(val[index]) for key, val in self.encodings.items()}
return item
def __len__(self):
return self.len
def print(self):
print(self.encodings)
# HYPER PARAMETERS
EPOCHS = 5
BATCH_SIZE = 2
WARMUP_STEPS = 5000
LEARNING_RATE = 1e-3
DECAY = 0
# Model ids and loading dataset
model_id = 'gpt2' # small model
# model_id = 'gpt2-medium' # medium model
# model_id = 'gpt2-large' # large model
dataset = load_dataset('wikitext', 'wikitext-2-v1') # first dataset
# dataset = load_dataset('m-newhauser/senator-tweets') # second dataset
# dataset = load_dataset('IsaacRodgz/Fake-news-latam-omdena') # third dataset
print('Loaded dataset')
# Dividing dataset into predefined splits
train_dataset = dataset['train']['text']
validation_dataset = dataset['validation']['text']
test_dataset = dataset['test']['text']
print('Divided dataset')
# loading tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(model_id,
# bos_token='<|startoftext|>', eos_token='<|endoftext|>',
pad_token='<|pad|>'
)
print('tokenizer max length:', tokenizer.model_max_length)
train_encoding = tokenizer(train_dataset, padding=True, truncation=True, max_length=1024, return_tensors='pt')
eval_encoding = tokenizer(validation_dataset, padding=True, truncation=True, max_length=1024, return_tensors='pt')
test_encoding = tokenizer(test_dataset, padding=True, truncation=True, max_length=1024, return_tensors='pt')
print('Converted to torch dataset')
torch_dataset_train = torchDataset(train_encoding)
torch_dataset_eval = torchDataset(eval_encoding)
torch_dataset_test = torchDataset(test_encoding)
# Setup training hyperparameters
training_args = TrainingArguments(
output_dir='/model_dump/',
num_train_epochs=EPOCHS,
warmup_steps=WARMUP_STEPS,
learning_rate=LEARNING_RATE,
weight_decay=DECAY
)
model = GPT2LMHeadModel.from_pretrained(model_id)
model.resize_token_embeddings(len(tokenizer))
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_encoding,
eval_dataset=eval_encoding
)
trainer.train()
# model.save_pretrained('/model_dump/')
But with this code I get this error
The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,past_key_values,attention_mask,token_type_ids,position_ids,head_mask,inputs_embeds,encoder_hidden_states,encoder_attention_mask,labels,use_cache,output_attentions,output_hidden_states,return_dict,labels,label,label_ids.
When I use the variables torch_dataset_train and torch_dataset_eval in Trainer's arguments, the error I get is:
TypeError: vars() argument must have __dict__ attribute
This typeError is the same I get if as dataset I use the WikiText2 from torchtext.
How can I fix this issue?

Related

How loss is calculated in MLM transformers training and the calculation/Intuition behind it

I am training a MLM model using Roberta-XLM large model.
Here is the standard code.
tokenizer = tr.XLMRobertaTokenizer.from_pretrained("xlm-roberta-large",local_files_only=True)
model = tr.XLMRobertaForMaskedLM.from_pretrained("xlm-roberta-large", return_dict=True,local_files_only=True)
df=pd.read_csv("training_data_multilingual.csv")
train_df=df.message_text.tolist()
train_df=list(set(train_df))
train_df = [x for x in train_df if str(x) != 'nan']
train_encodings = tokenizer(train_df, truncation=True, padding=True, max_length=512, return_tensors="pt")
class SEDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
return item
def __len__(self):
return len(self.encodings["attention_mask"])
train_data = SEDataset(train_encodings)
# print("train data created")
training_args = tr.TrainingArguments(
output_dir='results_mlm_vocab_exp'
,logging_dir='logs_mlm_vocab_exp' # directory for storing logs
,save_strategy="epoch"
,learning_rate=2e-5
,logging_steps=6000
,overwrite_output_dir=True
,num_train_epochs=10
,per_device_train_batch_size=2
,prediction_loss_only=True
,gradient_accumulation_steps=4
,bf16=True #Ampere GPU
,optim="adamw_hf"
)
trainer = tr.Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_data
)
trainer.train()
I have few question related to this:
How loss is calculated in MLM training? I see during training these logs are printed {'loss': 1.6117, 'learning_rate': 1.751861042183623e-05, 'epoch': 2.48}. I guess it's training loss? If so how its calculated?
How to pass validation data inside TrainingArguments ? Is it same as training data?
Is it logical to get precision, recall, F1 score for training and validation data for MLM training? If so then how to achieve it using Trainer?
Any reading links would also be appreciated.

Cannot create the calibration cache for the QAT model in tensorRT

I've trained a quantized model (with help of quantized-aware-training method in pytorch). I want to create the calibration cache to do inference in INT8 mode by TensorRT. When create calib cache, I get the following warning and the cache is not created:
[03/06/2022-08:14:07] [TRT] [W] Calibrator won't be used in explicit precision mode. Use quantization aware training to generate network with Quantize/Dequantize nodes.
[03/06/2022-08:14:11] [TRT] [W] Some weights are outside of int8_t range and will be clipped to int8_t range.
[03/06/2022-08:14:11] [TRT] [W] Some weights are outside of int8_t range and will be clipped to int8_t range.
[03/06/2022-08:14:11] [TRT] [W] Some weights are outside of int8_t range and will be clipped to int8_t range.
[03/06/2022-08:14:11] [TRT] [W] Some weights are outside of int8_t range and will be clipped to int8_t range.
I've trained the model accordingly and converted to ONNX:
import os
import sys
import argparse
import warnings
import collections
import torch
import torch.utils.data
from torch import nn
from tqdm import tqdm
import torchvision
from torchvision import transforms
from torch.hub import load_state_dict_from_url
from pytorch_quantization import nn as quant_nn
from pytorch_quantization import calib
from pytorch_quantization.tensor_quant import QuantDescriptor
from pytorch_quantization import quant_modules
import onnxruntime
import numpy as np
import models
import kornia
from prettytable import PrettyTable
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
def get_parser():
"""
Creates an argument parser.
"""
parser = argparse.ArgumentParser(description='Classification quantization flow script')
parser.add_argument('--data-dir', '-d', type=str, help='input data folder', required=True)
parser.add_argument('--model-name', '-m', default='', help='model name: default resnet50')
parser.add_argument('--disable-pcq', '-dpcq', action="store_true", help='disable per-channel quantization for weights')
parser.add_argument('--out-dir', '-o', default='/tmp', help='output folder: default /tmp')
parser.add_argument('--print-freq', '-pf', type=int, default=20, help='evaluation print frequency: default 20')
parser.add_argument('--threshold', '-t', type=float, default=-1.0, help='top1 accuracy threshold (less than 0.0 means no comparison): default -1.0')
parser.add_argument('--batch-size-train', type=int, default=8, help='batch size for training: default 128')
parser.add_argument('--batch-size-test', type=int, default=8, help='batch size for testing: default 128')
parser.add_argument('--batch-size-onnx', type=int, default=20, help='batch size for onnx: default 1')
parser.add_argument('--seed', type=int, default=12345, help='random seed: default 12345')
checkpoint = parser.add_mutually_exclusive_group(required=True)
checkpoint.add_argument('--ckpt-path', default='', type=str, required=False,
help='path to latest checkpoint (default: none)')
checkpoint.add_argument('--ckpt-url', default='', type=str, required=False,
help='url to latest checkpoint (default: none)')
checkpoint.add_argument('--pretrained', action="store_true")
parser.add_argument('--num-calib-batch', default=8, type=int,
help='Number of batches for calibration. 0 will disable calibration. (default: 4)')
parser.add_argument('--num-finetune-epochs', default=0, type=int,
help='Number of epochs to fine tune. 0 will disable fine tune. (default: 0)')
parser.add_argument('--calibrator', type=str, choices=["max", "histogram"], default="max")
parser.add_argument('--percentile', nargs='+', type=float, default=[99.9, 99.99, 99.999, 99.9999])
parser.add_argument('--sensitivity', action="store_true", help="Build sensitivity profile")
parser.add_argument('--evaluate-onnx', action="store_true", help="Evaluate exported ONNX")
return parser
def prepare_model(
model_name,
num_class,
data_dir,
per_channel_quantization,
batch_size_train,
batch_size_test,
batch_size_onnx,
calibrator,
pretrained,
ckpt_path,
ckpt_url=None):
## Initialize quantization, model and data loaders
if per_channel_quantization:
print('<<<<<<< Per channel qaunt >>>>>>>>')
quant_desc_input = QuantDescriptor(calib_method=calibrator)
quant_nn.QuantConv2d.set_default_quant_desc_input(quant_desc_input)
quant_nn.QuantLinear.set_default_quant_desc_input(quant_desc_input)
else:
## Force per tensor quantization for onnx runtime
print('<<<<<<< Per tensor qaunt >>>>>>>>')
quant_desc_input = QuantDescriptor(calib_method=calibrator, axis=None)
quant_nn.QuantConv2d.set_default_quant_desc_input(quant_desc_input)
quant_nn.QuantConvTranspose2d.set_default_quant_desc_input(quant_desc_input)
quant_nn.QuantLinear.set_default_quant_desc_input(quant_desc_input)
quant_desc_weight = QuantDescriptor(calib_method=calibrator, axis=None)
quant_nn.QuantConv2d.set_default_quant_desc_weight(quant_desc_weight)
quant_nn.QuantConvTranspose2d.set_default_quant_desc_weight(quant_desc_weight)
quant_nn.QuantLinear.set_default_quant_desc_weight(quant_desc_weight)
if model_name in models.__dict__:
model = models.__dict__[model_name](pretrained=pretrained, quantize=True)
num_feats = model.fc.in_features
model.fc = nn.Linear(num_feats, num_class)
else:
print('Model is not available....downloading....')
quant_modules.initialize()
model = torchvision.models.__dict__[model_name](pretrained=pretrained)
if 'resnet' in model_name:
num_feats = model.fc.in_features
model.fc = nn.Linear(num_feats, num_class)
if 'densenet' in model_name:
num_feats = model.classifier.in_features
model.classifier = nn.Linear(num_feats, num_class)
quant_modules.deactivate()
if not pretrained:
if ckpt_path:
model = torch.load(ckpt_path)
else:
model = load_state_dict_from_url(ckpt_url)
if 'state_dict' in model.keys():
model = model['state_dict']
elif 'model' in model.keys():
model = model['model']
# model.load_state_dict(checkpoint)
model.eval()
model.cuda()
print(model)
## Prepare the data loaders
traindir = os.path.join(data_dir, 'train')
valdir = os.path.join(data_dir, 'test')
_args = collections.namedtuple("mock_args", ["model", "distributed", "cache_dataset"])
dataset, dataset_test, train_sampler, test_sampler = load_data(
traindir, valdir, _args(model=model_name, distributed=False, cache_dataset=False))
data_loader_train = torch.utils.data.DataLoader(
dataset, batch_size=batch_size_train,
sampler=train_sampler, num_workers=4, pin_memory=True)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=batch_size_test,
sampler=test_sampler, num_workers=4, pin_memory=True)
data_loader_onnx = torch.utils.data.DataLoader(
dataset_test, batch_size=batch_size_onnx,
sampler=test_sampler, num_workers=4, pin_memory=True)
return model, data_loader_train, data_loader_test, data_loader_onnx
def main(cmdline_args):
parser = get_parser()
args = parser.parse_args(cmdline_args)
print(parser.description)
print(args)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
args.model_name = 'resnet34'
args.data_dir = '/home/Dataset/'
args.disable_pcq = True #set true to disbale
args.batch_size_train = 8
args.batch_size_test = 8
args.batch_size_onnx = 8
args.calibrator = 'max'
args.pretrained = True
args.ckpt_path = ''
args.ckpt_url = ''
args.num_class = 5
## Prepare the pretrained model and data loaders
model, data_loader_train, data_loader_test, data_loader_onnx = prepare_model(
args.model_name,
args.num_class,
args.data_dir,
not args.disable_pcq,
args.batch_size_train,
args.batch_size_test,
args.batch_size_onnx,
args.calibrator,
args.pretrained,
args.ckpt_path,
args.ckpt_url)
kwargs = {"alpha": 0.75, "gamma": 2.0, "reduction": 'mean'}
criterion = kornia.losses.FocalLoss(**kwargs)
## Calibrate the model
with torch.no_grad():
calibrate_model(
model=model,
model_name=args.model_name,
data_loader=data_loader_train,
num_calib_batch=args.num_calib_batch,
calibrator=args.calibrator,
hist_percentile=args.percentile,
out_dir=args.out_dir)
## Build sensitivy profile
if args.sensitivity:
build_sensitivity_profile(model, criterion, data_loader_test)
kwargs = {"alpha": 0.75, "gamma": 3.0, "reduction": 'mean'}
criterion = kornia.losses.FocalLoss(**kwargs)
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_finetune_epochs)
for epoch in range(args.num_finetune_epochs):
# Training a single epch
train_one_epoch(model, criterion, optimizer, data_loader_train, "cuda", epoch, 100)
lr_scheduler.step()
if args.num_finetune_epochs > 0:
## Evaluate after finetuning
with torch.no_grad():
print('Finetune evaluation:')
top1_finetuned = evaluate(model, criterion, data_loader_test, device="cuda")
else:
top1_finetuned = -1.0
## Export to ONNX
onnx_filename = args.out_dir + '/' + args.model_name + ".onnx"
top1_onnx = -1.0
if export_onnx(model, onnx_filename, args.batch_size_onnx, not args.disable_pcq) and args.evaluate_onnx:
## Validate ONNX and evaluate
top1_onnx = evaluate_onnx(onnx_filename, data_loader_onnx, criterion, args.print_freq)
## Print summary
print("Accuracy summary:")
table = PrettyTable(['Stage','Top1'])
table.align['Stage'] = "l"
table.add_row( [ 'Finetuned', "{:.2f}".format(top1_finetuned) ] )
table.add_row( [ 'ONNX', "{:.2f}".format(top1_onnx) ] )
print(table)
## Compare results
if args.threshold >= 0.0:
if args.evaluate_onnx and top1_onnx < 0.0:
print("Failed to export/evaluate ONNX!")
return 1
if args.num_finetune_epochs > 0:
if top1_finetuned >= (top1_onnx - args.threshold):
print("Accuracy threshold was met!")
else:
print("Accuracy threshold was missed!")
return 1
return 0
def evaluate_onnx(onnx_filename, data_loader, criterion, print_freq):
print("Loading ONNX file: ", onnx_filename)
ort_session = onnxruntime.InferenceSession(onnx_filename)
with torch.no_grad():
metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Test:'
with torch.no_grad():
for image, target in metric_logger.log_every(data_loader, print_freq, header):
image = image.to("cpu", non_blocking=True)
image_data = np.array(image)
input_data = image_data
# run the data through onnx runtime instead of torch model
input_name = ort_session.get_inputs()[0].name
raw_result = ort_session.run([], {input_name: input_data})
output = torch.tensor((raw_result[0]))
loss = criterion(output, target)
acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
batch_size = image.shape[0]
metric_logger.update(loss=loss.item())
metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print(' ONNXRuntime: Acc#1 {top1.global_avg:.3f} Acc#5 {top5.global_avg:.3f}'
.format(top1=metric_logger.acc1, top5=metric_logger.acc5))
return metric_logger.acc1.global_avg
def export_onnx(model, onnx_filename, batch_onnx, per_channel_quantization):
model.eval()
quant_nn.TensorQuantizer.use_fb_fake_quant = True # We have to shift to pytorch's fake quant ops before exporting the model to ONNX
if per_channel_quantization:
opset_version = 13
else:
opset_version = 12
# Export ONNX for multiple batch sizes
print("Creating ONNX file: " + onnx_filename)
dummy_input = torch.randn(batch_onnx, 3, 224, 224, device='cuda') #TODO: switch input dims by model
input_names = ['input']
if 'resnet' in onnx_filename:
print('Changing last layer of resnet...')
output_names = ['Linear[fc]'] ### ResNet34
if 'densenet' in onnx_filename:
print('Changing last layer of densenet...')
output_names = ['Linear[classifier]'] #### DenseNet
dynamic_axes = {'input': {0: 'batch_size'}}
try:
torch.onnx.export(model, dummy_input, onnx_filename, input_names=input_names,
export_params=True, output_names=output_names, opset_version=opset_version,
dynamic_axes=dynamic_axes, verbose=True, enable_onnx_checker=False, do_constant_folding=True)
except ValueError:
warnings.warn(UserWarning("Per-channel quantization is not yet supported in Pytorch/ONNX RT (requires ONNX opset 13)"))
print("Failed to export to ONNX")
return False
return True
def calibrate_model(model, model_name, data_loader, num_calib_batch, calibrator, hist_percentile, out_dir):
if num_calib_batch > 0:
print("Calibrating model")
with torch.no_grad():
collect_stats(model, data_loader, num_calib_batch)
if not calibrator == "histogram":
compute_amax(model, method="max")
calib_output = os.path.join(
out_dir,
F"{model_name}-max-{num_calib_batch*data_loader.batch_size}.pth")
# torch.save(model.state_dict(), calib_output) # Just weights
torch.save(model, calib_output) # whole model
else:
for percentile in hist_percentile:
print(F"{percentile} percentile calibration")
compute_amax(model, method="percentile")
calib_output = os.path.join(
out_dir,
F"{model_name}-percentile-{percentile}-{num_calib_batch*data_loader.batch_size}.pth")
torch.save(model, calib_output) # whole model
for method in ["mse", "entropy"]:
print(F"{method} calibration")
compute_amax(model, method=method)
calib_output = os.path.join(
out_dir,
F"{model_name}-{method}-{num_calib_batch*data_loader.batch_size}.pth")
# torch.save(model.state_dict(), calib_output)
torch.save(model, calib_output)
def collect_stats(model, data_loader, num_batches):
# Enable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.disable_quant()
module.enable_calib()
else:
module.disable()
# Feed data to the network for collecting stats
for i, (image, _) in tqdm(enumerate(data_loader), total=num_batches):
model(image.cuda())
if i >= num_batches:
break
# Disable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.enable_quant()
module.disable_calib()
else:
module.enable()
def compute_amax(model, **kwargs):
# Load calib result
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
if isinstance(module._calibrator, calib.MaxCalibrator):
module.load_calib_amax()
else:
module.load_calib_amax(**kwargs)
print(F"{name:40}: {module}")
model.cuda()
def build_sensitivity_profile(model, criterion, data_loader_test):
quant_layer_names = []
for name, module in model.named_modules():
if name.endswith("_quantizer"):
module.disable()
layer_name = name.replace("._input_quantizer", "").replace("._weight_quantizer", "")
if layer_name not in quant_layer_names:
quant_layer_names.append(layer_name)
for i, quant_layer in enumerate(quant_layer_names):
print("Enable", quant_layer)
for name, module in model.named_modules():
if name.endswith("_quantizer") and quant_layer in name:
module.enable()
print(F"{name:40}: {module}")
with torch.no_grad():
evaluate(model, criterion, data_loader_test, device="cuda")
for name, module in model.named_modules():
if name.endswith("_quantizer") and quant_layer in name:
module.disable()
print(F"{name:40}: {module}")
if __name__ == '__main__':
res = main(sys.argv[1:])
exit(res)
More info regarding system:
TensorRT == 8.2
Pytorch == 1.9.0+cu111
Torchvision == 0.10.0+cu111
ONNX == 1.9.0
ONNXRuntime == 1.8.1
pycuda == 2021
If the ONNX model has Q/DQ nodes in it, you may not need calibration cache because quantization parameters such as scale and zero point are included in the Q/DQ nodes. You can run the Q/DQ ONNX model directly in TensorRT execution provider in OnnxRuntime (>= v1.9.0).

Does Hugging face defaults allow to log mlflow artifacts and name every run of mlflow log?

I am training a simple binary classification model using Hugging face models using pytorch.
Bert PyTorch HuggingFace.
Here is the code:
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from transformers import AutoTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup,BertConfig
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
acc = np.sum(predictions == labels) / predictions.shape[0]
return {"accuracy": acc,
'precision': metrics.precision_score(labels, predictions),
'recall': metrics.recall_score(labels, predictions),
'f1': metrics.f1_score(labels, predictions)}
training_args = tr.TrainingArguments(
#report_to = 'wandb',
output_dir='/home/pc/proj/Exp2_conv_stampy_data/results_exp0', # output directory
overwrite_output_dir = True,
num_train_epochs=2, # total number of training epochs
per_device_train_batch_size=32, # batch size per device during training
per_device_eval_batch_size=32, # batch size for evaluation
learning_rate=2e-5,
warmup_steps=200, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs_exp0', # directory for storing logs
logging_steps=137,
evaluation_strategy="epoch"
,save_strategy="epoch"
,load_best_model_at_end=True
,fp16=True
,run_name="final_model0"
)
# counter = 0
# results_lst = []
from transformers import TrainerCallback
from copy import deepcopy
model = tr.XLMRobertaForSequenceClassification.from_pretrained("/home/pc/multilingual_toxic_xlm_roberta",problem_type="single_label_classification", num_labels=2,ignore_mismatched_sizes=True, id2label={0: 'negative', 1: 'positive'})
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
train_data = SEDataset(train_encodings, train_labels)
val_data = SEDataset(val_encodings, val_labels)
model.to(device)
class CustomCallback(TrainerCallback):
def __init__(self, trainer) -> None:
super().__init__()
self._trainer = trainer
def on_epoch_end(self, args, state, control, **kwargs):
if control.should_evaluate:
control_copy = deepcopy(control)
self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
return control_copy
trainer = tr.Trainer(
model=model, # the instantiated Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_data, # training dataset
eval_dataset=val_data, # evaluation dataset
compute_metrics=compute_metrics # the callback that computes metrics of interest
)
trainer.add_callback(CustomCallback(trainer))
train = trainer.train()
trainer.save_model("/home/pc/proj/Exp2_conv_stampy_data/result_toxic_model_exp0")
I see by default mlruns directory is created.
What is 0' and what are these 2 folders inside 0`?
How can rename to something useful and understandable.?
If I run multiple runs, how can I log every run of model with something like run1, run2 under same experiment?
Also I see artifact folder is empty, how to log final model?

ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 2]))

I'm trying to implement a code for sentiment analysis( positive or negative labels) using BERT and i want to add a BiLSTM layer to see if I can increase the accuracy of the pretrained model from HuggingFace. I have the below code and a few questions :
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from torch import cuda
import re
import torch.nn as nn
device = 'cuda' if cuda.is_available() else 'cpu'
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05 #5e-5, 3e-5 or 2e-5
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
class CustomDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.tokenizer = tokenizer
self.data = dataframe
self.comment_text = dataframe.review
self.targets = self.data.sentiment
self.max_len = max_len
def __len__(self):
return len(self.comment_text)
def __getitem__(self, index):
comment_text = str(self.comment_text[index])
comment_text = " ".join(comment_text.split())
inputs = self.tokenizer.encode_plus(comment_text,None,add_special_tokens=True,max_length=self.max_len,
pad_to_max_length=True,return_token_type_ids=True)
ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]
return {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
'targets': torch.tensor(self.targets[index], dtype=torch.float)
}
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)
train_params = {'batch_size': TRAIN_BATCH_SIZE,'shuffle': True,'num_workers': 0}
test_params = {'batch_size': VALID_BATCH_SIZE,'shuffle': True,'num_workers': 0}
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
class BERTClass(torch.nn.Module):
def __init__(self):
super(BERTClass, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-uncased',return_dict=False, num_labels =2)
self.lstm = nn.LSTM(768, 256, batch_first=True, bidirectional=True)
self.linear = nn.Linear(256*2,2)
def forward(self, ids , mask,token_type_ids):
sequence_output, pooled_output = self.bert(ids, attention_mask=mask, token_type_ids = token_type_ids)
lstm_output, (h, c) = self.lstm(sequence_output) ## extract the 1st token's embeddings
hidden = torch.cat((lstm_output[:, -1, :256], lstm_output[:, 0, 256:]), dim=-1)
linear_output = self.linear(lstm_output[:, -1].view(-1, 256 * 2))
return linear_output
model = BERTClass()
model.to(device)
print(model)
def loss_fn(outputs, targets):
return torch.nn.BCEWithLogitsLoss()(outputs, targets)
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
def train(epoch):
model.train()
for _, data in enumerate(training_loader, 0):
ids = data['ids'].to(device, dtype=torch.long)
mask = data['mask'].to(device, dtype=torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
targets = data['targets'].to(device, dtype=torch.float)
outputs = model(ids, mask, token_type_ids)
optimizer.zero_grad()
loss = loss_fn(outputs, targets)
if _ % 5000 == 0:
print(f'Epoch: {epoch}, Loss: {loss.item()}')
optimizer.zero_grad()
loss.backward()
optimizer.step()
for epoch in range(EPOCHS):
train(epoch)
So on the above code I ran into the error : Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 2])) . Checked online and tried to use targets = targets.unsqueeze(2) but then I get another error that I must use values from [-2,1] for unsqueeze. I also tried to modify the loss function to
def loss_fn(outputs, targets):
return torch.nn.BCELoss()(outputs, targets)
but I still receive the same error. Can someone advise if there is a solution to this problem? Or what can I do to make this work fine? Many thanks in advance.

How to prepare the training dataset for a trainer of BERT in pytorch?

The task is to detect whether a blood label is present in a text sequence using BERT for sequence classification pre-trained model.
class BloodDataset(Dataset):
"""MIMIC Blood dataset."""
def __init__(self, arff_file):
"""
Args:
arff_file (string): Path to the arff file with annotations.
"""
self.indices, self.contents, self.labels = read_arff(arff_file)
self.labels = torch.as_tensor(self.labels)
self.inputs = encode(self.contents)
self.input_ids = (self.inputs['input_ids'])
self.attention_mask = (self.inputs['attention_mask'])
def __len__(self):
return len(self.indices)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
if idx in self.indices:
sample_index = self.indices.index(idx)
sample = {'index': idx,
'content': self.contents[sample_index],
'label': self.labels[sample_index],
'input_ids': self.input_ids[sample_index],
'attention_mask': self.attention_mask[sample_index]
}
return sample
else:
return "Sample not found!"
Tutorial from Huggingface proposes a trainer solution:
model = BertForSequenceClassification.from_pretrained(model_type)
training_args = TrainingArguments(
output_dir='./results', # output directory
logging_dir='./logs', # directory for storing logs
)
trainer = Trainer(
# the instantiated 🤗 Transformers model to be trained
model=model,
args=training_args,
train_dataset=train_dataset, # training dataset
eval_dataset=test_dataset # evaluation dataset
)
return trainer
https://huggingface.co/transformers/training.html#trainer
train_dataset = preprocess.BloodDataset("test_blood.arff")
trainer = train.run(train_dataset, train_dataset)
trainer.train()
yields error:
/.local/lib/python3.6/site-packages/transformers/data/data_collator.py", line 38, in <listcomp>
features = [vars(f) for f in features]
TypeError: vars() argument must have __dict__ attribute
What's a proper dataset input for the trainer?

Resources