I am training a model using HuggingFace Trainer class.(GPT2 text Classification) The following code does a decent job:
def preprocess_function(examples):
return tokenizer(examples["text"], truncation=True ,max_length=MAXLEN,
padding=True
)
dataset_train = Dataset.from_pandas(train_sp , preserve_index=False)
dataset_val = Dataset.from_pandas(val_sp ,preserve_index=False)
dataset_train = dataset_train.map(preprocess_function, batched=True,load_from_cache_file=False)
dataset_val = dataset_val.map(preprocess_function, batched=True,load_from_cache_file=False)
columns_to_return = ['input_ids', 'label', 'attention_mask']
dataset_train.set_format(type='torch', columns=columns_to_return)
dataset_val.set_format(type='torch', columns=columns_to_return)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer )
training_args = TrainingArguments(
output_dir="/content/Model1", #The output directory
overwrite_output_dir=True, #overwrite the content of the output directory
num_train_epochs=3, # number of training epochs
per_device_train_batch_size=16, # batch size for training
per_device_eval_batch_size=8, # batch size for evaluation
eval_steps = 400, # Number of update steps between two evaluations.
save_steps=800, # after # steps model is saved
warmup_steps=500,# number of warmup steps for learning rate scheduler
prediction_loss_only=True,
#remove_unused_columns=True
)
#---------------------------------------------------#
trainer = Trainer(
model=model1,
args=training_args,
#data_collator=gpt2_classificaiton_collator,
train_dataset=dataset_train,
eval_dataset=dataset_val,
tokenizer=tokenizer,
data_collator=data_collator
)
trainer.train()
I got error _forward_unimplemented() got an unexpected keyword argument 'input_ids'
what should I do?
Input_ids and label
error mg
My Model argiteture
Related
I want to fine tune the blip model on ROCO database for image captioning chest x-ray images. But I am getting an error regarding integer indexing.
Can anyone please help me understand the cause of the error and how to rectify it.
This is the code:
def read_data(filepath,csv_path,n_samples):
df = pd.read_csv(csv_path)
images = []
capts = []
for idx in range(len(df)):
if 'hest x-ray' in df['caption'][idx] or 'hest X-ray' in df['caption'][idx]:
if len(images)>n_samples:
break
else:
images.append(Image.open(os.path.join(filepath,df['name'][idx])).convert('L'))
capts.append(df['caption'][idx])
return images, capts
def get_data():
imgtrpath = 'all_data/train/radiology/images'
trcsvpath = 'all_data/train/radiology/traindata.csv'
imgtspath = 'all_data/test/radiology/images'
tscsvpath = 'all_data/test/radiology/testdata.csv'
imgvalpath = 'all_data/validation/radiology/images'
valcsvpath = 'all_data/validation/radiology/valdata.csv'
print('Extracting Training Data')
trainimgs, traincapts = read_data(imgtrpath, trcsvpath, 1800)
print('Extracting Testing Data')
testimgs, testcapts = read_data(imgtrpath, trcsvpath, 100)
print('Extracting Validation Data')
valimgs, valcapts = read_data(imgtrpath, trcsvpath, 100)
return trainimgs, traincapts, testimgs, testcapts, valimgs, valcapts
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
trainimgs, traincapts, testimgs, testcapts, valimgs, valcapts = get_data()
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
metric = evaluate.load("accuracy")
traindata = processor(text=traincapts, images=trainimgs, return_tensors="pt", padding=True, truncation=True)
evaldata = processor(text=testcapts, images=testimgs, return_tensors="pt", padding=True, truncation=True)
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=traindata,
eval_dataset=evaldata,
compute_metrics=compute_metrics
)
trainer.train()
The code is meant to fine-tune the BLIP model on the ROCO dataset chest x-ray images for the purpose of image captioning.
But when I run it, I am getting this error:
File "C:\Users\omair\anaconda3\envs\torch\lib\site-packages\transformers\feature_extraction_utils.py", line 86, in __getitem__
raise KeyError("Indexing with integers is not available when using Python based feature extractors")
KeyError: 'Indexing with integers is not available when using Python based feature extractors'
There are two issues here:
You're not providing the labels during training, your ...capts are passed as the model's "Question". There is an example on how to do that in the link below.
Finetuning HF's BlipForConditionalGeneration is not supported at the moment, see https://discuss.huggingface.co/t/finetune-blip-on-customer-dataset-20893/28446 where they just fixed BlipForQuestionAnswering. If you create a dataset based on this link, you will also get the error ValueError: Expected input batch_size (0) to match target batch_size (511). which can be solved if you put the effort to reproduce the changes made on BlipForQuestionAnswering to BlipForConditionalGeneration.
I am trying to fine tune GPT2, with Huggingface's trainer class.
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, Trainer, TrainingArguments
class torchDataset(Dataset):
def __init__(self, encodings):
self.encodings = encodings
self.len = len(encodings)
def __getitem__(self, index):
item = {torch.tensor(val[index]) for key, val in self.encodings.items()}
return item
def __len__(self):
return self.len
def print(self):
print(self.encodings)
# HYPER PARAMETERS
EPOCHS = 5
BATCH_SIZE = 2
WARMUP_STEPS = 5000
LEARNING_RATE = 1e-3
DECAY = 0
# Model ids and loading dataset
model_id = 'gpt2' # small model
# model_id = 'gpt2-medium' # medium model
# model_id = 'gpt2-large' # large model
dataset = load_dataset('wikitext', 'wikitext-2-v1') # first dataset
# dataset = load_dataset('m-newhauser/senator-tweets') # second dataset
# dataset = load_dataset('IsaacRodgz/Fake-news-latam-omdena') # third dataset
print('Loaded dataset')
# Dividing dataset into predefined splits
train_dataset = dataset['train']['text']
validation_dataset = dataset['validation']['text']
test_dataset = dataset['test']['text']
print('Divided dataset')
# loading tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(model_id,
# bos_token='<|startoftext|>', eos_token='<|endoftext|>',
pad_token='<|pad|>'
)
print('tokenizer max length:', tokenizer.model_max_length)
train_encoding = tokenizer(train_dataset, padding=True, truncation=True, max_length=1024, return_tensors='pt')
eval_encoding = tokenizer(validation_dataset, padding=True, truncation=True, max_length=1024, return_tensors='pt')
test_encoding = tokenizer(test_dataset, padding=True, truncation=True, max_length=1024, return_tensors='pt')
print('Converted to torch dataset')
torch_dataset_train = torchDataset(train_encoding)
torch_dataset_eval = torchDataset(eval_encoding)
torch_dataset_test = torchDataset(test_encoding)
# Setup training hyperparameters
training_args = TrainingArguments(
output_dir='/model_dump/',
num_train_epochs=EPOCHS,
warmup_steps=WARMUP_STEPS,
learning_rate=LEARNING_RATE,
weight_decay=DECAY
)
model = GPT2LMHeadModel.from_pretrained(model_id)
model.resize_token_embeddings(len(tokenizer))
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_encoding,
eval_dataset=eval_encoding
)
trainer.train()
# model.save_pretrained('/model_dump/')
But with this code I get this error
The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,past_key_values,attention_mask,token_type_ids,position_ids,head_mask,inputs_embeds,encoder_hidden_states,encoder_attention_mask,labels,use_cache,output_attentions,output_hidden_states,return_dict,labels,label,label_ids.
When I use the variables torch_dataset_train and torch_dataset_eval in Trainer's arguments, the error I get is:
TypeError: vars() argument must have __dict__ attribute
This typeError is the same I get if as dataset I use the WikiText2 from torchtext.
How can I fix this issue?
I am training a simple binary classification model using Hugging face models using pytorch.
Bert PyTorch HuggingFace.
Here is the code:
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from transformers import AutoTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup,BertConfig
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
acc = np.sum(predictions == labels) / predictions.shape[0]
return {"accuracy": acc,
'precision': metrics.precision_score(labels, predictions),
'recall': metrics.recall_score(labels, predictions),
'f1': metrics.f1_score(labels, predictions)}
training_args = tr.TrainingArguments(
#report_to = 'wandb',
output_dir='/home/pc/proj/Exp2_conv_stampy_data/results_exp0', # output directory
overwrite_output_dir = True,
num_train_epochs=2, # total number of training epochs
per_device_train_batch_size=32, # batch size per device during training
per_device_eval_batch_size=32, # batch size for evaluation
learning_rate=2e-5,
warmup_steps=200, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs_exp0', # directory for storing logs
logging_steps=137,
evaluation_strategy="epoch"
,save_strategy="epoch"
,load_best_model_at_end=True
,fp16=True
,run_name="final_model0"
)
# counter = 0
# results_lst = []
from transformers import TrainerCallback
from copy import deepcopy
model = tr.XLMRobertaForSequenceClassification.from_pretrained("/home/pc/multilingual_toxic_xlm_roberta",problem_type="single_label_classification", num_labels=2,ignore_mismatched_sizes=True, id2label={0: 'negative', 1: 'positive'})
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
train_data = SEDataset(train_encodings, train_labels)
val_data = SEDataset(val_encodings, val_labels)
model.to(device)
class CustomCallback(TrainerCallback):
def __init__(self, trainer) -> None:
super().__init__()
self._trainer = trainer
def on_epoch_end(self, args, state, control, **kwargs):
if control.should_evaluate:
control_copy = deepcopy(control)
self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
return control_copy
trainer = tr.Trainer(
model=model, # the instantiated Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_data, # training dataset
eval_dataset=val_data, # evaluation dataset
compute_metrics=compute_metrics # the callback that computes metrics of interest
)
trainer.add_callback(CustomCallback(trainer))
train = trainer.train()
trainer.save_model("/home/pc/proj/Exp2_conv_stampy_data/result_toxic_model_exp0")
I see by default mlruns directory is created.
What is 0' and what are these 2 folders inside 0`?
How can rename to something useful and understandable.?
If I run multiple runs, how can I log every run of model with something like run1, run2 under same experiment?
Also I see artifact folder is empty, how to log final model?
I am using a fine-tuned Roberta Model that is unbiased-toxic-roberta trained on Jigsaw Data:
https://huggingface.co/unitary/unbiased-toxic-roberta
It is fine-tuned on 16 classes.
I am writing my code for binary classification:
Metrics to calculate loss on binary labels as accuracy
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
acc = np.sum(predictions == labels) / predictions.shape[0]
return {"accuracy" : acc}
import torch.nn as nn
model = tr.RobertaForSequenceClassification.from_pretrained("/home/pc/unbiased_toxic_roberta",num_labels=2)
model.to(device)
training_args = tr.TrainingArguments(
# report_to = 'wandb',
output_dir='/home/pc/1_Proj_hate_speech/results_roberta', # output directory
overwrite_output_dir = True,
num_train_epochs=20, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=32, # batch size for evaluation
learning_rate=2e-5,
warmup_steps=1000, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs3', # directory for storing logs
logging_steps=1000,
evaluation_strategy="epoch"
,save_strategy="epoch"
,load_best_model_at_end=True
)
trainer = tr.Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_data, # training dataset
eval_dataset=val_data, # evaluation dataset
compute_metrics=compute_metrics
)
When I run this, I get an error:
loading weights file /home/pc/unbiased_toxic_roberta/pytorch_model.bin
RuntimeError: Error(s) in loading state_dict for RobertaForSequenceClassification:
size mismatch for classifier.out_proj.weight: copying a param with shape torch.Size([16, 768]) from checkpoint, the shape in current model is torch.Size([2, 768]).
size mismatch for classifier.out_proj.bias: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([2]).
How can I add a linear layer and solve this error ?
Load with ignore_mismatched_sizes=True:
model = tr.RobertaForSequenceClassification.from_pretrained(
"/home/pc/unbiased_toxic_roberta",
num_labels=2,
ignore_mismatched_sizes=True)
then you can finetune the model.
I have questions regarding building custom dataset and iterator using torchtext. I used the following code found in this post and modified based on my case:
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
text_field = Field(sequential=True, eos_token="[CLS]", tokenize=tokenizer)
label_field = Field(sequential=False, use_vocab=False)
data_fields = [("file", None),
("text", text_field),
("label", label_field)]
train, val = train_test_split(input_dt, test_size=0.1)
train.to_csv("train_output_path", index=False)
val.to_csv("val_output_path", index=False)
train, val = TabularDataset(path="path", train="train.csv", validation="val.csv",
format="csv", skip_header=True, fields=data_fields)
When it comes to text_field.build_vocab(train), I got this error: TypeError: '<' not supported between instances of 'list' and 'int'.
The only difference between my code and the post is the pre-trained word embeddings. In the post, the author used glove, which I use XLNetTokenizer from transformers package. I also searched for other posts who used the similar method, but they all used the pre-trained word embeddings, therefore they did have such an issue.
Does anyone know how to fix this issue? Many thanks!
I think as you are using a predefined tokenizer you dont't need to build vocab, instead you can follow this steps. Showing an example of how to do it using BERT tokenizer.
Sentences: it is a list of of text data
lables: is the label associated
###tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
# For every sentence...
for sent in sentences:
# `encode_plus` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
# (5) Pad or truncate the sentence to `max_length`
# (6) Create attention masks for [PAD] tokens.
encoded_dict = tokenizer.encode_plus(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
max_length = 100, # Pad & truncate all sentences.
pad_to_max_length = True,
return_attention_mask = True, # Construct attn. masks.
return_tensors = 'pt', # Return pytorch tensors.
)
# Add the encoded sentence to the list.
input_ids.append(encoded_dict['input_ids'])
# And its attention mask (simply differentiates padding from non-padding).
attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])
### Not combine the input id , mask and labels and divide the dataset
#:
from torch.utils.data import TensorDataset, random_split
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)
# Create a 90-10 train-validation split.
# Calculate the number of samples to include in each set.
train_size = int(0.90 * len(dataset))
val_size = len(dataset) - train_size
# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
### Not you call loader of these datasets
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# The DataLoader needs to know our batch size for training, so we specify it
# here. For fine-tuning BERT on a specific task, the authors recommend a batch
# size of 16 or 32.
batch_size = 32
# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
train_dataset, # The training samples.
sampler = RandomSampler(train_dataset), # Select batches randomly
batch_size = batch_size # Trains with this batch size.
)
# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
val_dataset, # The validation samples.
sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
batch_size = batch_size # Evaluate with this batch size.
)