Setting `remove_unused_columns=False` causes error in HuggingFace Trainer class - pytorch

I am training a model using HuggingFace Trainer class. The following code does a decent job:
!pip install datasets
!pip install transformers
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
dataset = load_dataset('glue', 'mnli')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True)
def preprocess_function(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True, padding=True)
encoded_dataset = dataset.map(preprocess_function, batched=True)
args = TrainingArguments(
"test-glue",
learning_rate=3e-5,
per_device_train_batch_size=8,
num_train_epochs=3,
remove_unused_columns=True
)
trainer = Trainer(
model,
args,
train_dataset=encoded_dataset["train"],
tokenizer=tokenizer
)
trainer.train()
However, setting remove_unused_columns=False results in the following error:
ValueError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
704 if not is_tensor(value):
--> 705 tensor = as_tensor(value)
706
ValueError: too many dimensions 'str'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
8 frames
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
720 )
721 raise ValueError(
--> 722 "Unable to create tensor, you should probably activate truncation and/or padding "
723 "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
724 )
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.
Any suggestions are highly appreciated.

It fails because the value in line 705 is a list of str, which points to hypothesis. And hypothesis is one of the ignored_columns in trainer.py.
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
704 if not is_tensor(value):
--> 705 tensor = as_tensor(value)
See the below snippet from trainer.py for the remove_unused_columns flag:
def _remove_unused_columns(self, dataset: "datasets.Dataset", description: Optional[str] = None):
if not self.args.remove_unused_columns:
return dataset
if self._signature_columns is None:
# Inspect model forward signature to keep only the arguments it accepts.
signature = inspect.signature(self.model.forward)
self._signature_columns = list(signature.parameters.keys())
# Labels may be named label or label_ids, the default data collator handles that.
self._signature_columns += ["label", "label_ids"]
columns = [k for k in self._signature_columns if k in dataset.column_names]
ignored_columns = list(set(dataset.column_names) - set(self._signature_columns))
There could be a potential pull request on HuggingFace to provide a fallback option in case the flag is False. But in general, it looks like that the flag implementation is not complete for e.g. it can't be used with Tensorflow.
On the contrary, it doesn't hurt to keep it True, unless there is some special need.

Related

How to encode empty string using BERT

I have recently been trying to encode an empty string with CamemBERT (BERT model for French). I wasn't sure on how to do that. If I try to simply encode an empty string,
from transformers import CamembertModel, CamembertTokenizer
import torch
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
camembert = CamembertModel.from_pretrained("camembert-base")
tokenized_sentence = tokenizer.tokenize("")
encoded_sentence = tokenizer.encode(tokenized_sentence, return_tensors='pt')
embeddings = camembert(encoded_sentence)
embeddings.last_hidden_state.squeeze()[0] # embedding of the CLS token
I get the error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-21-553400f369a8> in <module>
1 # Tokenize in sub-words with SentencePiece
2 tokenized_sentence = tokenizer.tokenize("")
----> 3 encoded_sentence = tokenizer.encode(tokenized_sentence, return_tensors='pt')
4 embeddings = camembert(encoded_sentence)
5 embeddings.last_hidden_state.squeeze()[0] # embeddings.last_hidden_state[0][0]
~/anaconda3/envs/r_nlp2/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in encode(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, return_tensors, **kwargs)
2057 ``convert_tokens_to_ids`` method).
2058 """
-> 2059 encoded_inputs = self.encode_plus(
2060 text,
2061 text_pair=text_pair,
~/anaconda3/envs/r_nlp2/lib/python3.8/site-packages/transformers/tokenization_utils_base.py in encode_plus(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2376 )
2377
-> 2378 return self._encode_plus(
2379 text=text,
2380 text_pair=text_pair,
~/anaconda3/envs/r_nlp2/lib/python3.8/site-packages/transformers/tokenization_utils.py in _encode_plus(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
459 )
460
--> 461 first_ids = get_input_ids(text)
462 second_ids = get_input_ids(text_pair) if text_pair is not None else None
463
~/anaconda3/envs/r_nlp2/lib/python3.8/site-packages/transformers/tokenization_utils.py in get_input_ids(text)
446 )
447 else:
--> 448 raise ValueError(
449 f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
450 )
ValueError: Input [] is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.
Which I think is expected behavior. I have tried with spaCy's French transformer model but have also been unsuccessful. Here's the code I used for spaCy :
from transformers import BertTokenizer, BertModel
import spacy
#!python -m spacy download fr_dep_news_trf
trf_fr = spacy.load("fr_dep_news_trf")
example = trf_fr("")
example._.trf_data.tensors[1].flatten() # embedding of the CLS token
And the error is
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-27-c53de04d2e6f> in <module>
1 example = trf_fr("")
----> 2 example._.trf_data.tensors[1].flatten()
IndexError: list index out of range
simply because the model returns [].
I guess that at this point, my question is theoretical: what would be the best or a good way to encode an empty string using CamemBERT or spaCy? Would "forcing" the model to return a vector of 0 be a good thing? Would returning "impossible" values such as a (10,..., 10) be a good possibility? Should I force the tokenizer to create a sequence of [PAD] tokens? In this case, how would I implement that using spaCy and/or CamemBERT?
Thanks!
PS : I'm using
Python 3.8.10
spaCy 3.0.6
transformers 4.6.1

Getting a Mixed Precison Cuda Error while running a cell trying to fine tuning a Wav2Wav medical vocabulary module

Following is my Cell Code:
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir="wav2vec2-medical",
group_by_length=True,
per_device_train_batch_size=32,
evaluation_strategy="steps",
num_train_epochs=30,
fp16=True,
save_steps=500,
eval_steps=500,
logging_steps=500,
learning_rate=1e-4,
weight_decay=0.005,
warmup_steps=1000,
save_total_limit=2,
)
And here is the error that I am getting. I am not sure about what to make out of it.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-26-f9014a6221db> in <module>
1 from transformers import TrainingArguments
2
----> 3 training_args = TrainingArguments(
4 # output_dir="/content/gdrive/MyDrive/wav2vec2-base-timit-demo",
5 output_dir="./wav2vec2-medical",
~/Library/Python/3.8/lib/python/site-packages/transformers/training_args.py in __init__(self, output_dir, overwrite_output_dir, do_train, do_eval, do_predict, evaluation_strategy, prediction_loss_only, per_device_train_batch_size, per_device_eval_batch_size, per_gpu_train_batch_size, per_gpu_eval_batch_size, gradient_accumulation_steps, eval_accumulation_steps, learning_rate, weight_decay, adam_beta1, adam_beta2, adam_epsilon, max_grad_norm, num_train_epochs, max_steps, lr_scheduler_type, warmup_ratio, warmup_steps, logging_dir, logging_strategy, logging_first_step, logging_steps, save_strategy, save_steps, save_total_limit, no_cuda, seed, fp16, fp16_opt_level, fp16_backend, fp16_full_eval, local_rank, tpu_num_cores, tpu_metrics_debug, debug, dataloader_drop_last, eval_steps, dataloader_num_workers, past_index, run_name, disable_tqdm, remove_unused_columns, label_names, load_best_model_at_end, metric_for_best_model, greater_is_better, ignore_data_skip, sharded_ddp, deepspeed, label_smoothing_factor, adafactor, group_by_length, length_column_name, report_to, ddp_find_unused_parameters, dataloader_pin_memory, skip_memory_metrics, use_legacy_prediction_loop, push_to_hub, resume_from_checkpoint, mp_parameters)
~/Library/Python/3.8/lib/python/site-packages/transformers/training_args.py in __post_init__(self)
609
610 if is_torch_available() and self.device.type != "cuda" and (self.fp16 or self.fp16_full_eval):
--> 611 raise ValueError(
612 "Mixed precision training with AMP or APEX (`--fp16`) and FP16 evaluation can only be used on CUDA devices."
613 )
ValueError: Mixed precision training with AMP or APEX (`--fp16`) and FP16 evaluation can only be used on CUDA devices.
I tried to run it on Jupyter notebook on local device and also on Google Colab but still I got the same error
You should remove fp16=True or run on a GPU, this is a GPU only parameter

RuntimeError: Unknown device when trying to run AlbertForMaskedLM on colab tpu

I am running the following code on colab taken from the example here: https://huggingface.co/transformers/model_doc/albert.html#albertformaskedlm
import os
import torch
import torch_xla
import torch_xla.core.xla_model as xm
assert os.environ['COLAB_TPU_ADDR']
dev = xm.xla_device()
from transformers import AlbertTokenizer, AlbertForMaskedLM
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForMaskedLM.from_pretrained('albert-base-v2').to(dev)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
data = input_ids.to(dev)
outputs = model(data, masked_lm_labels=data)
loss, prediction_scores = outputs[:2]
I haven't done anything to the example code except move input_ids and model onto the TPU device using .to(dev). It seems everything is moved to the TPU no problem as when I input data I get the following output: tensor([[ 2, 10975, 15, 51, 1952, 25, 10901, 3]], device='xla:1')
However when I run this code I get the following error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-5-f756487db8f7> in <module>()
1
----> 2 outputs = model(data, masked_lm_labels=data)
3 loss, prediction_scores = outputs[:2]
9 frames
/usr/local/lib/python3.6/dist-packages/transformers/modeling_albert.py in forward(self, hidden_states, attention_mask, head_mask)
277 attention_output = self.attention(hidden_states, attention_mask, head_mask)
278 ffn_output = self.ffn(attention_output[0])
--> 279 ffn_output = self.activation(ffn_output)
280 ffn_output = self.ffn_output(ffn_output)
281 hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])
RuntimeError: Unknown device
Anyone know what's going on?
Solution is here: https://github.com/pytorch/xla/issues/1909
Before calling model.to(dev), you need to call xm.send_cpu_data_to_device(model, xm.xla_device()):
model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
model = xm.send_cpu_data_to_device(model, dev)
model = model.to(dev)
There are also some issues with getting the gelu activation function ALBERT uses to work on the TPU, so you need to use the following branch of transformers when working on TPU: https://github.com/huggingface/transformers/tree/fix-jit-tpu
See the following colab notebook (by https://github.com/jysohn23) for full solution: https://colab.research.google.com/gist/jysohn23/68d620cda395eab66289115169f43900/getting-started-with-pytorch-on-cloud-tpus.ipynb

How to find Top features from Naive Bayes using sklearn pipeline

How to find Top features from Naive Bayes using sklearn pipeline
Hi all,
I am trying to apply Naive Bayes(MultinomialNB ) using pipelines and i came up with the code. However I am interested in finding top 10 positve and negative words , but not able to succeed. when I searched , I got the code for finding top features which i mentioned below. However when i tried using the code using pipeline i am getting the error which i mentioned below. I tried searching exhaustively , but got the code without using pipeline.But when i use the code with my output from pipeline, it is not working. COuld you please help me on how to find feature importance from pipeline output.
# Pipeline dictionary
pipelines = {
'bow_MultinomialNB' : make_pipeline(
CountVectorizer(),
preprocessing.Normalizer(),
MultinomialNB()
)
}
# List tuneable hyperparameters of our pipeline
pipelines['bow_MultinomialNB'].get_params()
# BOW - MultinomialNB hyperparameters
bow_MultinomialNB_hyperparameters = {
'multinomialnb__alpha' : [1000,500,100,50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001,0.0005,0.0001]
}
# Create hyperparameters dictionary
hyperparameters = {
'bow_MultinomialNB' : bow_MultinomialNB_hyperparameters
}
tscv = TimeSeriesSplit(n_splits=3) #For time based splitting
for name, pipeline in pipelines.items():
print("NAME:",name)
print("PIPELINE:",pipeline)
%time
# Create empty dictionary called fitted_models
fitted_models = {}
# Loop through model pipelines, tuning each one and saving it to fitted_models
for name, pipeline in pipelines.items():
# Create cross-validation object from pipeline and hyperparameters
model = GridSearchCV(pipeline, hyperparameters[name], cv=tscv, n_jobs=1,verbose=1)
# Fit model on X_train, y_train
model.fit(X_train, y_train)
# Store model in fitted_models[name]
fitted_models[name] = model
# Print '{name} has been fitted'
print(name, 'has been fitted.')
FEAURE IMPORTANCE:-
pipelines['bow_MultinomialNB'].steps[2][1].classes__
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-125-7d45b007e86b> in <module>()
----> 1 pipelines['bow_MultinomialNB'].steps[2][1].classes_
AttributeError: 'MultinomialNB' object has no attribute 'classes_'
pipelines['bow_MultinomialNB'].steps[0][1].get_feature_names()
---------------------------------------------------------------------------
NotFittedError Traceback (most recent call last)
<ipython-input-126-2883929221d1> in <module>()
----> 1 pipelines['bow_MultinomialNB'].steps[0][1].get_feature_names()
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in get_feature_names(self)
958 def get_feature_names(self):
959 """Array mapping from feature integer indices to feature name"""
--> 960 self._check_vocabulary()
961
962 return [t for t, i in sorted(six.iteritems(self.vocabulary_),
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _check_vocabulary(self)
301 """Check if vocabulary is empty or missing (not fit-ed)"""
302 msg = "%(name)s - Vocabulary wasn't fitted."
--> 303 check_is_fitted(self, 'vocabulary_', msg=msg),
304
305 if len(self.vocabulary_) == 0:
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_is_fitted(estimator, attributes, msg, all_or_any)
766
767 if not all_or_any([hasattr(estimator, attr) for attr in attributes]):
--> 768 raise NotFittedError(msg % {'name': type(estimator).__name__})
769
770
NotFittedError: CountVectorizer - Vocabulary wasn't fitted.
x=pipelines['bow_MultinomialNB'].steps[0][1]._validate_vocabulary()
x.get_feature_names()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-120-f620c754a34e> in <module>()
----> 1 x.get_feature_names()
AttributeError: 'NoneType' object has no attribute 'get_feature_names'
Regards,
Shree

Problems with com cross validation and pool on catboost

I am using catboost (version = catboost-0.10.4.1 enum34-1.1.6) in a mac (Python 3.6.3 | Anaconda) and I got some problems with the module cv (with the cat_features parameter of the pool). Here is the exemple:
1) Using catboost in a single fit:
from catboost import CatBoostClassifier, cv,
model = CatBoostClassifier(early_stopping_rounds=30,cat_features = [0,1,2,3,4,5,6,7,8],iterations=2000,eval_metric='AUC',learning_rate=0.1,)
model.fit(X_train, y_train,eval_set=(X_test,y_test))
Everything works just fine and I got my outputs!
2-) When I try to use basically the same thing with the cv module the problem appears:
cv_data = cv(pool=Pool(X_train, y_train,cat_features = [0,1,2,3,4,5,6,7,8]),params=model.get_params())
Output :
---------------------------------------------------------------------------
CatboostError Traceback (most recent call last)
<ipython-input-242-675cf317e148> in <module>()
----> 1 cv_data = cv(pool=Pool(X_train, y_train,cat_features=[0,1,2,3,4,5,6,7,8]),params=model.get_params())
~/anaconda3/lib/python3.6/site-packages/catboost/core.py in cv(pool, params, dtrain, iterations, num_boost_round, fold_count, nfold, inverted, partition_random_seed, seed, shuffle, logging_level, stratified, as_pandas, metric_period, verbose, verbose_eval, plot, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval)
2900
2901 with log_fixup(), plot_wrapper(plot, params):
-> 2902 return _cv(params, pool, fold_count, inverted, partition_random_seed, shuffle, stratified, as_pandas)
2903
2904
_catboost.pyx in _catboost._cv()
_catboost.pyx in _catboost._cv()
CatboostError: catboost/libs/options/plain_options_helper.cpp:272: Error: unknown option cat_features with value [0,1,2,3,4,5,6,7,8]
The indexes are right for the categorical variables (and worked for a single fit).
I can't understand what is happening and couldn't find related content.
Can somebody help?

Resources