Unable to create tensor - nlp

I am trying to train an NLP model for MLM problem, but the trainer.train function is throwing:
Unable to create tensor, you should probably activate truncation
and/or padding with 'padding=True' 'truncation=True' to have batched
tensors with the same length. Perhaps your features (input_ids in
this case) have excessive nesting (inputs type list where type int
is expected).
I really don't know what's going on because I followed the hugging-face tutorials.
Code:
from transformers import AutoTokenizer,AutoModelForMaskedLM
cp= "tau/tavbert-he"
model=AutoModelForMaskedLM.from_pretrained(cp)
tokenizer=AutoTokenizer.from_pretrained(cp)
import datasets
ds=datasets.load_dataset("csv", data_files='/content/drive/Shareddrives/Embible/data.csv')
ds=ds['train'].train_test_split(train_size=0.8, seed=42)
def tokenize_function(dataset):
return tokenizer(str(dataset["verse"]),truncation=True,padding=True ,
max_length=512, return_overflowing_tokens=True)
tokenized_ds=ds.map(tokenize_function)
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
tokenized_ds=tokenized_ds.remove_columns(ds["train"].column_names)
from transformers import TrainingArguments
from transformers import Trainer
training_args = TrainingArguments("test-trainer")
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_ds['train'],
eval_dataset=tokenized_ds["test"],
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.train()

Related

How to convert sklearn model using pipeline to ONNX format for real time inferencing

It is a multi-class classification model with sklearn.
I am using OneVsOneClassifier model to train and predict 150 intents. Its a multi-class classification problem.
Data:
text intents
text1 int1
text2 int2
I convert these intents in labels using:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
Expectation:
Without changing the training pipeline or parameters, note the inference time. Currently, it's slow, ~1second for 1 inference. So to convert pipeline to ONNX format and then use for inferencing on 1 example.
Code:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC,LinearSVC
def create_pipe(clf):
# Each pipeline uses the same column transformer.
column_trans = ColumnTransformer(
[('Text', TfidfVectorizer(), 'text')
],
remainder='drop')
pipeline = Pipeline([('prep',column_trans),
('clf', clf)])
return pipeline
def fit_and_print(pipeline):
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(metrics.classification_report(y_test, y_pred,
target_names=le.classes_,
digits=3))
clf = OneVsOneClassifier(LinearSVC(random_state=42, class_weight='balanced'))
pipeline = create_pipe(clf)
%time fit_and_print(pipeline)
# convert input to df
def create_test_data(x):
d = {'text' : x}
df = pd.DataFrame(d, index=[0])
return df
revs=[]
for idx in [948, 5717, 458]:
cur = test.loc[idx, 'text']
revs.append(cur)
print(revs)
revs=sam['text'].values
%%time
for rev in revs:
c_res = pipeline.predict(create_test_data(rev))
print(rev, '=', labels[c_res[0]])
ONNX conversion code
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
initial_type = [('UTTERANCE', StringTensorType([None, 2]))]
model_onnx = convert_sklearn(pipeline, initial_types=initial_type)
Error
MissingShapeCalculator: Unable to find a shape calculator for type '<class 'sklearn.multiclass.OneVsOneClassifier'>'.
It usually means the pipeline being converted contains a
transformer or a predictor with no corresponding converter
implemented in sklearn-onnx. If the converted is implemented
in another library, you need to register
the converted so that it can be used by sklearn-onnx (function
update_registered_converter). If the model is not yet covered
by sklearn-onnx, you may raise an issue to
https://github.com/onnx/sklearn-onnx/issues
to get the converter implemented or even contribute to the
project. If the model is a custom model, a new converter must
be implemented. Examples can be found in the gallery.
How to resolve this? Also how to do prediction after converting to ONNX format?

How can I use LSTM with pretrained static word vectors on aclImdb dataset

I am trying to use Sentiment Classification with LSTM and pre-trained BERT embeddings, and later language translation with Transformer
first of all I downloaded
!pip install ktrain
!pip install tensorflow_text
And I imported the necessary lib
import pathlib
import random
import numpy as np
from typing import Tuple, List
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
# tensoflow imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import (
TextVectorization, LSTM, Dense, Embedding, Dropout,
Layer, Input, MultiHeadAttention, LayerNormalization)
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.initializers import Constant
from tensorflow.keras import backend as K
import tensorflow_text as tf_text
import ktrain
from ktrain import text
And I downloaded and extracted Large Movie dataset from Stanford
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz
1- I try to use LSTM with train by Creating the training and test sets with the texts_from_folder function of the ktrain.text module
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";
DATADIR ='/content/aclImdb'
trn, val, preproc = text.texts_from_folder(DATADIR,max_features=20000, maxlen=400, ngram_range=1, preprocess_mode='standard', train_test_names=['train', 'test'],classes=['pos', 'neg'])
And I am trying to build LSTM model her
K.clear_session()
def build_LSTM_model(
embedding_size: int,
total_words: int,
lstm_hidden_size: int,
dropout_rate: float) -> Sequential:
model.add(Embedding(input_dim = total_words,output_dim=embedding_size,input_length=total_words))
model.add(LSTM(lstm_hidden_size,return_sequences=True,name="lstm_layer"))
model.add(GlobalMaxPool1D())
# model.add(Dense(total_words, activation='softmax'))
model.add(Dropout(dropout_rate))
model.add(Dense(MAX_SEQUENCE_LEN, activation="relu"))
# adam = Adam(lr=0.01)
model.compile(loss='CategoricalCrossentropy', optimizer=Adam(lr=0.01), metrics=['CategoricalAccuracy'])
model.summary()
model = Sequential()
with the following requirements for a sequential model The model should include:
One Embedding layer at the beginning. (Watch out for proper parameterization!)
At least one LSTM layer.At least one Dropout layer for regularization.One final Dense layer mapping to the outputs.
compile model, with categorical_crossentropy loss and the adam optimizer. or might want to add other types of metrics for example CategoricalAccuracy makes sense here.
And then I want to use the ktrain library's get_learner method to create an easily trainable version of the previous model. and to use test set as the val_data, to see the performance. ( not include the proper train-validation-test split, but it could be extended if required.)
I am using the learner's lr_find and lr_plot methods to determine the most effective learning rate for the model. by Specifying the max_epochs parameter of lr_find to limit the time this takes. a couple of epochs! to determine the best learning rate based on the plot. and find balance between the fastest convergence and stability
learner: ktrain.Learner
model = text.text_classifier('bert', trn , preproc=preproc)
learner.lr_find()
learner.lr_plot()
learner.fit_onecycle(1e-4, 1)
I faced following errors
ValueError Traceback (most recent call last)
in ()
6 # workers=8, use_multiprocessing=False, batch_size=64)
7
----> 8 model = text.text_classifier('bert', trn , preproc=preproc)
10 # learner.lr_find()
1 frames
/usr/local/lib/python3.7/dist-packages/ktrain/text/models.py in _text_model(name, train_data, preproc, multilabel, classification, metrics, verbose)
109 raise ValueError(
110 "if '%s' is selected model, then preprocess_mode='%s' should be used and vice versa"
--> 111 % (BERT, BERT)
112 ) 113 is_huggingface = U.is_huggingface(data=train_data)
ValueError: if 'bert' is selected model, then preprocess_mode='bert' should be used and vice versa
And next step to make it with LSTM with pretrained static word vectors
If you're using BERT for pretrained word vectors supplied as features to an LSTM, then you don't need to build a separate BERT classification model. You can use TransformerEmbedding to generate word vectors for your dataset (or use sentence-transformers):
In [1]: from ktrain.text import TransformerEmbedding
In [2]: te = TransformerEmbedding('bert-base-cased')
In [3]: te.embed('George Washington went to Washington .').shape
Out[3]: (1, 6, 768)
This is what the included NER models in ktrain do under-the-hood.
Also, the input feature format for a BERT model is completely different than input features for an LSTM. As the error message indicates, to preprocess your texts for BERT classification model, you'll need to supply preprocess_mode='bert' to texts_from_folder.

Why is keras tokenizer applying lowercase() to it's own tokens?

I am running my first cnn text-classifier using the IMDB dataset with the in-built
tf.keras.datasets.imdb.load_data()
I understand the AttributeError: 'int' object has no attribute 'lower' error indicates that a lowercase function is being applied to int objects (seems to be from the tokenizer). However, I don't know why it is throwing this in this case as I am loading it directly though the in-built tf.keras.datasets.imdb.load_data().
I am not experienced with using embedding in text-classification.
The code excluding the CNN model is:
import tensorflow as tf
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding, LSTM
from keras.layers import Conv1D, Flatten, MaxPooling1D
from keras.datasets import imdb
import wandb
from wandb.keras import WandbCallback
import numpy as np
from keras.preprocessing import text
import imdb
wandb.init(mode="disabled") # disabled for debugging
config = wandb.config
# set parameters:
config.vocab_size = 1000
config.maxlen = 1000
config.batch_size = 32
config.embedding_dims = 10
config.filters = 16
config.kernel_size = 3
config.hidden_dims = 250
config.epochs = 10
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data()
tokenizer = text.Tokenizer(num_words=config.vocab_size)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_matrix(X_train)
X_test = tokenizer.texts_to_matrix(X_test)
X_train = sequence.pad_sequences(X_train, maxlen=config.maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=config.maxlen)
Line 34 referred to in the error is tokenizer = text.Tokenizer(num_words=config.vocab_size)
The exact error thrown (includes Deprecation warnings) is:
C:\Users\Keegan\anaconda3\envs\oldK\lib\site-
packages\tensorflow_core\python\keras\datasets\imdb.py:129:
VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-
or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If
you meant to do this, you must specify 'dtype=object' when creating the ndarray.
x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
C:\Users\Keegan\anaconda3\envs\oldK\lib\site-
packages\tensorflow_core\python\keras\datasets\imdb.py:130: VisibleDeprecationWarning: Creating
an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or
ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must
specify 'dtype=object' when creating the ndarray.
x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
Traceback (most recent call last):
File "imdb-cnn.py", line 34, in <module>
tokenizer.fit_on_texts(X_train)
File "C:\Users\Keegan\anaconda3\envs\oldK\lib\site-packages\keras_preprocessing\text.py",
line 217, in fit_on_texts
text = [text_elem.lower() for text_elem in text]
File "C:\Users\Keegan\anaconda3\envs\oldK\lib\site-packages\keras_preprocessing\text.py", line 217, in <listcomp>
text = [text_elem.lower() for text_elem in text]
AttributeError: 'int' object has no attribute 'lower'
The Anaconda venv has Python 3.7.1, Tensorflow 2.1.0 and Keras 2.3.1
The Keras tokenizer has an attribute lower which can be set either to True or False.
I guess the reason why the pre-packaged IMDB data is by default lower-cased is that the dataset is pretty small. If you did not lower-case it the capitalized and lower-cased words would get different embeddings, but the capitalized forms probably are not frequently enough in the training data to train the embeddings appropriately. This of course changes, once you use pre-trained embeddings or pre-trained contextualized models such as BERT which were pre-trained on large data.

Model overfits after first epoch

I'm trying to use hugging face's BERT-base-uncased model to train on emoji prediction on tweets, and it seems that after the first epoch, the model immediately starts to overfit. I have tried the following:
Increasing the training data (I increased this from 1x to 10x with no effect)
Changing the learning rate (no differences there)
Using different models from hugging face (the results were the same again)
Changing the batch size (went from 32, 72, 128, 256, 512, 1024)
Creating a model from scratch, but I ran into issues and decided to post here first to see if I was missing anything obvious.
At this point, I'm concerned that the individual tweets don't give enough information for the model to make a good guess, but wouldn't it be random in that case, rather than overfitting?
Also, training time seems to be ~4.5 hours on Colab's free GPUs, is there any way to speed that up? I tried their TPU, but it doesn't seem to be recognized.
This is what the data looks like
And this is my code below:
import pandas as pd
import json
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import torch
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score
import numpy as np
# opening up the data and removing all symbols
df = pd.read_json('/content/drive/MyDrive/computed_results.json.bz2')
df['text_no_emoji'] = df['text_no_emoji'].apply(lambda text: re.sub(r'[^\w\s]', '', text))
# loading the tokenizer and the model from huggingface
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5).to('cuda')
# test train split
train, test = train_test_split(df[['text_no_emoji', 'emoji_codes']].sample(frac=1), test_size=0.2)
# defining a dataset class that generates the encoder and labels on the fly to minimize memory usage
class Dataset(torch.utils.data.Dataset):
def __init__(self, input, labels=None):
self.input = input
self.labels = labels
def __getitem__(self, pos):
encoded = tokenizer(self.input[pos], truncation=True, max_length=15, padding='max_length')
label = self.labels[pos]
ret = {key: torch.tensor(val) for key, val in encoded.items()}
ret['labels'] = torch.tensor(label)
return ret
def __len__(self):
return len(self.labels)
# training and validation datasets are defined here
train_dataset = Dataset(train['text_no_emoji'].tolist(), train['emoji_codes'].tolist())
val_dataset = Dataset(train['text_no_emoji'].tolist(), test['emoji_codes'].tolist())
# defining the training arguments
args = TrainingArguments(
output_dir="output",
evaluation_strategy="epoch",
logging_steps = 10,
per_device_train_batch_size=1024,
per_device_eval_batch_size=1024,
num_train_epochs=5,
save_steps=3000,
seed=0,
load_best_model_at_end=True,
weight_decay=0.2,
)
# defining the model trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=val_dataset
)
# Training the model
trainer.train()
Results: After this, the training generally stops pretty fast due to the early stopper
The dataset can be found here (39 Mb compressed)

How to add a neural network model with ML models in VotingRegressor?

Background of the Problem
I was trying to use a KerasRegressor model with the ML models (e.g. Lasso, Gradient Boost Regressor) for the purpose of building an ensemble method. I used the VotingRegressor() function of sklearn to group the models. However, when I add the KerasRegressor model in VotingRegressor(), I get the following error.
ValueError: The estimator KerasRegressor should be a regressor.
How Did I Try to Solve the Problem?
I searched on google by the error and I found only this page where I do not find the solution. Moreover, I tried to understand the document of the KerasRegressor. However, I do not know why I get the error as the document says that it is the implementation of the scikit-learn regressor API for Keras.
Then, My Question
Why did I get the error and what can I do to solve it?
Any help will be greatly appreciated :). Thanks!
From this issue there is no solution using keras as sklearn wrapper is not maintained and will be removed
Fortunately scikeras package solve this issue.
I advice you to read docs or tutorials but here a simple example using subclassing:
!pip install scikeras
import scikeras
from tensorflow import keras
from sklearn.datasets import make_regression
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
class MLPRegressor(KerasRegressor):
def __init__(
self,
hidden_layer_sizes=(100, ),
optimizer="adam",
optimizer__learning_rate=0.001,
epochs=10,
verbose=0,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_layer_sizes = hidden_layer_sizes
self.optimizer = optimizer
self.epochs = epochs
self.verbose = verbose
def _keras_build_fn(self, compile_kwargs):
model = keras.Sequential()
inp = keras.layers.Input(shape=(self.n_features_in_))
model.add(inp)
for hidden_layer_size in self.hidden_layer_sizes:
layer = keras.layers.Dense(hidden_layer_size, activation="relu")
model.add(layer)
out = keras.layers.Dense(1)
model.add(out)
model.compile(loss="mse", optimizer=compile_kwargs["optimizer"])
return model
# simple linear regression
r1 = LinearRegression()
# keras model wrapper
r2= MLPRegressor(epochs=20)
X = (y/2).reshape(-1, 1)
y = np.arange(100)
#defining votting classifier
vr = VotingRegressor([('lr', r1), ('MLPReg', r2)])
vr.fit(X,y)
VotingRegressor(estimators=[('lr',
LinearRegression(copy_X=True, fit_intercept=True,
n_jobs=None, normalize=False)),
('MLPReg',
MLPRegressor(batch_size=None, build_fn=None, callbacks=None, epochs=20, hidden_layer_sizes=(100,),
loss=None, metrics=None, model=None, optimizer='adam',
random_state=None, run_eagerly=False, shuffle=True,
validation_batch_size=None, validation_split=0.0, verbose=0,
warm_start=False))],
n_jobs=None, weights=None)

Resources