Saving Patsy for sklean inference - scikit-learn

I am a huge lover of your sklego project, especially patsy implementation within sklean.
However, there is one thing I still would like your opinion on - how do you use a pipeline containing PatsyTransformer only for inference?
As the pickling is not yet supported on the patsy side I came up with a workaround.
import seaborn as sns
from joblib import dump, load
from sklego.preprocessing import PatsyTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
# Load The data
data = sns.load_dataset("tips")
# Basic Pipeline
pipe = Pipeline([
("patsy", PatsyTransformer("tip + C(day)")),
("model", LinearRegression())
])
data
# Train the pipeline
pipe.fit(data, data['total_bill'])
from sklearn.base import BaseEstimator, TransformerMixin
# Class for inferencing with pre-trained model (fit only passes, no training happens)
class Model_Inferencer(BaseEstimator, TransformerMixin):
"""
Function that applyes pre-trained models within a pipeline setting.
"""
def __init__(self, pre_trained_model=None):
self.pre_trained_model = pre_trained_model
def transform(self, X):
preds = self.pre_trained_model.predict(X)
return preds
def predict(self, X):
preds = self.pre_trained_model.predict(X)
return preds
def fit(self, X, y=None, **fit_params):
return self
pipe.predict(data)[:10]
# Save the model
dump(pipe['model'], 'model_github.joblib')
# Load The model
loaded_model = load('model_github.joblib')
# Create Inference Pipeline
pipe_inference = Pipeline([
("patsy", PatsyTransformer("tip + C(day)")),
("inferencer", Model_Inferencer(loaded_model))
])
# Inference pipeline needs to be fitted
# pipe_inference.fit(data)
# Save predictions (works only when fitted)
pipe_inference.predict(data)
I also tried saving the info by hand:
import h5py
def save_patsy(patsy_step, filename):
"""Save the coefficients of a linear model into a .h5 file."""
with h5py.File(filename, 'w') as hf:
hf.create_dataset("design_info", data=patsy_step.design_info_)
def load_coefficients(patsy_step, filename):
"""Attach the saved coefficients to a linear model."""
with h5py.File(filename, 'r') as hf:
design_info = hf['design_info'][:]
patsy_step.design_info_ = design_info
save_patsy(pipe['patsy'], "clf.h5")
However, a bummer error will occur.
Object dtype dtype('O') has no native HDF5 equivalent

Related

Does Hugging face defaults allow to log mlflow artifacts and name every run of mlflow log?

I am training a simple binary classification model using Hugging face models using pytorch.
Bert PyTorch HuggingFace.
Here is the code:
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from transformers import AutoTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup,BertConfig
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
acc = np.sum(predictions == labels) / predictions.shape[0]
return {"accuracy": acc,
'precision': metrics.precision_score(labels, predictions),
'recall': metrics.recall_score(labels, predictions),
'f1': metrics.f1_score(labels, predictions)}
training_args = tr.TrainingArguments(
#report_to = 'wandb',
output_dir='/home/pc/proj/Exp2_conv_stampy_data/results_exp0', # output directory
overwrite_output_dir = True,
num_train_epochs=2, # total number of training epochs
per_device_train_batch_size=32, # batch size per device during training
per_device_eval_batch_size=32, # batch size for evaluation
learning_rate=2e-5,
warmup_steps=200, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs_exp0', # directory for storing logs
logging_steps=137,
evaluation_strategy="epoch"
,save_strategy="epoch"
,load_best_model_at_end=True
,fp16=True
,run_name="final_model0"
)
# counter = 0
# results_lst = []
from transformers import TrainerCallback
from copy import deepcopy
model = tr.XLMRobertaForSequenceClassification.from_pretrained("/home/pc/multilingual_toxic_xlm_roberta",problem_type="single_label_classification", num_labels=2,ignore_mismatched_sizes=True, id2label={0: 'negative', 1: 'positive'})
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
train_data = SEDataset(train_encodings, train_labels)
val_data = SEDataset(val_encodings, val_labels)
model.to(device)
class CustomCallback(TrainerCallback):
def __init__(self, trainer) -> None:
super().__init__()
self._trainer = trainer
def on_epoch_end(self, args, state, control, **kwargs):
if control.should_evaluate:
control_copy = deepcopy(control)
self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
return control_copy
trainer = tr.Trainer(
model=model, # the instantiated Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_data, # training dataset
eval_dataset=val_data, # evaluation dataset
compute_metrics=compute_metrics # the callback that computes metrics of interest
)
trainer.add_callback(CustomCallback(trainer))
train = trainer.train()
trainer.save_model("/home/pc/proj/Exp2_conv_stampy_data/result_toxic_model_exp0")
I see by default mlruns directory is created.
What is 0' and what are these 2 folders inside 0`?
How can rename to something useful and understandable.?
If I run multiple runs, how can I log every run of model with something like run1, run2 under same experiment?
Also I see artifact folder is empty, how to log final model?

Model overfits after first epoch

I'm trying to use hugging face's BERT-base-uncased model to train on emoji prediction on tweets, and it seems that after the first epoch, the model immediately starts to overfit. I have tried the following:
Increasing the training data (I increased this from 1x to 10x with no effect)
Changing the learning rate (no differences there)
Using different models from hugging face (the results were the same again)
Changing the batch size (went from 32, 72, 128, 256, 512, 1024)
Creating a model from scratch, but I ran into issues and decided to post here first to see if I was missing anything obvious.
At this point, I'm concerned that the individual tweets don't give enough information for the model to make a good guess, but wouldn't it be random in that case, rather than overfitting?
Also, training time seems to be ~4.5 hours on Colab's free GPUs, is there any way to speed that up? I tried their TPU, but it doesn't seem to be recognized.
This is what the data looks like
And this is my code below:
import pandas as pd
import json
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import torch
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score
import numpy as np
# opening up the data and removing all symbols
df = pd.read_json('/content/drive/MyDrive/computed_results.json.bz2')
df['text_no_emoji'] = df['text_no_emoji'].apply(lambda text: re.sub(r'[^\w\s]', '', text))
# loading the tokenizer and the model from huggingface
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5).to('cuda')
# test train split
train, test = train_test_split(df[['text_no_emoji', 'emoji_codes']].sample(frac=1), test_size=0.2)
# defining a dataset class that generates the encoder and labels on the fly to minimize memory usage
class Dataset(torch.utils.data.Dataset):
def __init__(self, input, labels=None):
self.input = input
self.labels = labels
def __getitem__(self, pos):
encoded = tokenizer(self.input[pos], truncation=True, max_length=15, padding='max_length')
label = self.labels[pos]
ret = {key: torch.tensor(val) for key, val in encoded.items()}
ret['labels'] = torch.tensor(label)
return ret
def __len__(self):
return len(self.labels)
# training and validation datasets are defined here
train_dataset = Dataset(train['text_no_emoji'].tolist(), train['emoji_codes'].tolist())
val_dataset = Dataset(train['text_no_emoji'].tolist(), test['emoji_codes'].tolist())
# defining the training arguments
args = TrainingArguments(
output_dir="output",
evaluation_strategy="epoch",
logging_steps = 10,
per_device_train_batch_size=1024,
per_device_eval_batch_size=1024,
num_train_epochs=5,
save_steps=3000,
seed=0,
load_best_model_at_end=True,
weight_decay=0.2,
)
# defining the model trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=val_dataset
)
# Training the model
trainer.train()
Results: After this, the training generally stops pretty fast due to the early stopper
The dataset can be found here (39 Mb compressed)

What can be the cause of the validation loss increasing and the accuracy remaining constant to zero while the train loss decreases?

I am trying to solve a multiclass text classification problem. Due to specific requirements from my project I am trying to use skorch (https://skorch.readthedocs.io/en/stable/index.html) to wrap pytorch for the sklearn pipeline. What I am trying to do is fine-tune a pretrained version of BERT from Huggingface (https://huggingface.co) with my dataset. I have tried, in the best of my knowledge, to follow the instructions from skorch on how I should input my data, structure the model etc. Still during the training the train loss decreases until the 8th epoch where it starts fluctuating, all while the validation loss increases from the beginning and the validation accuracy remains constant to zero. My pipeline setup is
from sklearn.pipeline import Pipeline
pipeline = Pipeline(
[
("tokenizer", Tokenizer()),
("classifier", _get_new_transformer())
]
in which I am using a tokenizer class to preprocess my dataset, tokenizing it for BERT and creating the attention masks. It looks like this
import torch
from transformers import AutoTokenizer, AutoModel
from torch import nn
import torch.nn.functional as F
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
import numpy as np
class Tokenizer(BaseEstimator, TransformerMixin):
def __init__(self):
super(Tokenizer, self).__init__()
self.tokenizer = AutoTokenizer.from_pretrained(/path/to/model)
def _tokenize(self, X, y=None):
tokenized = self.tokenizer.encode_plus(X, max_length=20, add_special_tokens=True, pad_to_max_length=True)
tokenized_text = tokenized['input_ids']
attention_mask = tokenized['attention_mask']
return np.array(tokenized_text), np.array(attention_mask)
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
word_tokens, attention_tokens = np.array([self._tokenize(string)[0] for string in tqdm(X)]), \
np.array([self._tokenize(string)[1] for string in tqdm(X)])
X = word_tokens, attention_tokens
return X
def fit_transform(self, X, y=None, **fit_params):
self = self.fit(X, y)
return self.transform(X, y)
then I initialize the model I want to fine-tune as
class Transformer(nn.Module):
def __init__(self, num_labels=213, dropout_proba=.1):
super(Transformer, self).__init__()
self.num_labels = num_labels
self.model = AutoModel.from_pretrained(/path/to/model)
self.dropout = torch.nn.Dropout(dropout_proba)
self.classifier = torch.nn.Linear(768, num_labels)
def forward(self, X, **kwargs):
X_tokenized, attention_mask = torch.stack([x.unsqueeze(0) for x in X[0]]),\
torch.stack([x.unsqueeze(0) for x in X[1]])
_, X = self.model(X_tokenized.squeeze(), attention_mask.squeeze())
X = F.relu(X)
X = self.dropout(X)
X = self.classifier(X)
return X
I initialize the model and create the classifier with skorch as follows
from skorch import NeuralNetClassifier
from skorch.dataset import CVSplit
from skorch.callbacks import ProgressBar
import torch
from transformers import AdamW
def _get_new_transformer() -> NeuralNetClassifier:
transformer = Transformer()
net = NeuralNetClassifier(
transformer,
lr=2e-5,
max_epochs=10,
criterion=torch.nn.CrossEntropyLoss,
optimizer=AdamW,
callbacks=[ProgressBar(postfix_keys=['train_loss', 'valid_loss'])],
train_split=CVSplit(cv=2, random_state=0)
)
return net
and I use fit like that
pipeline.fit(X=dataset.training_samples, y=dataset.training_labels)
in which my training samples are lists of strings and my labels are the an array containing the indexes of each class, as pytorch requires.
This is a sample of what happens
training history
I have tried to keep train only the fully connected layer and not BERT but I have the same issue again. I also tested the train accuracy after the training process and it was only 0,16%. I would be grateful for any advice or insight on how to solve my problem! I am pretty new with skorch and not so comfortable with pytorch yet and I believe that I am missing something really simple. Thank you very much in advance!

Combining w2vec and feature selection in pipeline

Based on this article: http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/ I am trying to implement a gensim word2vec model with the pretrained vectors of GloVe in a text classification task. However, I would like to do FeatureSelection also in my text data. I tried multiple sequences in the pipeline but i get fast a memory error which points to the transform part of TfidfEmbeddingVectorizer.
return np.array([
np.mean([self.word2vec[w] * self.word2weight[w]
for w in words if w in self.word2vec] or
[np.zeros(self.dim)], axis=0)
for words in X
If I replace the TfidfEmbeddingVectorizer class with a regular TfIdfVectorizer it works properly. Is there a way I could combine SelectFromModel and W2vec in the pipeline?
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support as score, f1_score
import pickle
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
import gensim
import collections
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
return (X[self.column])
class TextStats(BaseEstimator, TransformerMixin):
"""Extract features from each document for DictVectorizer"""
def fit(self, x, y=None):
return self
def transform(self, posts):
return [{'REPORT_M': text}
for text in posts]
class TfidfEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
self.word2weight = None
self.dim = len(word2vec.values())
def fit(self, X, y):
tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(X)
# if a word was never seen - it must be at least as infrequent
# as any of the known words - so the default idf is the max of
# known idf's
max_idf = max(tfidf.idf_)
self.word2weight = collections.defaultdict(
lambda: max_idf,
[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] * self.word2weight[w]
for w in words if w in self.word2vec] or
[np.zeros(self.dim)], axis=0)
for words in X
])
# training model
def train(data_train, data_val):
with open("glove.6B/glove.6B.50d.txt", "rb") as lines:
w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
for line in lines}
classifier = Pipeline([
('union', FeatureUnion([
('text', Pipeline([
('selector', ItemSelector(column='TEXT')),
("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False),threshold=0.01))
])),
('category', Pipeline([
('selector', ItemSelector(column='category')),
('stats', TextStats()),
('vect', DictVectorizer())
]))
])),
('clf',ExtraTreesClassifier(n_estimators=200, max_depth=500, min_samples_split=6, class_weight= 'balanced'))])
classifier.fit(data_train,data_train.CLASSES)
predicted = classifier.predict(data_val)
I think in here self.dim = len(word2vec.values()) you should specify the dimension of the model. If you are using glove.6B.50d.txt, then the dimension should be 50.
len(word2vec.values()) is the total number of words, thus will create a huge matrix, i.e., memory error.

Scikit-learn pipeline for same data and steps fails to classifiy

I have vectors of floats that I created from doc2vec algorithm, and their labels. When i use them with a simple classifier, it works normally and gives an expected accuracy. Working code is below:
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
train_vecs #ndarray (20418,100)
#train_vecs = [[0.3244, 0.3232, -0.5454, 1.4543, ...],...]
y_train #labels
test_vecs #ndarray (6885,100)
y_test #labels
classifier = LinearSVC()
classifier.fit(train_vecs, y_train )
print('Test Accuracy: %.2f'%classifier.score(test_vecs, y_test))
However now I want to move it into a pipeline, because in the future I plan to do a feature union with different features. What I do is move the vectors into a dataframe, then use 2 custom transformers to i)select the column, ii) change the array type. Strangely the exact same data, with exact same shape, dtype and type.. gives 0.0005 accuracy. Which it does not make sense to me at all, it should give almost equal accuracy. After the ArrayCaster transformer the shapes and types of the inputs are exactly the same as before. The whole thing has been really frustrating.
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
# transformer that picks a column from the dataframe
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
print('item selector type',type(X[self.column]))
print('item selector shape',len(X[self.column]))
print('item selector dtype',X[self.column].dtype)
return (X[self.column])
# transformer that converts the series into an ndarray
class ArrayCaster(BaseEstimator, TransformerMixin):
def fit(self, x, y=None):
return self
def transform(self, data):
print('array caster type',type(np.array(data.tolist())))
print('array caster shape',np.array(data.tolist()).shape)
print('array caster dtype',np.array(data.tolist()).dtype)
return np.array(data.tolist())
train_vecs #ndarray (20418,100)
y_train #labels
test_vecs #ndarray (6885,100)
y_test #labels
train['vecs'] = pd.Series(train_vecs.tolist())
val['vecs'] = pd.Series(test_vecs.tolist())
classifier = Pipeline([
('selector', ItemSelector(column='vecs')),
('array', ArrayCaster()),
('clf',LinearSVC())])
classifier.fit(train, y_train)
print('Test Accuracy: %.2f'%classifier.score(test, y_test))
Ok sorry about that.. I figured it out. The error is pretty annoying to notice. All I had to do is cast them as list and place them into the dataframe, instead of converting them to series.
Change this
train['vecs'] = pd.Series(train_vecs.tolist())
val['vecs'] = pd.Series(test_vecs.tolist())
into:
train['vecs'] = list(train_vecs)
val['vecs'] = list(test_vecs)

Resources