Cant pass the check estimator for SKlearn - scikit-learn

I cant understand why i keep getting this errors? Does anybody know?
class AdaBoost(BaseEstimator, ClassifierMixin):
def __init__(self, M=1, tree_depth=1, random_state=None):
self.M = M
self.tree_depth = tree_depth
self.random_state = random_state
def get_params(self, deep=True):
return {"tree_depth": self.tree_depth, "M": self.M, "random_state": self.random_state}
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self
def fit(self, X, y):
self.classes_, y = np.unique(y, return_inverse=True)
self.X_ = X
self.y_ = y
X, y = check_X_y(X, y)
self.models = []
self.alphas = []
n_samples, _ = X.shape
w = np.ones(n_samples) / n_samples
for m in range(self.M):
clf = DecisionTreeClassifier(max_depth = self.tree_depth)
clf.fit(X,y, sample_weight = w)
pred = clf.predict(X)
error = w.dot(pred != y)
alpha = 0.5*(np.log(1-error)-np.log(error))
w = w*np.exp(-alpha*y*pred)
w = w/w.sum() # normalise to sum to 1
self.models.append(clf)
self.alphas.append(alpha)
def predict(self, X):
check_is_fitted(self, ['X_', 'y_', 'classes_'])
n_samples, _ = X.shape
ada = np.zeros(n_samples)
for alpha, clf in zip(self.alphas, self.models):
ada += alpha*clf.predict(X)
return np.sign(ada)
def score(self, X, y):
pred = self.predict(X)
accuracy = 100*sum(pred==y)/len(y)
return accuracy
Error:
Traceback (most recent call last):
File "C:\Users\usethis.py", line 81, in <module>
check_estimator(AdaBoost)
File "C:\Users\AppData\Local\Programs\Python\Python37-32\lib\site-packages\sklearn\utils\estimator_checks.py", line 302, in check_estimator
check(name, estimator)
File "C:\AppData\Local\Programs\Python\Python37-32\lib\site-packages\sklearn\utils\testing.py", line 355, in wrapper
return fn(*args, **kwargs)
File "C:\Users\AppData\Local\Programs\Python\Python37-32\lib\site-packages\sklearn\utils\estimator_checks.py", line 1646, in check_estimators_fit_returns_self
assert estimator.fit(X, y) is estimator
AssertionError
[Finished in 1.7s with exit code 1]

The way scikit-learn is developed requires that fit functions return the object itself after fitting. You can do this by adding return self as the last line in the fit function.
class AdaBoost(BaseEstimator, ClassifierMixin):
...
def fit(self, X, y):
...
return self

Related

error with classifier.score() python-weka

I'm trying to do a weka classifier for python to use python libraries with weka models, like #fracpete do on https://github.com/fracpete/sklearn-weka-plugin, but by my own.
Right now I have that and it works for predictions, use the SHAP library, etc.
from sklearn.base import BaseEstimator
from weka.classifiers import Classifier
from weka.core.dataset import Attribute, Instance, Instances
from sklearn.metrics import accuracy_score
class weka_classifier(BaseEstimator):
def __init__(self, classifier = None, dataset = None, index = None):
#Classifier: es el pww3/weka model
#Dataset: Data for fit the model
if classifier is not None:
self.classifier = classifier
elif dataset is not None:
self.dataset = dataset
self.dataset.class_is_last()
def fit(self, X, y):
return self.fit()
def fit(self):
return self.classifier.build_classifier(self.dataset)
def predict_instance(self,x):
x.append(0.0)
inst = Instance.create_instance(x,classname='weka.core.DenseInstance', weight=1.0)
inst.dataset = self.dataset
return self.classifier.classify_instance(inst)
def predict_proba_instance(self,x):
x.append(0.0)
inst = Instance.create_instance(x,classname='weka.core.DenseInstance', weight=1.0)
inst.dataset = self.dataset
return self.classifier.distribution_for_instance(inst)
def predict_proba(self,X):
prediction = []
for i in range(X.shape[0]):
instance = []
for j in range(X.shape[1]):
instance.append(X[i][j])
instance.append(0.0)
instance = Instance.create_instance(instance,classname='weka.core.DenseInstance', weight=1.0)
instance.dataset=self.dataset
prediction.append(self.classifier.distribution_for_instance(instance))
return np.asarray(prediction)
def predict(self,X):
prediction = []
for i in range(X.shape[0]):
instance = []
for j in range(X.shape[1]):
instance.append(X[i][j])
instance.append(0.0)
instance = Instance.create_instance(instance,classname='weka.core.DenseInstance', weight=1.0)
instance.dataset=self.dataset
prediction.append(self.classifier.classify_instance(instance))
return np.asarray(prediction)
def set_data(self,dataset):
self.dataset = dataset
self.dataset.class_is_last()
def score(self,X,y):
y_pred = self.predict(X)
score = accuracy_score(y, y_pred)
return score
But when I try to evaluate the classifier with this funcion
evaluate_models_cv(sci_Model_1,X,y,10)
from sklearn.model_selection import cross_validate
def evaluate_models_cv(weka_model, X, y, cv, scoring = None):
return_train_score = True
n_jobs = -1
scores = cross_validate(weka_model, X, y, cv = cv, n_jobs = n_jobs, scoring = scoring, return_train_score = return_train_score)
scores = pd.DataFrame(scores)
means = scores.mean(axis = 0)
sds = scores.std(axis = 0)
results = dict(mean = means, sd = sds)
results = pd.DataFrame(results)
return results
I'm getting this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_2608/4224958314.py in <module>
----> 1 evaluate_models_cv(sci_Model_1,X,y,10)
/tmp/ipykernel_2608/3738102976.py in evaluate_models_cv(weka_model, X, y, cv, scoring)
5 n_jobs = -1
6
----> 7 scores = cross_validate(weka_model, X, y, cv = cv, n_jobs = n_jobs, scoring = scoring, return_train_score = return_train_score)
8
9 scores = pd.DataFrame(scores)
/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
258 scorers = scoring
259 elif scoring is None or isinstance(scoring, str):
--> 260 scorers = check_scoring(estimator, scoring)
261 else:
262 scorers = _check_multimetric_scoring(estimator, scoring)
/usr/local/lib/python3.8/dist-packages/sklearn/metrics/_scorer.py in check_scoring(estimator, scoring, allow_none)
475 return None
476 else:
--> 477 raise TypeError(
478 "If no scoring is specified, the estimator passed should "
479 "have a 'score' method. The estimator %r does not." % estimator
TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator weka_classifier(classifier=AttributeSelectedClassifier:
So I tried to solve it makeing the score() function on the class, but that didn't work.
I'm doing something wrong?
Ok it works, I didn't extends the ClassifierMixin class like:
class weka_classifier(BaseEstimator, ClassifierMixin):

Inference on Multiple Label Sentiment analysis using pytorch_lightning

I have trained an LSTM model for sentiment analysis using pytorch_lightning but I've been having difficulties incorporating the inference.
This is my model:
class LSTM(pl.LightningModule):
def __init__(self,n_vocab,n_embed,
n_hidden,n_output,n_layers,learning_rate,embedding_matrix=None):
super().__init__()
self.n_vocab = n_vocab
self.n_layer = n_layers
self.n_hidden = n_hidden
self.embedding = nn.Embedding(n_vocab, n_embed, padding_idx = 0)
if embedding_matrix is not None:
self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
self.embedding.weight.requires_grad = False
self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, bidirectional = True)
self.fc = nn.Linear(2 * n_hidden, n_output)
self.dropout = nn.Dropout(0.2)
self.sigmoid = nn.Sigmoid()
self.batch_size = batch_size
self.learning_rate = learning_rate
def forward(self,input_words):
embedded_words = self.embedding(input_words)
lstm_out, _ = self.lstm(embedded_words)
lstm_out_f=lstm_out[:,-1 , :300 ]
lstm_out_b=lstm_out[:, 0 , 300: ]
lstm_out_final = torch.cat([lstm_out_f,lstm_out_b], dim=-1)
lstm_out_final = self.dropout(lstm_out_final)
fc_out = self.fc(lstm_out_final)
return fc_out
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
return optimizer
def training_step(self, batch, batch_nb):
x , y= batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y)
self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
return loss
def validation_step(self, batch, batch_nb):
x, y = batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y).to(device = 'cuda')
f1 = torchmetrics.F1(num_classes=5).to(device = 'cuda')
f1_score = f1(y_hat, y)
accuracy = Accuracy().to(device = 'cuda')
accur = accuracy(y_hat, y)
self.log("val_loss", loss)
self.log("f1_score", f1_score)
self.log("accuracy", accur)
def test_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
print(logits)
logitsies = softmax(logits)
choice = argmax(logitsies)
loss = F.nll_loss(logits, y)
self.log("test_loss", loss)
return choice
def predict_step(self, batch, batch_idx, dataloader_idx):
x, y = batch
x = x.view(x.size(0), -1)
y_hat = self(x)
logitsies = softmax(logits)
choice = argmax(logitsies)
loss = F.nll_loss(logits, y)
self.log("predict_loss", loss)
return choice
I called the model as such:
model = LSTM(
n_vocab=size_of_emb_matrix,
n_embed=embed_vector_len,
n_hidden=150,
n_output=5,
n_layers=1,
learning_rate=1e-4,
embedding_matrix=embedding_matrix
)
Now I am trying to write a function that would allow me to do inference. I have managed to succesfuly tokenize the input sentence and encode it through an already pre-defined functions, yet I have been getting several errors no matter what I try. I have found myself stuck and don't know how to continue. This is my function so far.
def get_sentiment(text):
x = encode_sentence(text,vocab2index)
x_bar = x[0]
y_hat = torch.tensor(x_bar)
trainer.predict(model,y_hat)
The encode_sentence function is as follows:
def encode_sentence(text, vocab2index, N=70):
tokenized = tokenize(text)
encoded = np.zeros(N, dtype=int)
enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
length = min(N, len(enc1))
encoded[:length] = enc1[:length]
return encoded, length
As I call the get_sentiment function, I am using the trainer.predict function which allows me to predict, hence doing inference.
But I have been getting the following Issue:
AttributeError Traceback (most recent call
last)
<ipython-input-195-825c536cbbb2> in <module>()
----> 1 get_sentiment("love that dress")
13 frames
/usr/local/lib/python3.7/dist-
packages/pytorch_lightning/loops/epoch/prediction_epoch_loop.py in
_store_batch_indices(self, dataloader_idx)
162 def _store_batch_indices(self, dataloader_idx: int) -> None:
163 """Stores the batch indices if the predictions should be
stored"""
--> 164 batch_sampler =
self.trainer.predict_dataloaders[dataloader_idx].batch_sampler
165 if isinstance(batch_sampler, IndexBatchSamplerWrapper):
166 self.current_batch_indices = batch_sampler.batch_indices
AttributeError: 'Tensor' object has no attribute 'batch_sampler'

AttributeError: 'tuple' object has no attribute 'train_dataloader'

I have a 3 file. In the datamodule file, I have created data and used the basic format of the PyTorch Lightning. In the linear_model I made a linear regression model based on this page. Finally, I have a train file, I am calling the model and trying to fit the data. But I am getting this error
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/mostafiz/Dropbox/MSc/Thesis/regreesion_EC/src/test_train.py", line 10, in <module>
train_dataloader=datamodule.DataModuleClass().setup().train_dataloader(),
AttributeError: 'tuple' object has no attribute 'train_dataloader'
Sample datamodule file
class DataModuleClass(pl.LightningDataModule):
def __init__(self):
super().__init__()
self.sigma = 5
self.batch_size = 10
self.prepare_data()
def prepare_data(self):
x = np.random.uniform(0, 10, 10)
e = np.random.normal(0, self.sigma, len(x))
y = x + e
X = np.transpose(np.array([x, e]))
self.x_train_tensor = torch.from_numpy(X).float().to(device)
self.y_train_tensor = torch.from_numpy(y).float().to(device)
training_dataset = TensorDataset(self.x_train_tensor, self.y_train_tensor)
self.training_dataset = training_dataset
def setup(self):
data = self.training_dataset
self.train_data, self.val_data = random_split(data, [8, 2])
return self.train_data, self.val_data
def train_dataloader(self):
return DataLoader(self.train_data)
def val_dataloader(self):
return DataLoader(self.val_data)
Sample training file
from . import datamodule, linear_model
model = linear_model.LinearRegression(input_dim=2, l1_strength=1, l2_strength=1)
trainer = pl.Trainer()
trainer.fit(model,
train_dataloader=datamodule.DataModuleClass().setup().train_dataloader(),
val_dataloaders=datamodule.DataModuleClass().setup().val_dataloaders())
Let me know if you need more code or explanation.
Update (Based on the comment)
Now, I am getting the following error after removing self.prepare_data() from the __init__() of the DataModuleClass(), removed return self.train_data, self.val_data from setup(), and changed the test file to
data_module = datamodule.DataModuleClass()
trainer = pl.Trainer()
trainer.fit(model,data_module)
Error:
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/mostafiz/Dropbox/MSc/Thesis/regreesion_EC/src/test_train.py", line 10, in <module>
train_dataloader=datamodule.DataModuleClass().train_dataloader(),
File "/home/mostafiz/Dropbox/MSc/Thesis/regreesion_EC/src/datamodule.py", line 54, in train_dataloader
return DataLoader(self.train_data)
AttributeError: 'DataModuleClass' object has no attribute 'train_data'
Most of the things were correct, except few things like:
def prepare_data(self):
This function was right except that it should not return anything.
Another thing was
def setup(self,stage=None):
Requires stage variable which can be set to a default value of none in case we don't want to switch between different test and train stage.
Putting everything together, here is the code:
from argparse import ArgumentParser
import numpy as np
import pytorch_lightning as pl
from torch.utils.data import random_split, DataLoader, TensorDataset
import torch
from torch.autograd import Variable
from torchvision import transforms
import pytorch_lightning as pl
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam
from torch.optim.optimizer import Optimizer
class LinearRegression(pl.LightningModule):
def __init__(
self,
input_dim: int = 2,
output_dim: int = 1,
bias: bool = True,
learning_rate: float = 1e-4,
optimizer: Optimizer = Adam,
l1_strength: float = 0.0,
l2_strength: float = 0.0
):
super().__init__()
self.save_hyperparameters()
self.optimizer = optimizer
self.linear = nn.Linear(in_features=self.hparams.input_dim, out_features=self.hparams.output_dim, bias=bias)
def forward(self, x):
y_hat = self.linear(x)
return y_hat
def training_step(self, batch, batch_idx):
x, y = batch
# flatten any input
x = x.view(x.size(0), -1)
y_hat = self(x)
loss = F.mse_loss(y_hat, y, reduction='sum')
# L1 regularizer
if self.hparams.l1_strength > 0:
l1_reg = sum(param.abs().sum() for param in self.parameters())
loss += self.hparams.l1_strength * l1_reg
# L2 regularizer
if self.hparams.l2_strength > 0:
l2_reg = sum(param.pow(2).sum() for param in self.parameters())
loss += self.hparams.l2_strength * l2_reg
loss /= x.size(0)
tensorboard_logs = {'train_mse_loss': loss}
progress_bar_metrics = tensorboard_logs
return {'loss': loss, 'log': tensorboard_logs, 'progress_bar': progress_bar_metrics}
def validation_step(self, batch, batch_idx):
x, y = batch
x = x.view(x.size(0), -1)
y_hat = self(x)
return {'val_loss': F.mse_loss(y_hat, y)}
def validation_epoch_end(self, outputs):
val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
tensorboard_logs = {'val_mse_loss': val_loss}
progress_bar_metrics = tensorboard_logs
return {'val_loss': val_loss, 'log': tensorboard_logs, 'progress_bar': progress_bar_metrics}
def configure_optimizers(self):
return self.optimizer(self.parameters(), lr=self.hparams.learning_rate)
np.random.seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class DataModuleClass(pl.LightningDataModule):
def __init__(self):
super().__init__()
self.sigma = 5
self.batch_size = 10
def prepare_data(self):
x = np.random.uniform(0, 10, 10)
e = np.random.normal(0, self.sigma, len(x))
y = x + e
X = np.transpose(np.array([x, e]))
self.x_train_tensor = torch.from_numpy(X).float().to(device)
self.y_train_tensor = torch.from_numpy(y).float().to(device)
training_dataset = TensorDataset(self.x_train_tensor, self.y_train_tensor)
self.training_dataset = training_dataset
def setup(self,stage=None):
data = self.training_dataset
self.train_data, self.val_data = random_split(data, [8, 2])
def train_dataloader(self):
return DataLoader(self.train_data)
def val_dataloader(self):
return DataLoader(self.val_data)
model = LinearRegression(input_dim=2, l1_strength=1, l2_strength=1)
trainer = pl.Trainer()
dummy = DataModuleClass()
trainer.fit(model,dummy)

AttributeError: dataset object has no attribute 'c' FastAI

I am trying to train a ResNet based UNet for image segmentation. I have the location of images and mask images in a csv file, that's why I have created my own dataloader, which is as follows:
X = list(df['input_img'])
y = list(df['mask_img'])
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size=0.33, random_state=42)
class NumbersDataset():
def __init__(self, inputs, labels):
self.X = inputs
self.y = labels
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
img_train = cv2.imread(self.X[idx])
img_mask = cv2.imread(self.y[idx])
img_train = cv2.resize(img_train, (427,240), interpolation = cv2.INTER_LANCZOS4)
img_mask = cv2.resize(img_mask, (427,240), interpolation = cv2.INTER_LANCZOS4)
return img_train, img_mask
I then call this datagenerator in the __main__ function:
if __name__ == '__main__':
dataset_train = NumbersDataset(X_train, y_train)
dataloader_train = DataLoader(dataset_train, batch_size=4, shuffle=True, num_workers=2)
dataset_valid = NumbersDataset(X_valid, y_valid)
dataloader_valid = DataLoader(dataset_valid, batch_size=4, shuffle=True, num_workers=2)
datas = DataBunch(train_dl = dataloader_train, valid_dl = dataloader_valid)
leaner = unet_learner(data = datas, arch = models.resnet34)
But I end up getting the following error:
Traceback (most recent call last):
File "dataset_test.py", line 70, in <module>
leaner = unet_learner(data = datas, arch = models.resnet34)
File "/home/sarvagya/miniconda3/envs/gr/lib/python3.6/site-packages/fastai/vision/learner.py", line 118, in unet_learner
model = to_device(models.unet.DynamicUnet(body, n_classes=data.c, img_size=size, blur=blur, blur_final=blur_final,
File "/home/sarvagya/miniconda3/envs/gr/lib/python3.6/site-packages/fastai/basic_data.py", line 122, in __getattr__
def __getattr__(self,k:int)->Any: return getattr(self.train_dl, k)
File "/home/sarvagya/miniconda3/envs/gr/lib/python3.6/site-packages/fastai/basic_data.py", line 38, in __getattr__
def __getattr__(self,k:str)->Any: return getattr(self.dl, k)
File "/home/sarvagya/miniconda3/envs/gr/lib/python3.6/site-packages/fastai/basic_data.py", line 20, in DataLoader___getattr__
def DataLoader___getattr__(dl, k:str)->Any: return getattr(dl.dataset, k)
AttributeError: 'NumbersDataset' object has no attribute 'c'
I tried searching and even tried using SegmentationItemList.from_df but nothing helped. What am I getting wrong here?
You should add the attribute c into your NumbersDataset, like this:
def __init__(self, inputs, labels, c):
self.inputs = inputs
self.labels = labels
self.c = c

custom layer cause "tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [128] vs. [128,256,256]"

I implement a custom layer called "MultiHeadAttention".When I try to use it, that caused
tensorflow.python.framework.errors_impl.InvalidArgumentError:
Incompatible shapes: [128] vs. [128,256,256]
...(omit)...(training/SGD/gradients/multi_head_attention_1/mul_1_grad/Shape,
training/SGD/gradients/multi_head_attention_1/mul_1_grad/Shape_1)]]
MultiHeadAttention code:
class MultiHeadAttention(Layer):
def __init__(self, n_head: int, model_dim: int, **kwargs):
self.n_head = n_head
self.model_dim = model_dim
self.dim_per_head = model_dim // n_head
super(MultiHeadAttention, self).__init__(**kwargs)
def build(self, input_shape):
if isinstance(input_shape, list):
input_shape = input_shape[0]
self.query_kernel = self.add_weight(name='query_kernel',
shape=(input_shape[2], self.dim_per_head * self.n_head),
initializer='uniform', trainable=True)
self.key_kernel = self.add_weight(name='key_kernel',
shape=(input_shape[2], self.dim_per_head * self.n_head),
initializer='uniform', trainable=True)
self.value_kernel = self.add_weight(name='value_kernel',
shape=(input_shape[2], self.dim_per_head * self.n_head),
initializer='uniform', trainable=True)
self.output_kernel = self.add_weight(name='output_kernel',
shape=(self.dim_per_head * self.n_head, self.model_dim),
initializer='uniform', trainable=True)
self.output_bias = self.add_weight(name='output_bias',
shape=(self.model_dim,),
initializer='zeros', trainable=True)
super(MultiHeadAttention, self).build(input_shape)
def call(self, x):
if isinstance(x, list):
attn, attn_mask = x
attn_mask = K.repeat_elements(attn_mask, self.n_head, 0)
else:
attn = x
attn_mask = None
query_big = K.dot(attn, self.query_kernel)
key_big = K.dot(attn, self.key_kernel)
value_big = K.dot(attn, self.value_kernel) # batch ,seq_len, hid*n_head
def reshape1(x):
s = list(x.shape)
x = K.reshape(x, [-1, s[1], self.n_head, s[2] // self.n_head])
x = K.permute_dimensions(x, [2, 0, 1, 3])
x = K.reshape(x, [-1, s[1], s[2] // self.n_head])
return x
query_big = reshape1(query_big)
key_big = reshape1(key_big)
value_big = reshape1(value_big)
# print(value_big.shape)
result = scale_dot_product(query_big, key_big, value_big, attn_mask) # n_head * batch, seq_len, hid
def reshape2(x):
s = list(x.shape) # [n_head * batch_size, len_v, d_v]
x = K.reshape(x, [self.n_head, -1, s[1], s[2]])
x = K.permute_dimensions(x, [1, 2, 0, 3])
x = K.reshape(x, [-1, s[1], self.n_head * s[2]]) # [batch_size, len_v, n_head * d_v]
return x
result = reshape2(result)
result = K.dot(result, self.output_kernel) + self.output_bias
return result
def compute_output_shape(self, input_shape):
if isinstance(input_shape, list):
input_shape = input_shape[0]
return (input_shape[0], input_shape[1], self.model_dim)
def compute_mask(self, inputs, mask=None):
return None
def scale_dot_product(query: tf.Tensor,
key: tf.Tensor,
value: tf.Tensor,
attn_mask=None):
shape_list = list(value.shape)
mul = K.batch_dot(query, K.permute_dimensions(key, (0, 2, 1)))
if attn_mask is not None:
attn_mask = K.cast(attn_mask, dtype=tf.float32)
mul = attn_mask * mul + (1.0 - attn_mask) * neg_inf
scale = mul / K.sqrt(K.cast(shape_list[-1], mul.dtype))
softmax = K.softmax(scale)
result = K.batch_dot(softmax, value)
return result
A simple example:
import numpy as np
import keras.backend as K
from keras.optimizers import SGD
from keras import Input, Model, losses
from keras.layers import Embedding, Lambda, Dense
import MultiHeadAttention
if __name__ == "__main__":
max_len = 256
word_dim = 200
vacab_size = 10000
input = Input(shape=(max_len,), name="Input-Sentence")
word_embedding = Embedding(vacab_size, word_dim, input_length=max_len,
mask_zero=False, trainable=True)(input)
inp_mask = Lambda(lambda t: K.any(K.not_equal(t, 0), axis=-1), name="Input_mask")(input)
out = word_embedding
# There were something wrong with the custom layer of MultiHeadAttention. if comment line below,it would be ok.
out = MultiHeadAttention(n_head=8, model_dim=word_dim)([out, inp_mask])
out = Dense(2, activation="softmax")(out)
model = Model(inputs=input, outputs=out)
model.summary()
model.compile(optimizer=SGD(), loss=losses.sparse_categorical_crossentropy)
# example data
data_num = 1024
x = np.array(np.random.randint(0, vacab_size, (data_num, max_len)).tolist())
y = np.array(np.random.randint(0, 2, (data_num, max_len, 1)).tolist())
print(x.shape, y.shape)
model.fit(x, y, epochs=24, batch_size=16)
keras==2.2.4
tf == 1.13.1
Error information:
Traceback (most recent call last):
File "D:\PyCharm Community Edition
2018.1.4\helpers\pydev\pydev_run_in_console.py", line 52, in run_file
pydev_imports.execfile(file, globals, locals) # execute the script
File "D:\PyCharm Community Edition
2018.1.4\helpers\pydev_pydev_imps_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File
"C:/Users/Administrator/PyProgram/InfosExtractor/code/BERT/MultiAttentionTest.py",
line 30, in
model.fit(x, y, epochs=24, batch_size=16)
File "D:\Anaconda3.7\lib\site-packages\keras\engine\training.py", line
1039, in fit
validation_steps=validation_steps)
File
"D:\Anaconda3.7\lib\site-packages\keras\engine\training_arrays.py",
line 199, in fit_loop
outs = f(ins_batch)
File
"D:\Anaconda3.7\lib\site-packages\keras\backend\tensorflow_backend.py",
line 2715, in call
return self._call(inputs)
File
"D:\Anaconda3.7\lib\site-packages\keras\backend\tensorflow_backend.py",
line 2675, in _call
fetched = self._callable_fn(*array_vals)
File
"D:\Anaconda3.7\lib\site-packages\tensorflow\python\client\session.py",
line 1454, in call
self._session._session, self._handle, args, status, None)
File
"D:\Anaconda3.7\lib\site-packages\tensorflow\python\framework\errors_impl.py",
line 519, in exit
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError:
Incompatible shapes: [128] vs. [128,256,256]
[[Node:
training/SGD/gradients/multi_head_attention_1/mul_1_grad/BroadcastGradientArgs
= BroadcastGradientArgs[T=DT_INT32, _class=["loc:#training/SGD/gradients/multi_head_attention_1/mul_1_grad/Reshape_1"],
_device="/job:localhost/replica:0/task:0/device:CPU:0"](training/SGD/gradients/multi_head_attention_1/mul_1_grad/Shape,
training/SGD/gradients/multi_head_attention_1/mul_1_grad/Shape_1)]]
Because of the attn_mask's shape that doesn't match the mul in the "scale_dot_product" method. So I make some changes:
First, add parameter keep_dim in "inp_mask":inp_mask = Lambda(lambda t: K.any(K.not_equal(t, 0), axis=-1, keep_dim=True), name="Input_mask")(input). But it still doesn't work.
Second, comment the line attn_mask = K.repeat_elements(attn_mask, self.n_head, 0) and do a new method called "reshape_mask"
def reshape_mask(mask, head_num):
if mask is None:
return mask
seq_len = K.shape(mask)[1]
mask = K.expand_dims(mask, axis=1)
mask = K.tile(mask, [1, head_num, 1])
return K.reshape(mask, (-1, seq_len))
Third, rewrite the method scale_dot_product.
def scale_dot_product(query: tf.Tensor,
key: tf.Tensor,
value: tf.Tensor,
attn_mask=None):
feature_dim = K.shape(query)[-1]
e = K.batch_dot(query, key, axes=2) / K.sqrt(K.cast(feature_dim, dtype=K.floatx()))
e = K.exp(e - K.max(e, axis=-1, keepdims=True))
if attn_mask is not None:
e *= K.cast(K.expand_dims(attn_mask, axis=-2), K.floatx())
a = e / (K.sum(e, axis=-1, keepdims=True) + K.epsilon())
v = K.batch_dot(a, value)
return v
Cheers! Cheers! Cheers! Cheers! Cheers! The problem has been solved!

Resources