Related
I'm trying to do a weka classifier for python to use python libraries with weka models, like #fracpete do on https://github.com/fracpete/sklearn-weka-plugin, but by my own.
Right now I have that and it works for predictions, use the SHAP library, etc.
from sklearn.base import BaseEstimator
from weka.classifiers import Classifier
from weka.core.dataset import Attribute, Instance, Instances
from sklearn.metrics import accuracy_score
class weka_classifier(BaseEstimator):
def __init__(self, classifier = None, dataset = None, index = None):
#Classifier: es el pww3/weka model
#Dataset: Data for fit the model
if classifier is not None:
self.classifier = classifier
elif dataset is not None:
self.dataset = dataset
self.dataset.class_is_last()
def fit(self, X, y):
return self.fit()
def fit(self):
return self.classifier.build_classifier(self.dataset)
def predict_instance(self,x):
x.append(0.0)
inst = Instance.create_instance(x,classname='weka.core.DenseInstance', weight=1.0)
inst.dataset = self.dataset
return self.classifier.classify_instance(inst)
def predict_proba_instance(self,x):
x.append(0.0)
inst = Instance.create_instance(x,classname='weka.core.DenseInstance', weight=1.0)
inst.dataset = self.dataset
return self.classifier.distribution_for_instance(inst)
def predict_proba(self,X):
prediction = []
for i in range(X.shape[0]):
instance = []
for j in range(X.shape[1]):
instance.append(X[i][j])
instance.append(0.0)
instance = Instance.create_instance(instance,classname='weka.core.DenseInstance', weight=1.0)
instance.dataset=self.dataset
prediction.append(self.classifier.distribution_for_instance(instance))
return np.asarray(prediction)
def predict(self,X):
prediction = []
for i in range(X.shape[0]):
instance = []
for j in range(X.shape[1]):
instance.append(X[i][j])
instance.append(0.0)
instance = Instance.create_instance(instance,classname='weka.core.DenseInstance', weight=1.0)
instance.dataset=self.dataset
prediction.append(self.classifier.classify_instance(instance))
return np.asarray(prediction)
def set_data(self,dataset):
self.dataset = dataset
self.dataset.class_is_last()
def score(self,X,y):
y_pred = self.predict(X)
score = accuracy_score(y, y_pred)
return score
But when I try to evaluate the classifier with this funcion
evaluate_models_cv(sci_Model_1,X,y,10)
from sklearn.model_selection import cross_validate
def evaluate_models_cv(weka_model, X, y, cv, scoring = None):
return_train_score = True
n_jobs = -1
scores = cross_validate(weka_model, X, y, cv = cv, n_jobs = n_jobs, scoring = scoring, return_train_score = return_train_score)
scores = pd.DataFrame(scores)
means = scores.mean(axis = 0)
sds = scores.std(axis = 0)
results = dict(mean = means, sd = sds)
results = pd.DataFrame(results)
return results
I'm getting this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_2608/4224958314.py in <module>
----> 1 evaluate_models_cv(sci_Model_1,X,y,10)
/tmp/ipykernel_2608/3738102976.py in evaluate_models_cv(weka_model, X, y, cv, scoring)
5 n_jobs = -1
6
----> 7 scores = cross_validate(weka_model, X, y, cv = cv, n_jobs = n_jobs, scoring = scoring, return_train_score = return_train_score)
8
9 scores = pd.DataFrame(scores)
/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
258 scorers = scoring
259 elif scoring is None or isinstance(scoring, str):
--> 260 scorers = check_scoring(estimator, scoring)
261 else:
262 scorers = _check_multimetric_scoring(estimator, scoring)
/usr/local/lib/python3.8/dist-packages/sklearn/metrics/_scorer.py in check_scoring(estimator, scoring, allow_none)
475 return None
476 else:
--> 477 raise TypeError(
478 "If no scoring is specified, the estimator passed should "
479 "have a 'score' method. The estimator %r does not." % estimator
TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator weka_classifier(classifier=AttributeSelectedClassifier:
So I tried to solve it makeing the score() function on the class, but that didn't work.
I'm doing something wrong?
Ok it works, I didn't extends the ClassifierMixin class like:
class weka_classifier(BaseEstimator, ClassifierMixin):
I have trained an LSTM model for sentiment analysis using pytorch_lightning but I've been having difficulties incorporating the inference.
This is my model:
class LSTM(pl.LightningModule):
def __init__(self,n_vocab,n_embed,
n_hidden,n_output,n_layers,learning_rate,embedding_matrix=None):
super().__init__()
self.n_vocab = n_vocab
self.n_layer = n_layers
self.n_hidden = n_hidden
self.embedding = nn.Embedding(n_vocab, n_embed, padding_idx = 0)
if embedding_matrix is not None:
self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
self.embedding.weight.requires_grad = False
self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, bidirectional = True)
self.fc = nn.Linear(2 * n_hidden, n_output)
self.dropout = nn.Dropout(0.2)
self.sigmoid = nn.Sigmoid()
self.batch_size = batch_size
self.learning_rate = learning_rate
def forward(self,input_words):
embedded_words = self.embedding(input_words)
lstm_out, _ = self.lstm(embedded_words)
lstm_out_f=lstm_out[:,-1 , :300 ]
lstm_out_b=lstm_out[:, 0 , 300: ]
lstm_out_final = torch.cat([lstm_out_f,lstm_out_b], dim=-1)
lstm_out_final = self.dropout(lstm_out_final)
fc_out = self.fc(lstm_out_final)
return fc_out
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
return optimizer
def training_step(self, batch, batch_nb):
x , y= batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y)
self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
return loss
def validation_step(self, batch, batch_nb):
x, y = batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y).to(device = 'cuda')
f1 = torchmetrics.F1(num_classes=5).to(device = 'cuda')
f1_score = f1(y_hat, y)
accuracy = Accuracy().to(device = 'cuda')
accur = accuracy(y_hat, y)
self.log("val_loss", loss)
self.log("f1_score", f1_score)
self.log("accuracy", accur)
def test_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
print(logits)
logitsies = softmax(logits)
choice = argmax(logitsies)
loss = F.nll_loss(logits, y)
self.log("test_loss", loss)
return choice
def predict_step(self, batch, batch_idx, dataloader_idx):
x, y = batch
x = x.view(x.size(0), -1)
y_hat = self(x)
logitsies = softmax(logits)
choice = argmax(logitsies)
loss = F.nll_loss(logits, y)
self.log("predict_loss", loss)
return choice
I called the model as such:
model = LSTM(
n_vocab=size_of_emb_matrix,
n_embed=embed_vector_len,
n_hidden=150,
n_output=5,
n_layers=1,
learning_rate=1e-4,
embedding_matrix=embedding_matrix
)
Now I am trying to write a function that would allow me to do inference. I have managed to succesfuly tokenize the input sentence and encode it through an already pre-defined functions, yet I have been getting several errors no matter what I try. I have found myself stuck and don't know how to continue. This is my function so far.
def get_sentiment(text):
x = encode_sentence(text,vocab2index)
x_bar = x[0]
y_hat = torch.tensor(x_bar)
trainer.predict(model,y_hat)
The encode_sentence function is as follows:
def encode_sentence(text, vocab2index, N=70):
tokenized = tokenize(text)
encoded = np.zeros(N, dtype=int)
enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
length = min(N, len(enc1))
encoded[:length] = enc1[:length]
return encoded, length
As I call the get_sentiment function, I am using the trainer.predict function which allows me to predict, hence doing inference.
But I have been getting the following Issue:
AttributeError Traceback (most recent call
last)
<ipython-input-195-825c536cbbb2> in <module>()
----> 1 get_sentiment("love that dress")
13 frames
/usr/local/lib/python3.7/dist-
packages/pytorch_lightning/loops/epoch/prediction_epoch_loop.py in
_store_batch_indices(self, dataloader_idx)
162 def _store_batch_indices(self, dataloader_idx: int) -> None:
163 """Stores the batch indices if the predictions should be
stored"""
--> 164 batch_sampler =
self.trainer.predict_dataloaders[dataloader_idx].batch_sampler
165 if isinstance(batch_sampler, IndexBatchSamplerWrapper):
166 self.current_batch_indices = batch_sampler.batch_indices
AttributeError: 'Tensor' object has no attribute 'batch_sampler'
I have a 3 file. In the datamodule file, I have created data and used the basic format of the PyTorch Lightning. In the linear_model I made a linear regression model based on this page. Finally, I have a train file, I am calling the model and trying to fit the data. But I am getting this error
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/mostafiz/Dropbox/MSc/Thesis/regreesion_EC/src/test_train.py", line 10, in <module>
train_dataloader=datamodule.DataModuleClass().setup().train_dataloader(),
AttributeError: 'tuple' object has no attribute 'train_dataloader'
Sample datamodule file
class DataModuleClass(pl.LightningDataModule):
def __init__(self):
super().__init__()
self.sigma = 5
self.batch_size = 10
self.prepare_data()
def prepare_data(self):
x = np.random.uniform(0, 10, 10)
e = np.random.normal(0, self.sigma, len(x))
y = x + e
X = np.transpose(np.array([x, e]))
self.x_train_tensor = torch.from_numpy(X).float().to(device)
self.y_train_tensor = torch.from_numpy(y).float().to(device)
training_dataset = TensorDataset(self.x_train_tensor, self.y_train_tensor)
self.training_dataset = training_dataset
def setup(self):
data = self.training_dataset
self.train_data, self.val_data = random_split(data, [8, 2])
return self.train_data, self.val_data
def train_dataloader(self):
return DataLoader(self.train_data)
def val_dataloader(self):
return DataLoader(self.val_data)
Sample training file
from . import datamodule, linear_model
model = linear_model.LinearRegression(input_dim=2, l1_strength=1, l2_strength=1)
trainer = pl.Trainer()
trainer.fit(model,
train_dataloader=datamodule.DataModuleClass().setup().train_dataloader(),
val_dataloaders=datamodule.DataModuleClass().setup().val_dataloaders())
Let me know if you need more code or explanation.
Update (Based on the comment)
Now, I am getting the following error after removing self.prepare_data() from the __init__() of the DataModuleClass(), removed return self.train_data, self.val_data from setup(), and changed the test file to
data_module = datamodule.DataModuleClass()
trainer = pl.Trainer()
trainer.fit(model,data_module)
Error:
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/mostafiz/Dropbox/MSc/Thesis/regreesion_EC/src/test_train.py", line 10, in <module>
train_dataloader=datamodule.DataModuleClass().train_dataloader(),
File "/home/mostafiz/Dropbox/MSc/Thesis/regreesion_EC/src/datamodule.py", line 54, in train_dataloader
return DataLoader(self.train_data)
AttributeError: 'DataModuleClass' object has no attribute 'train_data'
Most of the things were correct, except few things like:
def prepare_data(self):
This function was right except that it should not return anything.
Another thing was
def setup(self,stage=None):
Requires stage variable which can be set to a default value of none in case we don't want to switch between different test and train stage.
Putting everything together, here is the code:
from argparse import ArgumentParser
import numpy as np
import pytorch_lightning as pl
from torch.utils.data import random_split, DataLoader, TensorDataset
import torch
from torch.autograd import Variable
from torchvision import transforms
import pytorch_lightning as pl
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam
from torch.optim.optimizer import Optimizer
class LinearRegression(pl.LightningModule):
def __init__(
self,
input_dim: int = 2,
output_dim: int = 1,
bias: bool = True,
learning_rate: float = 1e-4,
optimizer: Optimizer = Adam,
l1_strength: float = 0.0,
l2_strength: float = 0.0
):
super().__init__()
self.save_hyperparameters()
self.optimizer = optimizer
self.linear = nn.Linear(in_features=self.hparams.input_dim, out_features=self.hparams.output_dim, bias=bias)
def forward(self, x):
y_hat = self.linear(x)
return y_hat
def training_step(self, batch, batch_idx):
x, y = batch
# flatten any input
x = x.view(x.size(0), -1)
y_hat = self(x)
loss = F.mse_loss(y_hat, y, reduction='sum')
# L1 regularizer
if self.hparams.l1_strength > 0:
l1_reg = sum(param.abs().sum() for param in self.parameters())
loss += self.hparams.l1_strength * l1_reg
# L2 regularizer
if self.hparams.l2_strength > 0:
l2_reg = sum(param.pow(2).sum() for param in self.parameters())
loss += self.hparams.l2_strength * l2_reg
loss /= x.size(0)
tensorboard_logs = {'train_mse_loss': loss}
progress_bar_metrics = tensorboard_logs
return {'loss': loss, 'log': tensorboard_logs, 'progress_bar': progress_bar_metrics}
def validation_step(self, batch, batch_idx):
x, y = batch
x = x.view(x.size(0), -1)
y_hat = self(x)
return {'val_loss': F.mse_loss(y_hat, y)}
def validation_epoch_end(self, outputs):
val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
tensorboard_logs = {'val_mse_loss': val_loss}
progress_bar_metrics = tensorboard_logs
return {'val_loss': val_loss, 'log': tensorboard_logs, 'progress_bar': progress_bar_metrics}
def configure_optimizers(self):
return self.optimizer(self.parameters(), lr=self.hparams.learning_rate)
np.random.seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class DataModuleClass(pl.LightningDataModule):
def __init__(self):
super().__init__()
self.sigma = 5
self.batch_size = 10
def prepare_data(self):
x = np.random.uniform(0, 10, 10)
e = np.random.normal(0, self.sigma, len(x))
y = x + e
X = np.transpose(np.array([x, e]))
self.x_train_tensor = torch.from_numpy(X).float().to(device)
self.y_train_tensor = torch.from_numpy(y).float().to(device)
training_dataset = TensorDataset(self.x_train_tensor, self.y_train_tensor)
self.training_dataset = training_dataset
def setup(self,stage=None):
data = self.training_dataset
self.train_data, self.val_data = random_split(data, [8, 2])
def train_dataloader(self):
return DataLoader(self.train_data)
def val_dataloader(self):
return DataLoader(self.val_data)
model = LinearRegression(input_dim=2, l1_strength=1, l2_strength=1)
trainer = pl.Trainer()
dummy = DataModuleClass()
trainer.fit(model,dummy)
I am trying to train a ResNet based UNet for image segmentation. I have the location of images and mask images in a csv file, that's why I have created my own dataloader, which is as follows:
X = list(df['input_img'])
y = list(df['mask_img'])
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size=0.33, random_state=42)
class NumbersDataset():
def __init__(self, inputs, labels):
self.X = inputs
self.y = labels
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
img_train = cv2.imread(self.X[idx])
img_mask = cv2.imread(self.y[idx])
img_train = cv2.resize(img_train, (427,240), interpolation = cv2.INTER_LANCZOS4)
img_mask = cv2.resize(img_mask, (427,240), interpolation = cv2.INTER_LANCZOS4)
return img_train, img_mask
I then call this datagenerator in the __main__ function:
if __name__ == '__main__':
dataset_train = NumbersDataset(X_train, y_train)
dataloader_train = DataLoader(dataset_train, batch_size=4, shuffle=True, num_workers=2)
dataset_valid = NumbersDataset(X_valid, y_valid)
dataloader_valid = DataLoader(dataset_valid, batch_size=4, shuffle=True, num_workers=2)
datas = DataBunch(train_dl = dataloader_train, valid_dl = dataloader_valid)
leaner = unet_learner(data = datas, arch = models.resnet34)
But I end up getting the following error:
Traceback (most recent call last):
File "dataset_test.py", line 70, in <module>
leaner = unet_learner(data = datas, arch = models.resnet34)
File "/home/sarvagya/miniconda3/envs/gr/lib/python3.6/site-packages/fastai/vision/learner.py", line 118, in unet_learner
model = to_device(models.unet.DynamicUnet(body, n_classes=data.c, img_size=size, blur=blur, blur_final=blur_final,
File "/home/sarvagya/miniconda3/envs/gr/lib/python3.6/site-packages/fastai/basic_data.py", line 122, in __getattr__
def __getattr__(self,k:int)->Any: return getattr(self.train_dl, k)
File "/home/sarvagya/miniconda3/envs/gr/lib/python3.6/site-packages/fastai/basic_data.py", line 38, in __getattr__
def __getattr__(self,k:str)->Any: return getattr(self.dl, k)
File "/home/sarvagya/miniconda3/envs/gr/lib/python3.6/site-packages/fastai/basic_data.py", line 20, in DataLoader___getattr__
def DataLoader___getattr__(dl, k:str)->Any: return getattr(dl.dataset, k)
AttributeError: 'NumbersDataset' object has no attribute 'c'
I tried searching and even tried using SegmentationItemList.from_df but nothing helped. What am I getting wrong here?
You should add the attribute c into your NumbersDataset, like this:
def __init__(self, inputs, labels, c):
self.inputs = inputs
self.labels = labels
self.c = c
I implement a custom layer called "MultiHeadAttention".When I try to use it, that caused
tensorflow.python.framework.errors_impl.InvalidArgumentError:
Incompatible shapes: [128] vs. [128,256,256]
...(omit)...(training/SGD/gradients/multi_head_attention_1/mul_1_grad/Shape,
training/SGD/gradients/multi_head_attention_1/mul_1_grad/Shape_1)]]
MultiHeadAttention code:
class MultiHeadAttention(Layer):
def __init__(self, n_head: int, model_dim: int, **kwargs):
self.n_head = n_head
self.model_dim = model_dim
self.dim_per_head = model_dim // n_head
super(MultiHeadAttention, self).__init__(**kwargs)
def build(self, input_shape):
if isinstance(input_shape, list):
input_shape = input_shape[0]
self.query_kernel = self.add_weight(name='query_kernel',
shape=(input_shape[2], self.dim_per_head * self.n_head),
initializer='uniform', trainable=True)
self.key_kernel = self.add_weight(name='key_kernel',
shape=(input_shape[2], self.dim_per_head * self.n_head),
initializer='uniform', trainable=True)
self.value_kernel = self.add_weight(name='value_kernel',
shape=(input_shape[2], self.dim_per_head * self.n_head),
initializer='uniform', trainable=True)
self.output_kernel = self.add_weight(name='output_kernel',
shape=(self.dim_per_head * self.n_head, self.model_dim),
initializer='uniform', trainable=True)
self.output_bias = self.add_weight(name='output_bias',
shape=(self.model_dim,),
initializer='zeros', trainable=True)
super(MultiHeadAttention, self).build(input_shape)
def call(self, x):
if isinstance(x, list):
attn, attn_mask = x
attn_mask = K.repeat_elements(attn_mask, self.n_head, 0)
else:
attn = x
attn_mask = None
query_big = K.dot(attn, self.query_kernel)
key_big = K.dot(attn, self.key_kernel)
value_big = K.dot(attn, self.value_kernel) # batch ,seq_len, hid*n_head
def reshape1(x):
s = list(x.shape)
x = K.reshape(x, [-1, s[1], self.n_head, s[2] // self.n_head])
x = K.permute_dimensions(x, [2, 0, 1, 3])
x = K.reshape(x, [-1, s[1], s[2] // self.n_head])
return x
query_big = reshape1(query_big)
key_big = reshape1(key_big)
value_big = reshape1(value_big)
# print(value_big.shape)
result = scale_dot_product(query_big, key_big, value_big, attn_mask) # n_head * batch, seq_len, hid
def reshape2(x):
s = list(x.shape) # [n_head * batch_size, len_v, d_v]
x = K.reshape(x, [self.n_head, -1, s[1], s[2]])
x = K.permute_dimensions(x, [1, 2, 0, 3])
x = K.reshape(x, [-1, s[1], self.n_head * s[2]]) # [batch_size, len_v, n_head * d_v]
return x
result = reshape2(result)
result = K.dot(result, self.output_kernel) + self.output_bias
return result
def compute_output_shape(self, input_shape):
if isinstance(input_shape, list):
input_shape = input_shape[0]
return (input_shape[0], input_shape[1], self.model_dim)
def compute_mask(self, inputs, mask=None):
return None
def scale_dot_product(query: tf.Tensor,
key: tf.Tensor,
value: tf.Tensor,
attn_mask=None):
shape_list = list(value.shape)
mul = K.batch_dot(query, K.permute_dimensions(key, (0, 2, 1)))
if attn_mask is not None:
attn_mask = K.cast(attn_mask, dtype=tf.float32)
mul = attn_mask * mul + (1.0 - attn_mask) * neg_inf
scale = mul / K.sqrt(K.cast(shape_list[-1], mul.dtype))
softmax = K.softmax(scale)
result = K.batch_dot(softmax, value)
return result
A simple example:
import numpy as np
import keras.backend as K
from keras.optimizers import SGD
from keras import Input, Model, losses
from keras.layers import Embedding, Lambda, Dense
import MultiHeadAttention
if __name__ == "__main__":
max_len = 256
word_dim = 200
vacab_size = 10000
input = Input(shape=(max_len,), name="Input-Sentence")
word_embedding = Embedding(vacab_size, word_dim, input_length=max_len,
mask_zero=False, trainable=True)(input)
inp_mask = Lambda(lambda t: K.any(K.not_equal(t, 0), axis=-1), name="Input_mask")(input)
out = word_embedding
# There were something wrong with the custom layer of MultiHeadAttention. if comment line below,it would be ok.
out = MultiHeadAttention(n_head=8, model_dim=word_dim)([out, inp_mask])
out = Dense(2, activation="softmax")(out)
model = Model(inputs=input, outputs=out)
model.summary()
model.compile(optimizer=SGD(), loss=losses.sparse_categorical_crossentropy)
# example data
data_num = 1024
x = np.array(np.random.randint(0, vacab_size, (data_num, max_len)).tolist())
y = np.array(np.random.randint(0, 2, (data_num, max_len, 1)).tolist())
print(x.shape, y.shape)
model.fit(x, y, epochs=24, batch_size=16)
keras==2.2.4
tf == 1.13.1
Error information:
Traceback (most recent call last):
File "D:\PyCharm Community Edition
2018.1.4\helpers\pydev\pydev_run_in_console.py", line 52, in run_file
pydev_imports.execfile(file, globals, locals) # execute the script
File "D:\PyCharm Community Edition
2018.1.4\helpers\pydev_pydev_imps_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File
"C:/Users/Administrator/PyProgram/InfosExtractor/code/BERT/MultiAttentionTest.py",
line 30, in
model.fit(x, y, epochs=24, batch_size=16)
File "D:\Anaconda3.7\lib\site-packages\keras\engine\training.py", line
1039, in fit
validation_steps=validation_steps)
File
"D:\Anaconda3.7\lib\site-packages\keras\engine\training_arrays.py",
line 199, in fit_loop
outs = f(ins_batch)
File
"D:\Anaconda3.7\lib\site-packages\keras\backend\tensorflow_backend.py",
line 2715, in call
return self._call(inputs)
File
"D:\Anaconda3.7\lib\site-packages\keras\backend\tensorflow_backend.py",
line 2675, in _call
fetched = self._callable_fn(*array_vals)
File
"D:\Anaconda3.7\lib\site-packages\tensorflow\python\client\session.py",
line 1454, in call
self._session._session, self._handle, args, status, None)
File
"D:\Anaconda3.7\lib\site-packages\tensorflow\python\framework\errors_impl.py",
line 519, in exit
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError:
Incompatible shapes: [128] vs. [128,256,256]
[[Node:
training/SGD/gradients/multi_head_attention_1/mul_1_grad/BroadcastGradientArgs
= BroadcastGradientArgs[T=DT_INT32, _class=["loc:#training/SGD/gradients/multi_head_attention_1/mul_1_grad/Reshape_1"],
_device="/job:localhost/replica:0/task:0/device:CPU:0"](training/SGD/gradients/multi_head_attention_1/mul_1_grad/Shape,
training/SGD/gradients/multi_head_attention_1/mul_1_grad/Shape_1)]]
Because of the attn_mask's shape that doesn't match the mul in the "scale_dot_product" method. So I make some changes:
First, add parameter keep_dim in "inp_mask":inp_mask = Lambda(lambda t: K.any(K.not_equal(t, 0), axis=-1, keep_dim=True), name="Input_mask")(input). But it still doesn't work.
Second, comment the line attn_mask = K.repeat_elements(attn_mask, self.n_head, 0) and do a new method called "reshape_mask"
def reshape_mask(mask, head_num):
if mask is None:
return mask
seq_len = K.shape(mask)[1]
mask = K.expand_dims(mask, axis=1)
mask = K.tile(mask, [1, head_num, 1])
return K.reshape(mask, (-1, seq_len))
Third, rewrite the method scale_dot_product.
def scale_dot_product(query: tf.Tensor,
key: tf.Tensor,
value: tf.Tensor,
attn_mask=None):
feature_dim = K.shape(query)[-1]
e = K.batch_dot(query, key, axes=2) / K.sqrt(K.cast(feature_dim, dtype=K.floatx()))
e = K.exp(e - K.max(e, axis=-1, keepdims=True))
if attn_mask is not None:
e *= K.cast(K.expand_dims(attn_mask, axis=-2), K.floatx())
a = e / (K.sum(e, axis=-1, keepdims=True) + K.epsilon())
v = K.batch_dot(a, value)
return v
Cheers! Cheers! Cheers! Cheers! Cheers! The problem has been solved!