I am developing a Bi-LSTM model and want to add a attention layer to it. But I am not getting how to add it.
My current code for the model is
model = Sequential()
model.add(Embedding(max_words, 1152, input_length=max_len, weights=[embeddings]))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()
And the model summary is
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, 1152, 1152) 278396928
_________________________________________________________________
batch_normalization_1 (Batch (None, 1152, 1152) 4608
_________________________________________________________________
activation_1 (Activation) (None, 1152, 1152) 0
_________________________________________________________________
dropout_1 (Dropout) (None, 1152, 1152) 0
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64) 303360
_________________________________________________________________
batch_normalization_2 (Batch (None, 64) 256
_________________________________________________________________
activation_2 (Activation) (None, 64) 0
_________________________________________________________________
dropout_2 (Dropout) (None, 64) 0
_________________________________________________________________
dense_1 (Dense) (None, 1) 65
=================================================================
Total params: 278,705,217
Trainable params: 278,702,785
Non-trainable params: 2,432
This can be a possible custom solution with a custom layer that computes attention on the positional/temporal dimension
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
class Attention(Layer):
def __init__(self, return_sequences=True):
self.return_sequences = return_sequences
super(Attention,self).__init__()
def build(self, input_shape):
self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
initializer="normal")
self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
initializer="zeros")
super(Attention,self).build(input_shape)
def call(self, x):
e = K.tanh(K.dot(x,self.W)+self.b)
a = K.softmax(e, axis=1)
output = x*a
if self.return_sequences:
return output
return K.sum(output, axis=1)
it's build to receive 3D tensors and output 3D tensors (return_sequences=True) or 2D tensors (return_sequences=False). below a dummy example
# dummy data creation
max_len = 100
max_words = 333
emb_dim = 126
n_sample = 5
X = np.random.randint(0,max_words, (n_sample,max_len))
Y = np.random.randint(0,2, n_sample)
with return_sequences=True
model = Sequential()
model.add(Embedding(max_words, emb_dim, input_length=max_len))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Attention(return_sequences=True)) # receive 3D and output 3D
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile('adam', 'binary_crossentropy')
model.fit(X,Y, epochs=3)
with return_sequences=False
model = Sequential()
model.add(Embedding(max_words, emb_dim, input_length=max_len))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Attention(return_sequences=False)) # receive 3D and output 2D
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile('adam', 'binary_crossentropy')
model.fit(X,Y, epochs=3)
You can integrate it into your networks easily
here the running notebook
In case, someone is using only Tensorflow and not keras externally, this is the way to do it.
import tensorflow as tf
class Attention(tf.keras.layers.Layer):
def __init__(self, return_sequences=True, name=None, **kwargs):
super(Attention, self).__init__(name=name)
self.return_sequences = return_sequences
super(Attention, self).__init__(**kwargs)
def build(self, input_shape):
self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
initializer="glorot_uniform", trainable=True)
self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
initializer="glorot_uniform", trainable=True)
super(Attention, self).build(input_shape)
def call(self, x):
e = tf.keras.activations.tanh(tf.keras.backend.dot(x, self.W) + self.b)
a = tf.keras.activations.softmax(e, axis=1)
output = x * a
if self.return_sequences:
return a, output
return a, tf.keras.backend.sum(output, axis=1)
def get_config(self):
config = super().get_config().copy()
config.update({
'return_sequences': self.return_sequences
})
return config
Related
I am training a RNN based English to Hindi Neural Machine Translation model. I have an LSTM layer in it and attention layer, too. I am getting an error that (0) Invalid argument: logits and labels must be broadcastable: logits_size=[384,2971] labels_size=[864,2971]
My model summary is
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, None)] 0
__________________________________________________________________________________________________
input_2 (InputLayer) [(None, None)] 0
__________________________________________________________________________________________________
embedding (Embedding) (None, None, 40) 93760 input_1[0][0]
__________________________________________________________________________________________________
embedding_1 (Embedding) (None, None, 40) 118840 input_2[0][0]
__________________________________________________________________________________________________
conv1d (Conv1D) (None, None, 16) 7056 embedding[0][0]
__________________________________________________________________________________________________
conv1d_1 (Conv1D) (None, None, 16) 10256 embedding_1[0][0]
__________________________________________________________________________________________________
lstm (LSTM) [(None, 40), (None, 9120 conv1d[0][0]
__________________________________________________________________________________________________
lstm_1 (LSTM) [(None, None, 40), ( 9120 conv1d_1[0][0]
lstm[0][1]
lstm[0][2]
__________________________________________________________________________________________________
dense (Dense) (None, None, 2971) 121811 lstm_1[0][0]
==================================================================================================
Total params: 369,963
Trainable params: 369,963
Non-trainable params: 0
__________________________________________________________________________________________________
Model compile code is
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 32
epochs = 100
And fit is
history = model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
steps_per_epoch = train_samples//batch_size,
epochs=epochs,
validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
validation_steps = val_samples//batch_size)
And the error is
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py:1844: UserWarning: `Model.fit_generator` is deprecated and will be removed in a future version. Please use `Model.fit`, which supports generators.
warnings.warn('`Model.fit_generator` is deprecated and '
Epoch 1/100
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
<ipython-input-39-dc64566948be> in <module>()
3 epochs=epochs,
4 validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
----> 5 validation_steps = val_samples//batch_size)
7 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: logits and labels must be broadcastable: logits_size=[384,2971] labels_size=[864,2971]
[[node categorical_crossentropy/softmax_cross_entropy_with_logits (defined at <ipython-input-39-dc64566948be>:5) ]]
[[gradient_tape/model_1/embedding_3/embedding_lookup/Reshape/_56]]
(1) Invalid argument: logits and labels must be broadcastable: logits_size=[384,2971] labels_size=[864,2971]
[[node categorical_crossentropy/softmax_cross_entropy_with_logits (defined at <ipython-input-39-dc64566948be>:5) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_11885]
Function call stack:
train_function -> train_function
You have to properly connect your embedding to CNN and your CNN to LSTM
Encoder:
num_encoder_tokens = 333
latent_dim = 128
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_CNN = Conv1D(16, kernel_size=11, activation='relu')(enc_emb)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_CNN)
encoder_states = [state_h, state_c]
Decoder:
num_decoder_tokens = 333
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_CNN = Conv1D(16, kernel_size=11, activation='relu')(dec_emb)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_CNN, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
Summary:
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_2 (InputLayer) [(None, None)] 0
__________________________________________________________________________________________________
input_4 (InputLayer) [(None, None)] 0
__________________________________________________________________________________________________
embedding (Embedding) (None, None, 128) 42624 input_2[0][0]
__________________________________________________________________________________________________
embedding_1 (Embedding) (None, None, 128) 42624 input_4[0][0]
__________________________________________________________________________________________________
conv1d (Conv1D) (None, None, 16) 22544 embedding[0][0]
__________________________________________________________________________________________________
conv1d_1 (Conv1D) (None, None, 16) 22544 embedding_1[0][0]
__________________________________________________________________________________________________
lstm (LSTM) [(None, 128), (None, 74240 conv1d[0][0]
__________________________________________________________________________________________________
lstm_1 (LSTM) [(None, None, 128), 74240 conv1d_1[0][0]
lstm[0][1]
lstm[0][2]
__________________________________________________________________________________________________
dense (Dense) (None, None, 333) 42957 lstm_1[0][0]
==================================================================================================
I'm trying to train a multilabel classification from text input.
I first tokenize the text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
data = pad_sequences(sequences, maxlen=maxlen)
getting the following shape:
Shape of data tensor: (1333, 100) Shape of label tensor: (1333,)
Then I split in train and validations
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]
I use Glove for word representations
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
if i < max_words:
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
I build the Keras model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(16, activation='softmax'))
model.summary()
Ending up with
Model: "sequential_32"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_27 (Embedding) (None, 100, 100) 1000000
_________________________________________________________________
flatten_21 (Flatten) (None, 10000) 0
_________________________________________________________________
dense_56 (Dense) (None, 64) 640064
_________________________________________________________________
dense_57 (Dense) (None, 16) 1040
=================================================================
Total params: 1,641,104
Trainable params: 1,641,104
Non-trainable params: 0
I set the weigth of the emedding layer:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['categorical_accuracy'])
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
But I end up with the error
ValueError: Shapes (None, 1) and (None, 16) are incompatible
Everything works right if I do a single-label classification (using Dense(1) as last layer and sigmoid activation), but I can't understand why this is happening.
You should encode your labels into one-hot format if you use categorical_crossentropy.
Otherwise you can try with sparse_categorical_crossentropy as loss function which accept your format of labels (info).
https://stats.stackexchange.com/questions/326065/cross-entropy-vs-sparse-cross-entropy-when-to-use-one-over-the-other
I have created a following siamese neural network with categorical labels
EXAMPLES=10000
FEATURES=30
LEFT=np.random.random((EXAMPLES,FEATURES))
RIGHT=np.random.random((EXAMPLES,FEATURES))
LABELS=[]
for i in range(EXAMPLES):
LABELS.append(np.random.randint(0,2))
LABELS=np.asarray(LABELS)
LABELSOFTMAX=to_categorical(LABELS)
def cosine_distance(vecs):
#I'm not sure about this function too
y_true, y_pred = vecs
y_true = K.l2_normalize(y_true, axis=-1)
y_pred = K.l2_normalize(y_pred, axis=-1)
return K.mean(1 - K.sum((y_true * y_pred), axis=-1))
def cosine_dist_output_shape(shapes):
shape1, shape2 = shapes
print((shape1[0], 1))
return (shape1[0], 1)
inputShape=Input(shape=(FEATURES,))
left_input = Input(shape=(FEATURES,))
right_input = Input(shape=(FEATURES,))
First Implementation
model = Sequential()
model.add(Dense(20, activation='relu', input_shape=(30,)))
model.add(BatchNormalization())
model.add(Dense(10, activation='relu'))
encoded_l = model(left_input)
encoded_r = model(right_input)
L1_Distance = Lambda(cosine_distance, output_shape=cosine_dist_output_shape)([encoded_l, encoded_r])
siamese_net = Model([left_input, right_input], L1_Distance)
siamese_net.summary()
siamese_net.compile(loss="binary_crossentropy",optimizer=Adam(lr=0.0001))
siamese_net.fit(x=[LEFT,RIGHT],y=LABELS,batch_size=64,epochs=100)
First model summary
(None, 1)
Model: "model_28"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_132 (InputLayer) (None, 30) 0
__________________________________________________________________________________________________
input_133 (InputLayer) (None, 30) 0
__________________________________________________________________________________________________
sequential_44 (Sequential) (None, 10) 910 input_132[0][0]
input_133[0][0]
__________________________________________________________________________________________________
lambda_37 (Lambda) (None, 1) 0 sequential_44[1][0]
sequential_44[2][0]
==================================================================================================
Total params: 910
Trainable params: 870
Non-trainable params: 40
Second Implementation
model = Sequential()
model.add(Dense(20, activation='relu', input_shape=(30,)))
model.add(BatchNormalization())
model.add(Dense(10, activation='relu'))
#model.add(Dense(30, activation='relu'))
encoded_l = model(left_input)
encoded_r = model(right_input)
L1_Layer = Lambda(cosine_distance, output_shape=cosine_dist_output_shape)([encoded_l, encoded_r])
L1_Diatance = L1_layer([encoded_l, encoded_r])
prediction = Dense(2,activation='softmax')(L1_Diatance)
siamese_net = Model([left_input, right_input], prediction)
siamese_net.compile(loss="binary_crossentropy",optimizer=Adam(lr=0.001))
siamese_net.summary()
Second Model Summary
Model: "model_29"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_135 (InputLayer) (None, 30) 0
__________________________________________________________________________________________________
input_136 (InputLayer) (None, 30) 0
__________________________________________________________________________________________________
sequential_45 (Sequential) (None, 10) 910 input_135[0][0]
input_136[0][0]
__________________________________________________________________________________________________
lambda_19 (Lambda) multiple 0 sequential_45[1][0]
sequential_45[2][0]
__________________________________________________________________________________________________
dense_140 (Dense) (None, 2) 22 lambda_19[10][0]
==================================================================================================
Total params: 932
Trainable params: 892
Non-trainable params: 40
So my question is which one is correct and better implementation as both are working fine , is their some subtle issues with any of these models, because cosine similarity layer I believe just gives a scaler tensor which is confusing me in this case?
I'm trying to make a prediction with my model where shape of the array is (3084, 32, 32).
Getting value Error here is error image
Here is my model
model.add(Dense(1028, input_shape = (3084,), activation = "sigmoid"))
model.add(Dense(514, activation="sigmoid"))
model.add(Dense(len(lb.classes_), activation="softmax"))
summary
Model: "sequential_21"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_57 (Dense) (None, 1028) 3171380
_________________________________________________________________
dense_58 (Dense) (None, 514) 528906
_________________________________________________________________
dense_59 (Dense) (None, 4) 2060
=================================================================
Total params: 3,702,346
Trainable params: 3,702,346
Non-trainable params: 0
_________________________________________________________________
trying to fit using
opt = SGD(lr = 0.01)
model.compile(loss = "categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
H = model.fit(train_X, train_Y, validation_data = (test_X, test_Y), epochs = 75, batch_size = 32)
You need to specify the input shape correctly, the following model should work.
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
model = Sequential()
model.add(Dense(1028, input_shape = (32,32), activation = "sigmoid"))
model.add(Flatten())
model.add(Dense(514, activation="sigmoid"))
model.add(Dense(4, activation="softmax"))
model.summary()
I am using keras in my multiclass text classifcation, the dataset contains 25000 arabic tweets with 10 class labels
I use this code :
model = Sequential()
model.add(Dense(512, input_shape=(10902,)))#10902
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(10))
model.add(Activation('softmax'))
model.summary()
#categorical_crossentropy
model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop',
metrics=['accuracy'])
..
history = model.fit(X_train, y_train,
batch_size=100,
epochs=30,
verbose=1,
validation_split=0.5)
Summary:
Layer (type) Output Shape Param #
=================================================================
dense_23 (Dense) (None, 512) 5582336
_________________________________________________________________
activation_22 (Activation) (None, 512) 0
_________________________________________________________________
dropout_15 (Dropout) (None, 512) 0
_________________________________________________________________
dense_24 (Dense) (None, 512) 262656
_________________________________________________________________
activation_23 (Activation) (None, 512) 0
_________________________________________________________________
dropout_16 (Dropout) (None, 512) 0
_________________________________________________________________
dense_25 (Dense) (None, 10) 5130
_________________________________________________________________
activation_24 (Activation) (None, 10) 0
=================================================================
Total params: 5,850,122
Trainable params: 5,850,122
Non-trainable params: 0
but i get error:
could not convert string to float: 'food'
where food is a class name
when i change loss to categorical_crossentropy i get the error
Error when checking target: expected activation_24 to have shape (10,) but got array with shape (1,)
Update
'
nd=data.replace(['ads', 'Politic', 'eco', 'food', 'health', 'porno', 'religion', 'sports', 'tech','tv'],
[1, 2, 3, 4, 5,6,7,8,9,10])
model = Sequential()
model.add(Dense(512, input_shape=(10902,10)))#no. of words
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(10))
model.add(Activation('softmax'))
model.summary()
#categorical_crossentropy
model.compile(loss='categorical_crossentropy', optimizer='rmsprop',
metrics=['accuracy'])
y_train=keras.utils.to_categorical(y_train)
history = model.fit(X_train, y_train,
batch_size=100,
epochs=30,
verbose=1,
validation_split=0.5)'
You correctly used Dense(10) at the end, in order to produce ten results, one for each class.
But you should have your output y_train shaped also with 10 classes.
It should have shape (numberOfTweets, 10).
For this you should:
If you have an array with indices, transform them using the keras function y_train=to_categorical(y_train).
If you have them as strings, you must transform them in indices, and then use to_categorical