I don't know the name of what I'm looking for, but I want to make a layer in keras where each input is multiplied by its own, independent weight and bias. E.g. if there were 10 inputs, there would be 10 weights, and 10 biases, and each input would be multiplied by its weight and summed with its bias to get 10 outputs.
For example here is a simple Dense network:
from keras.layers import Input, Dense
from keras.models import Model
N = 10
input = Input((N,))
output = Dense(N)(input)
model = Model(input, output)
As you can see, this model has 110 parameters, because it is fully connected:
Layer (type) Output Shape Param #
input_2 (InputLayer) (None, 10) 0
dense_2 (Dense) (None, 10) 110
Total params: 110
Trainable params: 110
Non-trainable params: 0
I want to replace output = Dense(N)(input) with something like output = SinglyConnected()(input), such that the model now has 20 parameters: 10 weights and 10 Biases.

Create a custom layer:
class SingleConnected(Layer):
def __init__(self, **kwargs):
super(SingleConnected, self).__init__(**kwargs)
#creates weights
def build(self, input_shape):
weight_shape = (1,) * (len(input_shape) - 1)
weight_shape = weight_shape + (input_shape[-1]) #(....., input)
self.kernel = self.add_weight(name='kernel',
self.bias = self.add_weight(name='bias',
def call(self, inputs):
return (inputs * self.kernel) + self.bias
#output shape
def compute_output_shape(self, input_shape):
return input_shape
#for saving the model - only necessary if you have parameters in __init__
def get_config(self):
config = super(SingleConnected, self).get_config()
return config
Use the layer:


None Output from MixtureSameFamily Tensorflow

I am trying to learn multi modal distribution using Neural Network and Gaussian Mixture model.
but, in my modal summary the output layer has None output, and it is from using MixtureSameFamily.
Can you help me resolve this?
def zero_inf(out):
loc, scale, probs = tf.split(out, num_or_size_splits=3, axis=-1)
scale = tf.nn.softplus(scale)
probs = tf.nn.softmax(probs)
return tfd.MixtureSameFamily(
mixture_distribution = tfd.Categorical(probs=probs),#D
components_distribution = tfd.Normal(loc=loc, scale=scale))
## Definition of the custom parametrized distribution
inputs = tf.keras.layers.Input(shape=(5,))
out = Dense(6)(inputs)#A
p_y_zi = tfp.layers.DistributionLambda(lambda t: zero_inf(t))(out)
model_zi = Model(inputs=inputs, outputs=p_y_zi)
# def NLL(y_true, y_hat):
# return -y_hat.log_prob(tf.reshape(y_true,(-1,)))
# model_zi.compile(optimizer="adam", loss=NLL)
Below is the model summary.
Model: "model_70"
Layer (type) Output Shape Param #
input_30 (InputLayer) [(None, 5)] 0
dense_130 (Dense) (None, 6) 36
distribution_lambda_36 (Dist ((None,), (None,)) 0
Total params: 36
Trainable params: 36
Non-trainable params: 0
Because of this empty/none output, I unable to train the model as I get some shape related errors.
InvalidArgumentError: Cannot update variable with shape [] using a Tensor with shape [32], shapes must be equal.
[[{{node metrics_60/mae/AssignAddVariableOp}}]]
Tensorflow version: 2.6.2
Tensorflow-probability: 0.14.0
Python: 3.6

How to implement hierarchical Transformer for document classification in Keras?

Hierarchical attention mechanism for document classification has been presented by Yang et al.
Its implementation is available on
Also, the implementation of the document classification with Transformer is available on
But, it's not hierarchical.
I have googled a lot but didn't find any implementation of a hierarchical Transformer. Does anyone know how to implement a hierarchical transformer for document classification in Keras?
My implementation is as follows. Note that the implementation extended from Nandan implementation for document classification.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical
class MultiHeadSelfAttention(layers.Layer):
def __init__(self, embed_dim, num_heads=8):
super(MultiHeadSelfAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
if embed_dim % num_heads != 0:
raise ValueError(
f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
self.projection_dim = embed_dim // num_heads
self.query_dense = layers.Dense(embed_dim)
self.key_dense = layers.Dense(embed_dim)
self.value_dense = layers.Dense(embed_dim)
self.combine_heads = layers.Dense(embed_dim)
def attention(self, query, key, value):
score = tf.matmul(query, key, transpose_b=True)
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_score = score / tf.math.sqrt(dim_key)
weights = tf.nn.softmax(scaled_score, axis=-1)
output = tf.matmul(weights, value)
return output, weights
def separate_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs):
# x.shape = [batch_size, seq_len, embedding_dim]
batch_size = tf.shape(inputs)[0]
query = self.query_dense(inputs) # (batch_size, seq_len, embed_dim)
key = self.key_dense(inputs) # (batch_size, seq_len, embed_dim)
value = self.value_dense(inputs) # (batch_size, seq_len, embed_dim)
query = self.separate_heads(
query, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
key = self.separate_heads(
key, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
value = self.separate_heads(
value, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
attention, weights = self.attention(query, key, value)
attention = tf.transpose(
attention, perm=[0, 2, 1, 3]
) # (batch_size, seq_len, num_heads, projection_dim)
concat_attention = tf.reshape(
attention, (batch_size, -1, self.embed_dim)
) # (batch_size, seq_len, embed_dim)
output = self.combine_heads(
) # (batch_size, seq_len, embed_dim)
return output
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate, name=None):
super(TransformerBlock, self).__init__(name=name)
self.att = MultiHeadSelfAttention(embed_dim, num_heads)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ]
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(dropout_rate)
self.dropout2 = layers.Dropout(dropout_rate)
def call(self, inputs, training):
attn_output = self.att(inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim, name=None):
super(TokenAndPositionEmbedding, self).__init__(name=name)
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
def compute_output_shape(self, input_shape):
# it changes the shape from (batch_size, maxlen) to (batch_size, maxlen, embed_dim)
return input_shape + (self.pos_emb.output_dim,)
# Lower level (produce a representation of each sentence):
embed_dim = 100 # Embedding size for each token
num_heads = 2 # Number of attention heads
ff_dim = 64 # Hidden layer size in feed forward network inside transformer
L1_dense_units = 100 # Size of the sentence-level representations output by the word-level model
dropout_rate = 0.1
vocab_size = 1000
class_number = 5
max_docs = 10000
max_sentences = 15
max_words = 60
word_input = layers.Input(shape=(max_words,), name='word_input')
word_embedding = TokenAndPositionEmbedding(maxlen=max_words, vocab_size=vocab_size,
embed_dim=embed_dim, name='word_embedding')(word_input)
word_transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='word_transformer')(word_embedding)
word_pool = layers.GlobalAveragePooling1D(name='word_pooling')(word_transformer)
word_drop = layers.Dropout(dropout_rate, name='word_drop')(word_pool)
word_dense = layers.Dense(L1_dense_units, activation="relu", name='word_dense')(word_drop)
word_encoder = keras.Model(word_input, word_dense)
# =========================================================================
# Upper level (produce a representation of each document):
L2_dense_units = 100
sentence_input = layers.Input(shape=(max_sentences, max_words), name='sentence_input')
sentence_encoder = tf.keras.layers.TimeDistributed(word_encoder, name='sentence_encoder')(sentence_input)
sentence_transformer = TransformerBlock(embed_dim=L1_dense_units, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='sentence_transformer')(sentence_encoder)
sentence_pool = layers.GlobalAveragePooling1D(name='sentence_pooling')(sentence_transformer)
sentence_out = layers.Dropout(dropout_rate)(sentence_pool)
preds = layers.Dense(class_number , activation='softmax', name='sentence_output')(sentence_out)
model = keras.Model(sentence_input, preds)
The summary of the model is as follows:
Model: "model_1"
Layer (type) Output Shape Param #
word_input (InputLayer) [(None, 60)] 0
word_embedding (TokenAndPos (None, 60, 100) 106000
word_transformer (Transform (None, 60, 100) 53764
word_pooling (GlobalAverage (None, 100) 0
word_drop (Dropout) (None, 100) 0
word_dense (Dense) (None, 100) 10100
Total params: 169,864
Trainable params: 169,864
Non-trainable params: 0
Model: "model_2"
Layer (type) Output Shape Param #
sentence_input (InputLayer) [(None, 15, 60)] 0
sentence_encoder (TimeDistr (None, 15, 100) 169864
sentence_transformer (Trans (None, 15, 100) 53764
sentence_pooling (GlobalAve (None, 100) 0
dropout_9 (Dropout) (None, 100) 0
sentence_output (Dense) (None, 5) 505
Total params: 224,133
Trainable params: 224,133
Non-trainable params: 0
Everything is ok and you can copy and paste these codes in colab to see the summary of the model.
But, my problem is for positional encoding at the sentence level.
How to apply positional encoding at the sentence level?
The implementation is recursive in the sense that you treat the average of your outputs of transformer x as the input to transformer x+1.
So let's say your data is structured as (batch, chapter, paragraph, sentence, token).
After the first transformation you end up with (batch, chapter, paragraph, sentence, embedding) so then you average and get (batch, chapter, paragraph, sentence_embedding_in).
Apply another transformation and get (batch, chapter, paragraph, sentence_embedding_out).
Average again and get (batch, chapter, paragraph_embedding). Rinse & Repeat.
The implementation of the paper is actually in a different repository:
They actually do something different from what I've described and apply transformers at the bottom and RNN at the top. In theory you could do the opposite or apply RNN at each layer (that would be really slow). As far as the implementation is concerned you can abstract from that - the principle remains the same: you apply a transformation, average the outputs and feed it into the next higher-level "layer" (or "module" using torch lingo).

Square brackets in keras model output shape

I've recently encountered this when looking at a model's summary.
I was wondering, what's the difference between [(None, 16)] and (None, 16)? Why does the Input layer have such input shape?
Source: model.summary() can't print output shape while using subclass model
The issue is how you are defining the input_shape. A single element tuple in python is actually a scalar value as you can see below -
input_shape0 = 32
input_shape1 = (32)
input_shape2 = (32,)
print(input_shape0, input_shape1, input_shape2)
32 32 (32,)
Since Keras function API Input needs an input shape as a tuple, you will have to pass it in the form of (n,) instead of n
It's weird that you get a square bracket because when I run the exact same code, I get an error.
TypeError Traceback (most recent call last)
<ipython-input-828-b564be68c80d> in <module>
34 if __name__ == '__main__':
---> 35 mlp = MLP((16))
36 mlp.summary()
<ipython-input-828-b564be68c80d> in __init__(self, input_shape, **kwargs)
6 super(MLP, self).__init__(**kwargs)
7 # Add input layer
----> 8 self.input_layer = klayers.Input(input_shape)
10 self.dense_1 = klayers.Dense(64, activation='relu')
~/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/ in Input(shape, batch_size, name, dtype, sparse, tensor, **kwargs)
229 dtype=dtype,
230 sparse=sparse,
--> 231 input_tensor=tensor)
232 # Return tensor including `_keras_history`.
233 # Note that in this case train_output and test_output are the same pointer.
~/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/ in __init__(self, input_shape, batch_size, dtype, input_tensor, sparse, name, **kwargs)
89 if input_tensor is None:
90 if input_shape is not None:
---> 91 batch_input_shape = (batch_size,) + tuple(input_shape)
92 else:
93 batch_input_shape = None
TypeError: 'int' object is not iterable
Therefore, the right way to do it (which should fix your model summary as well is as below -
from tensorflow import keras
from tensorflow.keras import layers as klayers
class MLP(keras.Model):
def __init__(self, input_shape=(32,), **kwargs):
super(MLP, self).__init__(**kwargs)
# Add input layer
self.input_layer = klayers.Input(input_shape)
self.dense_1 = klayers.Dense(64, activation='relu')
self.dense_2 = klayers.Dense(10)
# Get output layer with `call` method
self.out =
# Reinitial
super(MLP, self).__init__(
def build(self):
# Initialize the graph
self._is_graph_network = True
def call(self, inputs):
x = self.dense_1(inputs)
return self.dense_2(x)
if __name__ == '__main__':
mlp = MLP((16,))
Layer (type) Output Shape Param #
input_19 (InputLayer) (None, 16) 0
dense_8 (Dense) (None, 64) 1088
dense_9 (Dense) (None, 10) 650
Total params: 1,738
Trainable params: 1,738
Non-trainable params: 0

Bias only Layer in Keras

How could one build a layer in Keras which maps an input x to an output of the form x+b where b is a trainable weight of the same dimension? (Also the activation function here would be the identity).
You can always build a custom layer by extending tf.keras.layers.Layer class, here is how I'd do it
import tensorflow as tf
print('TensorFlow:', tf.__version__)
class BiasLayer(tf.keras.layers.Layer):
def __init__(self, *args, **kwargs):
super(BiasLayer, self).__init__(*args, **kwargs)
def build(self, input_shape):
self.bias = self.add_weight('bias',
def call(self, x):
return x + self.bias
input_layer = tf.keras.Input(shape=[5])
x = BiasLayer()(input_layer)
model = tf.keras.Model(inputs=[input_layer], outputs=[x])
TensorFlow: 2.1.0
Model: "model_2"
Layer (type) Output Shape Param #
input_7 (InputLayer) [(None, 5)] 0
bias_layer_3 (BiasLayer) (None, 5) 5
Total params: 5
Trainable params: 5
Non-trainable params: 0

How to add a trainable hadamard product layer in keras?

I am trying to introduce sparsity in the training samples. My data matrix has a size of (say) NxP and I want to pass it through a layer (keras layer) which has weights of size same as the input size. That is trainable weight matrix W has a shape of NxP. I want to do an hadamard product (element-wise multiplication) of Input matrix to this layer. W multiplied element-wise with input. How to get a trainable layer for W in this case ?
By the way, thank you so much for the quick reply. However, the hadamard product I want to do is between two matrices, one is the input, lets call it X and my X is shape of NxP. And I want my kernel in the hadamard layer to be the same size as X. So kernel should have a size of NxP too. And element wise multiplication of two matrices is achived by the call function.
But the current implementation gives the kernel size as P only. Also,I tried changing the shape of the kernel in the build as follows:
self.kernel = self.add_weight(name='kernel',
But it gives me the error below:
TypeError: Failed to convert object of type to Tensor. Contents: (None, 16). Consider casting elements to a supported type.
Here P is 16 and I will get my N during the runtime and N is similar to the number of training samples.
Thank you in advance for the help.
Take the example of the documentation to create a layer, and in the call function just define it to be x * self.kernel.
This is my POC:
from keras import backend as K
from keras.engine.topology import Layer
from keras.models import Sequential
from keras.layers import Dense, Activation
import numpy as np
class Hadamard(Layer):
def __init__(self, **kwargs):
super(Hadamard, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.kernel = self.add_weight(name='kernel',
shape=(1,) + input_shape[1:],
super(Hadamard, self).build(input_shape) # Be sure to call this somewhere!
def call(self, x):
print(x.shape, self.kernel.shape)
return x * self.kernel
def compute_output_shape(self, input_shape):
return input_shape
N = 10
P = 64
model = Sequential()
model.add(Dense(128, input_shape=(N, P), activation='relu'))
model.compile(loss='mean_squared_error', optimizer='adam')
print(model.summary()), N, P)), np.ones((10, N, 1)))
print(model.predict(np.ones((20, N, P))))
If you need to use it as the first layer you should include the input shape parameter:
N = 10
P = 64
model = Sequential()
model.add(Hadamard(input_shape=(N, P)))
model.compile(loss='mean_squared_error', optimizer='adam')
This results in:
Layer (type) Output Shape Param #
hadamard_1 (Hadamard) (None, 10, 64) 640
Total params: 640
Trainable params: 640
Non-trainable params: 0
