Custom Keras Layers fail

I want custom Keras layer, realize the output of the two models assign different weights and the weight can be trained as the following
class WeightedSum(Layer):
def __init__(self,**kwargs):
super(WeightedSum, self).__init__(**kwargs)
def build(self, input_shape):
self.weights =K.variable(np.random.random(1))
def call(self, two_model_outputs):
return self.weights * two_model_outputs[0] + (1 - self.weights) * two_model_outputs[1]
def compute_output_shape(self, input_shape):
return input_shape[0]
but I made a mistake in writing and don't know how to do it.
Traceback (most recent call last):
File "", line 182, in <module>
File "/root/anaconda3/lib/python3.7/site-packages/keras/engine/", line 431, in __call__
File "", line 162, in build
self.weights =K.variable(np.random.random(1))
AttributeError: can't set attribute

Maybe Keras is protecting itself from letting you use a word it considers reserved somehow?
Try to add the weights in a standard way and use another variable name:
def build(self, input_shape):
self.kernel = self.add_weight(name='kernel',
#I suggest a constraint here, see below
#this works as an initializer for the weights
K.set_value(self.kernel, np.array([0.5]))
#you can use np.random here, but it seems safer to go with 0.5
#this tells keras that the layer is build in fact
super(WeightedSum, self).build(shapes)
Of course you will need to replace weights with kernel in the call method.
Non related:
I suggest you also use a constraint to keep the kernel within 0 and 1.
from keras.constraints import MinMaxNorm
self.kernel = self.add_weight(name='kernel',
constraint = MinMaxNorm(0,1)


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, while using LayerNorm

While using PyTorch version 1.9.0, I'm getting the error saying that my tensors are at two different locations. Also, the error trace leads me to the LayerNorm function which has been assigned to the variable h. But when I check -
it returns true. Therefore, I'm confused regarding what is causing this error and how to solve it.
File "C:/Users/user/AppData/Roaming/JetBrains/PyCharmCE2020.2/scratches/", line 206, in forward
h = nn.LayerNorm(h.shape[1])(h)
File "C:\Users\user\anaconda3\envs\paper_2\lib\site-packages\torch\nn\modules\", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "C:\Users\user\anaconda3\envs\paper_2\lib\site-packages\torch\nn\modules\", line 174, in forward
input, self.normalized_shape, self.weight, self.bias, self.eps)
File "C:\Users\user\anaconda3\envs\paper_2\lib\site-packages\torch\nn\", line 2346, in layer_norm
return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking arugment for argument weight in method wrapper_native_layer_norm)
Update #1:
After following the stack trace, I reached the forward function in and checked the variables present over there -
def forward(self, input: Tensor) -> Tensor:
print("Check if weight is CUDA", self.weight.is_cuda)
print("Check if bias is CUDA", self.bias.is_cuda)
print("Check if input is CUDA", input.is_cuda)
#print("Check if normalized shape is CUDA", self.normalized_shape.is_cuda)
return F.layer_norm(
input, self.normalized_shape, self.weight, self.bias, self.eps)
Check if weight is CUDA False
Check if bias is CUDA False
Check if input is CUDA True
Therefore, it is the weight and the biases within the layernorm function that is causing this issue. A quick hack done by me to get the function running was as follows. However, I am not sure whether this technique is appropriate -
h ='cpu')
h = nn.LayerNorm(h.shape[1])(h)
h ='cuda')
I have added a minimally reproducible example below to better explain my issue. Please note the variables given in the question above and in this example will be different -
import math, random
from sklearn.datasets import load_sample_images
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
###Obtaining a random image and preprocessing it!##
dataset = load_sample_images()
first_img_data = dataset.images[0]
first_img_data = first_img_data.reshape(-1, 427, 640)
first_img_data = first_img_data[1, :, :]
first_img_data = first_img_data[0:84, 0:84].reshape(-1, 84,84)
first_img_data = torch.tensor(first_img_data)
USE_CUDA = torch.cuda.is_available()
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
class Cnn(nn.Module):
def __init__(self, input_shape):
super(Cnn, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
nn.Conv2d(32, 64, kernel_size=4, stride=2),
nn.Conv2d(64, 64, kernel_size=3, stride=1),
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
# If you uncomment the line below, it'll throw an error!
#x = nn.LayerNorm(x.shape[1])(x)
return x
state = first_img_data
Shape = (1,84, 84)
current_model = Cnn(Shape)'cuda')
state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True)
q_value = current_model.forward(state)
P.S There is a similar question over here(pytorch running: RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu), but I couldn't obtain an answer by following the steps given.

Pytorch DataParallel doesn't work when the model contain tensor operation

If my model contains only nn.Module layers such as nn.Linear, nn.DataParallel works fine.
x = torch.randn(100,10)
class normal_model(torch.nn.Module):
def __init__(self):
super(normal_model, self).__init__()
self.layer = torch.nn.Linear(10,1)
def forward(self, x):
return self.layer(x)
model = normal_model()
model = nn.DataParallel('cuda:0'))
However, when my model contains a tensor operation such as the following
class custom_model(torch.nn.Module):
def __init__(self):
super(custom_model, self).__init__()
self.layer = torch.nn.Linear(10,5)
self.weight = torch.ones(5,1, device='cuda:0')
def forward(self, x):
return self.layer(x) # self.weight
model = custom_model()
model = torch.nn.DataParallel('cuda:0'))
It gives me the following error
RuntimeError: Caught RuntimeError in replica 1 on device 1. Original
Traceback (most recent call last): File
line 60, in _worker
output = module(*input, **kwargs) File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/",
line 541, in call
result = self.forward(*input, **kwargs) File "", line 7, in forward
return self.layer(x) # self.weight RuntimeError: arguments are located on different GPUs at
How to avoid this error when we have some tensor operations in our model?
I have no experience with DataParallel, but I think it might be because your tensor is not part of the model parameters. You can do this by writing:
Note that you don't have to move it to the gpu when initializing, because now when you call'cuda:0') this is done automatically.
I can imagine that DataParallel uses the model parameters to move them to the appropriate gpu.
See this answer for more on the difference between a torch tensor and torch.nn.Parameter.
If you don't want the tensor values to be updated by backpropagation during training, you can add requires_grad=False.
Another way that might work is to override the to method, and initialize the tensor in the forward pass:
class custom_model(torch.nn.Module):
def __init__(self):
super(custom_model, self).__init__()
self.layer = torch.nn.Linear(10,5)
def forward(self, x):
return self.layer(x) # torch.ones(5,1, device=self.device)
def to(self, device: str):
new_self = super(custom_model, self).to(device)
new_self.device = device
return new_self
or something like this:
class custom_model(torch.nn.Module):
def __init__(self, device:str):
super(custom_model, self).__init__()
self.layer = torch.nn.Linear(10,5)
self.weight = torch.ones(5,1, device=device)
def forward(self, x):
return self.layer(x) # self.weight
def to(self, device: str):
new_self = super(custom_model, self).to(device)
new_self.device = device
new_self.weight = torch.ones(5,1, device=device)
return new_self
Adding to the answer from #Elgar de Groot since OP also wanted to freeze that layer. To do so you can still use torch.nn.Parameter but then you explicitly set requires_grad to false like this:
self.layer = torch.nn.Parameter(torch.ones(5,1))
self.layer.requires_grad = False

Problem using Elmo from tensorflow hub as custom tf.keras layer during prediction

I am trying to use Elmo from tensorflow hub with tf.keras, to perform NER. Training is fine and loss is decreasing, also test set gives good results. But I am unable to predict, as I get following error:
2019-05-02 15:41:42.785946: I tensorflow/stream_executor/] successfully opened CUDA library locally
Traceback (most recent call last):
File "", line 668, in <module>
np.array([['hello', 'world'] + ['--PAD--'] * 18])))
File "/home/ashwanipandey/eva_ml/experimental/eva_brain/venv/lib64/python3.6/site-packages/tensorflow/python/keras/engine/", line 1113, in predict
self, x, batch_size=batch_size, verbose=verbose, steps=steps)
File "/home/ashwanipandey/eva_ml/experimental/eva_brain/venv/lib64/python3.6/site-packages/tensorflow/python/keras/engine/", line 329, in model_iteration
batch_outs = f(ins_batch)
File "/home/ashwanipandey/eva_ml/experimental/eva_brain/venv/lib64/python3.6/site-packages/tensorflow/python/keras/", line 3076, in __call__
File "/home/ashwanipandey/eva_ml/experimental/eva_brain/venv/lib64/python3.6/site-packages/tensorflow/python/client/", line 1439, in __call__
File "/home/ashwanipandey/eva_ml/experimental/eva_brain/venv/lib64/python3.6/site-packages/tensorflow/python/framework/", line 528, in __exit__
tensorflow.python.framework.errors_impl.InvalidArgumentError: len(seq_lens) != input.dims(0), (256 vs. 1)
[[{{node Embed/elmo/elmo_module_apply_tokens/bilm/ReverseSequence}}]]
[[{{node Tag/t_output/transpose_1}}]]
256 is my batch size during training. I am trying to predict just one sentence.
I tried to search a lot on internet, but all in vane. Any help is much appreciated.
I can definitely get predictions if I repeat my vector 256 times and set batch_size to 256 during prediction. But as you can see this is highly inefficient workaround.
Here is code for custom layer
class ElmoEmbeddingLayer(keras.layers.Layer):
def __init__(self, dimensions=1024, batch_size=512, word_size=20, **kwargs):
self.dimensions = 1024
self.trainable = True
self.batch_size = _BATCH_SIZE
self.word_size = _WORD_SIZE
def build(self, input_shape):
self.elmo = hub.Module('', trainable=self.trainable,
def call(self, x, mask=None):
result = self.elmo(inputs={
"tokens": K.cast(x, tf.string),
"sequence_len": K.constant(self.batch_size*[self.word_size], dtype=tf.int32)
return result
def compute_mask(self, inputs, mask=None):
return K.not_equal(inputs, '--PAD--')
def compute_output_shape(self, input_shape):
return (None, self.word_size, self.dimensions)
def get_config(self):
config = {
'dimensions': self.dimensions,
'trainable': self.trainable,
'batch_size': self.batch_size,
'word_size': self.word_size
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
Here is my model architecture:
model architecture
I had the same problem with you, working on an RNN ELMo pos-tagger model. Finally I followed the solution predicting in batches and keeping the test sample I want:
model.predict([X_test[:split_te]], batch_size=256)[0]
For more ideas (like copying weights) look here!
The number of samples (in train and also test set) must be divisible by the batch_size. Otherwise the last batch in keras will break the architecture.
So for example a solution is to use samples until split_tr for training and split_te for predicting:
split_tr = (X_train.shape[0]//BATCH_SIZE)*BATCH_SIZE
split_te = (X_test.shape[0]//BATCH_SIZE)*BATCH_SIZE[:split_tr], y_train[:split_tr], batch_size=BATCH_SIZE, epochs=15, validation_data=(X_test_text[:split_te], y_test[:split_te]), verbose=1)

Cannot use predict_generator on loaded model

I am trying to load a Keras model for prediction only (i.e. I do not have to compile the model, per Pepslee's post here).
When I try to use model.predict_generator(), I get:
Using TensorFlow backend.
Exception in thread Thread-1:
Traceback (most recent call last):
File "/user/pkgs/anaconda2/lib/python2.7/", line 801, in __bootstrap_inner
File "/user/pkgs/anaconda2/lib/python2.7/", line 754, in run
self.__target(*self.__args, **self.__kwargs)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/keras/utils/", line 559, in _run
sequence = list(range(len(self.sequence)))
ValueError: __len__() should return >= 0
I am working with Tensorflow version 1.12.0, Keras version 2.2.4. I need to use these versions to ensure compatibility with my cuDNN version, which I have no control over.
How can I get around this error?
I was asked for an example. Unfortunately there's too much proprietary info here for me to give much detail, but here are the bare bones (note the model is not actually an LSTM):
class LSTMmodel():
def __init__(self, hid1 = 10, batch_size=32, mode='test'):
self.hid_dim_1 = hid1
self.t_per_e, self.test_generator = self.read_data()
#Load the entire fitted model
model_name = ''.join(glob.glob('*model.h5'))
self.__model = load_model(model_name, compile=False)
def read_data(self):
num_test_minibatches = 10
test_IDs = range(111, 111+10)
params = {'list_IDs': test_IDs, 'batch_size': self.batch_size, 'n_vars': 354}
test_generator = DataGenerator(test_IDs, **params)
t_per_e = int(len(test_IDs) - self.batch_size + 1)
return t_per_e, test_generator
def lstm_model():
#Model building here. Not needed, since not compiling the model
return 0
def lstm_predict(self):
pred = self.__model.predict_generator(self.test_generator, self.t_per_e)
return pred
class DataGenerator(keras.utils.Sequence):
#Other methods in here as necessary
def __len__(self):
'Denotes the number of batches per epoch'
batches_per_epoch = int(np.floor(len(self.list_IDs) - self.batch_size + 1))
return batches_per_epoch
def __data_generation(self, other_params_here):
'Generates data containing batch_size samples'
return preprocessed_data
def test_lstm():
test_inst = LSTMmodel(hid1=10) #hid1 is a hyperparameter
test_prediction = test_inst.lstm_predict()
return test_prediction
if __name__ == '__main__':
testvals = test_lstm()
Basically, the workflow is:
1) test_lstm() creates an instance of the LSTMmodel class, and then calls lstm_predict.
2) lstm_predict uses predict_generator which takes in the generator for the test set and the number of examples to generate (steps from here).
3) The generator for the test set is created as an instance of class DataGenerator() in the read_data() method of class LSTMmodel(). Importantly, the test data generator is created in the same way as the training data generator and the validation data generator.
4) self.__model is created by loading a fully trained model in the init method of class LSTMmodel().
How can I get rid of the error?

initialising and accessing an array of weights in a custom keras layer

I am writing a custom keras layer for convolution in a cnn architecture in fourier domain:
class Fourier_Conv2D(Layer):
def __init__(self, no_of_kernels, **kwargs):
self.no_of_kernels = no_of_kernels
super(Fourier_Conv2D, self).__init__(**kwargs)
def build(self, input_shape):
self.kernel = self.add_weight(name = 'kernel',
shape = input_shape + (self.no_of_kernels,),
initializer = 'uniform', trainable = True)
super(Fourier_Conv2D, self).build(input_shape)
def call(self, x):
return, self.kernel[0])
In the call function, I need to do pointwise multiplication of the fft of input with fft of each kernel (according to the convolution theorem) and add the products before passing this sum to activation function. But how can I access each weight separately in the call function, as using array index to do so is giving the following attribute error -
AttributeError Traceback (most recent call last)
<ipython-input-71-9617a8e7ab2e> in <module>()
1 x = Fourier_Conv2D(5)
----> 2,2,1))
<ipython-input-70-02ded53b8f6f> in call(self, x)
12 def call(self, x):
---> 13 return, self.kernel[0])
AttributeError: 'Fourier_Conv2D' object has no attribute 'kernel'
Thanks in advance for any help in solving the error.
You are not using your layer correctly. The line,2,1)) makes no sense since you need to pass a tensor to the layer. You should instead do something like this:
x = Input((3,4))
custom_layer = Fourier_Conv2D(10)
output = custom_layer(x)
Moreover, there are some errors in the definition of your layer. The following should work:
class Fourier_Conv2D(Layer):
def __init__(self, no_of_kernels, **kwargs):
self.no_of_kernels = no_of_kernels
super(Fourier_Conv2D, self).__init__(**kwargs)
def build(self, input_shape):
# Note the changes to the shape parameter
self.kernel = self.add_weight(name = 'kernel',
shape = (int(input_shape[-1]), self.no_of_kernels),
initializer = 'uniform', trainable = True)
super(Fourier_Conv2D, self).build(input_shape)
def call(self, x):
return, self.kernel) # kernel[0] --> kernel
