I'm building a custom model in tensorflow with a custom layer (fasttext embedding layer) via subclassing and I've got the current setup here so far:
(p.s. the simple_preprocess function is simply imported from gensim library -> from gensim.utils import simple_preprocess)
class FastTextEmbedding(tf.keras.layers.Layer):
def __init__(self, trained_ft_model_dir:str) -> None:
super(FastTextEmbedding, self).__init__()
self.trained_ft_model = FastText.load(trained_ft_model_dir)
# INPUT IS A STRING : "Hey my name is Anas"
def call(self, input):
# print(type(input))
# input = tf.constant(input)
# assert type(input) == str
out = np.zeros(shape=(1,60))
sent_tokenized = simple_preprocess(input)
# we'll use mean pooling
for word in sent_tokenized:
out[:,:] += self.trained_ft_model.wv[word]
return tf.convert_to_tensor(out / len(sent_tokenized))
# return tf.convert_to_tensor(out / len(sent_tokenized))
# def call(self, input):
# # print(type(input))
# # input = tf.constant(input)
# # assert type(input) == str
# out = np.zeros(shape=(60,))
# sent_tokenized = simple_preprocess(input)
# # we'll use mean pooling
# for word in sent_tokenized:
# out += self.trained_ft_model.wv[word]
# print(len(sent_tokenized))
# return tf.expand_dims(tf.convert_to_tensor(out / len(sent_tokenized)), axis=1)
# # return tf.convert_to_tensor(out / len(sent_tokenized))
class FastTextModel(tf.keras.Model):
def __init__(self, trained_ft_model_dir, num_classes:int=16) -> None:
super(FastTextModel, self).__init__(name='FT')
self.fasttext_embeddings = FastTextEmbedding(trained_ft_model_dir)
self.relu = tf.keras.layers.Activation('relu')
self.softmax = tf.keras.layers.Activation('softmax')
self.dense1 = tf.keras.layers.Dense(units=60) = tf.keras.layers.Dropout(rate=0.35)
self.dense2 = tf.keras.layers.Dense(units=num_classes) = tf.keras.layers.BatchNormalization()
def call(self, input):
x = self.fasttext_embeddings(input,training=False)
x = self.dense1(x, training=True)
x =, training=True)
x = self.relu(x)
x =, training=False)
x = self.dense2(x, training=True)
return self.softmax(x)
So I've made a custom layer FastTextEmbedding and built that layer into my model FastTextModel
My issue is that when I go to build the model via the following line:
_ ="Hey my name is Anas")
it builds successfully with the current setup I've got. FYI the model is intended to take in string and get the fasttext embeddings for the words, combine via mean pooling and pass that through some layers to make a decision. Now when I go to train my model I get this weird error:
output exceeds the size limit. Open the full output data in a text editor
TypeError Traceback (most recent call last)
Cell In[150], line 1
----> 1 history =,
2 y_train,
3 batch_size=128,
4 validation_data=(X_val, y_val),
5 validation_batch_size=128,
6 epochs=15)
File ~/Desktop/Winterproj/emojify/lib/python3.9/site-packages/keras/utils/, in filter_traceback.<locals>.error_handler(*args, **kwargs)
67 filtered_tb = _process_traceback_frames(e.__traceback__)
68 # To get the full stack trace, call:
69 # `tf.debugging.disable_traceback_filtering()`
---> 70 raise e.with_traceback(filtered_tb) from None
71 finally:
72 del filtered_tb
File /var/folders/07/1yqf9lq93hb3l2f96cqmmv9c0000gn/T/, in outer_factory.<locals>.inner_factory.<locals>.tf__train_function(iterator)
13 try:
14 do_return = True
---> 15 retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
16 except:
17 do_return = False
Call arguments received by layer 'FT' (type FastTextModel):
• input=tf.Tensor(shape=(None,), dtype=string)
My training data is of this format:
Training data (X_train, a np array of string tweets):
['chick fil hicksville finally open'
'accidental twinning with my favorites'
'one of my favorite people in this world love you mrs andrews' ...
'of your fav artists takin on hoco' 'welcome to the studio' 'the boys']
shape of training data: (204073,)
Labels for data(categorically encoded, there are 16 total classes):
[[0. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]]
shape of labels for training: (204073, 16)
I tried expanding the dimensions for my call method in my custom layer, that worked to get past a few problems but still I'm stuck here because I need the model to train as it takes in all the inputs and use batches of like 128 or so to train the model.
At the end I want to be able to take a single line of text such as "Hey my name is Anas" and apply my model to it to output one of the 16 classes (they are emoji labels).
This is my first time building out a model like this, I've been stuck for some time now and would appreciate any suggestions / advice. Thanks


Bi-LSTM with Keras : dimensions must be equal but are 7 and 300

I am creating for the first time a bilstm with keras but I am having difficulties. So that you understand, here are the steps I have done:
I created an embedding matrix with Glove for my x ;
def create_embeddings(fichier,dictionnaire,dictionnaire_tokens):
with open(fichier) as file:
line = file.readline()
max_words = max(dictionnaire_tokens.values())+1 #1032
max_size_dimensions = 300
emb_matrix = np.zeros((max_words,max_size_dimensions))
for item,count in dictionnaire_tokens.items():
vecteur = dictionnaire[item]
if vecteur is not None:
emb_matrix[count]= vecteur
return emb_matrix
I did some one hot encoding with my y's;
def one_hot_encoding(file):
with open(file) as file:
line = file.readline()
liste = []
while line:
tag = line.split(" ")[1]
tag = [tag]
line = file.readline()
one_hot = MultiLabelBinarizer()
array = one_hot.fit_transform(liste)
return array
I compiled my model with keras
from tensorflow.keras.layers import Bidirectional
model = Sequential()
embedding_layer = Embedding(input_dim=1031 + 1,
bilstm_layer = Bidirectional(LSTM(units=300, return_sequences=True))
model.add(Dense(300, activation="relu"))
#crf_layer = CRF(units=len(self.tags), sparse_target=True)
model.compile(optimizer="adam", loss='binary_crossentropy', metrics='acc')
Input of my embedding layer (embedding matrix) :
[[ 0. 0. 0. ... 0. 0. 0. ]
[ 0. 0. 0. ... 0. 0. 0. ]
[ 0. 0. 0. ... 0. 0. 0. ]
[-0.068577 -0.71314 0.3898 ... -0.077923 -1.0469 0.56874 ]
[ 0.32461 0.50463 0.72544 ... 0.17634 -0.28961 0.29007 ]
[-0.33771 -0.24912 -0.032685 ... -0.033254 -0.45513 -0.13319 ]]
I train my model. However when I want to train it, I get the following message: ValueError: Dimensions must be equal, but are 7 and 300 for '{{node binary_crossentropy/mul}} = Mul[T=DT_FLOAT](binary_crossentropy/Cast, binary_crossentropy/Log)' with input shapes: [?,7], [?,300,300].
My embedding matrix was made with glove 300d so it has 300 dimensions. While my labels, I have only 7 labels. So I have to make my x and y have the same dimensions but how? Thank you!!!
from tensorflow.keras.layers import Bidirectional
model = Sequential()
_input = keras.layers.Input(shape=(300,1))
bilstm_layer = Bidirectional(LSTM(units=300, return_sequences=False))
model.add(Dense(7, activation="relu")) #here 7 is the number of classes you have and None is the batch_size
#crf_layer = CRF(units=len(self.tags), sparse_target=True)
model.compile(optimizer="adam", loss='binary_crossentropy', metrics='acc')

Expected Randomness does not occur in Tensorflow Layer

I wrote a custom layer that shuffle the input. When I try to test the layer, said shuffling does not occur. Here is my minimal noise layer below:
class ShuffleLayer(tf.keras.layers.Layer):
def __init__(self, **kwargs):
super(ShuffleLayer, self).__init__(**kwargs)
def call(self, inputs, training=None):
if training:
shuffled = tf.stop_gradient(tf.random.shuffle(inputs))
return shuffled
return inputs
When I test the layer, the layer will not shuffle
SL = ShuffleLayer()
x = tf.reshape(tf.range(0,10, dtype=tf.float32), (5,2))
y = SL(x)
[[0. 1.] [2. 3.] [4. 5.] [6. 7.] [8. 9.]]
[[0. 1.] [2. 3.] [4. 5.] [6. 7.][8. 9.]]
Why will the expected behavior not occur?
Looking at the layer call, we see that the layer does nothing if training is None. When the layer is called as y = SL(x), it sees that training is None and returns the inputs. Getting the shuffled output is done by turning on the training parameter:
y = SL(x, training=True)
[[0. 1.][2. 3.][4. 5.][6. 7.][8. 9.]]
[[0. 1.][6. 7.][2. 3.][8. 9.][4. 5.]]

(pytorch / mse) How can I change the shape of tensor?

Problem definition:
I have to use MSELoss function to define the loss to classification problem. Therefore it keeps saying the error message regarding the shape of tensor.
Entire error message:
torch.Size([32, 10]) torch.Size([32])
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call
last) in
53 output = model.forward(images)
54 print(output.shape, labels.shape)
---> 55 loss = criterion(output, labels)
56 loss.backward()
57 optimizer.step()
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/ in
call(self, *input, **kwargs)
530 result = self._slow_forward(*input, **kwargs)
531 else:
--> 532 result = self.forward(*input, **kwargs)
533 for hook in self._forward_hooks.values():
534 hook_result = hook(self, input, result)
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/ in
forward(self, input, target)
430 def forward(self, input, target):
--> 431 return F.mse_loss(input, target, reduction=self.reduction)
/opt/conda/lib/python3.7/site-packages/torch/nn/ in
mse_loss(input, target, size_average, reduce, reduction) 2213
ret = torch.mean(ret) if reduction == 'mean' else torch.sum(ret)
2214 else:
-> 2215 expanded_input, expanded_target = torch.broadcast_tensors(input, target) 2216 ret =
torch._C._nn.mse_loss(expanded_input, expanded_target,
_Reduction.get_enum(reduction)) 2217 return ret
/opt/conda/lib/python3.7/site-packages/torch/ in
50 [0, 1, 2]])
51 """
---> 52 return torch._C._VariableFunctions.broadcast_tensors(tensors)
> RuntimeError: The size of tensor a (10) must match the size of tensor
b (32) at non-singleton dimension 1
How can I reshape the tensor, and which tensor (output or labels) should I change to calculate the loss?
Entire code is attached below.
import numpy as np
import torch
# Loading the Fashion-MNIST dataset
from torchvision import datasets, transforms
# Get GPU Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
transform = transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))])
# Download and load the training data
trainset = datasets.FashionMNIST('MNIST_data/', download = True, train = True, transform = transform)
testset = datasets.FashionMNIST('MNIST_data/', download = True, train = False, transform = transform)
trainloader =, batch_size = 32, shuffle = True, num_workers=4)
testloader =, batch_size = 32, shuffle = True, num_workers=4)
# Examine a sample
dataiter = iter(trainloader)
images, labels =
# Define the network architecture
from torch import nn, optim
import torch.nn.functional as F
model = nn.Sequential(nn.Linear(784, 128),
nn.Linear(128, 10),
nn.LogSoftmax(dim = 1))
# Define the loss
criterion = nn.MSELoss()
# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr = 0.001)
# Define the epochs
epochs = 5
train_losses, test_losses = [], []
for e in range(epochs):
running_loss = 0
for images, labels in trainloader:
# Flatten Fashion-MNIST images into a 784 long vector
images =
labels =
images = images.view(images.shape[0], -1)
# Training pass
output = model.forward(images)
print(output.shape, labels.shape)
loss = criterion(output, labels)
running_loss += loss.item()
test_loss = 0
accuracy = 0
# Turn off gradients for validation, saves memory and computation
with torch.no_grad():
# Set the model to evaluation mode
# Validation pass
for images, labels in testloader:
images =
labels =
images = images.view(images.shape[0], -1)
ps = model(images)
test_loss += criterion(ps, labels)
top_p, top_class = ps.topk(1, dim = 1)
equals = top_class == labels.view(*top_class.shape)
accuracy += torch.mean(equals.type(torch.FloatTensor))
print("Epoch: {}/{}..".format(e+1, epochs),
"Training loss: {:.3f}..".format(running_loss/len(trainloader)),
"Test loss: {:.3f}..".format(test_loss/len(testloader)),
"Test Accuracy: {:.3f}".format(accuracy/len(testloader)))
From the output you print before it error, torch.Size([32, 10]) torch.Size([32]).
The left one is what the model gives you and the right one is from trainloader, normally you use this for something like nn.CrossEntropyLoss.
And from the full error log, the error is from this line
loss = criterion(output, labels)
The way to make this work is called One-hot Encoding, if it's me for sake of my laziness I'll write it like this.
ones = torch.sparse.torch.eye(10).to(device) # number of class class
labels = ones.index_select(0, labels)
Alternatively, you can change your loss function from nn.MSELoss() to nn.CrossEntropyLoss(). Cross entropy loss is generally preferable to MSE for categorical tasks like this, and in PyTorch's implementation this loss function takes care of a lot of the shape conversion under the hood so you can provide it with a vector of class probabilities and a single class label.
Fundamentally, your model attempts to predict what class the input belongs to by calculating a score (you might call it a 'confidence score') for each possible class. So if you have 10 classes, the model's output will be a 10-dimensional list (in PyTorch, a tensor shape [10]) and the prediction would be the the index of the highest score. Often one would apply the softmax ( function to convert these scores to a probability distribution, so all scores will be between 0 and 1 and the elements all sum to 1.
Then cross entropy is a common choice of loss function for this task: it compares the list of predictions to the one-hot encoded label. E.g. if you have 3 classes, a label would look like [1, 0, 0] to represent the first class. This is also called the "one-hot encoding". Meanwhile a prediction might look like [0.7, 0.1, 0.2]. In PyTorch, nn.CrossEntropyLoss() expects your labels are coming as single value tensors whose value represents the class label, since there's no real need to move long, sparse vectors around memory. So this loss function accomplishes the comparison you want to do and I'm guessing is implemented more efficiently than actually creating one-hot encodings.

Converting spinning up policy gradient to pytorch

I'm trying to learn deep reinforcement learning through OpenAI spinning up. To do this, I want to rewrite some of their code using pytorch instead of tensorflow.
Currently I'm trying to convert the code for basic policy gradient (link with explanations) and this is my code so far:
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax
from torch.distributions import Categorical
import torch.optim as optim
import numpy as np
import gym
from gym.spaces import Discrete, Box
class Policy(nn.Module):
def __init__(self, sizes, activation=nn.Tanh(), output_activation=None):
# Build a feedforward neural network.
super(Policy, self).__init__()
self.layers=nn.ModuleList([nn.Linear(sizes[i],sizes[i+1]) for i in
self.returns=[] # for R(tau) weighting in policy gradient
self.rewards=[] # list for rewards accrued throughout ep
self.logits=[] # for measuring episode logits
def forward(self,x):
for layer in self.layers[:-1]:
if not self.output_activation==None:
return x
# make action selection op (outputs int actions, sampled from policy)
def select_action(logits):
return Categorical(logits=logits).sample()
# make loss function whose gradient, for the right data, is policy gradient
def loss(action_logits,tau_rets):
return torch.sum(,tau_rets))
def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2,
epochs=50, batch_size=5000, render=False):
# make environment, check spaces, get obs / act dims
env = gym.make(env_name)
assert isinstance(env.observation_space, Box), \
"This example only works for envs with continuous state spaces."
assert isinstance(env.action_space, Discrete), \
"This example only works for envs with discrete action spaces."
obs_dim = env.observation_space.shape[0]
n_acts = env.action_space.n
# make core of policy network
policy = Policy(sizes=[obs_dim]+hidden_sizes+[n_acts])
# make train op
train_op = optim.Adam(policy.parameters(), lr=lr)
# for training policy
def train_one_epoch():
# make some empty lists for logging.
batch_returns = [] # for measuring episode returns
batch_lens = [] # for measuring episode lengths
# reset episode-specific variables
obs = torch.from_numpy(env.reset()).type(torch.FloatTensor) # first obs comes from starting distribution
done = False # signal from environment that episode is over
num_obs=0 # to measure the number of observations
# render first episode of each epoch
finished_rendering_this_epoch = False
# collect experience by acting in the environment with current policy
while True:
# rendering
if (not finished_rendering_this_epoch) and render:
# act in the environment
act = select_action(act_logit)
tmp, reward, done, _ = env.step(act.numpy())
# save logit, reward
if done:
# if episode is over, record info about episode
ep_ret, ep_len = sum(policy.rewards), len(policy.rewards)
# the weight for each logprob(a|s) is R(tau)
policy.returns+= [ep_ret] * ep_len
# reset episode-specific variables
tmp, done, policy.rewards = env.reset(), False, []
# won't render again this epoch
finished_rendering_this_epoch = True
# end experience loop if we have enough of it
if num_obs > batch_size:
# take a single policy gradient update step
print (len(policy.returns),len(policy.rewards),len(policy.logits))
batch_loss = loss(torch.tensor(policy.logits),torch.tensor(policy.returns))
return batch_loss, batch_returns, batch_lens
# training loop
for i in range(epochs):
batch_loss, batch_rets, batch_lens = train_one_epoch()
print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
(i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
When I run train(), I get the following error:
RuntimeError Traceback (most recent call last)
<ipython-input-163-2da0ffaf5447> in <module>()
----> 1 train()
<ipython-input-162-560e772be08b> in train(env_name, hidden_sizes, lr, epochs,
batch_size, render)
114 # training loop
115 for i in range(epochs):
--> 116 batch_loss, batch_rets, batch_lens = train_one_epoch()
117 print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
118 (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
<ipython-input-162-560e772be08b> in train_one_epoch()
109 print (len(policy.returns),len(policy.rewards),len(policy.logits))
110 batch_loss = loss(torch.tensor(policy.logits),torch.tensor(policy.returns))
--> 111 batch_loss.backward()
112 return batch_loss, batch_returns, batch_lens
~\Anaconda3\lib\site-packages\torch\ in backward(self, gradient,
retain_graph, create_graph)
91 products. Defaults to ``False``.
92 """
---> 93 torch.autograd.backward(self, gradient, retain_graph, create_graph)
95 def register_hook(self, hook):
~\Anaconda3\lib\site-packages\torch\autograd\ in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
88 Variable._execution_engine.run_backward(
89 tensors, grad_tensors, retain_graph, create_graph,
---> 90 allow_unreachable=True) # allow_unreachable flag
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
I don't understand why this happens since my code is similar to other rl pytorch code such as this.

pytorch model.cuda() runtime error

I'm building a text classifier using pytorch, and got into some trouble with .cuda() method. I know that .cuda() moves all parameters into gpu so that the training procedure can be faster. However, error occurred in .cuda() method like this:
start_time = time.time()
for model_type in ('lstm',):
hyperparam_combinations = score_util.all_combination(hyperparam_dict[model_type].values())
# for selecting best scoring model
for test_idx, setting in enumerate(hyperparam_combinations):
args = custom_dataset.list_to_args(setting,model_type=model_type)
tsv = "test %d\ttrain_loss\ttrain_acc\ttrain_auc\tval_loss\tval_acc\tval_auc\n"%(test_idx) # tsv record
avg_score = [] # cv_mean score
### 4 fold cross validation
for cv_num,(train_iter,val_iter) in enumerate(cv_splits):
### model initiation
model = model_dict[model_type](args)
if args.emb_type is not None: # word embedding init
emb = emb_dict[args.emb_type]
emb = score_util.embedding_init(emb,tr_text_field,args.emb_type)
RuntimeError Traceback (most recent call last)
<ipython-input-20-ff6cfce73c10> in <module>()
---> 25 model.cuda()
27 optimizer= torch.optim.Adam(model.parameters(),
~\Anaconda3\lib\site-packages\torch\nn\modules\ in cuda(self, device_id)
145 copied to that device
146 """
--> 147 return self._apply(lambda t: t.cuda(device_id))
149 def cpu(self, device_id=None):
~\Anaconda3\lib\site-packages\torch\nn\modules\ in _apply(self, fn)
116 def _apply(self, fn):
117 for module in self.children():
--> 118 module._apply(fn)
120 for param in self._parameters.values():
~\Anaconda3\lib\site-packages\torch\nn\modules\ in _apply(self, fn)
122 # Variables stored in modules are graph leaves, and we don't
123 # want to create copy nodes, so we have to unpack the data.
--> 124 = fn(
125 if param._grad is not None:
126 = fn(
RuntimeError: Variable data has to be a tensor, but got torch.cuda.FloatTensor
These are error traceback and I can't see why this happens.
This code worked very well before I set epoch parameter to 1 to run some tests. I set epoch to 1000 again, but the problem lingers on.
Aren't torch.cuda.FloatTensor object also Tensors? Any help would be much appreciated.
my model looks like this :
class TR_LSTM(nn.Module):
def __init__(self,args,
pretrained_emb = None):
# arguments
self.emb_dim = args.embed_dim
self.emb_num = args.embed_num
self.num_hidden_unit = args.hidden_state_dim
self.num_lstm_layer = args.num_lstm_layer
self.use_hidden_average = use_hidden_average
self.batch_size = args.batch_size
# layers
self.embed = nn.Embedding(self.emb_num, self.emb_dim)
if pretrained_emb is not None:
self.lstm_layer = nn.LSTM(self.emb_dim, self.num_hidden_unit, self.num_lstm_layer, batch_first = True)
self.fc_layer = nn.Sequential(nn.Linear(self.num_hidden_unit,self.num_hidden_unit),
def forward(self,x):
x = self.embed(x) # batch * max_seq_len * emb_dim
h_0,c_0 = self.init_hidden(x.size(0))
x, (_, _) = self.lstm_layer(x, (h_0,c_0)) # batch * seq_len * hidden_unit_num
if not self.use_hidden_average:
x = x[:,x.size(1)-1,:]
x = x.squeeze(1)
x = x.mean(1).squeeze(1)
x = self.fc_layer(x)
return x
def init_hidden(self,batch_size):
h_0, c_0 = torch.zeros(self.num_lstm_layer,batch_size , self.num_hidden_unit),\
torch.zeros(self.num_lstm_layer,batch_size , self.num_hidden_unit)
h_0, c_0 = h_0.cuda(), c_0.cuda()
h_0_param, c_0_param = torch.nn.Parameter(h_0), torch.nn.Parameter(c_0)
return h_0_param, c_0_param
model.cuda() is called inside your training/test loop, which is the problem. As the error message suggests, you repeatedly convert parameters(tensors) in your model to cuda, which is not the right way to convert model into cuda tensor.
model object should be created and cuda-ize outside the loop. Only training/test instances shall be convert to cuda tensor every time you feed your model. I also suggest you read examples code from pytorch document site.
