Graph Disconnected when implementing custom LSTM - python-3.x

I've been trying to write my own LSTM for customization. However, an error occurred when I try to call my code using Keras. The error said the graph was disconnected on c_prev, but c_prev was used as LSTM's cell initializer. So I'm not sure if it's something wrong with my code or the way I call the model. Any help is appreciated.
My environment:
Python 3.7.6
Tensorflow 2.1.0 (installed via pip)
Mac Mojave
class EtienneLSTM(tf.keras.layers.Layer):
def __init__(self, units, activation='tanh', recurrent_activation='sigmoid',
kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros',
use_bias=True, unit_forget_bias=True,
kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None,
kernel_constraint=None, recurrent_constraint=None, bias_constraint=None,
# dropout=0.0, recurrent_dropout=0.0,
return_sequences=False, return_state=False, go_backwards=False, use_batchnorm=False):
super(EtienneLSTM, self).__init__()
self.units = units #
self.activation = tf.keras.layers.Activation(activation) #
self.recurrent_activation = tf.keras.layers.Activation(recurrent_activation) #
self.use_bias = use_bias #
self.kernel_initializer = kernel_initializer #
self.recurrent_initializer = recurrent_initializer #
self.bias_initializer = bias_initializer #
self.unit_forget_bias = unit_forget_bias #
if self.unit_forget_bias:
self.bias_initializer = 'zeros'
self.kernel_regularizer = kernel_regularizer #
self.recurrent_regularizer = recurrent_regularizer #
self.bias_regularizer = bias_regularizer #
self.activity_regularizer = activity_regularizer
self.kernel_constraint = kernel_constraint #
self.recurrent_constraint = recurrent_constraint #
self.bias_constraint = bias_constraint #
# self.dropout = dropout
# self.recurrent_dropout = recurrent_dropout
self.return_sequences = return_sequences #
self.return_state = return_state #
self.go_backwards = go_backwards #
self.use_batchnorm = use_batchnorm
if self.use_batchnorm:
self.batchnorm_f = tf.keras.layers.BatchNormalization()
self.batchnorm_i = tf.keras.layers.BatchNormalization()
self.batchnorm_o = tf.keras.layers.BatchNormalization()
self.batchnorm_c = tf.keras.layers.BatchNormalization()
def build(self, input_shape):
# forgot gate
self.Wf = self.add_weight(shape=(input_shape[-1], self.units), initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, constraint=self.kernel_constraint, trainable=True)
self.Uf = self.add_weight(shape=(self.units, self.units), initializer=self.recurrent_initializer, regularizer=self.recurrent_regularizer, constraint=self.recurrent_constraint, trainable=True)
if self.unit_forget_bias:
self.bf = self.add_weight(shape=(self.units,), initializer='ones', regularizer=self.bias_regularizer, constraint=self.bias_constraint, trainable=True)
else:
self.bf = self.add_weight(shape=(self.units,), initializer=self.bias_initializer, regularizer=self.bias_regularizer, trainable=True)
# input gate
self.Wi = self.add_weight(shape=(input_shape[-1], self.units), initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, constraint=self.kernel_constraint, trainable=True)
self.Ui = self.add_weight(shape=(self.units, self.units), initializer=self.recurrent_initializer, regularizer=self.recurrent_regularizer, constraint=self.recurrent_constraint, trainable=True)
if self.use_bias:
self.bi = self.add_weight(shape=(self.units,), initializer=self.bias_initializer, regularizer=self.bias_regularizer, constraint=self.bias_constraint, trainable=True)
# output gate
self.Wo = self.add_weight(shape=(input_shape[-1], self.units), initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, constraint=self.kernel_constraint, trainable=True)
self.Uo = self.add_weight(shape=(self.units, self.units), initializer=self.recurrent_initializer, regularizer=self.recurrent_regularizer, constraint=self.recurrent_constraint, trainable=True)
if self.use_bias:
self.bo = self.add_weight(shape=(self.units,), initializer=self.bias_initializer, regularizer=self.bias_regularizer, constraint=self.bias_constraint, trainable=True)
# context
self.Wc = self.add_weight(shape=(input_shape[-1], self.units), initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, constraint=self.kernel_constraint, trainable=True)
self.Uc = self.add_weight(shape=(self.units, self.units), initializer=self.recurrent_initializer, regularizer=self.recurrent_regularizer, constraint=self.recurrent_constraint, trainable=True)
if self.use_bias:
self.bc = self.add_weight(shape=(self.units,), initializer=self.bias_initializer, regularizer=self.bias_regularizer, constraint=self.bias_constraint, trainable=True)
def _inp_gate(self, x, hidden):
return self.recurrent_activation(tf.matmul(x, self.Wi) + tf.matmul(hidden, self.Ui) + self.bi)
def _new_mem(self, x, hidden):
return self.activation(tf.matmul(x, self.Wc) + tf.matmul(hidden, self.Uc) + self.bc)
def _forget_gate(self, x, hidden):
return self.recurrent_activation(tf.matmul(x, self.Wf) + tf.matmul(hidden, self.Uf) + self.bf)
def _update_cell(self, c_prev, c_tilde, f_t, i_t):
return (f_t * c_prev) + (i_t * c_tilde)
def _out_gate(self, x, hidden, ct):
ot = self.recurrent_activation(tf.matmul(x, self.Wo) + tf.matmul(hidden, self.Uo) + self.bo)
return ot * self.activation(ct)
def call(self, x, hidden, c_prev):
if self.go_backwards: x = x[:,:,::-1]
f_t = self._forget_gate(x, hidden)
i_t = self._inp_gate(x, hidden)
c_tilde = self._new_mem(x, hidden)
c_t = self._update_cell(c_prev, c_tilde, f_t, i_t)
h_t = self._out_gate(x, hidden, c_t)
# if self.return_state:
# return h_t, c_t
# if self.return_sequences:
# return h_t
return h_t
tf.keras.backend.clear_session()
def get_LSTM():
inp = tf.keras.layers.Input(shape=(200, 40))
out = tf.keras.layers.LSTM(32)(inp)
return tf.keras.Model(inp, out)
def get_EtienneLSTM():
inp = tf.keras.layers.Input(shape=(200, 40))
h0 = tf.keras.layers.Input(shape=(32,), name='h0')
c0 = tf.keras.layers.Input(shape=(32,), name='c0')
out = EtienneLSTM(32)(inp, h0, c0)
return tf.keras.Model(inp, out)
model_tf = get_LSTM()
model_etienne = get_EtienneLSTM()
Here is my error message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in
14
15 model_tf = get_LSTM()
---> 16 model_etienne = get_EtienneLSTM()
in get_EtienneLSTM()
11 c0 = tf.keras.layers.Input(shape=(32,), name='c0')
12 out = EtienneLSTM(32)(inp, h0, c0)
---> 13 return tf.keras.Model(inp, out)
14
15 model_tf = get_LSTM()
~/.env/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py in __init__(self, *args, **kwargs)
144
145 def __init__(self, *args, **kwargs):
--> 146 super(Model, self).__init__(*args, **kwargs)
147 _keras_api_gauge.get_cell('model').set(True)
148 # initializing _distribution_strategy here since it is possible to call
~/.env/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/network.py in __init__(self, *args, **kwargs)
167 'inputs' in kwargs and 'outputs' in kwargs):
168 # Graph network
--> 169 self._init_graph_network(*args, **kwargs)
170 else:
171 # Subclassed network
~/.env/lib/python3.7/site-packages/tensorflow_core/python/training/tracking/base.py in _method_wrapper(self, *args, **kwargs)
455 self._self_setattr_tracking = False # pylint: disable=protected-access
456 try:
--> 457 result = method(self, *args, **kwargs)
458 finally:
459 self._self_setattr_tracking = previous_value # pylint: disable=protected-access
~/.env/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/network.py in _init_graph_network(self, inputs, outputs, name, **kwargs)
322 # Keep track of the network's nodes and layers.
323 nodes, nodes_by_depth, layers, _ = _map_graph_network(
--> 324 self.inputs, self.outputs)
325 self._network_nodes = nodes
326 self._nodes_by_depth = nodes_by_depth
~/.env/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/network.py in _map_graph_network(inputs, outputs)
1674 'The following previous layers '
1675 'were accessed without issue: ' +
-> 1676 str(layers_with_complete_input))
1677 for x in nest.flatten(node.output_tensors):
1678 computable_tensors.add(id(x))
ValueError: Graph disconnected: cannot obtain value for tensor Tensor("c0:0", shape=(None, 32), dtype=float32) at layer "c0". The following previous layers were accessed without issue: ['input_2']
Thank you for your help.

Resolved, it seems that I implement LSTM wrong way. The correct method of implementing LSTM is as follows:
class EtienneLSTM(tf.keras.layers.Layer):
def __init__(self, units, activation='tanh', recurrent_activation='sigmoid',
kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros',
use_bias=True, unit_forget_bias=True,
kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None,
kernel_constraint=None, recurrent_constraint=None, bias_constraint=None,
# dropout=0.0, recurrent_dropout=0.0,
return_sequences=False, return_state=False, go_backwards=False, use_batchnorm=False):
super(EtienneLSTM, self).__init__()
self.units = units #
self.activation = tf.keras.layers.Activation(activation) #
self.recurrent_activation = tf.keras.layers.Activation(recurrent_activation) #
self.use_bias = use_bias #
self.kernel_initializer = kernel_initializer #
self.recurrent_initializer = recurrent_initializer #
self.bias_initializer = bias_initializer #
self.unit_forget_bias = unit_forget_bias #
if self.unit_forget_bias:
self.bias_initializer = 'zeros'
self.kernel_regularizer = kernel_regularizer #
self.recurrent_regularizer = recurrent_regularizer #
self.bias_regularizer = bias_regularizer #
self.activity_regularizer = activity_regularizer
self.kernel_constraint = kernel_constraint #
self.recurrent_constraint = recurrent_constraint #
self.bias_constraint = bias_constraint #
# self.dropout = dropout
# self.recurrent_dropout = recurrent_dropout
self.return_sequences = return_sequences #
self.return_state = return_state #
self.go_backwards = go_backwards #
self.use_batchnorm = use_batchnorm
if self.use_batchnorm:
self.batchnorm_f = tf.keras.layers.BatchNormalization()
self.batchnorm_i = tf.keras.layers.BatchNormalization()
self.batchnorm_o = tf.keras.layers.BatchNormalization()
self.batchnorm_c = tf.keras.layers.BatchNormalization()
def build(self, input_shape):
# forgot gate
self.Wf = self.add_weight(shape=(input_shape[-1], self.units), initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, constraint=self.kernel_constraint, trainable=True)
self.Uf = self.add_weight(shape=(self.units, self.units), initializer=self.recurrent_initializer, regularizer=self.recurrent_regularizer, constraint=self.recurrent_constraint, trainable=True)
if self.unit_forget_bias:
self.bf = self.add_weight(shape=(self.units,), initializer='ones', regularizer=self.bias_regularizer, constraint=self.bias_constraint, trainable=True)
else:
self.bf = self.add_weight(shape=(self.units,), initializer=self.bias_initializer, regularizer=self.bias_regularizer, trainable=True)
# input gate
self.Wi = self.add_weight(shape=(input_shape[-1], self.units), initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, constraint=self.kernel_constraint, trainable=True)
self.Ui = self.add_weight(shape=(self.units, self.units), initializer=self.recurrent_initializer, regularizer=self.recurrent_regularizer, constraint=self.recurrent_constraint, trainable=True)
if self.use_bias:
self.bi = self.add_weight(shape=(self.units,), initializer=self.bias_initializer, regularizer=self.bias_regularizer, constraint=self.bias_constraint, trainable=True)
# output gate
self.Wo = self.add_weight(shape=(input_shape[-1], self.units), initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, constraint=self.kernel_constraint, trainable=True)
self.Uo = self.add_weight(shape=(self.units, self.units), initializer=self.recurrent_initializer, regularizer=self.recurrent_regularizer, constraint=self.recurrent_constraint, trainable=True)
if self.use_bias:
self.bo = self.add_weight(shape=(self.units,), initializer=self.bias_initializer, regularizer=self.bias_regularizer, constraint=self.bias_constraint, trainable=True)
# context
self.Wc = self.add_weight(shape=(input_shape[-1], self.units), initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, constraint=self.kernel_constraint, trainable=True)
self.Uc = self.add_weight(shape=(self.units, self.units), initializer=self.recurrent_initializer, regularizer=self.recurrent_regularizer, constraint=self.recurrent_constraint, trainable=True)
if self.use_bias:
self.bc = self.add_weight(shape=(self.units,), initializer=self.bias_initializer, regularizer=self.bias_regularizer, constraint=self.bias_constraint, trainable=True)
def _inp_gate(self, x, hidden):
return self.recurrent_activation(tf.matmul(x, self.Wi) + tf.matmul(hidden, self.Ui) + self.bi)
def _new_mem(self, x, hidden):
return self.activation(tf.matmul(x, self.Wc) + tf.matmul(hidden, self.Uc) + self.bc)
def _forget_gate(self, x, hidden):
return self.recurrent_activation(tf.matmul(x, self.Wf) + tf.matmul(hidden, self.Uf) + self.bf)
def _update_cell(self, c_prev, c_tilde, f_t, i_t):
return (f_t * c_prev) + (i_t * c_tilde)
def _out_gate(self, x, hidden, ct):
ot = self.recurrent_activation(tf.matmul(x, self.Wo) + tf.matmul(hidden, self.Uo) + self.bo)
return ot * self.activation(ct)
def step_function(self, x_t, states):
h_t, c_t = states
f_t = self._forget_gate(x_t, h_t)
i_t = self._inp_gate(x_t, h_t)
c_tilde = self._new_mem(x_t, h_t)
c_t = self._update_cell(c_t, c_tilde, f_t, i_t)
h_t = self._out_gate(x_t, h_t, c_t)
return h_t, [h_t, c_t]
def call(self, x):
if self.go_backwards: x = x[:,:,::-1]
h_init = tf.zeros((tf.shape(x)[0], self.units))
c_init = tf.zeros((tf.shape(x)[0], self.units))
h, H, c = tf.keras.backend.rnn(self.step_function, x, (h_init, c_init))
if self.return_state:
return h, c
if self.return_sequences:
return H
return h
This is referring this question.
tf.keras.backend.rnn needed to be used.

Related

Pytorch, how to get the parameters of my network

I have a question about getting all parameters of the network. My network is defined as follow:
activation = nn.ReLU()
class OneInputBasis(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
self.l1 = nn.Linear(200, 100, bias = bo_b).to(device)
self.l4 = nn.Linear(100, 100, bias = bo_last).to(device)
def forward(self, v):
v = activation ( self.l1(v) )
v = ( self.l4(v) )
return v
and
class node(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
self.set_lay = []
for jj in range(dim_output_space_basis):
self.set_lay.append(OneInputBasis())
def forward(self, v):
w = self.set_lay[0](v)
for ii in range(dim_output_space_basis-1):
w = torch.cat((w, self.set_lay[ii+1](v)), dim = 1 )
return w
and
class mesh(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
self.l3 = nn.Linear(2, 100, bias = bo_b).to(device)
self.l4 = nn.Linear(100, 100, bias = bo_b).to(device)
self.l7 = nn.Linear(100,10, bias = bo_last).to(device)
def forward(self, w):
w = activation ( self.l3(w) )
w = activation ( self.l4(w) )
w = ( self.l7(w) )
return w
finally, I have
activation = nn.ReLU()
class Test(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
self.top = node()
self.bottom = mesh()
def forward(self, v, w, y):
v = self.top(v)
w = self.bottom(w)
e = torch.bmm(w ,torch.bmm(v, y))
return e[:, :, 0]
Now I define the network:
fnn_adam = Test()
When I print the parameters of the network, as
for p in fnn_adam.parameters():
print(p)
I can only see the parameters associated with fnn_adam.bottom, how can I print out the parameters associated with fnn_adam.top? Are the parameters associated with .top trainable? Thank you!
Calling self.set_lay.append(OneInputBasis()) with the instantiation of node does not register the fully-connected layers
self.l1 = nn.Linear(200, 100, bias = bo_b).to(device)
self.l4 = nn.Linear(100, 100, bias = bo_last).to(device)
to the instance fnn_adam of class Test. This is why the respective parameters do not show up in your code above.
Without loss of generality, I chose
import torch
import torch.nn as nn
import torch.nn.functional as F
dim_output_space_basis = 2
device ='cpu'
and modified the init method of class node. The remainder of your code is perfectly fine. Please see below:
class node(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
# self.set_lay = [] # Legacy
attributeNames = ['l_btm{}'.format(i) for i in range(dim_output_space_basis)]
for jj_index, jj in enumerate(range(dim_output_space_basis)):
# self.set_lay.append(OneInputBasis()) # Legacy
setattr(self, attributeNames[jj_index], OneInputBasis())
Now, the parameters register as evidenced by running fnn_adam._modules and observing its output
OrderedDict([('top',
node(
(l_btm0): OneInputBasis(
(l1): Linear(in_features=200, out_features=100, bias=True)
(l4): Linear(in_features=100, out_features=100, bias=False)
)
(l_btm1): OneInputBasis(
(l1): Linear(in_features=200, out_features=100, bias=True)
(l4): Linear(in_features=100, out_features=100, bias=False)
)
)),
('bottom',
mesh(
(l3): Linear(in_features=2, out_features=100, bias=True)
(l4): Linear(in_features=100, out_features=100, bias=True)
(l7): Linear(in_features=100, out_features=10, bias=False)
))])

the error that appears is not implemented error in getitem

#I tried to implement custom data generator using albumentations.I an getting getitem
raise NotImplementedError
class DataGenerator(tf.keras.utils.Sequence):
def __init__(self, images, label, augmentations, input_dim, batch_size=32,
shuffle=True):
self.images = images
self.label = label
self.augment = augmentations
self.batch_size = batch_size
self.input_size = input_size
self.model_name = model_name
self.shuffle = shuffle
def __len__(self):
return int(np.ceil(len(self.images) / self.batch_size))
def _getitem__(self, index):
indexes = self.indexes[index * self.batch_size: (index + 1) * self.batch_size]
batch_y = np.array([self.label[k] for k in indexes])
batch_x = [cv2.cvtColor(cv2.imread(self.images[k]), cv2.COLOR_RGB2BGR) for k in indexes]
return np.stack([self.augment(image=x)["image"] for x in batch_x], axis=0), np.array(batch_y)

Tensorflow 2 different batch size when doing inference

I have made a custom model in tensorflow 2, which uses eager execution.
The model is trained using the inherited .fit() function, about 600k training samples are used in a 10 epoch cycle with a batch size of 128 (up to 8k batch has been done). After training the model is saved as a SavedModel format. This is then used in C++ by using the cppflow library. However, this process requires the inference to use the same batch size as the training of the model, while only requiring to do inference on a single sample at a time. The application requires that things are fast and padding a feature vector array with 127 dummy vectors is slowing everyting down.
The batch size is also used in the NormalizeLayer at the end, which is using a hardcoded units value at the moment to initialize a matrix.
I have searched for a way to use variable batch sizes in Tensorflow 2 custom models, but the only thing that is remotely close are TF1 examples; which are so outdated they are unusable.
My model:
class IndividualFeaturesLayer(tf.keras.layers.Layer):
def __init__(self):
super(IndividualFeaturesLayer, self).__init__()
def build(self, input_shape):
stddev = 2 / np.sqrt(input_shape[-1] + input_shape[-1])
self.w = tf.Variable(tf.random.truncated_normal((input_shape[-1], input_shape[-1]), dtype='float64'), trainable=True)
b_init = tf.zeros_initializer()
self.b = tf.Variable(initial_value=b_init(shape=(input_shape[-1]), dtype='float64'), trainable=True)
def call(self, input):
returnVar = tf.math.add(tf.matmul(input, self.w), self.b)
return returnVar
class FullFeatureLayer(tf.keras.layers.Layer):
def __init__(self):
super(FullFeatureLayer, self).__init__()
self.globalFeatures = IndividualFeaturesLayer()
self.pieceFeatures = IndividualFeaturesLayer()
self.squareFeatures = IndividualFeaturesLayer()
def call(self, input):
globalFeature = input[:, :17]
pieceFeature = input[:, 17:225]
squareFeature = input[:, 225:353]
x = self.globalFeatures(globalFeature)
y = self.pieceFeatures(pieceFeature)
z = self.squareFeatures(squareFeature)
returnVar = tf.concat([x, y, z], 1)
return tf.nn.relu(returnVar)
class FullFullyConnectedFeatureLayer(tf.keras.layers.Layer):
def __init__(self):
super(FullFullyConnectedFeatureLayer, self).__init__()
def build(self, input_shape):
stddev = 2 / np.sqrt(input_shape[-1] + input_shape[-1])
self.w = tf.Variable(tf.random.truncated_normal((input_shape[-1], input_shape[-1]), dtype='float64'), trainable=True)
b_init = tf.zeros_initializer()
self.b = tf.Variable(initial_value=b_init(shape=(input_shape[-1]), dtype='float64'), trainable=True)
def call(self, input):
return tf.nn.relu(tf.math.add(tf.matmul(input, self.w), self.b))
class FullFullyConnectedOutputLayer(tf.keras.layers.Layer):
def __init__(self):
super(FullFullyConnectedOutputLayer, self).__init__()
def build(self, input_shape):
stddev = 2 / np.sqrt(input_shape[-1] + 1)
self.w = tf.Variable(tf.random.truncated_normal((input_shape[-1], 1), dtype='float64'), trainable=True)
b_init = tf.zeros_initializer()
self.b = tf.Variable(initial_value=b_init(shape=(1), dtype='float64'), trainable=True)
def call(self, input):
return tf.matmul(input, self.w) + self.b
class NormalizeLayer(tf.keras.layers.Layer):
def __init__(self, units=128):
super(NormalizeLayer, self).__init__()
self.units = units
def build(self, input_shape):
self.divideTensor = tf.fill((self.units, 1), tf.constant(1500, dtype='float64'))
self.minTensor = tf.fill((self.units, 1), tf.constant(-1, dtype='float64'))
self.maxTensor = tf.fill((self.units, 1), tf.constant(1, dtype='float64'))
def call(self, input):
dividedTensor = tf.divide(input, self.divideTensor)
minimizedTensor = tf.math.minimum(dividedTensor, self.maxTensor)
maximizedTensor = tf.math.maximum(minimizedTensor, self.minTensor)
return maximizedTensor
class FullNetwork(tf.keras.Model):
def __init__(self, batch_size):
super(FullNetwork, self).__init__(name='')
self.inputLayer = FullFeatureLayer()
self.hiddenLayer1 = FullFeatureLayer()
self.hiddenLayer2 = FullFullyConnectedFeatureLayer()
self.outputLayer = FullFullyConnectedOutputLayer()
self.normalizeLayer = NormalizeLayer()
def call(self, input, batch_size):
print(batch_size)
x = self.inputLayer(input)
x = self.hiddenLayer1(x)
x = self.hiddenLayer2(x)
x = self.outputLayer(x)
x = self.normalizeLayer(x)
return x
tf.keras.backend.set_floatx('float64')
fullNetwork = FullNetwork()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
fullNetwork.compile(optimizer, loss=tf.keras.losses.MeanSquaredError(), metrics=["MeanAbsoluteError"], run_eagerly=True)
fullNetwork.fit(training_feature_array, training_score_array, epochs=10, batch_size=128)

Eager Execution, tf.GradientTape only returns None

I'm trying to calculate the gradient with tf.GradientTape. When I try to do it using as inputs the loss and Model.trainable_weights (tf.keras.Model) the result that returns me in an array of None. what am I doing wrong? The tensorflow version I use is 1.13.0.
The implemneted algorithm is a OnPolicy DQN(Not usual DQN) so that I don't use a target network(whihch is used as behavioural network in conventional DQN code). So, I wanted to differentiate the Error, which is defined as a minibatch MSE of Y(which is R + gamma * max_a Q(s', a')) and Q(s,a) in the code below.
import gym
import numpy as np
import tensorflow as tf
from collections import deque
# ==== import below from my repo ====
from common.wrappers import MyWrapper # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters # params for training
from common.memory import ReplayBuffer # Experience Replay Buffer
tf.enable_eager_execution()
class Model(tf.keras.Model):
def __init__(self, num_action):
super(Model, self).__init__()
self.dense1 = tf.keras.layers.Dense(16, activation='relu')
self.dense2 = tf.keras.layers.Dense(16, activation='relu')
self.dense3 = tf.keras.layers.Dense(16, activation='relu')
self.pred = tf.keras.layers.Dense(num_action, activation='softmax')
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
x = self.dense3(x)
pred = self.pred(x)
return pred
class DQN:
"""
On policy DQN
"""
def __init__(self, num_action):
self.num_action = num_action
self.model = Model(num_action)
self.optimizer = tf.train.AdamOptimizer()
def predict(self, state):
return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.predict(state)
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
if __name__ == '__main__':
reward_buffer = deque(maxlen=5)
env = MyWrapper(gym.make("CartPole-v0"))
replay_buffer = ReplayBuffer(5000)
params = Parameters(mode="CartPole")
agent = DQN(env.action_space.n)
for i in range(2000):
state = env.reset()
total_reward = 0
for t in range(210):
# env.render()
action = np.argmax(agent.predict(state)) # behave greedily
next_state, reward, done, info = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
total_reward += reward
state = next_state
if done:
print("Episode {0} finished after {1} timesteps".format(i, t + 1))
if i > 10:
print("Update")
with tf.GradientTape() as tape:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = agent.predict(next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = agent.update(states, actions, Y)
print(loss)
grads = tape.gradient(loss, agent.model.trainable_weights)
# ==== THIS RETURNS ONLY NONE ====
print(grads)
agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
break
# store the episode reward
reward_buffer.append(total_reward)
# check the stopping condition
if np.mean(reward_buffer) > 195:
print("GAME OVER!!")
break
env.close()
import gym
import numpy as np
import tensorflow as tf
from collections import deque
# ==== import below from my repo ====
from common.wrappers import MyWrapper # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters # params for training
from common.memory import ReplayBuffer # Experience Replay Buffer
tf.enable_eager_execution()
class Model(tf.keras.Model):
def __init__(self, num_action):
super(Model, self).__init__()
self.dense1 = tf.keras.layers.Dense(16, activation='relu')
self.dense2 = tf.keras.layers.Dense(16, activation='relu')
self.dense3 = tf.keras.layers.Dense(16, activation='relu')
self.pred = tf.keras.layers.Dense(num_action, activation='softmax')
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
x = self.dense3(x)
pred = self.pred(x)
return pred
class DQN:
"""
On policy DQN
"""
def __init__(self, num_action):
self.num_action = num_action
self.model = Model(num_action)
self.optimizer = tf.train.AdamOptimizer()
def predict(self, state):
return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.predict(state)
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
if __name__ == '__main__':
reward_buffer = deque(maxlen=5)
env = MyWrapper(gym.make("CartPole-v0"))
replay_buffer = ReplayBuffer(5000)
params = Parameters(mode="CartPole")
agent = DQN(env.action_space.n)
for i in range(2000):
state = env.reset()
total_reward = 0
for t in range(210):
# env.render()
action = np.argmax(agent.predict(state)) # behave greedily
next_state, reward, done, info = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
total_reward += reward
state = next_state
if done:
print("Episode {0} finished after {1} timesteps".format(i, t + 1))
if i > 10:
print("Update")
with tf.GradientTape() as tape:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = agent.predict(next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = agent.update(states, actions, Y)
print(loss)
grads = tape.gradient(loss, agent.model.trainable_weights)
# ==== THIS RETURNS ONLY NONE ====
print(grads)
agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
break
# store the episode reward
reward_buffer.append(total_reward)
# check the stopping condition
if np.mean(reward_buffer) > 195:
print("GAME OVER!!")
break
env.close()
import gym
import numpy as np
import tensorflow as tf
from collections import deque
# ==== import below from my repo ====
from common.wrappers import MyWrapper # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters # params for training
from common.memory import ReplayBuffer # Experience Replay Buffer
tf.enable_eager_execution()
class Model(tf.keras.Model):
def __init__(self, num_action):
super(Model, self).__init__()
self.dense1 = tf.keras.layers.Dense(16, activation='relu')
self.dense2 = tf.keras.layers.Dense(16, activation='relu')
self.dense3 = tf.keras.layers.Dense(16, activation='relu')
self.pred = tf.keras.layers.Dense(num_action, activation='softmax')
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
x = self.dense3(x)
pred = self.pred(x)
return pred
class DQN:
"""
On policy DQN
"""
def __init__(self, num_action):
self.num_action = num_action
self.model = Model(num_action)
self.optimizer = tf.train.AdamOptimizer()
def predict(self, state):
return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.predict(state)
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
if __name__ == '__main__':
reward_buffer = deque(maxlen=5)
env = MyWrapper(gym.make("CartPole-v0"))
replay_buffer = ReplayBuffer(5000)
params = Parameters(mode="CartPole")
agent = DQN(env.action_space.n)
for i in range(2000):
state = env.reset()
total_reward = 0
for t in range(210):
# env.render()
action = np.argmax(agent.predict(state)) # behave greedily
next_state, reward, done, info = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
total_reward += reward
state = next_state
if done:
print("Episode {0} finished after {1} timesteps".format(i, t + 1))
if i > 10:
print("Update")
with tf.GradientTape() as tape:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = agent.predict(next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = agent.update(states, actions, Y)
print(loss)
grads = tape.gradient(loss, agent.model.trainable_weights)
# ==== THIS RETURNS ONLY NONE ====
print(grads)
agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
break
# store the episode reward
reward_buffer.append(total_reward)
# check the stopping condition
if np.mean(reward_buffer) > 195:
print("GAME OVER!!")
break
env.close()
import gym
import numpy as np
import tensorflow as tf
from collections import deque
# ==== import below from my repo ====
from common.wrappers import MyWrapper # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters # params for training
from common.memory import ReplayBuffer # Experience Replay Buffer
tf.enable_eager_execution()
class Model(tf.keras.Model):
def __init__(self, num_action):
super(Model, self).__init__()
self.dense1 = tf.keras.layers.Dense(16, activation='relu')
self.dense2 = tf.keras.layers.Dense(16, activation='relu')
self.dense3 = tf.keras.layers.Dense(16, activation='relu')
self.pred = tf.keras.layers.Dense(num_action, activation='softmax')
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
x = self.dense3(x)
pred = self.pred(x)
return pred
class DQN:
"""
On policy DQN
"""
def __init__(self, num_action):
self.num_action = num_action
self.model = Model(num_action)
self.optimizer = tf.train.AdamOptimizer()
def predict(self, state):
return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.predict(state)
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
if __name__ == '__main__':
reward_buffer = deque(maxlen=5)
env = MyWrapper(gym.make("CartPole-v0"))
replay_buffer = ReplayBuffer(5000)
params = Parameters(mode="CartPole")
agent = DQN(env.action_space.n)
for i in range(2000):
state = env.reset()
total_reward = 0
for t in range(210):
# env.render()
action = np.argmax(agent.predict(state)) # behave greedily
next_state, reward, done, info = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
total_reward += reward
state = next_state
if done:
print("Episode {0} finished after {1} timesteps".format(i, t + 1))
if i > 10:
print("Update")
with tf.GradientTape() as tape:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = agent.predict(next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = agent.update(states, actions, Y)
print(loss)
grads = tape.gradient(loss, agent.model.trainable_weights)
# ==== THIS RETURNS ONLY NONE ====
print(grads)
agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
break
# store the episode reward
reward_buffer.append(total_reward)
# check the stopping condition
if np.mean(reward_buffer) > 195:
print("GAME OVER!!")
break
env.close()
Try to change your update function to:
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32))
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
I think with the .numpy() call in the predict function the tape loses the refererence to the weights. (I've not tested my answer)

Implementing Luong Attention in PyTorch

I am trying to implement the attention described in Luong et al. 2015 in PyTorch myself, but I couldn't get it work. Below is my code, I am only interested in the "general" attention case for now. I wonder if I am missing any obvious error. It runs, but doesn't seem to learn.
class AttnDecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size, dropout_p=0.1):
super(AttnDecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_p = dropout_p
self.embedding = nn.Embedding(
num_embeddings=self.output_size,
embedding_dim=self.hidden_size
)
self.dropout = nn.Dropout(self.dropout_p)
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
self.attn = nn.Linear(self.hidden_size, self.hidden_size)
# hc: [hidden, context]
self.Whc = nn.Linear(self.hidden_size * 2, self.hidden_size)
# s: softmax
self.Ws = nn.Linear(self.hidden_size, self.output_size)
def forward(self, input, hidden, encoder_outputs):
embedded = self.embedding(input).view(1, 1, -1)
embedded = self.dropout(embedded)
gru_out, hidden = self.gru(embedded, hidden)
# [0] remove the dimension of directions x layers for now
attn_prod = torch.mm(self.attn(hidden)[0], encoder_outputs.t())
attn_weights = F.softmax(attn_prod, dim=1) # eq. 7/8
context = torch.mm(attn_weights, encoder_outputs)
# hc: [hidden: context]
out_hc = F.tanh(self.Whc(torch.cat([hidden[0], context], dim=1)) # eq.5
output = F.log_softmax(self.Ws(out_hc), dim=1) eq. 6
return output, hidden, attn_weights
I have studied the attention implemented in
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
and
https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb
The first one isn't the exact attention mechanism I am looking for. A major disadvantage is that its attention depends on the sequence length (self.attn = nn.Linear(self.hidden_size * 2, self.max_length)), which could be expensive for long sequences.
The second one is more similar to what's described in the paper, but still not the same as there is not tanh. Besides, it is really slow after updating it to latest version of pytorch (ref). Also I don't know why it takes the last context (ref).
This version works, and it follows the definition of Luong Attention (general), closely. The main difference from that in the question is the separation of embedding_size and hidden_size, which appears to be important for training after experimentation. Previously, I made both of them the same size (256), which creates trouble for learning, and it seems that the network could only learn half the sequence.
class EncoderRNN(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size,
num_layers=1, bidirectional=False, batch_size=1):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.batch_size = batch_size
self.embedding = nn.Embedding(input_size, embedding_size)
self.gru = nn.GRU(embedding_size, hidden_size, num_layers,
bidirectional=bidirectional)
def forward(self, input, hidden):
embedded = self.embedding(input).view(1, 1, -1)
output, hidden = self.gru(embedded, hidden)
return output, hidden
def initHidden(self):
directions = 2 if self.bidirectional else 1
return torch.zeros(
self.num_layers * directions,
self.batch_size,
self.hidden_size,
device=DEVICE
)
class AttnDecoderRNN(nn.Module):
def __init__(self, embedding_size, hidden_size, output_size, dropout_p=0):
super(AttnDecoderRNN, self).__init__()
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_p = dropout_p
self.embedding = nn.Embedding(
num_embeddings=output_size,
embedding_dim=embedding_size
)
self.dropout = nn.Dropout(self.dropout_p)
self.gru = nn.GRU(embedding_size, hidden_size)
self.attn = nn.Linear(hidden_size, hidden_size)
# hc: [hidden, context]
self.Whc = nn.Linear(hidden_size * 2, hidden_size)
# s: softmax
self.Ws = nn.Linear(hidden_size, output_size)
def forward(self, input, hidden, encoder_outputs):
embedded = self.embedding(input).view(1, 1, -1)
embedded = self.dropout(embedded)
gru_out, hidden = self.gru(embedded, hidden)
attn_prod = torch.mm(self.attn(hidden)[0], encoder_outputs.t())
attn_weights = F.softmax(attn_prod, dim=1)
context = torch.mm(attn_weights, encoder_outputs)
# hc: [hidden: context]
hc = torch.cat([hidden[0], context], dim=1)
out_hc = F.tanh(self.Whc(hc))
output = F.log_softmax(self.Ws(out_hc), dim=1)
return output, hidden, attn_weights

Resources