Implementing Luong Attention in PyTorch - pytorch

I am trying to implement the attention described in Luong et al. 2015 in PyTorch myself, but I couldn't get it work. Below is my code, I am only interested in the "general" attention case for now. I wonder if I am missing any obvious error. It runs, but doesn't seem to learn.
class AttnDecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size, dropout_p=0.1):
super(AttnDecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_p = dropout_p
self.embedding = nn.Embedding(
num_embeddings=self.output_size,
embedding_dim=self.hidden_size
)
self.dropout = nn.Dropout(self.dropout_p)
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
self.attn = nn.Linear(self.hidden_size, self.hidden_size)
# hc: [hidden, context]
self.Whc = nn.Linear(self.hidden_size * 2, self.hidden_size)
# s: softmax
self.Ws = nn.Linear(self.hidden_size, self.output_size)
def forward(self, input, hidden, encoder_outputs):
embedded = self.embedding(input).view(1, 1, -1)
embedded = self.dropout(embedded)
gru_out, hidden = self.gru(embedded, hidden)
# [0] remove the dimension of directions x layers for now
attn_prod = torch.mm(self.attn(hidden)[0], encoder_outputs.t())
attn_weights = F.softmax(attn_prod, dim=1) # eq. 7/8
context = torch.mm(attn_weights, encoder_outputs)
# hc: [hidden: context]
out_hc = F.tanh(self.Whc(torch.cat([hidden[0], context], dim=1)) # eq.5
output = F.log_softmax(self.Ws(out_hc), dim=1) eq. 6
return output, hidden, attn_weights
I have studied the attention implemented in
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
and
https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb
The first one isn't the exact attention mechanism I am looking for. A major disadvantage is that its attention depends on the sequence length (self.attn = nn.Linear(self.hidden_size * 2, self.max_length)), which could be expensive for long sequences.
The second one is more similar to what's described in the paper, but still not the same as there is not tanh. Besides, it is really slow after updating it to latest version of pytorch (ref). Also I don't know why it takes the last context (ref).

This version works, and it follows the definition of Luong Attention (general), closely. The main difference from that in the question is the separation of embedding_size and hidden_size, which appears to be important for training after experimentation. Previously, I made both of them the same size (256), which creates trouble for learning, and it seems that the network could only learn half the sequence.
class EncoderRNN(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size,
num_layers=1, bidirectional=False, batch_size=1):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.batch_size = batch_size
self.embedding = nn.Embedding(input_size, embedding_size)
self.gru = nn.GRU(embedding_size, hidden_size, num_layers,
bidirectional=bidirectional)
def forward(self, input, hidden):
embedded = self.embedding(input).view(1, 1, -1)
output, hidden = self.gru(embedded, hidden)
return output, hidden
def initHidden(self):
directions = 2 if self.bidirectional else 1
return torch.zeros(
self.num_layers * directions,
self.batch_size,
self.hidden_size,
device=DEVICE
)
class AttnDecoderRNN(nn.Module):
def __init__(self, embedding_size, hidden_size, output_size, dropout_p=0):
super(AttnDecoderRNN, self).__init__()
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_p = dropout_p
self.embedding = nn.Embedding(
num_embeddings=output_size,
embedding_dim=embedding_size
)
self.dropout = nn.Dropout(self.dropout_p)
self.gru = nn.GRU(embedding_size, hidden_size)
self.attn = nn.Linear(hidden_size, hidden_size)
# hc: [hidden, context]
self.Whc = nn.Linear(hidden_size * 2, hidden_size)
# s: softmax
self.Ws = nn.Linear(hidden_size, output_size)
def forward(self, input, hidden, encoder_outputs):
embedded = self.embedding(input).view(1, 1, -1)
embedded = self.dropout(embedded)
gru_out, hidden = self.gru(embedded, hidden)
attn_prod = torch.mm(self.attn(hidden)[0], encoder_outputs.t())
attn_weights = F.softmax(attn_prod, dim=1)
context = torch.mm(attn_weights, encoder_outputs)
# hc: [hidden: context]
hc = torch.cat([hidden[0], context], dim=1)
out_hc = F.tanh(self.Whc(hc))
output = F.log_softmax(self.Ws(out_hc), dim=1)
return output, hidden, attn_weights

Related

How can I call random of data with dataloader

When I am training a model, I should use only 10% of data for trainer.fit(model,datamodule)
so I should call DataModule just for 10% of data
Part of DataModule is:
class DataModule(pl.LightningDataModule):
def __init__(self, train_dataset, val_dataset, batch_size = 1):
super(DataModule, self).__init__()
self.train_dataset = train_dataset
self.val_dataset = val_dataset
self.batch_size = batch_size
def train_dataloader(self):
return DataLoader(self.train_dataset, batch_size = self.batch_size,
collate_fn = collate_fn, shuffle = True, num_workers = 2, pin_memory = True)
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size = self.batch_size,
collate_fn = collate_fn, shuffle = False, num_workers = 2, pin_memory = True)
So I use a for loop
datamodule = DataModule(train_ds, val_ds)
for i,data in enumerate(datamodule.train_dataloader()):
print( datamodule.train_dataloader(i,data))
But it doesn't work. How can I change it?

Pytorch, how to get the parameters of my network

I have a question about getting all parameters of the network. My network is defined as follow:
activation = nn.ReLU()
class OneInputBasis(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
self.l1 = nn.Linear(200, 100, bias = bo_b).to(device)
self.l4 = nn.Linear(100, 100, bias = bo_last).to(device)
def forward(self, v):
v = activation ( self.l1(v) )
v = ( self.l4(v) )
return v
and
class node(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
self.set_lay = []
for jj in range(dim_output_space_basis):
self.set_lay.append(OneInputBasis())
def forward(self, v):
w = self.set_lay[0](v)
for ii in range(dim_output_space_basis-1):
w = torch.cat((w, self.set_lay[ii+1](v)), dim = 1 )
return w
and
class mesh(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
self.l3 = nn.Linear(2, 100, bias = bo_b).to(device)
self.l4 = nn.Linear(100, 100, bias = bo_b).to(device)
self.l7 = nn.Linear(100,10, bias = bo_last).to(device)
def forward(self, w):
w = activation ( self.l3(w) )
w = activation ( self.l4(w) )
w = ( self.l7(w) )
return w
finally, I have
activation = nn.ReLU()
class Test(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
self.top = node()
self.bottom = mesh()
def forward(self, v, w, y):
v = self.top(v)
w = self.bottom(w)
e = torch.bmm(w ,torch.bmm(v, y))
return e[:, :, 0]
Now I define the network:
fnn_adam = Test()
When I print the parameters of the network, as
for p in fnn_adam.parameters():
print(p)
I can only see the parameters associated with fnn_adam.bottom, how can I print out the parameters associated with fnn_adam.top? Are the parameters associated with .top trainable? Thank you!
Calling self.set_lay.append(OneInputBasis()) with the instantiation of node does not register the fully-connected layers
self.l1 = nn.Linear(200, 100, bias = bo_b).to(device)
self.l4 = nn.Linear(100, 100, bias = bo_last).to(device)
to the instance fnn_adam of class Test. This is why the respective parameters do not show up in your code above.
Without loss of generality, I chose
import torch
import torch.nn as nn
import torch.nn.functional as F
dim_output_space_basis = 2
device ='cpu'
and modified the init method of class node. The remainder of your code is perfectly fine. Please see below:
class node(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
# self.set_lay = [] # Legacy
attributeNames = ['l_btm{}'.format(i) for i in range(dim_output_space_basis)]
for jj_index, jj in enumerate(range(dim_output_space_basis)):
# self.set_lay.append(OneInputBasis()) # Legacy
setattr(self, attributeNames[jj_index], OneInputBasis())
Now, the parameters register as evidenced by running fnn_adam._modules and observing its output
OrderedDict([('top',
node(
(l_btm0): OneInputBasis(
(l1): Linear(in_features=200, out_features=100, bias=True)
(l4): Linear(in_features=100, out_features=100, bias=False)
)
(l_btm1): OneInputBasis(
(l1): Linear(in_features=200, out_features=100, bias=True)
(l4): Linear(in_features=100, out_features=100, bias=False)
)
)),
('bottom',
mesh(
(l3): Linear(in_features=2, out_features=100, bias=True)
(l4): Linear(in_features=100, out_features=100, bias=True)
(l7): Linear(in_features=100, out_features=10, bias=False)
))])

the error that appears is not implemented error in getitem

#I tried to implement custom data generator using albumentations.I an getting getitem
raise NotImplementedError
class DataGenerator(tf.keras.utils.Sequence):
def __init__(self, images, label, augmentations, input_dim, batch_size=32,
shuffle=True):
self.images = images
self.label = label
self.augment = augmentations
self.batch_size = batch_size
self.input_size = input_size
self.model_name = model_name
self.shuffle = shuffle
def __len__(self):
return int(np.ceil(len(self.images) / self.batch_size))
def _getitem__(self, index):
indexes = self.indexes[index * self.batch_size: (index + 1) * self.batch_size]
batch_y = np.array([self.label[k] for k in indexes])
batch_x = [cv2.cvtColor(cv2.imread(self.images[k]), cv2.COLOR_RGB2BGR) for k in indexes]
return np.stack([self.augment(image=x)["image"] for x in batch_x], axis=0), np.array(batch_y)

Tensorflow 2 different batch size when doing inference

I have made a custom model in tensorflow 2, which uses eager execution.
The model is trained using the inherited .fit() function, about 600k training samples are used in a 10 epoch cycle with a batch size of 128 (up to 8k batch has been done). After training the model is saved as a SavedModel format. This is then used in C++ by using the cppflow library. However, this process requires the inference to use the same batch size as the training of the model, while only requiring to do inference on a single sample at a time. The application requires that things are fast and padding a feature vector array with 127 dummy vectors is slowing everyting down.
The batch size is also used in the NormalizeLayer at the end, which is using a hardcoded units value at the moment to initialize a matrix.
I have searched for a way to use variable batch sizes in Tensorflow 2 custom models, but the only thing that is remotely close are TF1 examples; which are so outdated they are unusable.
My model:
class IndividualFeaturesLayer(tf.keras.layers.Layer):
def __init__(self):
super(IndividualFeaturesLayer, self).__init__()
def build(self, input_shape):
stddev = 2 / np.sqrt(input_shape[-1] + input_shape[-1])
self.w = tf.Variable(tf.random.truncated_normal((input_shape[-1], input_shape[-1]), dtype='float64'), trainable=True)
b_init = tf.zeros_initializer()
self.b = tf.Variable(initial_value=b_init(shape=(input_shape[-1]), dtype='float64'), trainable=True)
def call(self, input):
returnVar = tf.math.add(tf.matmul(input, self.w), self.b)
return returnVar
class FullFeatureLayer(tf.keras.layers.Layer):
def __init__(self):
super(FullFeatureLayer, self).__init__()
self.globalFeatures = IndividualFeaturesLayer()
self.pieceFeatures = IndividualFeaturesLayer()
self.squareFeatures = IndividualFeaturesLayer()
def call(self, input):
globalFeature = input[:, :17]
pieceFeature = input[:, 17:225]
squareFeature = input[:, 225:353]
x = self.globalFeatures(globalFeature)
y = self.pieceFeatures(pieceFeature)
z = self.squareFeatures(squareFeature)
returnVar = tf.concat([x, y, z], 1)
return tf.nn.relu(returnVar)
class FullFullyConnectedFeatureLayer(tf.keras.layers.Layer):
def __init__(self):
super(FullFullyConnectedFeatureLayer, self).__init__()
def build(self, input_shape):
stddev = 2 / np.sqrt(input_shape[-1] + input_shape[-1])
self.w = tf.Variable(tf.random.truncated_normal((input_shape[-1], input_shape[-1]), dtype='float64'), trainable=True)
b_init = tf.zeros_initializer()
self.b = tf.Variable(initial_value=b_init(shape=(input_shape[-1]), dtype='float64'), trainable=True)
def call(self, input):
return tf.nn.relu(tf.math.add(tf.matmul(input, self.w), self.b))
class FullFullyConnectedOutputLayer(tf.keras.layers.Layer):
def __init__(self):
super(FullFullyConnectedOutputLayer, self).__init__()
def build(self, input_shape):
stddev = 2 / np.sqrt(input_shape[-1] + 1)
self.w = tf.Variable(tf.random.truncated_normal((input_shape[-1], 1), dtype='float64'), trainable=True)
b_init = tf.zeros_initializer()
self.b = tf.Variable(initial_value=b_init(shape=(1), dtype='float64'), trainable=True)
def call(self, input):
return tf.matmul(input, self.w) + self.b
class NormalizeLayer(tf.keras.layers.Layer):
def __init__(self, units=128):
super(NormalizeLayer, self).__init__()
self.units = units
def build(self, input_shape):
self.divideTensor = tf.fill((self.units, 1), tf.constant(1500, dtype='float64'))
self.minTensor = tf.fill((self.units, 1), tf.constant(-1, dtype='float64'))
self.maxTensor = tf.fill((self.units, 1), tf.constant(1, dtype='float64'))
def call(self, input):
dividedTensor = tf.divide(input, self.divideTensor)
minimizedTensor = tf.math.minimum(dividedTensor, self.maxTensor)
maximizedTensor = tf.math.maximum(minimizedTensor, self.minTensor)
return maximizedTensor
class FullNetwork(tf.keras.Model):
def __init__(self, batch_size):
super(FullNetwork, self).__init__(name='')
self.inputLayer = FullFeatureLayer()
self.hiddenLayer1 = FullFeatureLayer()
self.hiddenLayer2 = FullFullyConnectedFeatureLayer()
self.outputLayer = FullFullyConnectedOutputLayer()
self.normalizeLayer = NormalizeLayer()
def call(self, input, batch_size):
print(batch_size)
x = self.inputLayer(input)
x = self.hiddenLayer1(x)
x = self.hiddenLayer2(x)
x = self.outputLayer(x)
x = self.normalizeLayer(x)
return x
tf.keras.backend.set_floatx('float64')
fullNetwork = FullNetwork()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
fullNetwork.compile(optimizer, loss=tf.keras.losses.MeanSquaredError(), metrics=["MeanAbsoluteError"], run_eagerly=True)
fullNetwork.fit(training_feature_array, training_score_array, epochs=10, batch_size=128)

Eager Execution, tf.GradientTape only returns None

I'm trying to calculate the gradient with tf.GradientTape. When I try to do it using as inputs the loss and Model.trainable_weights (tf.keras.Model) the result that returns me in an array of None. what am I doing wrong? The tensorflow version I use is 1.13.0.
The implemneted algorithm is a OnPolicy DQN(Not usual DQN) so that I don't use a target network(whihch is used as behavioural network in conventional DQN code). So, I wanted to differentiate the Error, which is defined as a minibatch MSE of Y(which is R + gamma * max_a Q(s', a')) and Q(s,a) in the code below.
import gym
import numpy as np
import tensorflow as tf
from collections import deque
# ==== import below from my repo ====
from common.wrappers import MyWrapper # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters # params for training
from common.memory import ReplayBuffer # Experience Replay Buffer
tf.enable_eager_execution()
class Model(tf.keras.Model):
def __init__(self, num_action):
super(Model, self).__init__()
self.dense1 = tf.keras.layers.Dense(16, activation='relu')
self.dense2 = tf.keras.layers.Dense(16, activation='relu')
self.dense3 = tf.keras.layers.Dense(16, activation='relu')
self.pred = tf.keras.layers.Dense(num_action, activation='softmax')
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
x = self.dense3(x)
pred = self.pred(x)
return pred
class DQN:
"""
On policy DQN
"""
def __init__(self, num_action):
self.num_action = num_action
self.model = Model(num_action)
self.optimizer = tf.train.AdamOptimizer()
def predict(self, state):
return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.predict(state)
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
if __name__ == '__main__':
reward_buffer = deque(maxlen=5)
env = MyWrapper(gym.make("CartPole-v0"))
replay_buffer = ReplayBuffer(5000)
params = Parameters(mode="CartPole")
agent = DQN(env.action_space.n)
for i in range(2000):
state = env.reset()
total_reward = 0
for t in range(210):
# env.render()
action = np.argmax(agent.predict(state)) # behave greedily
next_state, reward, done, info = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
total_reward += reward
state = next_state
if done:
print("Episode {0} finished after {1} timesteps".format(i, t + 1))
if i > 10:
print("Update")
with tf.GradientTape() as tape:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = agent.predict(next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = agent.update(states, actions, Y)
print(loss)
grads = tape.gradient(loss, agent.model.trainable_weights)
# ==== THIS RETURNS ONLY NONE ====
print(grads)
agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
break
# store the episode reward
reward_buffer.append(total_reward)
# check the stopping condition
if np.mean(reward_buffer) > 195:
print("GAME OVER!!")
break
env.close()
import gym
import numpy as np
import tensorflow as tf
from collections import deque
# ==== import below from my repo ====
from common.wrappers import MyWrapper # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters # params for training
from common.memory import ReplayBuffer # Experience Replay Buffer
tf.enable_eager_execution()
class Model(tf.keras.Model):
def __init__(self, num_action):
super(Model, self).__init__()
self.dense1 = tf.keras.layers.Dense(16, activation='relu')
self.dense2 = tf.keras.layers.Dense(16, activation='relu')
self.dense3 = tf.keras.layers.Dense(16, activation='relu')
self.pred = tf.keras.layers.Dense(num_action, activation='softmax')
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
x = self.dense3(x)
pred = self.pred(x)
return pred
class DQN:
"""
On policy DQN
"""
def __init__(self, num_action):
self.num_action = num_action
self.model = Model(num_action)
self.optimizer = tf.train.AdamOptimizer()
def predict(self, state):
return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.predict(state)
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
if __name__ == '__main__':
reward_buffer = deque(maxlen=5)
env = MyWrapper(gym.make("CartPole-v0"))
replay_buffer = ReplayBuffer(5000)
params = Parameters(mode="CartPole")
agent = DQN(env.action_space.n)
for i in range(2000):
state = env.reset()
total_reward = 0
for t in range(210):
# env.render()
action = np.argmax(agent.predict(state)) # behave greedily
next_state, reward, done, info = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
total_reward += reward
state = next_state
if done:
print("Episode {0} finished after {1} timesteps".format(i, t + 1))
if i > 10:
print("Update")
with tf.GradientTape() as tape:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = agent.predict(next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = agent.update(states, actions, Y)
print(loss)
grads = tape.gradient(loss, agent.model.trainable_weights)
# ==== THIS RETURNS ONLY NONE ====
print(grads)
agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
break
# store the episode reward
reward_buffer.append(total_reward)
# check the stopping condition
if np.mean(reward_buffer) > 195:
print("GAME OVER!!")
break
env.close()
import gym
import numpy as np
import tensorflow as tf
from collections import deque
# ==== import below from my repo ====
from common.wrappers import MyWrapper # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters # params for training
from common.memory import ReplayBuffer # Experience Replay Buffer
tf.enable_eager_execution()
class Model(tf.keras.Model):
def __init__(self, num_action):
super(Model, self).__init__()
self.dense1 = tf.keras.layers.Dense(16, activation='relu')
self.dense2 = tf.keras.layers.Dense(16, activation='relu')
self.dense3 = tf.keras.layers.Dense(16, activation='relu')
self.pred = tf.keras.layers.Dense(num_action, activation='softmax')
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
x = self.dense3(x)
pred = self.pred(x)
return pred
class DQN:
"""
On policy DQN
"""
def __init__(self, num_action):
self.num_action = num_action
self.model = Model(num_action)
self.optimizer = tf.train.AdamOptimizer()
def predict(self, state):
return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.predict(state)
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
if __name__ == '__main__':
reward_buffer = deque(maxlen=5)
env = MyWrapper(gym.make("CartPole-v0"))
replay_buffer = ReplayBuffer(5000)
params = Parameters(mode="CartPole")
agent = DQN(env.action_space.n)
for i in range(2000):
state = env.reset()
total_reward = 0
for t in range(210):
# env.render()
action = np.argmax(agent.predict(state)) # behave greedily
next_state, reward, done, info = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
total_reward += reward
state = next_state
if done:
print("Episode {0} finished after {1} timesteps".format(i, t + 1))
if i > 10:
print("Update")
with tf.GradientTape() as tape:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = agent.predict(next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = agent.update(states, actions, Y)
print(loss)
grads = tape.gradient(loss, agent.model.trainable_weights)
# ==== THIS RETURNS ONLY NONE ====
print(grads)
agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
break
# store the episode reward
reward_buffer.append(total_reward)
# check the stopping condition
if np.mean(reward_buffer) > 195:
print("GAME OVER!!")
break
env.close()
import gym
import numpy as np
import tensorflow as tf
from collections import deque
# ==== import below from my repo ====
from common.wrappers import MyWrapper # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters # params for training
from common.memory import ReplayBuffer # Experience Replay Buffer
tf.enable_eager_execution()
class Model(tf.keras.Model):
def __init__(self, num_action):
super(Model, self).__init__()
self.dense1 = tf.keras.layers.Dense(16, activation='relu')
self.dense2 = tf.keras.layers.Dense(16, activation='relu')
self.dense3 = tf.keras.layers.Dense(16, activation='relu')
self.pred = tf.keras.layers.Dense(num_action, activation='softmax')
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
x = self.dense3(x)
pred = self.pred(x)
return pred
class DQN:
"""
On policy DQN
"""
def __init__(self, num_action):
self.num_action = num_action
self.model = Model(num_action)
self.optimizer = tf.train.AdamOptimizer()
def predict(self, state):
return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.predict(state)
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
if __name__ == '__main__':
reward_buffer = deque(maxlen=5)
env = MyWrapper(gym.make("CartPole-v0"))
replay_buffer = ReplayBuffer(5000)
params = Parameters(mode="CartPole")
agent = DQN(env.action_space.n)
for i in range(2000):
state = env.reset()
total_reward = 0
for t in range(210):
# env.render()
action = np.argmax(agent.predict(state)) # behave greedily
next_state, reward, done, info = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
total_reward += reward
state = next_state
if done:
print("Episode {0} finished after {1} timesteps".format(i, t + 1))
if i > 10:
print("Update")
with tf.GradientTape() as tape:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = agent.predict(next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = agent.update(states, actions, Y)
print(loss)
grads = tape.gradient(loss, agent.model.trainable_weights)
# ==== THIS RETURNS ONLY NONE ====
print(grads)
agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
break
# store the episode reward
reward_buffer.append(total_reward)
# check the stopping condition
if np.mean(reward_buffer) > 195:
print("GAME OVER!!")
break
env.close()
Try to change your update function to:
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32))
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
I think with the .numpy() call in the predict function the tape loses the refererence to the weights. (I've not tested my answer)

Resources