When I am training a model, I should use only 10% of data for trainer.fit(model,datamodule)
so I should call DataModule just for 10% of data
Part of DataModule is:
class DataModule(pl.LightningDataModule):
def __init__(self, train_dataset, val_dataset, batch_size = 1):
super(DataModule, self).__init__()
self.train_dataset = train_dataset
self.val_dataset = val_dataset
self.batch_size = batch_size
def train_dataloader(self):
return DataLoader(self.train_dataset, batch_size = self.batch_size,
collate_fn = collate_fn, shuffle = True, num_workers = 2, pin_memory = True)
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size = self.batch_size,
collate_fn = collate_fn, shuffle = False, num_workers = 2, pin_memory = True)
So I use a for loop
datamodule = DataModule(train_ds, val_ds)
for i,data in enumerate(datamodule.train_dataloader()):
print( datamodule.train_dataloader(i,data))
But it doesn't work. How can I change it?
I'm trying to calculate the gradient with tf.GradientTape. When I try to do it using as inputs the loss and Model.trainable_weights (tf.keras.Model) the result that returns me in an array of None. what am I doing wrong? The tensorflow version I use is 1.13.0.
The implemneted algorithm is a OnPolicy DQN(Not usual DQN) so that I don't use a target network(whihch is used as behavioural network in conventional DQN code). So, I wanted to differentiate the Error, which is defined as a minibatch MSE of Y(which is R + gamma * max_a Q(s', a')) and Q(s,a) in the code below.
import gym
import numpy as np
import tensorflow as tf
from collections import deque
# ==== import below from my repo ====
from common.wrappers import MyWrapper # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters # params for training
from common.memory import ReplayBuffer # Experience Replay Buffer
tf.enable_eager_execution()
class Model(tf.keras.Model):
def __init__(self, num_action):
super(Model, self).__init__()
self.dense1 = tf.keras.layers.Dense(16, activation='relu')
self.dense2 = tf.keras.layers.Dense(16, activation='relu')
self.dense3 = tf.keras.layers.Dense(16, activation='relu')
self.pred = tf.keras.layers.Dense(num_action, activation='softmax')
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
x = self.dense3(x)
pred = self.pred(x)
return pred
class DQN:
"""
On policy DQN
"""
def __init__(self, num_action):
self.num_action = num_action
self.model = Model(num_action)
self.optimizer = tf.train.AdamOptimizer()
def predict(self, state):
return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.predict(state)
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
if __name__ == '__main__':
reward_buffer = deque(maxlen=5)
env = MyWrapper(gym.make("CartPole-v0"))
replay_buffer = ReplayBuffer(5000)
params = Parameters(mode="CartPole")
agent = DQN(env.action_space.n)
for i in range(2000):
state = env.reset()
total_reward = 0
for t in range(210):
# env.render()
action = np.argmax(agent.predict(state)) # behave greedily
next_state, reward, done, info = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
total_reward += reward
state = next_state
if done:
print("Episode {0} finished after {1} timesteps".format(i, t + 1))
if i > 10:
print("Update")
with tf.GradientTape() as tape:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = agent.predict(next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = agent.update(states, actions, Y)
print(loss)
grads = tape.gradient(loss, agent.model.trainable_weights)
# ==== THIS RETURNS ONLY NONE ====
print(grads)
agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
break
# store the episode reward
reward_buffer.append(total_reward)
# check the stopping condition
if np.mean(reward_buffer) > 195:
print("GAME OVER!!")
break
env.close()
import gym
import numpy as np
import tensorflow as tf
from collections import deque
# ==== import below from my repo ====
from common.wrappers import MyWrapper # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters # params for training
from common.memory import ReplayBuffer # Experience Replay Buffer
tf.enable_eager_execution()
class Model(tf.keras.Model):
def __init__(self, num_action):
super(Model, self).__init__()
self.dense1 = tf.keras.layers.Dense(16, activation='relu')
self.dense2 = tf.keras.layers.Dense(16, activation='relu')
self.dense3 = tf.keras.layers.Dense(16, activation='relu')
self.pred = tf.keras.layers.Dense(num_action, activation='softmax')
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
x = self.dense3(x)
pred = self.pred(x)
return pred
class DQN:
"""
On policy DQN
"""
def __init__(self, num_action):
self.num_action = num_action
self.model = Model(num_action)
self.optimizer = tf.train.AdamOptimizer()
def predict(self, state):
return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.predict(state)
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
if __name__ == '__main__':
reward_buffer = deque(maxlen=5)
env = MyWrapper(gym.make("CartPole-v0"))
replay_buffer = ReplayBuffer(5000)
params = Parameters(mode="CartPole")
agent = DQN(env.action_space.n)
for i in range(2000):
state = env.reset()
total_reward = 0
for t in range(210):
# env.render()
action = np.argmax(agent.predict(state)) # behave greedily
next_state, reward, done, info = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
total_reward += reward
state = next_state
if done:
print("Episode {0} finished after {1} timesteps".format(i, t + 1))
if i > 10:
print("Update")
with tf.GradientTape() as tape:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = agent.predict(next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = agent.update(states, actions, Y)
print(loss)
grads = tape.gradient(loss, agent.model.trainable_weights)
# ==== THIS RETURNS ONLY NONE ====
print(grads)
agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
break
# store the episode reward
reward_buffer.append(total_reward)
# check the stopping condition
if np.mean(reward_buffer) > 195:
print("GAME OVER!!")
break
env.close()
import gym
import numpy as np
import tensorflow as tf
from collections import deque
# ==== import below from my repo ====
from common.wrappers import MyWrapper # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters # params for training
from common.memory import ReplayBuffer # Experience Replay Buffer
tf.enable_eager_execution()
class Model(tf.keras.Model):
def __init__(self, num_action):
super(Model, self).__init__()
self.dense1 = tf.keras.layers.Dense(16, activation='relu')
self.dense2 = tf.keras.layers.Dense(16, activation='relu')
self.dense3 = tf.keras.layers.Dense(16, activation='relu')
self.pred = tf.keras.layers.Dense(num_action, activation='softmax')
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
x = self.dense3(x)
pred = self.pred(x)
return pred
class DQN:
"""
On policy DQN
"""
def __init__(self, num_action):
self.num_action = num_action
self.model = Model(num_action)
self.optimizer = tf.train.AdamOptimizer()
def predict(self, state):
return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.predict(state)
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
if __name__ == '__main__':
reward_buffer = deque(maxlen=5)
env = MyWrapper(gym.make("CartPole-v0"))
replay_buffer = ReplayBuffer(5000)
params = Parameters(mode="CartPole")
agent = DQN(env.action_space.n)
for i in range(2000):
state = env.reset()
total_reward = 0
for t in range(210):
# env.render()
action = np.argmax(agent.predict(state)) # behave greedily
next_state, reward, done, info = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
total_reward += reward
state = next_state
if done:
print("Episode {0} finished after {1} timesteps".format(i, t + 1))
if i > 10:
print("Update")
with tf.GradientTape() as tape:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = agent.predict(next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = agent.update(states, actions, Y)
print(loss)
grads = tape.gradient(loss, agent.model.trainable_weights)
# ==== THIS RETURNS ONLY NONE ====
print(grads)
agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
break
# store the episode reward
reward_buffer.append(total_reward)
# check the stopping condition
if np.mean(reward_buffer) > 195:
print("GAME OVER!!")
break
env.close()
import gym
import numpy as np
import tensorflow as tf
from collections import deque
# ==== import below from my repo ====
from common.wrappers import MyWrapper # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters # params for training
from common.memory import ReplayBuffer # Experience Replay Buffer
tf.enable_eager_execution()
class Model(tf.keras.Model):
def __init__(self, num_action):
super(Model, self).__init__()
self.dense1 = tf.keras.layers.Dense(16, activation='relu')
self.dense2 = tf.keras.layers.Dense(16, activation='relu')
self.dense3 = tf.keras.layers.Dense(16, activation='relu')
self.pred = tf.keras.layers.Dense(num_action, activation='softmax')
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
x = self.dense3(x)
pred = self.pred(x)
return pred
class DQN:
"""
On policy DQN
"""
def __init__(self, num_action):
self.num_action = num_action
self.model = Model(num_action)
self.optimizer = tf.train.AdamOptimizer()
def predict(self, state):
return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.predict(state)
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
if __name__ == '__main__':
reward_buffer = deque(maxlen=5)
env = MyWrapper(gym.make("CartPole-v0"))
replay_buffer = ReplayBuffer(5000)
params = Parameters(mode="CartPole")
agent = DQN(env.action_space.n)
for i in range(2000):
state = env.reset()
total_reward = 0
for t in range(210):
# env.render()
action = np.argmax(agent.predict(state)) # behave greedily
next_state, reward, done, info = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
total_reward += reward
state = next_state
if done:
print("Episode {0} finished after {1} timesteps".format(i, t + 1))
if i > 10:
print("Update")
with tf.GradientTape() as tape:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = agent.predict(next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = agent.update(states, actions, Y)
print(loss)
grads = tape.gradient(loss, agent.model.trainable_weights)
# ==== THIS RETURNS ONLY NONE ====
print(grads)
agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
break
# store the episode reward
reward_buffer.append(total_reward)
# check the stopping condition
if np.mean(reward_buffer) > 195:
print("GAME OVER!!")
break
env.close()
Try to change your update function to:
def update(self, state, action, target):
# target: R + gamma * Q(s',a')
# calculate Q(s,a)
q_values = self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32))
actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)
# Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
return loss
I think with the .numpy() call in the predict function the tape loses the refererence to the weights. (I've not tested my answer)
I am trying to implement the attention described in Luong et al. 2015 in PyTorch myself, but I couldn't get it work. Below is my code, I am only interested in the "general" attention case for now. I wonder if I am missing any obvious error. It runs, but doesn't seem to learn.
class AttnDecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size, dropout_p=0.1):
super(AttnDecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_p = dropout_p
self.embedding = nn.Embedding(
num_embeddings=self.output_size,
embedding_dim=self.hidden_size
)
self.dropout = nn.Dropout(self.dropout_p)
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
self.attn = nn.Linear(self.hidden_size, self.hidden_size)
# hc: [hidden, context]
self.Whc = nn.Linear(self.hidden_size * 2, self.hidden_size)
# s: softmax
self.Ws = nn.Linear(self.hidden_size, self.output_size)
def forward(self, input, hidden, encoder_outputs):
embedded = self.embedding(input).view(1, 1, -1)
embedded = self.dropout(embedded)
gru_out, hidden = self.gru(embedded, hidden)
# [0] remove the dimension of directions x layers for now
attn_prod = torch.mm(self.attn(hidden)[0], encoder_outputs.t())
attn_weights = F.softmax(attn_prod, dim=1) # eq. 7/8
context = torch.mm(attn_weights, encoder_outputs)
# hc: [hidden: context]
out_hc = F.tanh(self.Whc(torch.cat([hidden[0], context], dim=1)) # eq.5
output = F.log_softmax(self.Ws(out_hc), dim=1) eq. 6
return output, hidden, attn_weights
I have studied the attention implemented in
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
and
https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb
The first one isn't the exact attention mechanism I am looking for. A major disadvantage is that its attention depends on the sequence length (self.attn = nn.Linear(self.hidden_size * 2, self.max_length)), which could be expensive for long sequences.
The second one is more similar to what's described in the paper, but still not the same as there is not tanh. Besides, it is really slow after updating it to latest version of pytorch (ref). Also I don't know why it takes the last context (ref).
This version works, and it follows the definition of Luong Attention (general), closely. The main difference from that in the question is the separation of embedding_size and hidden_size, which appears to be important for training after experimentation. Previously, I made both of them the same size (256), which creates trouble for learning, and it seems that the network could only learn half the sequence.
class EncoderRNN(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size,
num_layers=1, bidirectional=False, batch_size=1):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.batch_size = batch_size
self.embedding = nn.Embedding(input_size, embedding_size)
self.gru = nn.GRU(embedding_size, hidden_size, num_layers,
bidirectional=bidirectional)
def forward(self, input, hidden):
embedded = self.embedding(input).view(1, 1, -1)
output, hidden = self.gru(embedded, hidden)
return output, hidden
def initHidden(self):
directions = 2 if self.bidirectional else 1
return torch.zeros(
self.num_layers * directions,
self.batch_size,
self.hidden_size,
device=DEVICE
)
class AttnDecoderRNN(nn.Module):
def __init__(self, embedding_size, hidden_size, output_size, dropout_p=0):
super(AttnDecoderRNN, self).__init__()
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_p = dropout_p
self.embedding = nn.Embedding(
num_embeddings=output_size,
embedding_dim=embedding_size
)
self.dropout = nn.Dropout(self.dropout_p)
self.gru = nn.GRU(embedding_size, hidden_size)
self.attn = nn.Linear(hidden_size, hidden_size)
# hc: [hidden, context]
self.Whc = nn.Linear(hidden_size * 2, hidden_size)
# s: softmax
self.Ws = nn.Linear(hidden_size, output_size)
def forward(self, input, hidden, encoder_outputs):
embedded = self.embedding(input).view(1, 1, -1)
embedded = self.dropout(embedded)
gru_out, hidden = self.gru(embedded, hidden)
attn_prod = torch.mm(self.attn(hidden)[0], encoder_outputs.t())
attn_weights = F.softmax(attn_prod, dim=1)
context = torch.mm(attn_weights, encoder_outputs)
# hc: [hidden: context]
hc = torch.cat([hidden[0], context], dim=1)
out_hc = F.tanh(self.Whc(hc))
output = F.log_softmax(self.Ws(out_hc), dim=1)
return output, hidden, attn_weights