DDPG not converging for VRep maze environment - pytorch

I'm sorry for the lengthy post, just wanted to provide with implementation details in advance.
Also, sorry for the code, I know it's a mess.
I've been using a combination of PyTorch and VRep to setup environment for the differential drive robot to learn maneuvering the maze. The robot only has 6 IC proximity sensors along its rim that measure distances from 10-80cm. As a first simple version, I've created VRep environment containing just a narrow corridor (width of the corridor is ~2*robot_width). Starting position for the robot is somewhere in the middle of the corridor, with the goal point being somewhere at the end of the corridor, with goal velocities being ~0 m/s. Idea is that my learned agent handles both navigation and low level robot control at the same time.
Now, I've been using preexisting DDPG implementation which works on Gym environments, but it does not seem to converge for my case. I thought that using only 6 proximity sensors is the part of the problem, so I've introduced to the state vector position and velocity readings, together with errors for the desired state (as some papers suggest).
I would appreciate any help.
ddpg_agent
class Agent():
"""Interacts with and learns from the environment."""
def __init__(self, state_size, action_size, random_seed):
"""Initialize an Agent object.
Params
======
state_size (int): dimension of each state
action_size (int): dimension of each action
random_seed (int): random seed
"""
self.state_size = state_size
self.action_size = action_size
self.seed = random.seed(random_seed)
# Actor Network (w/ Target Network)
self.actor_local = Actor(state_size, action_size, random_seed).to(device)
self.actor_target = Actor(state_size, action_size, random_seed).to(device)
self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
# Critic Network (w/ Target Network)
self.critic_local = Critic(state_size, action_size, random_seed).to(device)
self.critic_target = Critic(state_size, action_size, random_seed).to(device)
self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
# Noise process
self.noise = OUNoise(action_size, random_seed)
# Replay memory
self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def step(self, state, action, reward, next_state, done):
"""Save experience in replay memory, and use random sample from buffer to learn."""
# Save experience / reward
self.memory.add(state, action, reward, next_state, done)
def act(self, state, add_noise=True):
"""Returns actions for given state as per current policy."""
state = torch.from_numpy(state).float().to(device)
self.actor_local.eval()
with torch.no_grad():
action = self.actor_local(state).cpu().data.numpy()
self.actor_local.train()
if add_noise:
action += self.noise.sample()
return np.clip(action, -2, 2)
def reset(self):
self.noise.reset()
def start_learn(self):
if len(self.memory) > MIN_BUFFER_SIZE:
experiences = self.memory.sample()
self.learn(experiences, GAMMA)
def learn(self, experiences, gamma):
"""Update policy and value parameters using given batch of experience tuples.
Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
where:
actor_target(state) -> action
critic_target(state, action) -> Q-value
Params
======
experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
gamma (float): discount factor
"""
states, actions, rewards, next_states, dones = experiences
# ---------------------------- update critic ---------------------------- #
# Get predicted next-state actions and Q values from target models
actions_next = self.actor_target(next_states)
Q_targets_next = self.critic_target(next_states, actions_next)
# Compute Q targets for current states (y_i)
Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)).detach()
# Compute critic loss
Q_expected = self.critic_local(states, actions)
critic_loss = F.mse_loss(Q_expected, Q_targets)
# Minimize the loss
self.critic_optimizer.zero_grad()
critic_loss.backward()
# torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
self.critic_optimizer.step()
# ---------------------------- update actor ---------------------------- #
# Compute actor loss
actions_pred = self.actor_local(states)
actor_loss = -self.critic_local(states, actions_pred).mean()
# Minimize the loss
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# ----------------------- update target networks ----------------------- #
self.soft_update(self.critic_local, self.critic_target, TAU)
self.soft_update(self.actor_local, self.actor_target, TAU)
def soft_update(self, local_model, target_model, tau):
"""Soft update model parameters.
θ_target = τ*θ_local + (1 - τ)*θ_target
Params
======
local_model: PyTorch model (weights will be copied from)
target_model: PyTorch model (weights will be copied to)
tau (float): interpolation parameter
"""
for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
def save_samples(self):
fileSamples = open('samples.obj', 'w')
pickle.dump(self.memory, fileSamples)
def load_samples(self):
fileSamples = open('samples.obj', 'r')
return pickle.load(fileSamples)
class OUNoise:
"""Ornstein-Uhlenbeck process."""
def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
"""Initialize parameters and noise process."""
self.mu = mu * np.ones(size)
self.theta = theta
self.sigma = sigma
self.seed = random.seed(seed)
self.reset()
def reset(self):
"""Reset the internal state (= noise) to mean (mu)."""
self.state = copy.copy(self.mu)
def sample(self):
"""Update internal state and return it as a noise sample."""
x = self.state
dx = self.theta * (self.mu - x) + self.sigma * np.array([np.random.randn() for i in range(len(x))])
self.state = x + dx
return self.state
class ReplayBuffer:
"""Fixed-size buffer to store experience tuples."""
def __init__(self, action_size, buffer_size, batch_size, seed):
"""Initialize a ReplayBuffer object.
Params
======
buffer_size (int): maximum size of buffer
batch_size (int): size of each training batch
"""
self.action_size = action_size
self.memory = deque(maxlen=buffer_size) # internal memory (deque)
self.batch_size = batch_size
self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
self.seed = random.seed(seed)
def add(self, state, action, reward, next_state, done):
"""Add a new experience to memory."""
e = self.experience(state, action, reward, next_state, done)
self.memory.append(e)
def sample(self):
"""Randomly sample a batch of experiences from memory."""
experiences = random.sample(self.memory, k=self.batch_size)
states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
return (states, actions, rewards, next_states, dones)
def __len__(self):
"""Return the current size of internal memory."""
return len(self.memory)
Model:
class Actor(nn.Module):
"""Actor (Policy) Model."""
def __init__(self, state_size, action_size, seed, fc1_units=600, fc2_units=400, fc3_units=300):
"""Initialize parameters and build model.
Params
======
state_size (int): Dimension of each state
action_size (int): Dimension of each action
seed (int): Random seed
fc1_units (int): Number of nodes in first hidden layer
fc2_units (int): Number of nodes in second hidden layer
"""
super(Actor, self).__init__()
self.seed = torch.manual_seed(seed)
self.fc1 = nn.Linear(state_size, fc1_units)
self.bn1 = nn.BatchNorm1d(fc1_units)
self.fc2 = nn.Linear(fc1_units, fc2_units)
self.fc3 = nn.Linear(fc2_units, fc3_units)
self.fc4 = nn.Linear(fc3_units, action_size)
self.reset_parameters()
def reset_parameters(self):
self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
self.fc3.weight.data.uniform_(*hidden_init(self.fc3))
self.fc4.weight.data.uniform_(-3e-3, 3e-3)
def forward(self, state):
"""Build an actor (policy) network that maps states -> actions."""
# x = F.relu(self.bn1(self.fc1(state.unsqueeze(0))))
x = F.relu(self.fc1(state))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
return torch.tanh(self.fc4(x))
class Critic(nn.Module):
"""Critic (Value) Model."""
def __init__(self, state_size, action_size, seed, fcs1_units=600, fc2_units=400, fc3_units=300):
"""Initialize parameters and build model.
Params
======
state_size (int): Dimension of each state
action_size (int): Dimension of each action
seed (int): Random seed
fcs1_units (int): Number of nodes in the first hidden layer
fc2_units (int): Number of nodes in the second hidden layer
"""
super(Critic, self).__init__()
self.seed = torch.manual_seed(seed)
self.fcs1 = nn.Linear(state_size, fcs1_units)
self.bn1 = nn.BatchNorm1d(fcs1_units)
self.fc2 = nn.Linear(fcs1_units+action_size, fc2_units)
self.fc3 = nn.Linear(fc2_units, fc3_units)
self.fc4 = nn.Linear(fc3_units, 1)
self.reset_parameters()
def reset_parameters(self):
self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1))
self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
self.fc3.weight.data.uniform_(*hidden_init(self.fc3))
self.fc4.weight.data.uniform_(-3e-3, 3e-3)
def forward(self, state, action):
"""Build a critic (value) network that maps (state, action) pairs -> Q-values."""
# xs = F.relu(self.bn1(self.fcs1(state.unsqueeze(0))))
xs = F.relu(self.fcs1(state))
x = torch.cat((xs, action), dim=1)
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
return self.fc4(x)
Main:
def main():
vrepHeadlessMode = True
state_dim = 18 # x, y, yaw, vx, vy, v_yaw, e_x, e_y, e_yaw, e_vx, e_vy, e_v_yaw, prox 0 ... prox5
action_dim = 2
action_space = np.array([[-2, 2], [-2, 2]])
action_lim = [-2.0, 2.0] # 2 o/sec is the max angular speed of each motor, max. linear velocity is 0.5 m/s
learn_every = 1 # number of steps after which the network update occurs [20]
num_learn = 1 # number of network updates done in a row [10]
episodes = 10000
steps = 500
desiredState = [-1.4, 0.3, -np.pi, 0.0, 0.0, 0.0] # x, y, yawAngle, vx, vy, yawVelocity
mobRob = MobRob(['MobRob'],
['leftMotor', 'rightMotor'],
['proximitySensor0', 'proximitySensor1', 'proximitySensor2', 'proximitySensor3', 'proximitySensor4',
'proximitySensor5'])
env = LabEnv(mobRob, vrepHeadlessMode)
random_seed = 7
mobRob = Agent(state_dim, action_dim, random_seed)
total_num_of_steps = 0
actions = np.zeros((episodes, steps+1, action_dim), dtype=np.float)
total_rewards = []
save_rewards = []
durations = []
for episode in range(episodes):
cur_state = env.restart(desiredState)
mobRob.reset()
start_time = time.time()
reason = ''
episode_rewards = []
for step in range(steps+1):
total_num_of_steps += 1
action = mobRob.act(cur_state)
actions[episode][step] = action
# print(action)
new_state, reward, done = env.step(action, desiredState)
mobRob.step(cur_state, action, reward, new_state, done)
cur_state = new_state
episode_rewards.append(reward)
if step % learn_every == 0:
for _ in range(num_learn):
mobRob.start_learn()
if step < steps and done and ~env.collision:
reason = 'COMPLETED'
break
if step == steps: # time budget for episode was overstepped
reason = 'TIMEOUT '
break
if env.collision:
reason = 'COLLISION'
break
mean_score = np.mean(episode_rewards)
min_score = np.min(episode_rewards)
max_score = np.max(episode_rewards)
total_rewards.append(mean_score)
duration = time.time() - start_time
durations.append(duration)
save_rewards.append([total_rewards[episode], episode])
eta = np.mean(durations)*(episodes-episode) / 60 / 60
if eta < 1.0:
etaString = str(np.round(eta * 60, 2)) + " min"
else:
etaString = str(np.round(eta, 2)) + " h"
print(
'\rEpisode {}\t{}\tMean episode reward: {:.2f}\tMin: {:.2f}\tMax: {:.2f}\tDuration: {:.2f}\tETA: {}'
.format(episode, reason, mean_score, min_score, max_score, duration, etaString))
gc.collect()
torch.save(mobRob.actor_local.state_dict(), './actor.pth')
torch.save(mobRob.critic_local.state_dict(), './critic.pth')
np.save('mean_episode_rewards', save_rewards)
if __name__ == "__main__":
main()
Typical output
Episode 3179 TIMEOUT Mean episode reward: 0.36 Min: 0.00 Max: 0.69 Duration: 36.86 ETA: 134.16 h
Episode 3180 TIMEOUT Mean episode reward: 0.22 Min: 0.00 Max: 0.72 Duration: 37.39 ETA: 134.12 h
Episode 3181 COLLISION Mean episode reward: 0.26 Min: -49.50 Max: 0.54 Duration: 29.11 ETA: 134.08 h
Episode 3182 COLLISION Mean episode reward: -0.39 Min: -50.00 Max: 0.21 Duration: 9.50 ETA: 134.02 h
Episode 3183 COLLISION Mean episode reward: -0.06 Min: -50.00 Max: 0.32 Duration: 27.10 ETA: 133.98 h
Episode 3184 COLLISION Mean episode reward: 0.38 Min: -49.32 Max: 0.69 Duration: 37.90 ETA: 133.94 h
Episode 3185 COLLISION Mean episode reward: -0.52 Min: -50.00 Max: 0.21 Duration: 7.28 ETA: 133.88 h
Position reached!
Speed reached!
Episode 3186 COMPLETED Mean episode reward: 0.39 Min: 0.00 Max: 80.72 Duration: 37.73 ETA: 133.84 h
Episode 3187 COLLISION Mean episode reward: 0.36 Min: -49.36 Max: 0.68 Duration: 34.68 ETA: 133.8 h
Episode 3188 COLLISION Mean episode reward: 0.32 Min: -49.43 Max: 0.62 Duration: 35.35 ETA: 133.76 h
Episode 3189 COLLISION Mean episode reward: -0.23 Min: -50.00 Max: 0.21 Duration: 16.59 ETA: 133.71 h
Episode 3190 TIMEOUT Mean episode reward: 0.39 Min: 0.00 Max: 0.65 Duration: 38.15 ETA: 133.67 h
Episode 3191 TIMEOUT Mean episode reward: 0.35 Min: 0.00 Max: 0.67 Duration: 37.76 ETA: 133.63 h
Position reached!
Episode 3192 COMPLETED Mean episode reward: 0.41 Min: 0.00 Max: 30.71 Duration: 36.36 ETA: 133.59 h
Episode 3193 TIMEOUT Mean episode reward: 0.35 Min: 0.00 Max: 0.72 Duration: 36.98 ETA: 133.55 h
Even after a high number of episodes, I cannot see any apparent rise in rewards.

Related

multi-agent DQN learn single model for all agents

I'm trying to run a DQN for a multi-agent system, so there is one DNN for each agent.
It takes input=state [batch, state size, #time steps, #nodes], while for simplicity we assume #time steps=1. #nodes is number of agents. And output=Q-values for each agent.
The problem is that I test various stuff with this network, but it return not so consistent results. I suspect it has to do with me running separately DQN for each agent, but learning it via the same model. I sum the losses for all agents into one loss, and then it divide by their amount.
I'm not sure it is correct. I'd be grateful for any help.
Here's my code:
class DQN(nn.Module):
def __init__(self, args): #node_size, inputs, outputs, layers=[128, 64, 16]):
# state_size, n_actions = inputs, outputs
super(DQN, self).__init__()
self.model_type = args.model_type
if args.model_type == "seperate_state_DNN":
out_size = args.num_of_actions
self.shared_model = nn.Sequential()
h_sizes = [args.input_state_size] + args.layers
for k in range(len(h_sizes) - 1):
self.shared_model.add_module('k1'+str(k), nn.Linear(h_sizes[k], h_sizes[k + 1]))
self.shared_model.add_module('k2'+str(k), args.activations[args.layers_nl[k]])
self.shared_model.add_module('final', nn.Linear(h_sizes[-1], out_size))
def forward(self, input, i=None):
# input state dimension: [batch, state size, #time steps, #nodes]
if self.model_type == "seperate_state_DNN":
if i is None:
final_output = torch.zeros_like(input)
else:
final_output = self.shared_model(input) # [:, :, :, i].unsqueeze(3))
return final_output
And here is the calling function:
def select_action(self, state, edge_state):
#self.policy_net.eval()
sample = random.random()
if self.configuration == 2:
self.eps_threshold = 0.0 # no exploration at all, only optimal values!
else:
self.eps_threshold = self.decay_functionn()
self.steps_done += 1
if sample > self.eps_threshold:
self.last_exploration = False
with torch.no_grad():
# t.max(1) will return largest column value of each row.
# second column on max result is index of where max element was
# found, so we pick action with the larger expected reward.
state = state.to(self.device)# torch.from_numpy(state).float().to(self.device) # Convert to tensor.
state = state.unsqueeze(0) # Add batch dimension (also to action below): [batch=1, #time steps, #nodes, state size]
final_output = []
x1 = self.policy_net(state, None)#.detach()
for i in range(self.node_size):
final_output.append(self.policy_net(x1[:, :, -1, i]+state[:, :, -1, i], i).max(1)[1].detach().cpu().view(state.shape[0], -1))
# .to(self.device) # action dimension: [batch=1, #nodes]
return torch.cat(final_output, dim=1)
else:
self.last_exploration = True
return torch.randint(0, self.n_actions, (1, self.node_size))
And this is the main RL training loop:
for epi in range(self.episodes):
print("### Starting Episode: ", epi, ' ### in index=', self.run_index)
state = env.reset(self, heatup=self.sim_heatup) # single step state
done = False
while not done:
action = agent.select_action(state) # .to(device)
next_state1, reward, done = env.do_step(action)
agent.add_to_memory(state, action, next_state, reward)
agent.optimize_model()
state = next_state
agent.curr_episode += 1
# Plot and dump statistics and learning curves.
agent.dump_data_on_episode_end(plot=True)
env.capture_episode()
env.close()
Finally, this is the optimization, executed in "agent.optimize_model()" above, including the functions it uses:
def optimize_model(self):
if len(self.memory) < self.batch_size:
return
transitions = self.memory.sample(self.batch_size)
# This converts batch-array of Transitions
# to Transition of batch-arrays.
batch = Transition(*zip(*transitions))
next_states_batch = torch.stack(batch.next_state).to(self.device)
state_batch = torch.stack(batch.state).to(self.device)
action_batch = torch.cat(batch.action).view(self.batch_size, -1).to(self.device) #torch.stack(batch.action, dim=0).to(self.device)
reward_batch = torch.cat(batch.reward).view(self.batch_size, -1).to(self.device)
# dims: states=[batch, steps, nodes, state size]; action=[batch, nodes]; reward=[batch, nodes]
loss = torch.tensor(0., device=self.device)
self.policy_net.train() # IM NOT SURE IF IT SHOULD BE HERE...
x1 = self.policy_net(state_batch, None)
x2 = self.policy_net(next_states_batch, None)
for i in range(self.node_size):
action_batch1 = action_batch[:,i].unsqueeze(1).reshape(-1, 1) # action=[batchXnodes, 1]
reward_batch1 = reward_batch[:,i].unsqueeze(1).view(-1, 1) # reward=[batchXnodes, 1]
# Compute loss
loss += self._compute_loss(i, x1[:, :, -1, i]+state_batch[:, :, -1, i], edge_state_batch, action_batch1,
x2[:, :, -1, i]+next_states_batch[:, :, -1, i], next_edge_state_batch, reward_batch1)
# Optimize the model
loss.div_(self.node_size)
self.optimizer.zero_grad()
loss.backward()
# clip grad
if self.grad_clip is not None:
for param in self.policy_net.parameters():
param.grad.data.clamp_(-self.grad_clip, self.grad_clip)
# update Policy net weights
self.optimizer.step()
#del loss
self.losses.append(loss.detach().cpu().numpy())
# update Target net weights
self._update_target()
def _compute_loss(self, i, state_batch, edge_state_batch, action_batch, next_states_batch, next_edge_state_batch, reward_batch):
# Q{policy net}(s, a): [batchXnodes, actions] ---gather---> [batchXnodes, 1=q_values according to this policy]
state_action_q_values = self.policy_net(state_batch, i).gather(1, action_batch)
# argmax{a} Q{policy net}(s', a'): [batchXnodes, actions] ---argmax---> [batchXnodes] ---unsqueeze---> [batchXnodes, 1]
next_state_actions = torch.argmax(self.policy_net(next_states_batch, i), dim=1).unsqueeze(1)
# Q{ploicy net}(s', argmax{a} Q{target net}(s', a') ): [batchXnodes, actions] --gather--> [batchXnodes, 1=q_values according to this policy]
next_state_q_values = self.target_net(next_states_batch, i).gather(1, next_state_actions)
# Q* = Disount * Q(s', argmax(..)) + R: [batchXnodes, 1]
expected_state_action_values = (next_state_q_values.detach() * self.discount) + reward_batch
loss = F.smooth_l1_loss(state_action_q_values, expected_state_action_values)
return loss
def _update_target(self):
if self.target_net is None:
# There is nothing to update.
return
# Update the target network, copying all weights and biases in DQN
if self.target_update > 1:
# Hard copy of weights.
if self.steps_done % self.target_update == 0:
self.target_net.load_state_dict(self.policy_net.state_dict())
return
elif self.target_update < 1 and self.target_update > 0:
# polyak averaging:
tau = self.target_update
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
target_param.data.copy_(tau * param + (1 - tau) * target_param)
return
else:
raise NotImplementedError
Sorry for the large question, I just wanted to supply all the necessary information.
If more information is needed I'd be happy to give it.
Any suggestion is much appreciated.
Thanks,
Shimon

DQN not converging

I am trying to implement DQN in openai-gym's "lunar lander" environment.
It shows no sign of converging after 3000 episodes for training. (for comparison, a very simple policy gradient method converges after 2000 episodes)
I went through my code for several times but can't find where's wrong. I hope if someone here can point out where the problem is. Below is my code:
I use a simple fully-connected network:
class Net(nn.Module):
def __init__(self) -> None:
super().__init__()
self.main = nn.Sequential(
nn.Linear(8, 16),
nn.ReLU(),
nn.Linear(16, 16),
nn.ReLU(),
nn.Linear(16, 4)
)
def forward(self, state):
return self.main(state)
I use epsilon greedy when choosing actions, and the epsilon(start from 0.5) decreases exponentially overtime:
def sample_action(self, state):
self.epsilon = self.epsilon * 0.99
action_probs = self.network_train(state)
random_number = random.random()
if random_number < (1-self.epsilon):
action = torch.argmax(action_probs, dim=-1).item()
else:
action = random.choice([0, 1, 2, 3])
return action
When training, I use a replay buffer, batch size of 64, and gradient clipping:
def learn(self):
if len(self.buffer) >= BATCH_SIZE:
self.learn_counter += 1
transitions = self.buffer.sample(BATCH_SIZE)
batch = Transition(*zip(*transitions))
state = torch.from_numpy(np.concatenate(batch.state)).reshape(-1, 8)
action = torch.tensor(batch.action).reshape(-1, 1)
reward = torch.tensor(batch.reward).reshape(-1, 1)
state_value = self.network_train(state).gather(1, action)
next_state = torch.from_numpy(np.concatenate(batch.next_state)).reshape(-1, 8)
next_state_value = self.network_target(next_state).max(1)[0].reshape(-1, 1).detach()
loss = F.mse_loss(state_value.float(), (self.DISCOUNT_FACTOR*next_state_value + reward).float())
self.optim.zero_grad()
loss.backward()
for param in self.network_train.parameters():
param.grad.data.clamp_(-1, 1)
self.optim.step()
I also use a target network, its parameters are updated every 100 timesteps:
def update_network_target(self):
if (self.learn_counter % 100) == 0:
self.network_target.load_state_dict(self.network_train.state_dict())
BTW, I use a Adam optimizer and LR of 1e-3.
Solved. Apparently the freq of updating target network is too high. I set it to every 10 episodes and fixed the problem.

Problem with reinforcement learning in keras not learning

My name is Andy and I am new to stackoverflow and this is my first question.
I started learning python 40ish days ago thanks to covid19 and jumped into machine learning/qlearning about 3 weeks ago and got stuck there since.
Goal:
have the computer play Rad Racer 2 (NES racing game) using reinforcement learning.
Plans to make this work:
after various tutorials/sites, I decided to use a double network to train/learn.
2x 256 convolution network using keras since I have watched a few tutorial vids on keras basic
3 actions(hold down accelerate(J), accelerate Left(JA), accelerate Right(JD)
I am using directinput keys codes I found online to send inputs to game as sending regular keys does not work.
I know ppl uses retro gym for these type of games but I wanted to see the inner working of reward/observation and such so I used yolov5 to detect lines/objects. Based on the result from yolov5, I calculate the reward for the step.
My input is a series of grayscale images(4) to represent motion using deque then stacked with numpy.
Once I have gather enough experiences/replay memory(1500) I started the training at the end of each of episode instead of each step. I found that it lag out a lot training after each step.
Problem:
My biggest problem currently is the model does not seem to learn properly. I seem to be slightly okay around episode 20-30 then after that it get worst and worst. It get to a point where it only does one action for hours.
I have tried playing around with the learning rate(0.1 - 0.00001), different inputs(1 bgr layer, grayscale layer, 4 layer..etc), different epsilon decay rate. I commented most of the reward stuffs, only basic reward for now.
most codes beside the yolo stuffs, had to removed a few lines due to # character limitation
# parameters
training = True
learning_rate = 0.0001
DISCOUNT = 0.99
REPLAY_MEMORY_SIZE = 50_000 # How many last steps to keep for model training
MIN_REPLAY_MEMORY_SIZE = 1500 # Minimum number of steps in a memory to start training
MINIBATCH_SIZE = 1000 # How many steps (samples) to use for training
batch_size = 32
UPDATE_TARGET_EVERY = 0 # Terminal states (end of episodes)
MODEL_NAME = 'RC'
MIN_REWARD = 0 # For model save
save_every = 5 # save every x episodes
EPISODES = 2_000
# Exploration settings
if training is True:
epsilon = 1 # not a constant, going to be decayed
else:
epsilon = 0
MIN_EPSILON = 0.01
START_EPISODE_DECAY = 0
END_EPISODE_DECAY = 20
if epsilon > MIN_EPSILON:
EPS_DECAY = -(epsilon/((END_EPISODE_DECAY-START_EPISODE_DECAY)/epsilon))
else:
EPS_DECAY = 0
# Agent class
class DQNAgent:
def __init__(self):
# Main model
self.model = self.create_model()
# self.model = self.load_model()
# Target network
self.target_model = self.create_model()
self.target_model.set_weights(self.model.get_weights())
# An array with last n steps for training
self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
# Used to count when to update target network with main network's weights
self.target_update_counter = 0
def create_model(self):
dropout = 0.1
model = Sequential()
model.add(Conv2D(256, (2, 2), input_shape=(int(height/resize_ratio), int(width/resize_ratio), img_channels)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(dropout))
model.add(Conv2D(256, (2, 2)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(dropout))
model.add(Flatten())
model.add(Dense(64))
model.add(Dense(env.ACTION_SPACE_SIZE, activation='linear')) # ACTION_SPACE_SIZE = how many choices (9)
model.compile(loss="mse", optimizer=Adam(lr=learning_rate), metrics=['accuracy'])
return model
# Trains main network at end of episode
def train(self, terminal_state):
# Start training only if certain number of samples is already saved
if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
return
minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
current_states = np.array([transition[0] for transition in minibatch])
# from (MINIBATCH_SIZE, 1, h, w, 4) > (MINIBATCH_SIZE, h, w, 4)
current_states = current_states.reshape(current_states.shape[0], current_states.shape[2],
current_states.shape[3], current_states.shape[4])
current_qs_list = self.model.predict(current_states)
new_current_states = np.array([transition[3] for transition in minibatch])
new_current_states = new_current_states.reshape(new_current_states.shape[0], new_current_states.shape[2],
new_current_states.shape[3], new_current_states.shape[4])
# new_current_states = np.expand_dims(new_current_states, axis=-1)
future_qs_list = self.target_model.predict(new_current_states)
X = []
y = []
for index, (current_state_img, current_action, current_reward, new_current_img, current_done) in enumerate(minibatch):
if not current_done:
max_future_q = np.max(future_qs_list[index])
new_q = current_reward + (DISCOUNT * max_future_q)
else:
new_q = 0.0
current_qs = current_qs_list[index]
current_qs[current_action] = new_q
X.append(np.squeeze(current_state_img, axis=0))
y.append(current_qs)
X = np.array(X)
# X = np.expand_dims(X, axis=-1)
# X = X.reshape(X.shape[0], X.shape[2], X.shape[3], X.shape[4])
y = np.array(y)
self.model.fit(X, y, batch_size=batch_size, verbose=0, shuffle=False)
# self.model.train_on_batch(X, y)
if terminal_state:
self.target_update_counter += 1
# If counter reaches set value, update target network with weights of main network
if self.target_update_counter > UPDATE_TARGET_EVERY:
self.target_model.set_weights(self.model.get_weights())
self.target_update_counter = 0
print('target_model trained!')
# Queries main network for Q values given current observation space (environment state)
def get_qs(self, state):
result = agent.model.predict(state)
result = result[0]
return result
agent = DQNAgent()
current_img_stack = deque(maxlen=4)
# make the game active
game = gw.getWindowsWithTitle('Mesen')[0]
game.activate()
time.sleep(1)
release_all()
# Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
episode_reward = 0
step = 1
if episode <= START_EPISODE_DECAY - 1:
start_epsilon = False
elif episode >= END_EPISODE_DECAY + 1:
start_epsilon = False
else:
start_epsilon = True
# Reset environment and get initial state
# blackscreens followed by the 1st screen starting out
current_state = env.reset()
blackscreen = np.zeros_like(current_state)
current_img_stack.append(blackscreen)
current_img_stack.append(blackscreen)
current_img_stack.append(blackscreen)
current_img_stack.append(current_state)
stacked_state = np.stack(current_img_stack, axis=2)
stacked_state = np.ascontiguousarray(stacked_state, dtype=np.float32) / 255
stacked_state = np.transpose(stacked_state, (1, 0, 2))
stacked_state = np.expand_dims(stacked_state, axis=0)
start_time = time.time()
# Reset flag and start iterating until episode ends
done = False
while not done:
if np.random.random() > epsilon:
action = np.argmax(agent.get_qs(stacked_state))
else:
action = np.random.randint(0, env.ACTION_SPACE_SIZE)
new_state, reward, done, prediction, preview = env.step(action)
if done is False:
next_img_stack = current_img_stack
next_img_stack.append(new_state)
next_stack = np.stack(next_img_stack, axis=2)
next_stack = np.ascontiguousarray(next_stack, dtype=np.float32) / 255
next_stack = np.transpose(next_stack, (1, 0, 2))
next_stack = np.expand_dims(next_stack, axis=0)
# current_state = new_state
current_img_stack = next_img_stack
stacked_state = next_stack
else:
next_img_stack = current_img_stack
next_img_stack.append(blackscreen)
next_stack = np.stack(next_img_stack, axis=2)
next_stack = np.ascontiguousarray(next_stack, dtype=np.float32) / 255
next_stack = np.transpose(next_stack, (1, 0, 2))
next_stack = np.expand_dims(next_stack, axis=0)
step += 1
episode_reward += reward
ep_rewards.append(episode_reward)
if SHOW_PREVIEW:
env.render(preview, prediction)
if training is True:
agent.update_replay_memory((stacked_state, action, reward, next_stack, done))
# print(episode_reward)
if done is True:
ep_reward_final.append(episode_reward)
print(' Epsilon(' + str(epsilon) + ') EPtimes(' + str(time.time() - start_time) + ') done('
+ str(done) + ') step(' + str(step) + ') EPreward(' + str(episode_reward) +
') best_reward_this_session(' + str(max(ep_reward_final)) + ') fps(' +
str(step/(time.time() - start_time)) + ')')
# plot(ep_reward_final)
if training is True:
agent.train(done)
# Decay epsilon
if show_info is False and epsilon <= MIN_EPSILON:
print(f"\nEPS_DECAY ended on episode {episode} - epsilon {epsilon}")
epsilon = MIN_EPSILON
show_info = True
elif start_epsilon is True:
epsilon += EPS_DECAY

RuntimeError: the derivative for 'indices' is not implemented

I am following this online tutorial for coding a DQN,https://github.com/philtabor/Youtube-Code-Repository/blob/master/ReinforcementLearning/DeepQLearning/torch_deep_q_model.py
, however I am running into this Runtime Error that I am unsure of how to debug or modify to prevent this error. Thanks!
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-196-00975d66fd2d> in <module>
28 agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
29 obs= obs_
---> 30 agent.learn(batch_size)
31 lastAction = action
32 scores.append(score)
<ipython-input-191-f6b163cc3a8a> in learn(self, batch_size)
72 Qtarget = Qpred.clone()
73 print(Qnext[1])
---> 74 Qtarget[:,maxA] = rewards + self.GAMMA*torch.max(Qnext[1])
75 # epsilon decay action
76 if self.steps > 2000:
RuntimeError: the derivative for 'indices' is not implemented
These are my code blocks in my jupyter notebook
class DeepQNetwork(nn.Module):
def __init__(self,Alpha):
super(DeepQNetwork,self).__init__()
self.conv1 = nn.Conv2d(1,32,8,stride=4, padding=1)
self.conv2 = nn.Conv2d(32,64,4,stride=2)
self.conv3 = nn.Conv2d(64,128,3)
self.fc1 = nn.Linear(128* 21* 12,512)
self.fc2 = nn.Linear(512,6)
self.optimizer = optim.RMSprop(self.parameters(), lr = Alpha)
self.loss = nn.MSELoss()
self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
self.to(self.device)
def forward(self,obs):
'''Passing in a sequence of arrays'''
obs = torch.Tensor(obs).to(self.device) # send to the GPU
''' Feed forward the Network Parameters'''
obs = obs.view(-1, 1,200,125)
#print(obs.shape)
obs = F.relu(self.conv1(obs))
#print(obs.shape)
obs = F.relu(self.conv2(obs))
#print(obs.shape)
obs = F.relu(self.conv3(obs))
#print(obs.shape)
obs = obs.view(-1,128* 21* 12)
obs = F.relu(self.fc1(obs))
# 4 Rows and 6 columns
actions = self.fc2(obs)
return actions
This is the Agent Code, and it contains the error causing line of code
class DQNAgent(object):
def __init__(self, gamma, epsilon, alpha, maxMemory,
epsEnd = 0.05, replace =10000, actionSpace = [0,1,2,3,4,5]):
'''
Gamma -> discount factor of valuing current reward over future reward
Epsilon -> for trade off between exploration-exploitation
alpha -> learn rate
maxMemory -> max size of Memory buffer
epsEnd -> smallest value of Exploration
repace -> how often to replace target network
'''
self.GAMMA = gamma
self.EPSILON = epsilon
self.EPS_END = epsEnd
self.actionSpace = actionSpace
self.maxMemory = maxMemory
self.steps = 0
self.learn_step_counter = 0
self.memory = []
self.memCount = 0
self.replace_tgt_count = replace
self.Q_eval = DeepQNetwork(alpha)
self.Q_next = DeepQNetwork(alpha)
def storeTransition(self, state, action, reward, state_):
'''Stores Transition states'''
if self.memCount < self.maxMemory:
self.memory.append([state,action,reward,state_])
else:
self.memory[self.memCount%self.maxMemory] = [state,action,reward,state_]
self.memCount +=1
def chooseAction(self,obs):
'''
Exploration if np.random > epsilon
else take epsilon greedy action
'''
rand = np.random.random()
# Get the value for all actions for the current set of states
# Forward pass the stack of frames to get value of each action given subset of staes in obs
actions = self.Q_eval.forward(obs)
if rand<1-self.EPSILON:
action = torch.argmax(actions[1]).item()
else:
action = np.random.choice(self.actionSpace)
self.steps += 1
return action
def learn(self, batch_size):
self.Q_eval.optimizer.zero_grad()
#0 gradient to do batch optimisation
if self.replace_tgt_count is not None and self.learn_step_counter % self.replace_tgt_count==0:
self.Q_next.load_state_dict(self.Q_eval.state_dict())
# memory subsampling
if self.memCount + batch_size < self.maxMemory:
memStart = int(np.random.choice(range(self.memCount)))
else:
memStart = int(np.random.choice(range(self.maxMemory-batch_size-1)))
miniBatch = self.memory[memStart:memStart+batch_size]
memory = np.array(miniBatch)
#feed forward current state and successor state conv to list as memory is array of numpy objects
Qpred = self.Q_eval.forward(list(memory[:,0][:])).to(self.Q_eval.device)
Qnext = self.Q_next.forward(list(memory[:,3][:])).to(self.Q_eval.device)
maxA = torch.argmax(Qnext,dim = 1).to(self.Q_eval.device)
#calculate rewards
rewards = torch.Tensor(list(memory[:,2])).to(self.Q_eval.device)
# loss for every action except max action to be 0
Qtarget = Qpred.clone()
print(Qnext.shape)
Qtarget[:,maxA] = rewards + self.GAMMA*torch.max(Qnext[1])# PROBLEMATIC LINE
# epsilon decay action
if self.steps > 2000:
if self.EPSILON-1e-4 >self.EPS_END:
self.EPSILON-= 1e-4
else:
self.EPSILON = self.EPS_END
loss = self.Q_eval.loss(Qtarget,Qpred).to(self.Q_eval.device)
loss.backward()
self.Q_eval.optimizer.step()
self.learn_step_counter +=1
env = gym.make("Invader-v0")
agent = DQNAgent(gamma=0.95,epsilon = 1.0,alpha = 0.003, maxMemory = 5000,replace = None)
while agent.memCount < agent.maxMemory:
obs = env.reset()
done = False
lives = 3
while not done:
action = env.action_space.sample()
obs_ , reward, done, info = env.step(action)
if done and info['lives']<lives:
lives = info['lives']
reward -= 200
agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
obs= obs_
initialised = True
scores = []
epsHistory = []
numGames = 50
batch_size = 16
for i in range(numGames):
print(f'starting game {i+1}, epsilon = {agent.EPSILON}')
epsHistory.append(agent.EPSILON)
done = False
obs = env.reset()
frames = [np.sum(obs)]
score = 0
lastAction = 0
lives = 3
while not done:
if len(frames) == 4:
action = agent.chooseAction(frames)
frames = []
else:
action = lastAction
obs_, reward, done, info = env.step(action)
score += score-reward
frames.append(preprocess(obs_))
if done and info['lives'] < lives:
reward -=200
agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
obs= obs_
agent.learn(batch_size)
lastAction = action
scores.append(score)
print('score: ', score)
x = [i+1 for i in range(numGames)]
You have to do use .detach() for :
Qnext = self.Q_next.forward(list(memory[:,3][:])).detach().to(self.Q_eval.device)

How do optimisers adapt to custom learning rates?

I'm having difficulty in understanding how an Optimiser uses custom learning rates defined by a LearningRateScheduler. Looking at the source for SGD, the parameters seem to be affected by its own parameters only (e.g., self.lr) only:
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
lr = self.lr
if self.initial_decay > 0:
lr *= (1. / (1. + self.decay * K.cast(self.iterations,
K.dtype(self.decay))))
# momentum
shapes = [K.int_shape(p) for p in params]
moments = [K.zeros(shape) for shape in shapes]
self.weights = [self.iterations] + moments
for p, g, m in zip(params, grads, moments):
v = self.momentum * m - lr * g # velocity
self.updates.append(K.update(m, v))
if self.nesterov:
new_p = p + self.momentum * v - lr * g
else:
new_p = p + v
...
https://github.com/keras-team/keras/blob/master/keras/optimizers.py#L172
If fit_generator was called using:
def step_decay(epoch):
initial_lrate = 0.01
drop = 0.5
epochs_drop = 10.0
lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
return lrate
lr_scheduler = LearningRateScheduler(step_decay)
model.fit_generator(
...
callbacks=[lr_scheduler],
...)
is the original learning rate used for initialising the optimiser overridden somehow? It seems to be to a certain extent. I've tried the following printer:
class LearningRatePrinter(Callback):
def init(self):
super(LearningRatePrinter, self).init()
def on_epoch_begin(self, epoch, logs={}):
print('lr:', sess.run(optimizer.lr))
lr_printer = LearningRatePrinter()
and call fit_generator() with callbacks=[lr_scheduler, lr_printer,]. What I get is lr: 0.01 for each epoch. Sure, lr_printer prints the member variable lr, but how is the lr_scheduler custom learning rate value used at all? There's no sign that it's actually being used.

Resources