Solving MountainCar-v0 by predicting future velocity? Why does this work? - openai-gym

I solved the MountainCar-v0 OpenAI gym challenge by predicting future velocity based on current action. This system uses the absolute value of velocity as the reward function.
My question is why does this work?
My hypothesis is that this is working similar to q-learning, where we are creating a map/table of all actions and their results, then the action picker just picks the highest velocity. This works since velocity is linear, so picking the highest future velocity will very likely help you reach the next highest velocity.
Here is the code. It should take less than 3 minutes to reach it's goal.
import numpy as np
import gym
from collections import deque, namedtuple
import random
import torch
import torch.nn as nn
import torch.optim as optim
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.001
env = gym.make("MountainCar-v0")
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.layer1 = nn.Linear(2, 128)
self.relu1 = nn.ReLU()
self.layer2 = nn.Linear(128, 3)
def forward(self, x):
l1 = self.relu1(self.layer1(x))
output = self.layer2(l1)
return output
model = Model()
target = Model()
target.load_state_dict(model.state_dict())
mse = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
actionPoint = namedtuple("actionPoint", ["currentState", "action", "reward", "observation", "done"])
actionMap = deque()
epsilon = 1
for epoch in range(100000):
observation, info = env.reset(return_info=True)
currentState = observation
totalReward = 0
maxSpeed = 0
done = False
while not done:
if np.random.random() > epsilon:
action = model(torch.from_numpy(currentState))
action = torch.argmax(action).item()
else:
action = env.action_space.sample()
observation, reward, done, info = env.step(action)
totalReward += reward
maxSpeed = max(maxSpeed, abs(observation[1]))
if done and totalReward != -200:
print("reached goal", totalReward)
actionMap.append(actionPoint(currentState, action, reward, observation, done))
currentState = observation
if len(actionMap) > 128:
samples = random.sample(actionMap, 128)
sampleReward = torch.FloatTensor([abs(s.observation[1]) for s in samples])
sampleCurrentState = torch.from_numpy(np.array([s.currentState for s in samples]))
futurePrediction = target(sampleCurrentState)
for i, s in enumerate(samples):
futurePrediction[i][s.action] = sampleReward[i]
currentPrediction = model(sampleCurrentState)
optimizer.zero_grad()
loss = mse(currentPrediction, futurePrediction.detach())
loss.backward()
optimizer.step()
print(maxSpeed)
target.load_state_dict(model.state_dict())
if epsilon > MIN_EPSILON:
epsilon *= EPSILON_DECAY
epsilon = max(epsilon, MIN_EPSILON)
env.close()

Related

REINFORCE for Cartpole: Training Unstable

I am implementing REINFORCE for Cartpole-V0. However, the training process is very unstable. I have not implemented `early-stopping' for the environment and allow training to continue for a fixed (high) number of episodes. After a few thousand iterations, the training reward seems to go down again. Is this due to overfitting and early-stopping is essential, or have I implemented something incorrectly?
Here is my code:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import os
def running_average(x, n):
N = n
kernel = np.ones(N)
conv_len = x.shape[0]-N
y = np.zeros(conv_len)
for i in range(conv_len):
y[i] = kernel # x[i:i+N] # matrix multiplication operator: np.mul
y[i] /= N
return y
class PolicyNetwork(nn.Module):
def __init__(self, state_dim, n_actions):
super().__init__()
self.n_actions = n_actions
self.model = nn.Sequential(
nn.Linear(state_dim, 64),
nn.ReLU(),
nn.Linear(64, 32),
nn.ReLU(),
nn.Linear(32, n_actions),
nn.Softmax(dim=1)
).float()
def forward(self, X):
return self.model(X)
def train_reinforce_agent(env, episode_length, max_episodes, gamma, visualize_step, learning_rate=0.003):
model = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
EPISODE_LENGTH = episode_length
MAX_EPISODES = max_episodes
GAMMA = gamma
VISUALIZE_STEP = max(1, visualize_step)
score = []
for episode in range(MAX_EPISODES):
curr_state = env.reset()
done = False
all_episode_t = []
score_episode = 0
for t in range(EPISODE_LENGTH):
act_prob = model(torch.from_numpy(curr_state).unsqueeze(0).float())
action = np.random.choice(np.array(list(range(env.action_space.n))), p=act_prob.squeeze(0).data.numpy())
prev_state = curr_state
curr_state, reward, done, info = env.step(action)
score_episode += reward
e_t = {'state': prev_state, 'action':action, 'reward': reward, 'returns':0}
all_episode_t.append(e_t)
if done:
break
score.append(score_episode)
G = 0
max_G = 0
for t in range(len(all_episode_t)-1, -1, -1):
G = GAMMA*G + all_episode_t[t]['reward']
all_episode_t[t]['returns'] = G
if G > max_G:
max_G = G
episode_returns = np.array([all_episode_t[t]['returns'] for t in range(len(all_episode_t))])
# normalize the returns
for t in range(len(all_episode_t)):
all_episode_t[t]['returns'] = (all_episode_t[t]['returns'] - np.mean(episode_returns))/(max_G + 10**(-6))
episode_returns = torch.FloatTensor(episode_returns)
state_batch = torch.Tensor(np.array([all_episode_t[t]['state'] for t in range(len(all_episode_t))]))
action_batch = torch.Tensor(np.array([all_episode_t[t]['action'] for t in range(len(all_episode_t))]))
pred_batch = model(state_batch)
prob_batch = pred_batch.gather(dim=1, index=action_batch.long().view(-1, 1)).squeeze()
loss_tensor = torch.log(prob_batch) * episode_returns
loss = -torch.sum(loss_tensor)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if episode % VISUALIZE_STEP == 0 and episode > 0:
print('Episode {}\tAverage Score: {:.2f}'.format(episode, np.mean(score[-VISUALIZE_STEP:-1])))
# # EARLY-STOPPING: if the average score across last 100 episodes is greater than 195, game is solved
# if np.mean(score[-100:-1]) > 195:
# break
# Training plot
score = np.array(score)
avg_score = running_average(score, visualize_step)
plt.figure(figsize=(15, 7))
plt.ylabel("Episodic Reward", fontsize=12)
plt.xlabel("Training Episodes", fontsize=12)
plt.plot(score, color='gray', linewidth=1)
plt.plot(avg_score, color='blue', linewidth=3)
plt.scatter(np.arange(score.shape[0]), score, color='green', linewidth=0.3)
plt.savefig("cartpole_reinforce_training_plot.pdf")
def main():
env = gym.make('CartPole-v0')
episode_length = 300
n_episodes = 5000
gamma = 0.99
vis_steps = 100
learning_rate = 0.003
train_reinforce_agent(env, episode_length, n_episodes, gamma, vis_steps, learning_rate=learning_rate)
if __name__ == "__main__":
main()

Deep Q learning not performing well for algo-trading

I have implemented a deep q learning in Python using keras framework to reproduce a paper's results. However, It is not working. Here is some info :
the training is done in 10000 step for the agent for 2 possible actions
the input vector's shape is 117
Here is the code for the algorithm (inspired from various github repos maybe I implemented the algorithm wrong)
def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims,CLIP_GRADIENT=1):
#set_seed(42)
model = Sequential([
Dense(fc1_dims, input_shape=(input_dims,)), # bias_regularizer=regularizers.l2(1e-4),activity_regularizer=regularizers.l2(1e-5)
Activation('relu'),
BatchNormalization(),
Dense(fc2_dims),
Activation('relu'),
BatchNormalization(),
Dense(n_actions)])
model.compile(optimizer=Adam(lr=lr, clipvalue=CLIP_GRADIENT), loss='mse')
return model
class Agent(object):
def __init__(self, alpha, gamma, n_actions, epsilon, batch_size,
input_dims, epsilon_dec=0.996, epsilon_end=0.01,
mem_size=1000000, fname='dqn_model.h5'):
self.action_space = [i for i in range(n_actions)]
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_dec = epsilon_dec
self.epsilon_min = epsilon_end
self.batch_size = batch_size
self.model_file = fname
self.memory = ReplayBuffer(mem_size, input_dims, n_actions,
discrete=True)
self.q_eval = build_dqn(alpha, n_actions, input_dims, 64, 32)
def remember(self, state, action, reward, new_state, done):
self.memory.store_transition(state, action, reward, new_state, done)
def choose_action(self, state):
state = state[np.newaxis, :]
rand = np.random.random()
if rand < self.epsilon:
action = np.random.choice(self.action_space)
else:
actions = self.q_eval.predict(state)
action = np.argmax(actions)
return action
def learn(self):
if self.memory.mem_cntr > self.batch_size:
state, action, reward, new_state, done = \
self.memory.sample_buffer(self.batch_size)
action_values = np.array(self.action_space, dtype=np.int8)
action_indices = np.dot(action, action_values)
q_eval = self.q_eval.predict(state)
q_next = self.q_eval.predict(new_state)
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
q_target[batch_index, action_indices] = reward + \
self.gamma*np.max(q_next, axis=1)*done
_ = self.q_eval.fit(state, q_target, verbose=0)
self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > \
self.epsilon_min else self.epsilon_min
def processState(self, state):
n = len(state)
relative_diff_matrix,prev_posiion = state[:n-1],state[n-1]
relative_diff_matrix = relative_diff_matrix.reshape((int(n/30),30))
relative_diff_matrix = np.diff(relative_diff_matrix) / relative_diff_matrix[:,:-1]
relative_diff_matrix = StandardScaler().fit_transform(relative_diff_matrix.T).T
processed_state = relative_diff_matrix.flatten()
processed_state = np.append(processed_state,prev_posiion)
return processed_state
def processReward(self, reward,rewardClipping=1):
return np.clip(reward, -rewardClipping, rewardClipping)
def train_model(self,trainingEnv, n_episodes = 1,verbose=0):
scores = []
eps_history = []
for i in range(n_episodes):
done = False
score = 0
observation = env.reset()
observation = self.processState(observation)
#observation = self.processState(observation)
while not done:
action = agent.choose_action(observation)
observation_, reward, done, info = trainingEnv.step(action)
# Remembering episode
reward = self.processReward(reward)
observation_ = self.processState(observation_)
score += reward
self.remember(observation_, action, reward, observation_, int(done))
# Remembering episode for other action => Better exploration
otherAction = int(not bool(action))
otherReward = self.processReward(info['Reward'])
otherNextState = self.processState(info['State'])
otherDone = info['Done']
self.remember(observation_, otherAction, otherReward, otherNextState, otherDone)
observation = observation_
# learning
self.learn()
if verbose :
eps_history.append(agent.epsilon)
scores.append(score)
avg_score = np.mean(scores[max(0, i-100):(i+1)])
print('episode: ', i,'score: %.2f' % score,
' average score %.2f' % avg_score)
trainingEnv.render()
def save_model(self):
self.q_eval.save(self.model_file)
def load_model(self):
self.q_eval = load_model(self.model_file)
I start with 100 $ capital I finish with slightly more or less in the horizon of 20 years (about 10000 step). I tried tuning the parameters but nothing worked.
here is the main:
env = TradingEnv(marketSymbol="GOOGL", period=PERIOD_DEFAULT, startingDate=START_DEFAULT, endingDate=END_DEFAULT, columns=COLUMNS, money=100,transactionCosts=0)
lr = 0.0005
agent = Agent(gamma=1, epsilon=0.00, alpha=lr, input_dims=117,
n_actions=2, mem_size=1000000, batch_size=32, epsilon_end=0.0)
agent.train_model(env)
I think I have managed to solve the problem. We need to set the number of episodes to a sufficiently high number (not 1), in my case it was just thirty. However, I don't know how to efficiently backtest the deep q trading agent !

Torch throws a RuntimeError: element 0 of tensors does not require grad... but can't find where computational graph is severed

I am getting the above error:
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
I looked this up and it looks like the computational graph is not connected for some reason. However, I cannot find the location where the graph is severed.
My code is a reproduction of the arjovsky WGAN: https://github.com/martinarjovsky/WassersteinGAN
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import torch
import torch.nn as nn
from __future__ import print_function
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
from torch.autograd import Variable
import os
import json
class MLP_G(nn.Module):
def __init__(self, isize, nz, ngf, ngpu):
super(MLP_G, self).__init__()
self.ngpu = ngpu
main = nn.Sequential(
# Z goes into a linear of size: ngf
nn.Linear(nz, ngf),
nn.ReLU(True),
nn.Linear(ngf, ngf),
nn.ReLU(True),
nn.Linear(ngf, ngf),
nn.ReLU(True),
nn.Linear(ngf, isize),
)
self.main = main
self.isize = isize
self.nz = nz
def forward(self, input):
input = input.view(input.size(0), input.size(1))
if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1:
output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
else:
output = self.main(input)
return output.view(output.size(0), self.isize)
class MLP_D(nn.Module):
def __init__(self, isize, nz, ndf, ngpu):
super(MLP_D, self).__init__()
self.ngpu = ngpu
main = nn.Sequential(
# Z goes into a linear of size: ndf
nn.Linear(isize, ndf),
nn.ReLU(True),
nn.Linear(ndf, ndf),
nn.ReLU(True),
nn.Linear(ndf, ndf),
nn.ReLU(True),
nn.Linear(ndf, 1),
)
self.main = main
self.isize = isize
self.nz = nz
def forward(self, input):
input = input.view(input.size(0),input.size(1))
if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1:
output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
else:
output = self.main(input)
output = output.mean(0)
return output.view(1)
netG = None #path to saved generator
netD = None #discriminator path
batchSize = 1000 #size of batch (which is size of data)
cuda = False
lrD = lrG = .00005
beta1 = .5
niter = 25
experiment = '/content/drive/MyDrive/savefolder'
clamp_upper = .01
clamp_lower = -clamp_upper
manualSeed = random.randint(1, 10000) # fix seed
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)
cudnn.benchmark = True
dataset = torch.tensor(np.stack([x,y, instrument], axis = 1)).float().reshape(-1,3)
ngpu = 1
nz = 4 #three latents and the instrument
ngf = 128
ndf = 128
# custom weights initialization called on netG and netD
def weights_init(m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
m.weight.data.normal_(0.0, 0.02)
elif classname.find('BatchNorm') != -1:
m.weight.data.normal_(1.0, 0.02)
m.bias.data.fill_(0)
netG = MLP_G(2, nz, ngf, ngpu)
netG.apply(weights_init)
print(netG)
netD = MLP_D(3, nz, ndf, ngpu)
print(netD)
input = torch.FloatTensor(batchSize, 2)
noise = torch.FloatTensor(batchSize, nz-1)
fixed_noise = torch.FloatTensor(batchSize, nz-1).normal_(0, 1)
one = torch.FloatTensor([1])
mone = one * -1
# setup optimizer
optimizerD = optim.Adam(netD.parameters(), lr=lrD, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lrG, betas=(beta1, 0.999))
real_cpu = data = dataset
gen_iterations = 0
for epoch in range(niter):
#data_iter = iter(dataloader)
############################
# (1) Update D network
###########################
for p in netD.parameters(): # reset requires_grad
p.requires_grad = True # they are set to False below in netG update
# train the discriminator Diters times
if gen_iterations < 25 or gen_iterations % 500 == 0:
Diters = 100
else:
Diters = 5
j = 0
while j < Diters:
j += 1
# clamp parameters to a cube
for p in netD.parameters():
p.data.clamp_(clamp_lower, clamp_upper)
# train with real
netD.zero_grad()
if cuda:
real_cpu = real_cpu.cuda()
input.resize_as_(real_cpu).copy_(real_cpu)
inputv = Variable(input, requires_grad=False)
errD_real = netD(inputv)
errD_real.backward(one)#Error Occurs here
# train with fake
noise.resize_(batchSize, nz-1).normal_(0, 1)
noisev = torch.cat([Variable(noise, requires_grad=False), dataset[:,2].reshape(-1,1)], 1)# totally freeze netG
fake = torch.cat([Variable(netG(noisev).data), dataset[:,2].view(-1,1)], 1)
inputv = fake
errD_fake = netD(inputv)
errD_fake.backward(mone)
errD = errD_real - errD_fake
optimizerD.step()
############################
# (2) Update G network
###########################
for p in netD.parameters():
p.requires_grad = False # to avoid computation
netG.zero_grad()
# in case our last batch was the tail batch of the dataloader,
# make sure we feed a full batch of noise
noise.resize_(batchSize, nz-1).normal_(0, 1)
noisev = torch.cat([Variable(noise), dataset[:,2].view(-1,1)], 1)
fake = torch.cat([netG(noisev), dataset[:,2].view(-1,1)], 1)
errG = netD(fake)
errG.backward(one)
optimizerG.step()
gen_iterations += 1
i = 0
print('[%d/%d][%d] Loss_D: %f Loss_G: %f Loss_D_real: %f Loss_D_fake %f'
% (epoch, niter, gen_iterations,
errD.data[0], errG.data[0], errD_real.data[0], errD_fake.data[0]))
# if gen_iterations % 500 == 0:
# real_cpu = real_cpu.mul(0.5).add(0.5)
# vutils.save_image(real_cpu, '{0}/real_samples.png'.format(opt.experiment))
# fake = netG(Variable(fixed_noise, volatile=True))
# fake.data = fake.data.mul(0.5).add(0.5)
# vutils.save_image(fake.data, '{0}/fake_samples_{1}.png'.format(opt.experiment, gen_iterations))
# do checkpointing
torch.save(netG.state_dict(), '{0}/netG_epoch_{1}.pth'.format(experiment, epoch))
torch.save(netD.state_dict(), '{0}/netD_epoch_{1}.pth'.format(experiment, epoch))
Error occurs on the line: errD_real.backward(one). The error might be something regarding zeroing out the computational graph as the code runs for one iteration then throws an error. Thanks for your help.
You most certainly need to add require_grad=True on one. You could define it as:
one = torch.tensor([1], dtype=torch.float16, requires_grad=True)

How do I reduce memory usage for deep reinforcement learning algorithms?

I wrote a script of DQN to play BreakoutDeterministic and ran it on my school GPU server. However, it seems that the code is taking up 97% of the total RAM memory (more than 100GB)!
I would like to know which part of the script is demanding this high usage of RAM? I used memory-profiler for 3 episodes and it seems that the memory requirement increases linearly with each time step on my laptop.
I wrote the script in PyCharm, python 3.6. My laptop 12GB RAM with no GPU but the school server is using Ubuntu, p100 GPU.
import gym
import numpy as np
import random
from collections import deque
from keras.layers import Dense, Input, Lambda, convolutional, core
from keras.models import Model
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import os
import time as dt
plt.switch_backend('agg')
def preprocess(state):
process_state = np.mean(state, axis=2).astype(np.uint8)
process_state = process_state[::2, ::2]
process_state_size = list(process_state.shape)
process_state_size.append(1)
process_state = np.reshape(process_state, process_state_size)
return process_state
class DQNAgent:
def __init__(self, env):
self.env = env
self.action_size = env.action_space.n
self.state_size = self.select_state_size()
self.memory = deque(maxlen=1000000) # specify memory size
self.gamma = 0.99
self.eps = 1.0
self.eps_min = 0.01
self.decay = 0.95
self.lr = 0.00025
self.start_life = 5 # get from environment
self.tau = 0.125 # special since 2 models to be trained
self.model = self.create_cnnmodel()
self.target_model = self.create_cnnmodel()
def select_state_size(self):
process_state = preprocess(self.env.reset())
state_size = process_state.shape
return state_size
def create_cnnmodel(self):
data_input = Input(shape=self.state_size, name='data_input', dtype='int32')
normalized = Lambda(lambda x: x/255)(data_input)
conv1 = convolutional.Convolution2D(32, 8, strides=(4, 4), activation='relu')(normalized)
conv2 = convolutional.Convolution2D(64, 4, strides=(2,2), activation='relu')(conv1)
conv3 = convolutional.Convolution2D(64, 3, strides=(1,1), activation='relu')(conv2)
conv_flatten = core.Flatten()(conv3) # flatten to feed cnn to fc
h4 = Dense(512, activation='relu')(conv_flatten)
prediction_output = Dense(self.action_size, name='prediction_output', activation='linear')(h4)
model = Model(inputs=data_input, outputs=prediction_output)
model.compile(optimizer=Adam(lr=self.lr),
loss='mean_squared_error') # 'mean_squared_error') keras.losses.logcosh(y_true, y_pred)
return model
def remember(self, state, action, reward, new_state, done): # store past experience as a pre-defined table
self.memory.append([state, action, reward, new_state, done])
def replay(self, batch_size):
if batch_size > len(self.memory):
return
all_states = []
all_targets = []
samples = random.sample(self.memory, batch_size)
for sample in samples:
state, action, reward, new_state, done = sample
target = self.target_model.predict(state)
if done:
target[0][action] = reward
else:
target[0][action] = reward + self.gamma*np.max(self.target_model.predict(new_state)[0])
all_states.append(state)
all_targets.append(target)
history = self.model.fit(np.vstack(all_states), np.vstack(all_targets), epochs=1, verbose=0)
return history
def act(self, state):
self.eps *= self.decay
self.eps = max(self.eps_min, self.eps)
if np.random.random() < self.eps:
return self.env.action_space.sample()
return np.argmax(self.model.predict(state)[0])
def train_target(self):
weights = self.model.get_weights()
target_weights = self.target_model.get_weights()
for i in range(len(target_weights)):
target_weights[i] = (1-self.tau)*target_weights[i] + self.tau*weights[i]
self.target_model.set_weights(target_weights) #
def main(episodes):
env = gym.make('BreakoutDeterministic-v4')
agent = DQNAgent(env, cnn)
time = env._max_episode_steps
batch_size = 32
save_model = 'y'
filepath = os.getcwd()
date = dt.strftime('%d%m%Y')
clock = dt.strftime('%H.%M.%S')
print('++ Training started on {} at {} ++'.format(date, clock))
start_time = dt.time()
tot_r = []
tot_loss = []
it_r = []
it_loss = []
tot_frames = 0
for e in range(episodes):
r = []
loss = []
state = env.reset()
state = preprocess(state)
state = state[None,:]
current_life = agent.start_life
for t in range(time):
if rend_env == 'y':
action = agent.act(state)
new_state, reward, terminal_life, life = env.step(action)
new_state = preprocess(new_state)
new_state = new_state[None,:]
if life['ale.lives'] < current_life:
reward = -1
current_life = life['ale.lives']
agent.remember(state, action, reward, new_state, terminal_life)
hist = agent.replay(batch_size)
agent.train_target()
state = new_state
r.append(reward)
tot_frames += 1
if hist is None:
loss.append(0.0)
else:
loss.append(hist.history['loss'][0])
if t%20 == 0:
print('Frame : {}, Cum Reward = {}, Avg Loss = {}, Curr Life: {}'.format(t,
np.sum(r),
round(np.mean(loss[-20:-1]),3),
current_life))
agent.model.save('{}/Mod_Fig/DQN_BO_model_{}.h5'.format(filepath, date))
agent.model.save_weights('{}/Mod_Fig/DQN_BO_weights_{}.h5'.format(filepath, date))
if current_life == 0 or terminal_life:
print('Episode {} of {}, Cum Reward = {}, Avg Loss = {}'.format(e, episodes, np.sum(r), np.mean(loss)))
break
tot_r.append(np.sum(r))
tot_loss.append(np.mean(loss))
it_r.append(r)
it_loss.append(loss)
print('Training ended on {} at {}'.format(date, clock))
run_time = dt.time() - start_time
print('Total Training time: %d Hrs %d Mins $d s' % (run_time // 3600, (run_time % 3600) // 60),
(run_time % 3600) % 60 // 1)
if save_model == 'y':
agent.model.save('{}/Mod_Fig/DQN_BO_finalmodel_{}_{}.h5'.format(filepath, date, clock))
agent.model.save_weights('{}/Mod_Fig/DQN_BO_finalweights_{}_{}.h5'.format(filepath, date, clock))
agent.model.summary()
return tot_r, tot_loss, it_r, it_loss, tot_frames
if __name__ == '__main__':
episodes = 3
total_reward, total_loss, rewards_iter, loss_iter, frames_epi = main(episodes=episodes)
Would really appreciate your comments and help on writing memory and speed efficient deep RL codes! I hope to train my DQN on breakout for 5000 episodes but the remote server only allows maximum of 48 hours of training. Thanks in advance!
It sounds like you have a memory leak.
This line
agent.remember(state, action, reward, new_state, terminal_life)
gets called 5000 * env._max_episode_steps times, and each state is a (210, 160, 3) array. The first thing to try would be to reduce the size of self.memory = deque(maxlen=1000000) # specify memory size to verify that this is the sole cause.
If you really believe you need that much capacity, you should dump self.memory to disk and keep a only a small subsample in memory.
Additionally: subsampling from deque is very slow, deque is implemented as a linked list so each subsample is O(N*M). You should consider implementing your own ring buffer for self.memory.
Alternatively: you might consider a probabilistic buffer (I don't know the proper name), where each time you would append to a full buffer, remove an element at random and append the new element. This means any (state, action, reward, ...) tuple that is encountered has a nonzero probability of being contained in the buffer, with recent tuples being more likely than older ones.
I had similar problems with memory and I still do.
The main cause of the large memory consumption are the states. But here's what I did to make it better:
Step 1: Resize them to a 84 x 84 sample using openCV. Some people instead downsample the images to 84 x 84. This results in each state having the shape (84,84,3).
Step 2: Convert these frames to grayscale (basically, black and white). This should change the shape to (84,84,1).
Step 3: Use dtype=np.uint8 for storing states. They consume minimal memory and are perfect for the pixel intensity values ranged 0-255.
Additional Info
I run my code on free Google Collab notebooks (K80 Tesla GPU and 13GB RAM), periodically saving the replay buffer to my drive.
For steps 1 and 2, consider using the OpenAI baseline Atari wrappers, as there is no point in reinventing the wheel.
You could also this snippet to check the amount of RAM used by your own program at each step, like I did:
import os
import psutil
def show_RAM_usage(self):
py = psutil.Process(os.getpid())
print('RAM usage: {} GB'.format(py.memory_info()[0]/2. ** 30))
This snippet is modified to use in my own program from the original answer

Input shape in Keras

I am creating a deep neural network using Keras using images from the Gym library from Open AI.
I tried to reshape the images using the following code:
def reshape_dimensions(observation):
processed = np.mean(observation,2,keepdims = False)
cropped = processed[35:195]
result = cropped[::2,::2]
return result
This gives me an image of shape (80,80) but every time I try to input that shape in the first layer of the Keras network it doesn't work.
What should be the shape I should use so I can further develop the network?
Attached the whole code:
PART I retrieves the training data
import gym
import random
import numpy as np
from statistics import mean, median
from collections import Counter
### GAME VARIABLE SETTINGS ###
env = gym.make('MsPacman-v0')
env.reset()
goal_steps = 2000
score_requirement = 250
initial_games = 200
print('Options to play: ',env.unwrapped.get_action_meanings())
### DEFINE FUNCTIONS ####
def reshape_dimensions(observation):
processed = np.mean(observation,2,keepdims = False)
cropped = processed[35:195]
result = cropped[::2,::2]
return result
def initial_population():
training_data = []
scores = []
accepted_scores = []
for _ in range(initial_games):
score = 0
game_memory = []
prev_obvservation = []
for _ in range(goal_steps):
#env.render()
action = env.action_space.sample() #Take random action in the env
observation, reward, done, info = env.step(action)
reshape_observation = reshape_dimensions(observation)
if len(prev_obvservation) > 0:
game_memory.append([prev_obvservation, action])
prev_obvservation = reshape_observation
score = score + reward
if done:
break
if score >= score_requirement:
accepted_scores.append(score)
for data in game_memory:
if data[1] == 0:
output = [1,0,0,0,0,0,0,0,0]
elif data[1] == 1:
output = [0,1,0,0,0,0,0,0,0]
elif data[1] == 2:
output = [0,0,1,0,0,0,0,0,0]
elif data[1] == 3:
output = [0,0,0,1,0,0,0,0,0]
elif data[1] == 4:
output = [0,0,0,0,1,0,0,0,0]
elif data[1] == 5:
output = [0,0,0,0,0,1,0,0,0]
elif data[1] == 6:
output = [0,0,0,0,0,0,1,0,0]
elif data[1] == 7:
output = [0,0,0,0,0,0,0,1,0]
elif data[1] == 8:
output = [0,0,0,0,0,0,0,0,1]
training_data.append([data[0],output])
env.reset()
scores.append(score)
print('Average accepted scores:', mean(accepted_scores))
print('Median accepted scores:', median(accepted_scores))
print(Counter(accepted_scores))
return training_data
### RUN CODE ###
training_data = initial_population()
np.save('data_for_training_200.npy', training_data)
PART II trains the model
import gym
import random
import numpy as np
import keras
from statistics import mean, median
from collections import Counter
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
### LOAD DATA ###
raw_training_data = np.load("data_for_training_200.npy")
training_data = [i[0:2] for i in raw_training_data]
print(np.shape(training_data))
### DEFINE FUNCTIONS ###
def neural_network_model():
network = Sequential()
network.add(Dense(100, activation = 'relu', input_shape = (80,80)))
network.add(Dense(9,activation = 'softmax'))
optimizer = Adam(lr = 0.001)
network.compile(optimizer = optimizer, loss = 'categorical_crossentropy', metrics=['accuracy'])
return network
def train_model(training_data):
X = [i[0] for i in training_data]
y = [i[1] for i in training_data]
#X = np.array([i[0] for i in training_data])
#y = np.array([i[1] for i in training_data])
print('shape of X: ', np.shape(X))
print('shape of y: ', np.shape(y))
early_stopping_monitor = EarlyStopping(patience = 3)
model = neural_network_model()
model.fit(X, y, epochs = 20, callbacks = [early_stopping_monitor])
return model
train_model(training_data = training_data)
It seems like you are pre-processing individual images correctly but putting them inside a list instead of an input tensor. From the error message you have a list of 36859 (80,80) arrays while you would like to have a single array of shape (36859, 80, 80). You have the code that does this commented out X = np.array([i[0] for i in training_data]), you have to ensure that every i[0] is of same shape (80,80) for this to work.

Resources