I have implemented a deep q learning in Python using keras framework to reproduce a paper's results. However, It is not working. Here is some info :
the training is done in 10000 step for the agent for 2 possible actions
the input vector's shape is 117
Here is the code for the algorithm (inspired from various github repos maybe I implemented the algorithm wrong)
def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims,CLIP_GRADIENT=1):
#set_seed(42)
model = Sequential([
Dense(fc1_dims, input_shape=(input_dims,)), # bias_regularizer=regularizers.l2(1e-4),activity_regularizer=regularizers.l2(1e-5)
Activation('relu'),
BatchNormalization(),
Dense(fc2_dims),
Activation('relu'),
BatchNormalization(),
Dense(n_actions)])
model.compile(optimizer=Adam(lr=lr, clipvalue=CLIP_GRADIENT), loss='mse')
return model
class Agent(object):
def __init__(self, alpha, gamma, n_actions, epsilon, batch_size,
input_dims, epsilon_dec=0.996, epsilon_end=0.01,
mem_size=1000000, fname='dqn_model.h5'):
self.action_space = [i for i in range(n_actions)]
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_dec = epsilon_dec
self.epsilon_min = epsilon_end
self.batch_size = batch_size
self.model_file = fname
self.memory = ReplayBuffer(mem_size, input_dims, n_actions,
discrete=True)
self.q_eval = build_dqn(alpha, n_actions, input_dims, 64, 32)
def remember(self, state, action, reward, new_state, done):
self.memory.store_transition(state, action, reward, new_state, done)
def choose_action(self, state):
state = state[np.newaxis, :]
rand = np.random.random()
if rand < self.epsilon:
action = np.random.choice(self.action_space)
else:
actions = self.q_eval.predict(state)
action = np.argmax(actions)
return action
def learn(self):
if self.memory.mem_cntr > self.batch_size:
state, action, reward, new_state, done = \
self.memory.sample_buffer(self.batch_size)
action_values = np.array(self.action_space, dtype=np.int8)
action_indices = np.dot(action, action_values)
q_eval = self.q_eval.predict(state)
q_next = self.q_eval.predict(new_state)
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
q_target[batch_index, action_indices] = reward + \
self.gamma*np.max(q_next, axis=1)*done
_ = self.q_eval.fit(state, q_target, verbose=0)
self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > \
self.epsilon_min else self.epsilon_min
def processState(self, state):
n = len(state)
relative_diff_matrix,prev_posiion = state[:n-1],state[n-1]
relative_diff_matrix = relative_diff_matrix.reshape((int(n/30),30))
relative_diff_matrix = np.diff(relative_diff_matrix) / relative_diff_matrix[:,:-1]
relative_diff_matrix = StandardScaler().fit_transform(relative_diff_matrix.T).T
processed_state = relative_diff_matrix.flatten()
processed_state = np.append(processed_state,prev_posiion)
return processed_state
def processReward(self, reward,rewardClipping=1):
return np.clip(reward, -rewardClipping, rewardClipping)
def train_model(self,trainingEnv, n_episodes = 1,verbose=0):
scores = []
eps_history = []
for i in range(n_episodes):
done = False
score = 0
observation = env.reset()
observation = self.processState(observation)
#observation = self.processState(observation)
while not done:
action = agent.choose_action(observation)
observation_, reward, done, info = trainingEnv.step(action)
# Remembering episode
reward = self.processReward(reward)
observation_ = self.processState(observation_)
score += reward
self.remember(observation_, action, reward, observation_, int(done))
# Remembering episode for other action => Better exploration
otherAction = int(not bool(action))
otherReward = self.processReward(info['Reward'])
otherNextState = self.processState(info['State'])
otherDone = info['Done']
self.remember(observation_, otherAction, otherReward, otherNextState, otherDone)
observation = observation_
# learning
self.learn()
if verbose :
eps_history.append(agent.epsilon)
scores.append(score)
avg_score = np.mean(scores[max(0, i-100):(i+1)])
print('episode: ', i,'score: %.2f' % score,
' average score %.2f' % avg_score)
trainingEnv.render()
def save_model(self):
self.q_eval.save(self.model_file)
def load_model(self):
self.q_eval = load_model(self.model_file)
I start with 100 $ capital I finish with slightly more or less in the horizon of 20 years (about 10000 step). I tried tuning the parameters but nothing worked.
here is the main:
env = TradingEnv(marketSymbol="GOOGL", period=PERIOD_DEFAULT, startingDate=START_DEFAULT, endingDate=END_DEFAULT, columns=COLUMNS, money=100,transactionCosts=0)
lr = 0.0005
agent = Agent(gamma=1, epsilon=0.00, alpha=lr, input_dims=117,
n_actions=2, mem_size=1000000, batch_size=32, epsilon_end=0.0)
agent.train_model(env)
I think I have managed to solve the problem. We need to set the number of episodes to a sufficiently high number (not 1), in my case it was just thirty. However, I don't know how to efficiently backtest the deep q trading agent !
Related
I solved the MountainCar-v0 OpenAI gym challenge by predicting future velocity based on current action. This system uses the absolute value of velocity as the reward function.
My question is why does this work?
My hypothesis is that this is working similar to q-learning, where we are creating a map/table of all actions and their results, then the action picker just picks the highest velocity. This works since velocity is linear, so picking the highest future velocity will very likely help you reach the next highest velocity.
Here is the code. It should take less than 3 minutes to reach it's goal.
import numpy as np
import gym
from collections import deque, namedtuple
import random
import torch
import torch.nn as nn
import torch.optim as optim
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.001
env = gym.make("MountainCar-v0")
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.layer1 = nn.Linear(2, 128)
self.relu1 = nn.ReLU()
self.layer2 = nn.Linear(128, 3)
def forward(self, x):
l1 = self.relu1(self.layer1(x))
output = self.layer2(l1)
return output
model = Model()
target = Model()
target.load_state_dict(model.state_dict())
mse = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
actionPoint = namedtuple("actionPoint", ["currentState", "action", "reward", "observation", "done"])
actionMap = deque()
epsilon = 1
for epoch in range(100000):
observation, info = env.reset(return_info=True)
currentState = observation
totalReward = 0
maxSpeed = 0
done = False
while not done:
if np.random.random() > epsilon:
action = model(torch.from_numpy(currentState))
action = torch.argmax(action).item()
else:
action = env.action_space.sample()
observation, reward, done, info = env.step(action)
totalReward += reward
maxSpeed = max(maxSpeed, abs(observation[1]))
if done and totalReward != -200:
print("reached goal", totalReward)
actionMap.append(actionPoint(currentState, action, reward, observation, done))
currentState = observation
if len(actionMap) > 128:
samples = random.sample(actionMap, 128)
sampleReward = torch.FloatTensor([abs(s.observation[1]) for s in samples])
sampleCurrentState = torch.from_numpy(np.array([s.currentState for s in samples]))
futurePrediction = target(sampleCurrentState)
for i, s in enumerate(samples):
futurePrediction[i][s.action] = sampleReward[i]
currentPrediction = model(sampleCurrentState)
optimizer.zero_grad()
loss = mse(currentPrediction, futurePrediction.detach())
loss.backward()
optimizer.step()
print(maxSpeed)
target.load_state_dict(model.state_dict())
if epsilon > MIN_EPSILON:
epsilon *= EPSILON_DECAY
epsilon = max(epsilon, MIN_EPSILON)
env.close()
I am implementing REINFORCE for Cartpole-V0. However, the training process is very unstable. I have not implemented `early-stopping' for the environment and allow training to continue for a fixed (high) number of episodes. After a few thousand iterations, the training reward seems to go down again. Is this due to overfitting and early-stopping is essential, or have I implemented something incorrectly?
Here is my code:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import os
def running_average(x, n):
N = n
kernel = np.ones(N)
conv_len = x.shape[0]-N
y = np.zeros(conv_len)
for i in range(conv_len):
y[i] = kernel # x[i:i+N] # matrix multiplication operator: np.mul
y[i] /= N
return y
class PolicyNetwork(nn.Module):
def __init__(self, state_dim, n_actions):
super().__init__()
self.n_actions = n_actions
self.model = nn.Sequential(
nn.Linear(state_dim, 64),
nn.ReLU(),
nn.Linear(64, 32),
nn.ReLU(),
nn.Linear(32, n_actions),
nn.Softmax(dim=1)
).float()
def forward(self, X):
return self.model(X)
def train_reinforce_agent(env, episode_length, max_episodes, gamma, visualize_step, learning_rate=0.003):
model = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
EPISODE_LENGTH = episode_length
MAX_EPISODES = max_episodes
GAMMA = gamma
VISUALIZE_STEP = max(1, visualize_step)
score = []
for episode in range(MAX_EPISODES):
curr_state = env.reset()
done = False
all_episode_t = []
score_episode = 0
for t in range(EPISODE_LENGTH):
act_prob = model(torch.from_numpy(curr_state).unsqueeze(0).float())
action = np.random.choice(np.array(list(range(env.action_space.n))), p=act_prob.squeeze(0).data.numpy())
prev_state = curr_state
curr_state, reward, done, info = env.step(action)
score_episode += reward
e_t = {'state': prev_state, 'action':action, 'reward': reward, 'returns':0}
all_episode_t.append(e_t)
if done:
break
score.append(score_episode)
G = 0
max_G = 0
for t in range(len(all_episode_t)-1, -1, -1):
G = GAMMA*G + all_episode_t[t]['reward']
all_episode_t[t]['returns'] = G
if G > max_G:
max_G = G
episode_returns = np.array([all_episode_t[t]['returns'] for t in range(len(all_episode_t))])
# normalize the returns
for t in range(len(all_episode_t)):
all_episode_t[t]['returns'] = (all_episode_t[t]['returns'] - np.mean(episode_returns))/(max_G + 10**(-6))
episode_returns = torch.FloatTensor(episode_returns)
state_batch = torch.Tensor(np.array([all_episode_t[t]['state'] for t in range(len(all_episode_t))]))
action_batch = torch.Tensor(np.array([all_episode_t[t]['action'] for t in range(len(all_episode_t))]))
pred_batch = model(state_batch)
prob_batch = pred_batch.gather(dim=1, index=action_batch.long().view(-1, 1)).squeeze()
loss_tensor = torch.log(prob_batch) * episode_returns
loss = -torch.sum(loss_tensor)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if episode % VISUALIZE_STEP == 0 and episode > 0:
print('Episode {}\tAverage Score: {:.2f}'.format(episode, np.mean(score[-VISUALIZE_STEP:-1])))
# # EARLY-STOPPING: if the average score across last 100 episodes is greater than 195, game is solved
# if np.mean(score[-100:-1]) > 195:
# break
# Training plot
score = np.array(score)
avg_score = running_average(score, visualize_step)
plt.figure(figsize=(15, 7))
plt.ylabel("Episodic Reward", fontsize=12)
plt.xlabel("Training Episodes", fontsize=12)
plt.plot(score, color='gray', linewidth=1)
plt.plot(avg_score, color='blue', linewidth=3)
plt.scatter(np.arange(score.shape[0]), score, color='green', linewidth=0.3)
plt.savefig("cartpole_reinforce_training_plot.pdf")
def main():
env = gym.make('CartPole-v0')
episode_length = 300
n_episodes = 5000
gamma = 0.99
vis_steps = 100
learning_rate = 0.003
train_reinforce_agent(env, episode_length, n_episodes, gamma, vis_steps, learning_rate=learning_rate)
if __name__ == "__main__":
main()
My name is Andy and I am new to stackoverflow and this is my first question.
I started learning python 40ish days ago thanks to covid19 and jumped into machine learning/qlearning about 3 weeks ago and got stuck there since.
Goal:
have the computer play Rad Racer 2 (NES racing game) using reinforcement learning.
Plans to make this work:
after various tutorials/sites, I decided to use a double network to train/learn.
2x 256 convolution network using keras since I have watched a few tutorial vids on keras basic
3 actions(hold down accelerate(J), accelerate Left(JA), accelerate Right(JD)
I am using directinput keys codes I found online to send inputs to game as sending regular keys does not work.
I know ppl uses retro gym for these type of games but I wanted to see the inner working of reward/observation and such so I used yolov5 to detect lines/objects. Based on the result from yolov5, I calculate the reward for the step.
My input is a series of grayscale images(4) to represent motion using deque then stacked with numpy.
Once I have gather enough experiences/replay memory(1500) I started the training at the end of each of episode instead of each step. I found that it lag out a lot training after each step.
Problem:
My biggest problem currently is the model does not seem to learn properly. I seem to be slightly okay around episode 20-30 then after that it get worst and worst. It get to a point where it only does one action for hours.
I have tried playing around with the learning rate(0.1 - 0.00001), different inputs(1 bgr layer, grayscale layer, 4 layer..etc), different epsilon decay rate. I commented most of the reward stuffs, only basic reward for now.
most codes beside the yolo stuffs, had to removed a few lines due to # character limitation
# parameters
training = True
learning_rate = 0.0001
DISCOUNT = 0.99
REPLAY_MEMORY_SIZE = 50_000 # How many last steps to keep for model training
MIN_REPLAY_MEMORY_SIZE = 1500 # Minimum number of steps in a memory to start training
MINIBATCH_SIZE = 1000 # How many steps (samples) to use for training
batch_size = 32
UPDATE_TARGET_EVERY = 0 # Terminal states (end of episodes)
MODEL_NAME = 'RC'
MIN_REWARD = 0 # For model save
save_every = 5 # save every x episodes
EPISODES = 2_000
# Exploration settings
if training is True:
epsilon = 1 # not a constant, going to be decayed
else:
epsilon = 0
MIN_EPSILON = 0.01
START_EPISODE_DECAY = 0
END_EPISODE_DECAY = 20
if epsilon > MIN_EPSILON:
EPS_DECAY = -(epsilon/((END_EPISODE_DECAY-START_EPISODE_DECAY)/epsilon))
else:
EPS_DECAY = 0
# Agent class
class DQNAgent:
def __init__(self):
# Main model
self.model = self.create_model()
# self.model = self.load_model()
# Target network
self.target_model = self.create_model()
self.target_model.set_weights(self.model.get_weights())
# An array with last n steps for training
self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
# Used to count when to update target network with main network's weights
self.target_update_counter = 0
def create_model(self):
dropout = 0.1
model = Sequential()
model.add(Conv2D(256, (2, 2), input_shape=(int(height/resize_ratio), int(width/resize_ratio), img_channels)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(dropout))
model.add(Conv2D(256, (2, 2)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(dropout))
model.add(Flatten())
model.add(Dense(64))
model.add(Dense(env.ACTION_SPACE_SIZE, activation='linear')) # ACTION_SPACE_SIZE = how many choices (9)
model.compile(loss="mse", optimizer=Adam(lr=learning_rate), metrics=['accuracy'])
return model
# Trains main network at end of episode
def train(self, terminal_state):
# Start training only if certain number of samples is already saved
if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
return
minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
current_states = np.array([transition[0] for transition in minibatch])
# from (MINIBATCH_SIZE, 1, h, w, 4) > (MINIBATCH_SIZE, h, w, 4)
current_states = current_states.reshape(current_states.shape[0], current_states.shape[2],
current_states.shape[3], current_states.shape[4])
current_qs_list = self.model.predict(current_states)
new_current_states = np.array([transition[3] for transition in minibatch])
new_current_states = new_current_states.reshape(new_current_states.shape[0], new_current_states.shape[2],
new_current_states.shape[3], new_current_states.shape[4])
# new_current_states = np.expand_dims(new_current_states, axis=-1)
future_qs_list = self.target_model.predict(new_current_states)
X = []
y = []
for index, (current_state_img, current_action, current_reward, new_current_img, current_done) in enumerate(minibatch):
if not current_done:
max_future_q = np.max(future_qs_list[index])
new_q = current_reward + (DISCOUNT * max_future_q)
else:
new_q = 0.0
current_qs = current_qs_list[index]
current_qs[current_action] = new_q
X.append(np.squeeze(current_state_img, axis=0))
y.append(current_qs)
X = np.array(X)
# X = np.expand_dims(X, axis=-1)
# X = X.reshape(X.shape[0], X.shape[2], X.shape[3], X.shape[4])
y = np.array(y)
self.model.fit(X, y, batch_size=batch_size, verbose=0, shuffle=False)
# self.model.train_on_batch(X, y)
if terminal_state:
self.target_update_counter += 1
# If counter reaches set value, update target network with weights of main network
if self.target_update_counter > UPDATE_TARGET_EVERY:
self.target_model.set_weights(self.model.get_weights())
self.target_update_counter = 0
print('target_model trained!')
# Queries main network for Q values given current observation space (environment state)
def get_qs(self, state):
result = agent.model.predict(state)
result = result[0]
return result
agent = DQNAgent()
current_img_stack = deque(maxlen=4)
# make the game active
game = gw.getWindowsWithTitle('Mesen')[0]
game.activate()
time.sleep(1)
release_all()
# Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
episode_reward = 0
step = 1
if episode <= START_EPISODE_DECAY - 1:
start_epsilon = False
elif episode >= END_EPISODE_DECAY + 1:
start_epsilon = False
else:
start_epsilon = True
# Reset environment and get initial state
# blackscreens followed by the 1st screen starting out
current_state = env.reset()
blackscreen = np.zeros_like(current_state)
current_img_stack.append(blackscreen)
current_img_stack.append(blackscreen)
current_img_stack.append(blackscreen)
current_img_stack.append(current_state)
stacked_state = np.stack(current_img_stack, axis=2)
stacked_state = np.ascontiguousarray(stacked_state, dtype=np.float32) / 255
stacked_state = np.transpose(stacked_state, (1, 0, 2))
stacked_state = np.expand_dims(stacked_state, axis=0)
start_time = time.time()
# Reset flag and start iterating until episode ends
done = False
while not done:
if np.random.random() > epsilon:
action = np.argmax(agent.get_qs(stacked_state))
else:
action = np.random.randint(0, env.ACTION_SPACE_SIZE)
new_state, reward, done, prediction, preview = env.step(action)
if done is False:
next_img_stack = current_img_stack
next_img_stack.append(new_state)
next_stack = np.stack(next_img_stack, axis=2)
next_stack = np.ascontiguousarray(next_stack, dtype=np.float32) / 255
next_stack = np.transpose(next_stack, (1, 0, 2))
next_stack = np.expand_dims(next_stack, axis=0)
# current_state = new_state
current_img_stack = next_img_stack
stacked_state = next_stack
else:
next_img_stack = current_img_stack
next_img_stack.append(blackscreen)
next_stack = np.stack(next_img_stack, axis=2)
next_stack = np.ascontiguousarray(next_stack, dtype=np.float32) / 255
next_stack = np.transpose(next_stack, (1, 0, 2))
next_stack = np.expand_dims(next_stack, axis=0)
step += 1
episode_reward += reward
ep_rewards.append(episode_reward)
if SHOW_PREVIEW:
env.render(preview, prediction)
if training is True:
agent.update_replay_memory((stacked_state, action, reward, next_stack, done))
# print(episode_reward)
if done is True:
ep_reward_final.append(episode_reward)
print(' Epsilon(' + str(epsilon) + ') EPtimes(' + str(time.time() - start_time) + ') done('
+ str(done) + ') step(' + str(step) + ') EPreward(' + str(episode_reward) +
') best_reward_this_session(' + str(max(ep_reward_final)) + ') fps(' +
str(step/(time.time() - start_time)) + ')')
# plot(ep_reward_final)
if training is True:
agent.train(done)
# Decay epsilon
if show_info is False and epsilon <= MIN_EPSILON:
print(f"\nEPS_DECAY ended on episode {episode} - epsilon {epsilon}")
epsilon = MIN_EPSILON
show_info = True
elif start_epsilon is True:
epsilon += EPS_DECAY
I am following this online tutorial for coding a DQN,https://github.com/philtabor/Youtube-Code-Repository/blob/master/ReinforcementLearning/DeepQLearning/torch_deep_q_model.py
, however I am running into this Runtime Error that I am unsure of how to debug or modify to prevent this error. Thanks!
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-196-00975d66fd2d> in <module>
28 agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
29 obs= obs_
---> 30 agent.learn(batch_size)
31 lastAction = action
32 scores.append(score)
<ipython-input-191-f6b163cc3a8a> in learn(self, batch_size)
72 Qtarget = Qpred.clone()
73 print(Qnext[1])
---> 74 Qtarget[:,maxA] = rewards + self.GAMMA*torch.max(Qnext[1])
75 # epsilon decay action
76 if self.steps > 2000:
RuntimeError: the derivative for 'indices' is not implemented
These are my code blocks in my jupyter notebook
class DeepQNetwork(nn.Module):
def __init__(self,Alpha):
super(DeepQNetwork,self).__init__()
self.conv1 = nn.Conv2d(1,32,8,stride=4, padding=1)
self.conv2 = nn.Conv2d(32,64,4,stride=2)
self.conv3 = nn.Conv2d(64,128,3)
self.fc1 = nn.Linear(128* 21* 12,512)
self.fc2 = nn.Linear(512,6)
self.optimizer = optim.RMSprop(self.parameters(), lr = Alpha)
self.loss = nn.MSELoss()
self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
self.to(self.device)
def forward(self,obs):
'''Passing in a sequence of arrays'''
obs = torch.Tensor(obs).to(self.device) # send to the GPU
''' Feed forward the Network Parameters'''
obs = obs.view(-1, 1,200,125)
#print(obs.shape)
obs = F.relu(self.conv1(obs))
#print(obs.shape)
obs = F.relu(self.conv2(obs))
#print(obs.shape)
obs = F.relu(self.conv3(obs))
#print(obs.shape)
obs = obs.view(-1,128* 21* 12)
obs = F.relu(self.fc1(obs))
# 4 Rows and 6 columns
actions = self.fc2(obs)
return actions
This is the Agent Code, and it contains the error causing line of code
class DQNAgent(object):
def __init__(self, gamma, epsilon, alpha, maxMemory,
epsEnd = 0.05, replace =10000, actionSpace = [0,1,2,3,4,5]):
'''
Gamma -> discount factor of valuing current reward over future reward
Epsilon -> for trade off between exploration-exploitation
alpha -> learn rate
maxMemory -> max size of Memory buffer
epsEnd -> smallest value of Exploration
repace -> how often to replace target network
'''
self.GAMMA = gamma
self.EPSILON = epsilon
self.EPS_END = epsEnd
self.actionSpace = actionSpace
self.maxMemory = maxMemory
self.steps = 0
self.learn_step_counter = 0
self.memory = []
self.memCount = 0
self.replace_tgt_count = replace
self.Q_eval = DeepQNetwork(alpha)
self.Q_next = DeepQNetwork(alpha)
def storeTransition(self, state, action, reward, state_):
'''Stores Transition states'''
if self.memCount < self.maxMemory:
self.memory.append([state,action,reward,state_])
else:
self.memory[self.memCount%self.maxMemory] = [state,action,reward,state_]
self.memCount +=1
def chooseAction(self,obs):
'''
Exploration if np.random > epsilon
else take epsilon greedy action
'''
rand = np.random.random()
# Get the value for all actions for the current set of states
# Forward pass the stack of frames to get value of each action given subset of staes in obs
actions = self.Q_eval.forward(obs)
if rand<1-self.EPSILON:
action = torch.argmax(actions[1]).item()
else:
action = np.random.choice(self.actionSpace)
self.steps += 1
return action
def learn(self, batch_size):
self.Q_eval.optimizer.zero_grad()
#0 gradient to do batch optimisation
if self.replace_tgt_count is not None and self.learn_step_counter % self.replace_tgt_count==0:
self.Q_next.load_state_dict(self.Q_eval.state_dict())
# memory subsampling
if self.memCount + batch_size < self.maxMemory:
memStart = int(np.random.choice(range(self.memCount)))
else:
memStart = int(np.random.choice(range(self.maxMemory-batch_size-1)))
miniBatch = self.memory[memStart:memStart+batch_size]
memory = np.array(miniBatch)
#feed forward current state and successor state conv to list as memory is array of numpy objects
Qpred = self.Q_eval.forward(list(memory[:,0][:])).to(self.Q_eval.device)
Qnext = self.Q_next.forward(list(memory[:,3][:])).to(self.Q_eval.device)
maxA = torch.argmax(Qnext,dim = 1).to(self.Q_eval.device)
#calculate rewards
rewards = torch.Tensor(list(memory[:,2])).to(self.Q_eval.device)
# loss for every action except max action to be 0
Qtarget = Qpred.clone()
print(Qnext.shape)
Qtarget[:,maxA] = rewards + self.GAMMA*torch.max(Qnext[1])# PROBLEMATIC LINE
# epsilon decay action
if self.steps > 2000:
if self.EPSILON-1e-4 >self.EPS_END:
self.EPSILON-= 1e-4
else:
self.EPSILON = self.EPS_END
loss = self.Q_eval.loss(Qtarget,Qpred).to(self.Q_eval.device)
loss.backward()
self.Q_eval.optimizer.step()
self.learn_step_counter +=1
env = gym.make("Invader-v0")
agent = DQNAgent(gamma=0.95,epsilon = 1.0,alpha = 0.003, maxMemory = 5000,replace = None)
while agent.memCount < agent.maxMemory:
obs = env.reset()
done = False
lives = 3
while not done:
action = env.action_space.sample()
obs_ , reward, done, info = env.step(action)
if done and info['lives']<lives:
lives = info['lives']
reward -= 200
agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
obs= obs_
initialised = True
scores = []
epsHistory = []
numGames = 50
batch_size = 16
for i in range(numGames):
print(f'starting game {i+1}, epsilon = {agent.EPSILON}')
epsHistory.append(agent.EPSILON)
done = False
obs = env.reset()
frames = [np.sum(obs)]
score = 0
lastAction = 0
lives = 3
while not done:
if len(frames) == 4:
action = agent.chooseAction(frames)
frames = []
else:
action = lastAction
obs_, reward, done, info = env.step(action)
score += score-reward
frames.append(preprocess(obs_))
if done and info['lives'] < lives:
reward -=200
agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
obs= obs_
agent.learn(batch_size)
lastAction = action
scores.append(score)
print('score: ', score)
x = [i+1 for i in range(numGames)]
You have to do use .detach() for :
Qnext = self.Q_next.forward(list(memory[:,3][:])).detach().to(self.Q_eval.device)
I have this problem with my Neural Network. I'm trying to implement what's called a DMN (Dynamic Memory Network) for the babi data set. A paper about the DMN model can be found here: http://arxiv.org/abs/1506.07285 Another paper about DMNs can be found here: https://yerevann.github.io/2016/02/05/implementing-dynamic-memory-networks/
Here's my problem. btw I'm using PyTorch.
I split the training and testing data into parts for training, testing, and validation. I use 1000 parts for training, 500 parts for testing and 500 parts for validation. I run into a problem. I can train successfully but when I go to the validation step I never get a score above 50% accuracy. With the babi data set it is documented that you should be able to get 100% accuracy with the first test set. (There are 20 test sets in all). I can get 100% accuracy during training, but only 50% in validation. My question to you is what part of the program would be responsible for this kind of behavior? In other words, can you tell me why I'm always getting 50% ?? Thanks for your time. I'm limiting my experiments to the first babi test for now.
I thought I had this all figured out but my problem has cropped up again. I really don't have a clue what it is. Here is a link to the code. If you could take a look I would be most grateful. https://github.com/radiodee1/awesome-chatbot/blob/master/model/babi_iv.py
Some code is included below.
class WrapMemRNN(nn.Module):
def __init__(self,vocab_size, embed_dim, hidden_size, n_layers, dropout=0.3, do_babi=True, bad_token_lst=[], freeze_embedding=False, embedding=None, print_to_screen=False):
super(WrapMemRNN, self).__init__()
self.hidden_size = hidden_size
self.n_layers = n_layers
self.do_babi = do_babi
self.print_to_screen = print_to_screen
self.bad_token_lst = bad_token_lst
self.embedding = embedding
self.freeze_embedding = freeze_embedding
self.teacher_forcing_ratio = hparams['teacher_forcing_ratio']
gru_dropout = dropout * 0
self.model_1_enc = Encoder(vocab_size, embed_dim, hidden_size, n_layers, dropout=dropout,embedding=embedding, bidirectional=False)
self.model_2_enc = Encoder(vocab_size, embed_dim, hidden_size, n_layers, dropout=gru_dropout, embedding=embedding, bidirectional=False)
self.model_3_mem_a = MemRNN(hidden_size, dropout=gru_dropout)
self.model_3_mem_b = MemRNN(hidden_size, dropout=gru_dropout)
self.model_4_att = EpisodicAttn(hidden_size, dropout=gru_dropout)
self.model_5_ans = AnswerModule(vocab_size, hidden_size,dropout=dropout)
self.input_var = None # for input
self.q_var = None # for question
self.answer_var = None # for answer
self.q_q = None # extra question
self.inp_c = None # extra input
self.inp_c_seq = None
self.all_mem = None
self.last_mem = None # output of mem unit
self.prediction = None # final single word prediction
self.memory_hops = hparams['babi_memory_hops']
self.reset_parameters()
if self.freeze_embedding or self.embedding is not None:
self.new_freeze_embedding()
#self.criterion = nn.CrossEntropyLoss()
pass
def reset_parameters(self):
#print('reset')
stdv = 1.0 / math.sqrt(self.hidden_size)
for weight in self.parameters():
#print('here...')
weight.data.uniform_(-stdv, stdv)
if len(weight.size()) > 1:
init.xavier_normal_(weight)
def forward(self, input_variable, question_variable, target_variable, criterion=None):
self.new_input_module(input_variable, question_variable)
self.new_episodic_module()
outputs, ans = self.new_answer_module_simple()
return outputs, None, ans, None
def new_freeze_embedding(self):
self.model_1_enc.embed.weight.requires_grad = False
self.model_2_enc.embed.weight.requires_grad = False
print('freeze embedding')
pass
def new_input_module(self, input_variable, question_variable):
prev_h1 = []
for ii in input_variable:
ii = self.prune_tensor(ii, 2)
out1, hidden1 = self.model_1_enc(ii, None)
prev_h1.append(hidden1)
self.inp_c_seq = prev_h1
self.inp_c = prev_h1[-1]
prev_h2 = []
for ii in question_variable:
ii = self.prune_tensor(ii, 2)
out2, hidden2 = self.model_2_enc(ii, None)
prev_h2.append(hidden2)
self.q_q = hidden2[:,-1,:]
return
def new_episodic_module(self):
if True:
mem_list = []
sequences = self.inp_c_seq
for i in range(len(sequences)):
m_list = [self.q_q.clone()]
#print(sequences[i].size(),'seq')
for iter in range(self.memory_hops):
x = self.new_attention_step(sequences[i], None, m_list[iter], self.q_q)
if self.print_to_screen and not self.training:
print(x,'x -- after', len(x), sequences[i].size())
e, _ = self.new_episode_small_step(sequences[i], x.permute(1,0), None)
assert len(sequences[i].size()) == 3
#print(e.size(),'e')
ee = e[:, 0, -1]#.permute(2,1,0)
_, out = self.model_3_mem_a(ee.unsqueeze(0), self.prune_tensor(m_list[iter], 3))
m_list.append(out)
mem_list.append(m_list[self.memory_hops])
mm_list = torch.cat(mem_list, dim=1)
self.last_mem = mm_list
#print(self.last_mem.size(),'lm')
return None
def new_episode_small_step(self, ct, g, prev_h):
assert len(ct.size()) == 3
bat, sen, emb = ct.size()
#print(ct.size(),'ct')
#print(sen,'sen', g.size())
last = [prev_h]
ep = []
for iii in range(sen):
c = ct[0,iii,:].unsqueeze(0)
if prev_h is not None:
prev_h = self.prune_tensor(prev_h, 3)
out, gru = self.model_3_mem_b(c, last[iii] )
last.append(out)
g = g.squeeze(0)
gru = gru.squeeze(0).permute(1,0)
#if not self.training: print(g.size(),'g', iii)
#ggg = g[:, iii]
ggg = g[iii]
h = torch.mul(ggg , gru)# + torch.mul((1 - g[iii]) , prev_h.permute(1,0))
index = -1 #-1 # -2
if last[iii + index] is not None:
#print(last[iii].size(),'last -',ggg.size(), ggg, sen)
if False: h = h + torch.mul((1 - ggg), last[iii + index])
#print(h.size(),'hsize')
if iii == sen - 1 : ep.append(h.unsqueeze(1))
h = torch.cat(ep, dim=1)
#print(h.size(),ep[0].size(),'h',sen, gru.size())
return h, gru
def new_attention_step(self, ct, prev_g, mem, q_q):
q_q = self.prune_tensor(q_q,3)
mem = self.prune_tensor(mem,3)
assert len(ct.size()) == 3
bat, sen, emb = ct.size()
#print(sen,'len sen')
att = []
for iii in range(sen):
c = ct[0,iii,:]
concat_list = [
c.unsqueeze(0),
mem.squeeze(0),
q_q.squeeze(0),
(c * q_q).squeeze(0),
(c * mem).squeeze(0),
(torch.abs(c - q_q) ).squeeze(0),
(torch.abs(c - mem) ).squeeze(0)
]
#for ii in concat_list: print(ii.size())
#print(sen,'sen')
#exit()
#z = F.sigmoid(z)
concat_list = torch.cat(concat_list, dim=1)
#print(concat_list.size(),'cl')
att.append(concat_list)
att = torch.cat(att, dim=0)
#z = torch.cat(att, dim=0)
z = self.model_4_att(att)
z = F.sigmoid(z)
#z = F.softmax(z, dim=1) #F.sigmoid(z)
#print(z.size(),'z')
return z
def prune_tensor(self, input, size):
if len(input.size()) < size:
input = input.unsqueeze(0)
if len(input.size()) > size:
input = input.squeeze(0)
return input
def new_answer_module_simple(self):
#outputs
ansx = self.model_5_ans(self.last_mem, None)
#ansx = F.softmax(ansx, dim=0)
if self.print_to_screen:
print(ansx, 'ansx printed')
print(ansx.size(), 'ansx')
vocab, sen = ansx.size()
aa = torch.argmax(ansx, dim=0)
print(aa.size(),'aa')
for i in range(sen):
zz = aa[i]
z = ansx[:, i]
a = torch.argmax(z, dim=0)
print(a.item(), zz.item())
print('----')
#ans = torch.argmax(ansx,dim=1)#[0]
return [None], ansx
pass