RuntimeError: the derivative for 'indices' is not implemented

RuntimeError: the derivative for 'indices' is not implemented - pytorch

I am following this online tutorial for coding a DQN,https://github.com/philtabor/Youtube-Code-Repository/blob/master/ReinforcementLearning/DeepQLearning/torch_deep_q_model.py
, however I am running into this Runtime Error that I am unsure of how to debug or modify to prevent this error. Thanks!
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-196-00975d66fd2d> in <module>
28 agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
29 obs= obs_
---> 30 agent.learn(batch_size)
31 lastAction = action
32 scores.append(score)
<ipython-input-191-f6b163cc3a8a> in learn(self, batch_size)
72 Qtarget = Qpred.clone()
73 print(Qnext[1])
---> 74 Qtarget[:,maxA] = rewards + self.GAMMA*torch.max(Qnext[1])
75 # epsilon decay action
76 if self.steps > 2000:
RuntimeError: the derivative for 'indices' is not implemented
These are my code blocks in my jupyter notebook
class DeepQNetwork(nn.Module):
def __init__(self,Alpha):
super(DeepQNetwork,self).__init__()
self.conv1 = nn.Conv2d(1,32,8,stride=4, padding=1)
self.conv2 = nn.Conv2d(32,64,4,stride=2)
self.conv3 = nn.Conv2d(64,128,3)
self.fc1 = nn.Linear(128* 21* 12,512)
self.fc2 = nn.Linear(512,6)
self.optimizer = optim.RMSprop(self.parameters(), lr = Alpha)
self.loss = nn.MSELoss()
self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
self.to(self.device)
def forward(self,obs):
'''Passing in a sequence of arrays'''
obs = torch.Tensor(obs).to(self.device) # send to the GPU
''' Feed forward the Network Parameters'''
obs = obs.view(-1, 1,200,125)
#print(obs.shape)
obs = F.relu(self.conv1(obs))
#print(obs.shape)
obs = F.relu(self.conv2(obs))
#print(obs.shape)
obs = F.relu(self.conv3(obs))
#print(obs.shape)
obs = obs.view(-1,128* 21* 12)
obs = F.relu(self.fc1(obs))
# 4 Rows and 6 columns
actions = self.fc2(obs)
return actions
This is the Agent Code, and it contains the error causing line of code
class DQNAgent(object):
def __init__(self, gamma, epsilon, alpha, maxMemory,
epsEnd = 0.05, replace =10000, actionSpace = [0,1,2,3,4,5]):
'''
Gamma -> discount factor of valuing current reward over future reward
Epsilon -> for trade off between exploration-exploitation
alpha -> learn rate
maxMemory -> max size of Memory buffer
epsEnd -> smallest value of Exploration
repace -> how often to replace target network
'''
self.GAMMA = gamma
self.EPSILON = epsilon
self.EPS_END = epsEnd
self.actionSpace = actionSpace
self.maxMemory = maxMemory
self.steps = 0
self.learn_step_counter = 0
self.memory = []
self.memCount = 0
self.replace_tgt_count = replace
self.Q_eval = DeepQNetwork(alpha)
self.Q_next = DeepQNetwork(alpha)
def storeTransition(self, state, action, reward, state_):
'''Stores Transition states'''
if self.memCount < self.maxMemory:
self.memory.append([state,action,reward,state_])
else:
self.memory[self.memCount%self.maxMemory] = [state,action,reward,state_]
self.memCount +=1
def chooseAction(self,obs):
'''
Exploration if np.random > epsilon
else take epsilon greedy action
'''
rand = np.random.random()
# Get the value for all actions for the current set of states
# Forward pass the stack of frames to get value of each action given subset of staes in obs
actions = self.Q_eval.forward(obs)
if rand<1-self.EPSILON:
action = torch.argmax(actions[1]).item()
else:
action = np.random.choice(self.actionSpace)
self.steps += 1
return action
def learn(self, batch_size):
self.Q_eval.optimizer.zero_grad()
#0 gradient to do batch optimisation
if self.replace_tgt_count is not None and self.learn_step_counter % self.replace_tgt_count==0:
self.Q_next.load_state_dict(self.Q_eval.state_dict())
# memory subsampling
if self.memCount + batch_size < self.maxMemory:
memStart = int(np.random.choice(range(self.memCount)))
else:
memStart = int(np.random.choice(range(self.maxMemory-batch_size-1)))
miniBatch = self.memory[memStart:memStart+batch_size]
memory = np.array(miniBatch)
#feed forward current state and successor state conv to list as memory is array of numpy objects
Qpred = self.Q_eval.forward(list(memory[:,0][:])).to(self.Q_eval.device)
Qnext = self.Q_next.forward(list(memory[:,3][:])).to(self.Q_eval.device)
maxA = torch.argmax(Qnext,dim = 1).to(self.Q_eval.device)
#calculate rewards
rewards = torch.Tensor(list(memory[:,2])).to(self.Q_eval.device)
# loss for every action except max action to be 0
Qtarget = Qpred.clone()
print(Qnext.shape)
Qtarget[:,maxA] = rewards + self.GAMMA*torch.max(Qnext[1])# PROBLEMATIC LINE
# epsilon decay action
if self.steps > 2000:
if self.EPSILON-1e-4 >self.EPS_END:
self.EPSILON-= 1e-4
else:
self.EPSILON = self.EPS_END
loss = self.Q_eval.loss(Qtarget,Qpred).to(self.Q_eval.device)
loss.backward()
self.Q_eval.optimizer.step()
self.learn_step_counter +=1
env = gym.make("Invader-v0")
agent = DQNAgent(gamma=0.95,epsilon = 1.0,alpha = 0.003, maxMemory = 5000,replace = None)
while agent.memCount < agent.maxMemory:
obs = env.reset()
done = False
lives = 3
while not done:
action = env.action_space.sample()
obs_ , reward, done, info = env.step(action)
if done and info['lives']<lives:
lives = info['lives']
reward -= 200
agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
obs= obs_
initialised = True
scores = []
epsHistory = []
numGames = 50
batch_size = 16
for i in range(numGames):
print(f'starting game {i+1}, epsilon = {agent.EPSILON}')
epsHistory.append(agent.EPSILON)
done = False
obs = env.reset()
frames = [np.sum(obs)]
score = 0
lastAction = 0
lives = 3
while not done:
if len(frames) == 4:
action = agent.chooseAction(frames)
frames = []
else:
action = lastAction
obs_, reward, done, info = env.step(action)
score += score-reward
frames.append(preprocess(obs_))
if done and info['lives'] < lives:
reward -=200
agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
obs= obs_
agent.learn(batch_size)
lastAction = action
scores.append(score)
print('score: ', score)
x = [i+1 for i in range(numGames)]

You have to do use .detach() for :
Qnext = self.Q_next.forward(list(memory[:,3][:])).detach().to(self.Q_eval.device)

Related

DQN model for Atari game never learns

I'm trying to implement an Pong game with DQN model by torch. However I got two problems during the execution. Firstly, I found that the game never get done. Secondly, I found the loss function does not have any change in the trainning. This is my code below:
I defined a CNN network with the input of the size (batch=32, channels=4, height=84, weight=84). By this step there's nothing wrong happened:
class CNN(nn.Module):
def __init__(self, s_channels, a_space):
super(CNN, self).__init__()
self.pool = nn.MaxPool2d(kernel_size=2, stride=1)
self.conv1 = nn.Conv2d(s_channels,out_channels=32,kernel_size=8,stride=4)
self.conv2 = nn.Conv2d(32,64,4,2)
self.conv3 = nn.Conv2d(64,64,3,1)
self.fc1 = nn.Linear(64*4*4,1024)
self.fc2 = nn.Linear(1024,512)
self.fc3 = nn.Linear(512,a_space)
def forward(self,input):
output = self.pool(F.relu(self.conv1(input)))
output = self.pool(F.relu(self.conv2(output)))
output = self.pool(F.relu(self.conv3(output)))
output = output.view(-1,64*4*4)
output = F.relu(self.fc1(output))
output = F.relu(self.fc2(output))
output = F.relu(self.fc3(output))
return output
For the agent class, I defined a back propagation function to replay the weight in CNN and the data pre-processing function:
# Agent
class Agent():
def __init__(self, s_space, a_space) -> None:
# define parameters
self.epsilon = 1.0
self.min_epsilon = 0.01
self.dr = 0.995
self.lr = 0.001
self.gamma = 0.9
# define models
self.evl_net = CNN(s_space, a_space)
self.tgt_net = CNN(s_space, a_space)
self.cert = nn.SmoothL1Loss()
self.optimal = th.optim.Adam(self.evl_net.parameters(),lr=self.lr)
# define memory store
self.memory = deque(maxlen=2000)
# self.img_stack = deque(maxlen=4)
# pre-processing frame images: transform the imaages into tensors
# def bsl_image_pre_process(self,env):
# env = aw.AtariWrapper(env,noop_max=30,frame_skip=4,screen_size=84,terminal_on_life_loss=True,clip_reward = True)
# return env
def gym_image_pre_process(self,env):
#Atari preprocessing
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
#create frame stack
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
return env,channels
# env = aw.AtariWrapper(env,noop_max=30,frame_skip=4,screen_size=84,terminal_on_life_loss=True,clip_reward = True)
# return env
def data_pre_process(self,batch_size):
s_v = []
a_v = []
next_s_v = []
r_v = []
dones = []
materials = random.sample(self.memory,batch_size)
for t in materials:
s_v.append(t[0])
a_v.append(t[1])
next_s_v.append(t[2])
r_v.append(t[3])
dones.append(t[4])
# print(th.FloatTensor(r_v))
# print(th.FloatTensor(r_v).size())
# print(s_v)
s_v = th.Tensor(s_v) # size: [32,3,210,160]
a_v = th.LongTensor(a_v).unsqueeze(1) # size: [32,1]
next_s_v = th.Tensor(next_s_v) # size: [32,3,210,160]
r_v = th.FloatTensor(r_v) # size: [32]
return s_v, a_v, next_s_v, r_v, dones
# remember the transformed images
def record(self,tpl):
self.memory.append(tpl)
# select actions according to the states (input images with 4 channels)
def select(self,state,a_space):
actions = self.evl_net(state).data.tolist()
if(random.random() <= self.epsilon):
action = random.randint(0,a_space-1)
else:
action = actions.index(max(actions))
return action
# DQN trainning progression
def train(self,state,batch_size):
s_v,a_v,next_s_v,r_v,dones = self.data_pre_process(batch_size)
self.tgt_net.load_state_dict(self.evl_net.state_dict())
evl_Q_value = self.evl_net(s_v).gather(0,a_v) # size: [32,6].gather() -> [32,1]
tgt = self.tgt_net(next_s_v).max(1)[0].detach() # size [32,1]
tgt_Q_value = (r_v + self.gamma * tgt)
for index in range(len(dones)):
if(dones[index]==True):
tgt[index][0] = -1
# print(tgt_Q_value)
tgt_Q_value = tgt_Q_value.reshape(batch_size,1) # size: [32, 1] cannot be back propagated
# print(tgt_Q_value)
self.optimal.zero_grad()
loss = self.cert(evl_Q_value, tgt_Q_value)
print(loss)
loss.backward()
for pr in self.evl_net.parameters():
pr.grad.data.clamp_(-1, 1)
self.optimal.step()
if(self.epsilon > self.min_epsilon):
self.epsilon *= self.dr
At the training stage, I found the first question. the condition of done in each episode is always false. With gym.wrappers I've pre-processed the image tensor into 48484 and the environment with only one life. But it still appears:
# main test
_display = Display(visible=0, size=(900,1400))
_display.start()
# set episode step and batch_size
episodes = 5000
batch_size = 32
env = gym.make("PongNoFrameskip-v4")
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
# create frame stack for the input image data (size: (4,84,84))
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
a_space = env.action_space.n
agent = Agent(channels, a_space)
# env.render()
# testing:
for e in range(episodes):
# step 1: reset the agent at the beginning
s = np.array(env.reset())
for run in range(100):
score = 0
# display.clear_output(wait=True)
# display.display(Image.fromarray(env.render(mode='rgb_array')))
# env.render("rgb_array")
img = plt.imshow(env.render('rgb_array'))
# step 2: create state space tensor
# step 3: iterate actions
a = agent.select(th.Tensor(s).unsqueeze(0),a_space)
next_s, reward, done, _ = env.step(a)
if(done==True):
next_s = None
next_s = np.array(next_s) # done is never true. Why?
# step 4: record the data into buffer
dataset = (s,a,next_s,reward,done)
agent.record(dataset)
# step 5: update state steps
s = next_s
score += reward
if(done==True or run == 99):
print("episodes:",e,"score:",score,"epsilon: {:.2}".format(agent.epsilon))
break
# step 6: training and update CNN
if(len(agent.memory) > batch_size):
agent.train(channels,batch_size)
As I tried to find this problem, I detected that the loss value never even roughly decreases(at most fluctuate around 1.2). I rechecked the input and output tensor but found nothing else. I hope to get some help for how to fix these two problems. Many thanks!

problem changing input type in pytorch reinforcement learning

I'm trying to tun a code I found on github, but keep crashing in one part and getting TypeError: only size-1 arrays can be converted to Python scalars error. I've been trying to solve it for two days now. If I understand the problem correctly I have incompatible dtypes of tensors yet I've no idea how to fix this. Everytime I try to change my tensor type to say DoubleTensor, or float64 I get other errors. I simply haven't got a clue now which part needs to be changed and how.
This is my model defined:
class Policy(nn.Module):
def __init__(self):
super(Policy, self).__init__()
self.input_layer = nn.Linear(11, 128)
self.hidden_1 = nn.Linear(128, 128)
self.hidden_2 = nn.Linear(32,31)
self.hidden_state = torch.tensor(torch.zeros(2,1,32)).cuda()
self.rnn = nn.GRU(128, 32, 2)
self.action_head = nn.Linear(31, 5)
self.value_head = nn.Linear(31, 1)
self.saved_actions = []
self.rewards = []
def reset_hidden(self):
self.hidden_state = torch.tensor(torch.zeros(2,1,32)).cuda()
def forward(self, x):
x = torch.tensor(x).cuda()
x = torch.sigmoid(self.input_layer(x))
x = torch.tanh(self.hidden_1(x))
x, self.hidden_state = self.rnn(x.view(1,-1,128), self.hidden_state.data)
x = F.relu(self.hidden_2(x.squeeze()))
action_scores = self.action_head(x)
state_values = self.value_head(x)
return F.softmax(action_scores, dim=-1), state_values
def forward(self, x):
conv_out = self.conv(x).view(x.size()[0], -1)
val = self.fc_val(conv_out)
adv = self.fc_adv(conv_out)
return val + (adv - adv.mean(dim=1, keepdim=True))
def act(self, state):
probs, state_value = self.forward(state)
m = Categorical(probs)
action = m.sample()
if action == 1 and env.state[0] < 1: action = torch.LongTensor([2]).squeeze().cuda.DoubleTensor()
if action == 4 and env.state[1] < 1: action = torch.LongTensor([2]).squeeze().cuda.DoubleTensor()
if action == 6 and env.state[2] < 1: action = torch.LongTensor([2]).squeeze().cuda.DoubleTensor()
self.saved_actions.append((m.log_prob(action), state_value))
return action.item()
This is where it crashes
env.reset()
# In case you're running this a second time with the same model, delete the gradients
del model.rewards[:]
del model.saved_actions[:]
gamma = 0.9
log_interval = 60
def finish_episode():
R = 0
saved_actions = model.saved_actions
policy_losses = []
value_losses = []
rewards = []
for r in model.rewards[::-1]:
R = r + (gamma * R)
rewards.insert(0, R)
rewards = torch.tensor(rewards)
epsilon = (torch.rand(1) / 1e4) - 5e-5
# With different architectures, I found the following standardization step sometimes
# helpful, sometimes unhelpful.
#
rewards = (rewards - rewards.mean()) / (rewards.std(unbiased=False) + epsilon)
# Alternatively, comment it out and use the following line instead:
rewards += epsilon
for (log_prob, value), r in zip(saved_actions, rewards):
reward = torch.tensor(r - value.item()).cuda()
policy_losses.append(-log_prob * reward)
value_losses.append(F.smooth_l1_loss(value, torch.tensor([r]).cuda()))
optimizer.zero_grad()
loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
loss = torch.clamp(loss, -1e-5, 1e5)
loss.backward()
optimizer.step()
del model.rewards[:]
del model.saved_actions[:]
running_reward = 0
for episode in range(0, 4000):
state = env.reset()
state = state.type(torch.float32)
reward = 0
done = False
msg = None
while not done:
action = model.act(state)
state, reward, done, msg = env.step(action)
model.rewards.append(reward)
if done:
break
running_reward = running_reward * (1 - 1/log_interval) + reward * (1/log_interval)
finish_episode()
# Resetting the hidden state seems unnecessary - it's effectively random from the previous
# episode anyway, more random than a bunch of zeros.
# model.reset_hidden()
if msg["msg"] == "done" and env.portfolio_value() > env.starting_portfolio_value * 1.1 and running_reward > 500:
print("Early Stopping: " + str(int(reward)))
break
if episode % log_interval == 0:
print("""Episode {}: started at {:.1f}, finished at {:.1f} because {} # t={}, \
last reward {:.1f}, running reward {:.1f}""".format(episode, env.starting_portfolio_value, \
env.portfolio_value(), msg["msg"], env.cur_timestep, reward, running_reward))
This is the error I get
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-74-21b617d2e36f> in <module>()
46 msg = None
47 while not done:
---> 48 action = model.act(state)
49 state, reward, done, msg = env.step(action)
50 model.rewards.append(reward)
4 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
1751 if has_torch_function_variadic(input, weight):
1752 return handle_torch_function(linear, (input, weight), input, weight, bias=bias)
-> 1753 return torch._C._nn.linear(input, weight, bias)
1754
1755
RuntimeError: expected scalar type Double but found Float
This is the github I'm using:
https://github.com/tomgrek/RL-stocktrading/blob/master/Finance%20final.ipynb
I was suggested that teh problem is because when I use "state" as a parameter torch throws an error cause it expects a numeric type, but I cannot change state to any float, because I get another error that list can't be changed to float32.
I'll be super grateful if you could show me idiot-proof what I'm doing wrong.

First are you using the same environment "env" and/or dataset ?
Second, you added this line state = state.type(torch.float32) and it didn't throw an error so I suppose state was already a tensor (which is a bit weird). If you had to change the type to float32 then in the next while loop you may have forgotten to change the type.
while not done:
action = model.act(state)
state, reward, done, msg = env.step(action)
state = state.float() # To add as I think env.step(action) returns a long tensor for some reason
model.rewards.append(reward)
if done:
break
Good luck.

Deep Q learning not performing well for algo-trading

I have implemented a deep q learning in Python using keras framework to reproduce a paper's results. However, It is not working. Here is some info :
the training is done in 10000 step for the agent for 2 possible actions
the input vector's shape is 117
Here is the code for the algorithm (inspired from various github repos maybe I implemented the algorithm wrong)
def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims,CLIP_GRADIENT=1):
#set_seed(42)
model = Sequential([
Dense(fc1_dims, input_shape=(input_dims,)), # bias_regularizer=regularizers.l2(1e-4),activity_regularizer=regularizers.l2(1e-5)
Activation('relu'),
BatchNormalization(),
Dense(fc2_dims),
Activation('relu'),
BatchNormalization(),
Dense(n_actions)])
model.compile(optimizer=Adam(lr=lr, clipvalue=CLIP_GRADIENT), loss='mse')
return model
class Agent(object):
def __init__(self, alpha, gamma, n_actions, epsilon, batch_size,
input_dims, epsilon_dec=0.996, epsilon_end=0.01,
mem_size=1000000, fname='dqn_model.h5'):
self.action_space = [i for i in range(n_actions)]
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_dec = epsilon_dec
self.epsilon_min = epsilon_end
self.batch_size = batch_size
self.model_file = fname
self.memory = ReplayBuffer(mem_size, input_dims, n_actions,
discrete=True)
self.q_eval = build_dqn(alpha, n_actions, input_dims, 64, 32)
def remember(self, state, action, reward, new_state, done):
self.memory.store_transition(state, action, reward, new_state, done)
def choose_action(self, state):
state = state[np.newaxis, :]
rand = np.random.random()
if rand < self.epsilon:
action = np.random.choice(self.action_space)
else:
actions = self.q_eval.predict(state)
action = np.argmax(actions)
return action
def learn(self):
if self.memory.mem_cntr > self.batch_size:
state, action, reward, new_state, done = \
self.memory.sample_buffer(self.batch_size)
action_values = np.array(self.action_space, dtype=np.int8)
action_indices = np.dot(action, action_values)
q_eval = self.q_eval.predict(state)
q_next = self.q_eval.predict(new_state)
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
q_target[batch_index, action_indices] = reward + \
self.gamma*np.max(q_next, axis=1)*done
_ = self.q_eval.fit(state, q_target, verbose=0)
self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > \
self.epsilon_min else self.epsilon_min
def processState(self, state):
n = len(state)
relative_diff_matrix,prev_posiion = state[:n-1],state[n-1]
relative_diff_matrix = relative_diff_matrix.reshape((int(n/30),30))
relative_diff_matrix = np.diff(relative_diff_matrix) / relative_diff_matrix[:,:-1]
relative_diff_matrix = StandardScaler().fit_transform(relative_diff_matrix.T).T
processed_state = relative_diff_matrix.flatten()
processed_state = np.append(processed_state,prev_posiion)
return processed_state
def processReward(self, reward,rewardClipping=1):
return np.clip(reward, -rewardClipping, rewardClipping)
def train_model(self,trainingEnv, n_episodes = 1,verbose=0):
scores = []
eps_history = []
for i in range(n_episodes):
done = False
score = 0
observation = env.reset()
observation = self.processState(observation)
#observation = self.processState(observation)
while not done:
action = agent.choose_action(observation)
observation_, reward, done, info = trainingEnv.step(action)
# Remembering episode
reward = self.processReward(reward)
observation_ = self.processState(observation_)
score += reward
self.remember(observation_, action, reward, observation_, int(done))
# Remembering episode for other action => Better exploration
otherAction = int(not bool(action))
otherReward = self.processReward(info['Reward'])
otherNextState = self.processState(info['State'])
otherDone = info['Done']
self.remember(observation_, otherAction, otherReward, otherNextState, otherDone)
observation = observation_
# learning
self.learn()
if verbose :
eps_history.append(agent.epsilon)
scores.append(score)
avg_score = np.mean(scores[max(0, i-100):(i+1)])
print('episode: ', i,'score: %.2f' % score,
' average score %.2f' % avg_score)
trainingEnv.render()
def save_model(self):
self.q_eval.save(self.model_file)
def load_model(self):
self.q_eval = load_model(self.model_file)
I start with 100 $ capital I finish with slightly more or less in the horizon of 20 years (about 10000 step). I tried tuning the parameters but nothing worked.
here is the main:
env = TradingEnv(marketSymbol="GOOGL", period=PERIOD_DEFAULT, startingDate=START_DEFAULT, endingDate=END_DEFAULT, columns=COLUMNS, money=100,transactionCosts=0)
lr = 0.0005
agent = Agent(gamma=1, epsilon=0.00, alpha=lr, input_dims=117,
n_actions=2, mem_size=1000000, batch_size=32, epsilon_end=0.0)
agent.train_model(env)

I think I have managed to solve the problem. We need to set the number of episodes to a sufficiently high number (not 1), in my case it was just thirty. However, I don't know how to efficiently backtest the deep q trading agent !

DMN neural network with poor validation results -- only 50%

I have this problem with my Neural Network. I'm trying to implement what's called a DMN (Dynamic Memory Network) for the babi data set. A paper about the DMN model can be found here: http://arxiv.org/abs/1506.07285 Another paper about DMNs can be found here: https://yerevann.github.io/2016/02/05/implementing-dynamic-memory-networks/
Here's my problem. btw I'm using PyTorch.
I split the training and testing data into parts for training, testing, and validation. I use 1000 parts for training, 500 parts for testing and 500 parts for validation. I run into a problem. I can train successfully but when I go to the validation step I never get a score above 50% accuracy. With the babi data set it is documented that you should be able to get 100% accuracy with the first test set. (There are 20 test sets in all). I can get 100% accuracy during training, but only 50% in validation. My question to you is what part of the program would be responsible for this kind of behavior? In other words, can you tell me why I'm always getting 50% ?? Thanks for your time. I'm limiting my experiments to the first babi test for now.
I thought I had this all figured out but my problem has cropped up again. I really don't have a clue what it is. Here is a link to the code. If you could take a look I would be most grateful. https://github.com/radiodee1/awesome-chatbot/blob/master/model/babi_iv.py
Some code is included below.
class WrapMemRNN(nn.Module):
def __init__(self,vocab_size, embed_dim, hidden_size, n_layers, dropout=0.3, do_babi=True, bad_token_lst=[], freeze_embedding=False, embedding=None, print_to_screen=False):
super(WrapMemRNN, self).__init__()
self.hidden_size = hidden_size
self.n_layers = n_layers
self.do_babi = do_babi
self.print_to_screen = print_to_screen
self.bad_token_lst = bad_token_lst
self.embedding = embedding
self.freeze_embedding = freeze_embedding
self.teacher_forcing_ratio = hparams['teacher_forcing_ratio']
gru_dropout = dropout * 0
self.model_1_enc = Encoder(vocab_size, embed_dim, hidden_size, n_layers, dropout=dropout,embedding=embedding, bidirectional=False)
self.model_2_enc = Encoder(vocab_size, embed_dim, hidden_size, n_layers, dropout=gru_dropout, embedding=embedding, bidirectional=False)
self.model_3_mem_a = MemRNN(hidden_size, dropout=gru_dropout)
self.model_3_mem_b = MemRNN(hidden_size, dropout=gru_dropout)
self.model_4_att = EpisodicAttn(hidden_size, dropout=gru_dropout)
self.model_5_ans = AnswerModule(vocab_size, hidden_size,dropout=dropout)
self.input_var = None # for input
self.q_var = None # for question
self.answer_var = None # for answer
self.q_q = None # extra question
self.inp_c = None # extra input
self.inp_c_seq = None
self.all_mem = None
self.last_mem = None # output of mem unit
self.prediction = None # final single word prediction
self.memory_hops = hparams['babi_memory_hops']
self.reset_parameters()
if self.freeze_embedding or self.embedding is not None:
self.new_freeze_embedding()
#self.criterion = nn.CrossEntropyLoss()
pass
def reset_parameters(self):
#print('reset')
stdv = 1.0 / math.sqrt(self.hidden_size)
for weight in self.parameters():
#print('here...')
weight.data.uniform_(-stdv, stdv)
if len(weight.size()) > 1:
init.xavier_normal_(weight)
def forward(self, input_variable, question_variable, target_variable, criterion=None):
self.new_input_module(input_variable, question_variable)
self.new_episodic_module()
outputs, ans = self.new_answer_module_simple()
return outputs, None, ans, None
def new_freeze_embedding(self):
self.model_1_enc.embed.weight.requires_grad = False
self.model_2_enc.embed.weight.requires_grad = False
print('freeze embedding')
pass
def new_input_module(self, input_variable, question_variable):
prev_h1 = []
for ii in input_variable:
ii = self.prune_tensor(ii, 2)
out1, hidden1 = self.model_1_enc(ii, None)
prev_h1.append(hidden1)
self.inp_c_seq = prev_h1
self.inp_c = prev_h1[-1]
prev_h2 = []
for ii in question_variable:
ii = self.prune_tensor(ii, 2)
out2, hidden2 = self.model_2_enc(ii, None)
prev_h2.append(hidden2)
self.q_q = hidden2[:,-1,:]
return
def new_episodic_module(self):
if True:
mem_list = []
sequences = self.inp_c_seq
for i in range(len(sequences)):
m_list = [self.q_q.clone()]
#print(sequences[i].size(),'seq')
for iter in range(self.memory_hops):
x = self.new_attention_step(sequences[i], None, m_list[iter], self.q_q)
if self.print_to_screen and not self.training:
print(x,'x -- after', len(x), sequences[i].size())
e, _ = self.new_episode_small_step(sequences[i], x.permute(1,0), None)
assert len(sequences[i].size()) == 3
#print(e.size(),'e')
ee = e[:, 0, -1]#.permute(2,1,0)
_, out = self.model_3_mem_a(ee.unsqueeze(0), self.prune_tensor(m_list[iter], 3))
m_list.append(out)
mem_list.append(m_list[self.memory_hops])
mm_list = torch.cat(mem_list, dim=1)
self.last_mem = mm_list
#print(self.last_mem.size(),'lm')
return None
def new_episode_small_step(self, ct, g, prev_h):
assert len(ct.size()) == 3
bat, sen, emb = ct.size()
#print(ct.size(),'ct')
#print(sen,'sen', g.size())
last = [prev_h]
ep = []
for iii in range(sen):
c = ct[0,iii,:].unsqueeze(0)
if prev_h is not None:
prev_h = self.prune_tensor(prev_h, 3)
out, gru = self.model_3_mem_b(c, last[iii] )
last.append(out)
g = g.squeeze(0)
gru = gru.squeeze(0).permute(1,0)
#if not self.training: print(g.size(),'g', iii)
#ggg = g[:, iii]
ggg = g[iii]
h = torch.mul(ggg , gru)# + torch.mul((1 - g[iii]) , prev_h.permute(1,0))
index = -1 #-1 # -2
if last[iii + index] is not None:
#print(last[iii].size(),'last -',ggg.size(), ggg, sen)
if False: h = h + torch.mul((1 - ggg), last[iii + index])
#print(h.size(),'hsize')
if iii == sen - 1 : ep.append(h.unsqueeze(1))
h = torch.cat(ep, dim=1)
#print(h.size(),ep[0].size(),'h',sen, gru.size())
return h, gru
def new_attention_step(self, ct, prev_g, mem, q_q):
q_q = self.prune_tensor(q_q,3)
mem = self.prune_tensor(mem,3)
assert len(ct.size()) == 3
bat, sen, emb = ct.size()
#print(sen,'len sen')
att = []
for iii in range(sen):
c = ct[0,iii,:]
concat_list = [
c.unsqueeze(0),
mem.squeeze(0),
q_q.squeeze(0),
(c * q_q).squeeze(0),
(c * mem).squeeze(0),
(torch.abs(c - q_q) ).squeeze(0),
(torch.abs(c - mem) ).squeeze(0)
]
#for ii in concat_list: print(ii.size())
#print(sen,'sen')
#exit()
#z = F.sigmoid(z)
concat_list = torch.cat(concat_list, dim=1)
#print(concat_list.size(),'cl')
att.append(concat_list)
att = torch.cat(att, dim=0)
#z = torch.cat(att, dim=0)
z = self.model_4_att(att)
z = F.sigmoid(z)
#z = F.softmax(z, dim=1) #F.sigmoid(z)
#print(z.size(),'z')
return z
def prune_tensor(self, input, size):
if len(input.size()) < size:
input = input.unsqueeze(0)
if len(input.size()) > size:
input = input.squeeze(0)
return input
def new_answer_module_simple(self):
#outputs
ansx = self.model_5_ans(self.last_mem, None)
#ansx = F.softmax(ansx, dim=0)
if self.print_to_screen:
print(ansx, 'ansx printed')
print(ansx.size(), 'ansx')
vocab, sen = ansx.size()
aa = torch.argmax(ansx, dim=0)
print(aa.size(),'aa')
for i in range(sen):
zz = aa[i]
z = ansx[:, i]
a = torch.argmax(z, dim=0)
print(a.item(), zz.item())
print('----')
#ans = torch.argmax(ansx,dim=1)#[0]
return [None], ansx
pass

Deep Neural Network does not update weights upon training

I am currently getting into tensorflow and have just now started to grasp the graph like concept of it. Now I tried to implement a NN using gradient descent(Adam optimizer) to solve the cartpole environment. I start by randomly intializing my weights and then take random actions(accounting for existing weights) during training. When testing I always take the action with maximum probability. However I always get a score that hovers around 10 and variance is around 0.8. Always. it doesn't change in a notable fashion at all making it look that it always takes purely random actions at every step, not learning anything at all. As I said it seems that the weights are never updated correctly. Where and how do I need to do that?
Here's my code:
import tensorflow as tf
import numpy as np
from gym.envs.classic_control import CartPoleEnv
env = CartPoleEnv()
learning_rate = 10**(-3)
gamma = 0.9999
n_train_trials = 10**3
n_test_trials = 10**2
n_actions = env.action_space.n
n_obs = env.observation_space.high.__len__()
goal_steps = 200
should_render = False
print_per_episode = 100
state_holder = tf.placeholder(dtype=tf.float32, shape=(None, n_obs), name='symbolic_state')
actions_one_hot_holder = tf.placeholder(dtype=tf.float32, shape=(None, n_actions),
name='symbolic_actions_one_hot_holder')
discounted_rewards_holder = tf.placeholder(dtype=tf.float32, shape=None, name='symbolic_reward')
# initialize neurons list dynamically
def get_neurons_list():
i = n_obs
n_neurons_list = [i]
while i < (n_obs * n_actions) // (n_actions // 2):
i *= 2
n_neurons_list.append(i)
while i // 2 > n_actions:
i = i // 2
n_neurons_list.append(i)
n_neurons_list.append(n_actions)
# print(n_neurons_list)
return n_neurons_list
with tf.name_scope('nonlinear_policy'):
# create list of layers with sizes
n_neurons_list = get_neurons_list()
network = None
for i in range((len(n_neurons_list) - 1)):
theta = tf.Variable(tf.random_normal([n_neurons_list[i], n_neurons_list[i+1]]))
bias = tf.Variable(tf.random_normal([n_neurons_list[i+1]]))
if network is None:
network = tf.matmul(state_holder, theta) + bias
else:
network = tf.matmul(network, theta) + bias
if i < len(n_neurons_list) - 1:
network = tf.nn.relu(network)
action_probabilities = tf.nn.softmax(network)
testing_action_choice = tf.argmax(action_probabilities, dimension=1, name='testing_action_choice')
with tf.name_scope('loss'):
actually_chosen_probability = action_probabilities * actions_one_hot_holder
L_theta = -1 * (tf.reduce_sum(tf.log(actually_chosen_probability)) * tf.reduce_sum(discounted_rewards_holder))
with tf.name_scope('train'):
# We define the optimizer to use the ADAM optimizer, and ask it to minimize our loss
gd_opt = tf.train.AdamOptimizer(learning_rate).minimize(L_theta)
sess = tf.Session() # FOR NOW everything is symbolic, this object has to be called to compute each value of Q
# Start
sess.run(tf.global_variables_initializer())
observation = env.reset()
batch_rewards = []
states = []
action_one_hots = []
episode_rewards = []
episode_rewards_list = []
episode_steps_list = []
step = 0
episode_no = 0
while episode_no <= n_train_trials:
if should_render: env.render()
step += 1
action_probability_values = sess.run(action_probabilities,
feed_dict={state_holder: [observation]})
# Choose the action using the action probabilities output by the policy implemented in tensorflow.
action = np.random.choice(np.arange(n_actions), p=action_probability_values.ravel())
# Calculating the one-hot action array for use by tensorflow
action_arr = np.zeros(n_actions)
action_arr[action] = 1.
action_one_hots.append(action_arr)
# Record states
states.append(observation)
observation, reward, done, info = env.step(action)
# We don't want to go above 200 steps
if step >= goal_steps:
done = True
batch_rewards.append(reward)
episode_rewards.append(reward)
# If the episode is done, and it contained at least one step, do the gradient updates
if len(batch_rewards) > 0 and done:
# First calculate the discounted rewards for each step
batch_reward_length = len(batch_rewards)
discounted_batch_rewards = batch_rewards.copy()
for i in range(batch_reward_length):
discounted_batch_rewards[i] *= (gamma ** (batch_reward_length - i - 1))
# Next run the gradient descent step
# Note that each of action_one_hots, states, discounted_batch_rewards has the first dimension as the length
# of the current trajectory
gradients = sess.run(gd_opt, feed_dict={actions_one_hot_holder: action_one_hots, state_holder: states,
discounted_rewards_holder: discounted_batch_rewards})
action_one_hots = []
states = []
batch_rewards = []
if done:
# Done with episode. Reset stuff.
episode_no += 1
episode_rewards_list.append(np.sum(episode_rewards))
episode_steps_list.append(step)
episode_rewards = []
step = 0
observation = env.reset()
if episode_no % print_per_episode == 0:
print("Episode {}: Average steps in last {} episodes".format(episode_no, print_per_episode),
np.mean(episode_steps_list[(episode_no - print_per_episode):episode_no]), '+-',
np.std(episode_steps_list[(episode_no - print_per_episode):episode_no])
)
observation = env.reset()
episode_rewards_list = []
episode_rewards = []
episode_steps_list = []
step = 0
episode_no = 0
print("Testing")
while episode_no <= n_test_trials:
env.render()
step += 1
# For testing, we choose the action using an argmax.
test_action, = sess.run([testing_action_choice],
feed_dict={state_holder: [observation]})
observation, reward, done, info = env.step(test_action[0])
if step >= 200:
done = True
episode_rewards.append(reward)
if done:
episode_no += 1
episode_rewards_list.append(np.sum(episode_rewards))
episode_steps_list.append(step)
episode_rewards = []
step = 0
observation = env.reset()
if episode_no % print_per_episode == 0:
print("Episode {}: Average steps in last {} episodes".format(episode_no, print_per_episode),
np.mean(episode_steps_list[(episode_no - print_per_episode):episode_no]), '+-',
np.std(episode_steps_list[(episode_no - print_per_episode):episode_no])
)

Here is an example tensorflow program that uses Q Learning to learn the CartPole Open Gym.
It is able to quickly learn to stay upright for 80 steps.
Here is the code :
import math
import numpy as np
import sys
import random
sys.path.append("../gym")
from gym.envs.classic_control import CartPoleEnv
env = CartPoleEnv()
discount = 0.5
learning_rate = 0.5
gradient = .001
regularizaiton_factor = .1
import tensorflow as tf
tf_state = tf.placeholder( dtype=tf.float32 , shape=[4] )
tf_state_2d = tf.reshape( tf_state , [1,4] )
tf_action = tf.placeholder( dtype=tf.int32 )
tf_action_1hot = tf.reshape( tf.one_hot( tf_action , 2 ) , [1,2] )
tf_delta_reward = tf.placeholder( dtype=tf.float32 )
tf_value = tf.placeholder( dtype=tf.float32 )
tf_matrix1 = tf.Variable( tf.random_uniform([4,7], -.001, .001) )
tf_matrix2 = tf.Variable( tf.random_uniform([7,2], -.001, .001) )
tf_logits = tf.matmul( tf_state_2d , tf_matrix1 )
tf_logits = tf.matmul( tf_logits , tf_matrix2 )
tf_loss = -1 * learning_rate * ( tf_delta_reward + discount * tf_value - tf_logits ) * tf_action_1hot
tf_regularize = tf.reduce_mean( tf.square( tf_matrix1 )) + tf.reduce_mean( tf.square( tf_matrix2 ))
tf_train = tf.train.GradientDescentOptimizer(gradient).minimize( tf_loss + tf_regularize * regularizaiton_factor )
sess = tf.Session()
sess.run( tf.global_variables_initializer() )
def max_Q( state ) :
actions = sess.run( tf_logits, feed_dict={ tf_state:state } )
actions = actions[0]
value = actions.max()
action = 0 if actions[0] == value else 1
return action , value
avg_age = 0
for trial in range(1,101) :
# initialize state
previous_state = env.reset()
# initialize action and the value of the expected reward
action , value = max_Q(previous_state)
previous_reward = 0
for age in range(1,301) :
if trial % 100 == 0 :
env.render()
new_state, new_reward, done, info = env.step(action)
new_state = new_state
action, value = max_Q(new_state)
# The cart-pole gym doesn't return a reward of Zero when done.
if done :
new_reward = 0
delta_reward = new_reward - previous_reward
# learning phase
sess.run(tf_train, feed_dict={ tf_state:previous_state, tf_action:action, tf_delta_reward:delta_reward, tf_value:value })
previous_state = new_state
previous_reward = new_reward
if done :
break
avg_age = avg_age * 0.95 + age * .05
if trial % 50 == 0 :
print "Average age =",int(round(avg_age))," , trial",trial," , discount",discount," , learning_rate",learning_rate," , gradient",gradient
elif trial % 10 == 0 :
print int(round(avg_age)),
Here is the output:
6 18 23 30 Average age = 36 , trial 50 , discount 0.5 , learning_rate 0.5 , gradient 0.001
38 47 50 53 Average age = 55 , trial 100 , discount 0.5 , learning_rate 0.5 , gradient 0.001
Summary
I wasn't able to get Q learning with a simple neural net to be able to solve the CartPole problem, but have fun experimenting with different NN sizes and depths!
Hope you enjoy this code,
cheers

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

RuntimeError: the derivative for 'indices' is not implemented - pytorch

You have to do use .detach() for : Qnext = self.Q_next.forward(list(memory[:,3][:])).detach().to(self.Q_eval.device)

Related

DQN model for Atari game never learns

problem changing input type in pytorch reinforcement learning

Deep Q learning not performing well for algo-trading

DMN neural network with poor validation results -- only 50%

Deep Neural Network does not update weights upon training

Categories

Resources