I have this problem with my Neural Network. I'm trying to implement what's called a DMN (Dynamic Memory Network) for the babi data set. A paper about the DMN model can be found here: http://arxiv.org/abs/1506.07285 Another paper about DMNs can be found here: https://yerevann.github.io/2016/02/05/implementing-dynamic-memory-networks/
Here's my problem. btw I'm using PyTorch.
I split the training and testing data into parts for training, testing, and validation. I use 1000 parts for training, 500 parts for testing and 500 parts for validation. I run into a problem. I can train successfully but when I go to the validation step I never get a score above 50% accuracy. With the babi data set it is documented that you should be able to get 100% accuracy with the first test set. (There are 20 test sets in all). I can get 100% accuracy during training, but only 50% in validation. My question to you is what part of the program would be responsible for this kind of behavior? In other words, can you tell me why I'm always getting 50% ?? Thanks for your time. I'm limiting my experiments to the first babi test for now.
I thought I had this all figured out but my problem has cropped up again. I really don't have a clue what it is. Here is a link to the code. If you could take a look I would be most grateful. https://github.com/radiodee1/awesome-chatbot/blob/master/model/babi_iv.py
Some code is included below.
class WrapMemRNN(nn.Module):
def __init__(self,vocab_size, embed_dim, hidden_size, n_layers, dropout=0.3, do_babi=True, bad_token_lst=[], freeze_embedding=False, embedding=None, print_to_screen=False):
super(WrapMemRNN, self).__init__()
self.hidden_size = hidden_size
self.n_layers = n_layers
self.do_babi = do_babi
self.print_to_screen = print_to_screen
self.bad_token_lst = bad_token_lst
self.embedding = embedding
self.freeze_embedding = freeze_embedding
self.teacher_forcing_ratio = hparams['teacher_forcing_ratio']
gru_dropout = dropout * 0
self.model_1_enc = Encoder(vocab_size, embed_dim, hidden_size, n_layers, dropout=dropout,embedding=embedding, bidirectional=False)
self.model_2_enc = Encoder(vocab_size, embed_dim, hidden_size, n_layers, dropout=gru_dropout, embedding=embedding, bidirectional=False)
self.model_3_mem_a = MemRNN(hidden_size, dropout=gru_dropout)
self.model_3_mem_b = MemRNN(hidden_size, dropout=gru_dropout)
self.model_4_att = EpisodicAttn(hidden_size, dropout=gru_dropout)
self.model_5_ans = AnswerModule(vocab_size, hidden_size,dropout=dropout)
self.input_var = None # for input
self.q_var = None # for question
self.answer_var = None # for answer
self.q_q = None # extra question
self.inp_c = None # extra input
self.inp_c_seq = None
self.all_mem = None
self.last_mem = None # output of mem unit
self.prediction = None # final single word prediction
self.memory_hops = hparams['babi_memory_hops']
if self.freeze_embedding or self.embedding is not None:
#self.criterion = nn.CrossEntropyLoss()
def reset_parameters(self):
stdv = 1.0 / math.sqrt(self.hidden_size)
for weight in self.parameters():
weight.data.uniform_(-stdv, stdv)
if len(weight.size()) > 1:
def forward(self, input_variable, question_variable, target_variable, criterion=None):
self.new_input_module(input_variable, question_variable)
outputs, ans = self.new_answer_module_simple()
return outputs, None, ans, None
def new_freeze_embedding(self):
self.model_1_enc.embed.weight.requires_grad = False
self.model_2_enc.embed.weight.requires_grad = False
print('freeze embedding')
def new_input_module(self, input_variable, question_variable):
prev_h1 = []
for ii in input_variable:
ii = self.prune_tensor(ii, 2)
out1, hidden1 = self.model_1_enc(ii, None)
self.inp_c_seq = prev_h1
self.inp_c = prev_h1[-1]
prev_h2 = []
for ii in question_variable:
ii = self.prune_tensor(ii, 2)
out2, hidden2 = self.model_2_enc(ii, None)
self.q_q = hidden2[:,-1,:]
def new_episodic_module(self):
if True:
mem_list = []
sequences = self.inp_c_seq
for i in range(len(sequences)):
m_list = [self.q_q.clone()]
for iter in range(self.memory_hops):
x = self.new_attention_step(sequences[i], None, m_list[iter], self.q_q)
if self.print_to_screen and not self.training:
print(x,'x -- after', len(x), sequences[i].size())
e, _ = self.new_episode_small_step(sequences[i], x.permute(1,0), None)
assert len(sequences[i].size()) == 3
ee = e[:, 0, -1]#.permute(2,1,0)
_, out = self.model_3_mem_a(ee.unsqueeze(0), self.prune_tensor(m_list[iter], 3))
mm_list = torch.cat(mem_list, dim=1)
self.last_mem = mm_list
return None
def new_episode_small_step(self, ct, g, prev_h):
assert len(ct.size()) == 3
bat, sen, emb = ct.size()
#print(sen,'sen', g.size())
last = [prev_h]
ep = []
for iii in range(sen):
c = ct[0,iii,:].unsqueeze(0)
if prev_h is not None:
prev_h = self.prune_tensor(prev_h, 3)
out, gru = self.model_3_mem_b(c, last[iii] )
g = g.squeeze(0)
gru = gru.squeeze(0).permute(1,0)
#if not self.training: print(g.size(),'g', iii)
#ggg = g[:, iii]
ggg = g[iii]
h = torch.mul(ggg , gru)# + torch.mul((1 - g[iii]) , prev_h.permute(1,0))
index = -1 #-1 # -2
if last[iii + index] is not None:
#print(last[iii].size(),'last -',ggg.size(), ggg, sen)
if False: h = h + torch.mul((1 - ggg), last[iii + index])
if iii == sen - 1 : ep.append(h.unsqueeze(1))
h = torch.cat(ep, dim=1)
#print(h.size(),ep[0].size(),'h',sen, gru.size())
return h, gru
def new_attention_step(self, ct, prev_g, mem, q_q):
q_q = self.prune_tensor(q_q,3)
mem = self.prune_tensor(mem,3)
assert len(ct.size()) == 3
bat, sen, emb = ct.size()
#print(sen,'len sen')
att = []
for iii in range(sen):
c = ct[0,iii,:]
concat_list = [
(c * q_q).squeeze(0),
(c * mem).squeeze(0),
(torch.abs(c - q_q) ).squeeze(0),
(torch.abs(c - mem) ).squeeze(0)
#for ii in concat_list: print(ii.size())
#z = F.sigmoid(z)
concat_list = torch.cat(concat_list, dim=1)
att = torch.cat(att, dim=0)
#z = torch.cat(att, dim=0)
z = self.model_4_att(att)
z = F.sigmoid(z)
#z = F.softmax(z, dim=1) #F.sigmoid(z)
return z
def prune_tensor(self, input, size):
if len(input.size()) < size:
input = input.unsqueeze(0)
if len(input.size()) > size:
input = input.squeeze(0)
return input
def new_answer_module_simple(self):
ansx = self.model_5_ans(self.last_mem, None)
#ansx = F.softmax(ansx, dim=0)
if self.print_to_screen:
print(ansx, 'ansx printed')
print(ansx.size(), 'ansx')
vocab, sen = ansx.size()
aa = torch.argmax(ansx, dim=0)
for i in range(sen):
zz = aa[i]
z = ansx[:, i]
a = torch.argmax(z, dim=0)
print(a.item(), zz.item())
#ans = torch.argmax(ansx,dim=1)#[0]
return [None], ansx
I am creating my first multivariate multistep encoder-decoder LSTM to forecast revenues.
As you can see, the values move towards a value and then stop at that value. The aim is to create a forecast for a longer period, but there is no deviation at all from this standard value after the first week.
What is wrong and what can I do? To me it doesn't look like it is working at all.
class ModelTrainer:
def __init__(self, prediction_length=30, offset=1):
self.prediction_length = prediction_length
self.offset = offset
self.use_scaling = True
def _setup_values(self):
# Model configuration
self.additional_metrics = ['accuracy']
self.embedding_output_dims = 15
self.max_sequence_length = 300
self.num_distinct_words = 5000
self.verbosity_mode = 1
self.BATCH_SIZE = 128
self.DROPOUT = 0.3
self.NODES_PER_LAYER = 256
self.LEARNING_RATE = 0.001
self.OPTIMIZER = Adam(learning_rate=self.LEARNING_RATE)
self.TEST_SIZE = 0.1
self.RANDOM_STATE = 123
self.LOSS_FUNCTION = MeanSquaredError()
def __import_data(self):
self.series = DataOrganizer().df
def __prepare_data(self):
self.scaler = preprocessing.MinMaxScaler()
data_scaled = self.scaler.fit_transform(self.series)
self.features, self.target = self._create_feature_target_values_window(
def _create_feature_target_values_window(self, data):
self.number_of_output_columns = 4
feature_data = data
target_data = data[:, :self.number_of_output_columns]
features, target = list(), list()
in_start = 0
for _ in range(len(data)):
in_end = in_start + self.WINDOW_LENGTH
out_end = in_end + self.prediction_length
if out_end <= len(data):
features.append(feature_data[in_start:in_end, :])
target_data[in_end:out_end, 0:self.number_of_output_columns])
in_start += 1
return np.array(features), np.array(target)
def __create_LSTM_model(self):
num_feature_columns = self.features.shape[2]
num_output_columns = self.target.shape[2]
model = Sequential()
model.add(LSTM(self.NODES_PER_LAYER, input_shape=(
self.WINDOW_LENGTH, num_feature_columns)))
model.add(LSTM(self.NODES_PER_LAYER, return_sequences=True))
return model
def train_model(self, callbacks=[]):
model = self.__create_LSTM_model()
metrics=['accuracy', MeanAbsoluteError()]
self.model = model
def create_forecast(self):
prediction = self.model.predict(self.features[-1:])
# prediction = self.model.predict(self.features[-30:-29]) # Show forecast from a month old
test_X = self.features.copy()
test_X = test_X[:self.prediction_length,
:1, self.number_of_output_columns:]
test_X = test_X.reshape(
self.prediction_length, self.series.shape[1] - self.number_of_output_columns)
prediction = prediction.reshape(self.prediction_length,
inv_yhat = np.concatenate((prediction, test_X), axis=1)
inv_yhat = self.scaler.inverse_transform(inv_yhat)
prediction_df = pd.DataFrame(
inv_yhat, columns=self.scaler.feature_names_in_)
first_date = self.series.last_valid_index() + timedelta(days=1)
last_date = first_date + timedelta(days=self.prediction_length-1)
days = pd.date_range(first_date, last_date, freq='D')
prediction_df.set_index(days, inplace=True)
prediction_df = prediction_df[self.series.columns[0:4]]
(I know the x-axis description is incorrect. Don't worry about it)
I would like to implement a GRU able to encode a sequence of vectors to one vector (many-to-one), and then another GRU able to decode a vector to a sequence of vector (one-to-many). The size of the vectors wouldn't be changed. I would like to have an opinion about what I implemented.
Here is the code:
class AEGRU(nn.Module):
def __init__(self, opt):
super(AEGRU, self).__init__()
self.length = 256
self.latent_space = 256
self.num_layers = 1
self.GRU_enc = nn.GRU(input_size=3, hidden_size=self.latent_space, num_layers=self.num_layers, batch_first=True)
self.fc_enc = nn.Linear(self.latent_space, self.latent_space)
self.GRU_dec = nn.GRU(input_size=self.latent_space, hidden_size=3, num_layers=self.num_layers, batch_first=True)
self.fc_dec = nn.Linear(3, 3)
def enc(self, x):
# x has shape: Batch_size x self.length x 3
h0 = torch.zeros(self.num_layers, x.shape[0], self.latent_space).cuda()
out, _ = self.GRU_enc(x, h0)
out = out[:, -1, :]
out = self.fc_enc(out)
return out
def dec(self, x):
# x has shape: Batch_size x self.latent_space
x = x[:, None, :]
h = torch.zeros(self.num_layers, x.shape[0], 3).cuda()
# method 1 ??
'''outputs = torch.zeros(x.shape[0], self.length, 3).cuda()
for i in range(self.length):
out, h = self.GRU_dec(x, h)
outputs[:, i, :] = out[:, 0, :]'''
# method 2 ??
x = x.repeat(1, self.length, 1)
outputs, _ = self.GRU_dec(x, h)
# linear layer
outputs = self.fc_dec(outputs)
return outputs
def forward(self, x):
self.indices = []
latent = self.enc(x)
output = self.dec(latent)
return output
I am not sure whether this is the good way to do a one-to-many GRU. Could I have some opinions about this?
Thanks for reading!
I am following this online tutorial for coding a DQN,https://github.com/philtabor/Youtube-Code-Repository/blob/master/ReinforcementLearning/DeepQLearning/torch_deep_q_model.py
, however I am running into this Runtime Error that I am unsure of how to debug or modify to prevent this error. Thanks!
RuntimeError Traceback (most recent call last)
<ipython-input-196-00975d66fd2d> in <module>
28 agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
29 obs= obs_
---> 30 agent.learn(batch_size)
31 lastAction = action
32 scores.append(score)
<ipython-input-191-f6b163cc3a8a> in learn(self, batch_size)
72 Qtarget = Qpred.clone()
73 print(Qnext[1])
---> 74 Qtarget[:,maxA] = rewards + self.GAMMA*torch.max(Qnext[1])
75 # epsilon decay action
76 if self.steps > 2000:
RuntimeError: the derivative for 'indices' is not implemented
These are my code blocks in my jupyter notebook
class DeepQNetwork(nn.Module):
def __init__(self,Alpha):
self.conv1 = nn.Conv2d(1,32,8,stride=4, padding=1)
self.conv2 = nn.Conv2d(32,64,4,stride=2)
self.conv3 = nn.Conv2d(64,128,3)
self.fc1 = nn.Linear(128* 21* 12,512)
self.fc2 = nn.Linear(512,6)
self.optimizer = optim.RMSprop(self.parameters(), lr = Alpha)
self.loss = nn.MSELoss()
self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
def forward(self,obs):
'''Passing in a sequence of arrays'''
obs = torch.Tensor(obs).to(self.device) # send to the GPU
''' Feed forward the Network Parameters'''
obs = obs.view(-1, 1,200,125)
obs = F.relu(self.conv1(obs))
obs = F.relu(self.conv2(obs))
obs = F.relu(self.conv3(obs))
obs = obs.view(-1,128* 21* 12)
obs = F.relu(self.fc1(obs))
# 4 Rows and 6 columns
actions = self.fc2(obs)
return actions
This is the Agent Code, and it contains the error causing line of code
class DQNAgent(object):
def __init__(self, gamma, epsilon, alpha, maxMemory,
epsEnd = 0.05, replace =10000, actionSpace = [0,1,2,3,4,5]):
Gamma -> discount factor of valuing current reward over future reward
Epsilon -> for trade off between exploration-exploitation
alpha -> learn rate
maxMemory -> max size of Memory buffer
epsEnd -> smallest value of Exploration
repace -> how often to replace target network
self.GAMMA = gamma
self.EPSILON = epsilon
self.EPS_END = epsEnd
self.actionSpace = actionSpace
self.maxMemory = maxMemory
self.steps = 0
self.learn_step_counter = 0
self.memory = []
self.memCount = 0
self.replace_tgt_count = replace
self.Q_eval = DeepQNetwork(alpha)
self.Q_next = DeepQNetwork(alpha)
def storeTransition(self, state, action, reward, state_):
'''Stores Transition states'''
if self.memCount < self.maxMemory:
self.memory[self.memCount%self.maxMemory] = [state,action,reward,state_]
self.memCount +=1
def chooseAction(self,obs):
Exploration if np.random > epsilon
else take epsilon greedy action
rand = np.random.random()
# Get the value for all actions for the current set of states
# Forward pass the stack of frames to get value of each action given subset of staes in obs
actions = self.Q_eval.forward(obs)
if rand<1-self.EPSILON:
action = torch.argmax(actions[1]).item()
action = np.random.choice(self.actionSpace)
self.steps += 1
return action
def learn(self, batch_size):
#0 gradient to do batch optimisation
if self.replace_tgt_count is not None and self.learn_step_counter % self.replace_tgt_count==0:
# memory subsampling
if self.memCount + batch_size < self.maxMemory:
memStart = int(np.random.choice(range(self.memCount)))
memStart = int(np.random.choice(range(self.maxMemory-batch_size-1)))
miniBatch = self.memory[memStart:memStart+batch_size]
memory = np.array(miniBatch)
#feed forward current state and successor state conv to list as memory is array of numpy objects
Qpred = self.Q_eval.forward(list(memory[:,0][:])).to(self.Q_eval.device)
Qnext = self.Q_next.forward(list(memory[:,3][:])).to(self.Q_eval.device)
maxA = torch.argmax(Qnext,dim = 1).to(self.Q_eval.device)
#calculate rewards
rewards = torch.Tensor(list(memory[:,2])).to(self.Q_eval.device)
# loss for every action except max action to be 0
Qtarget = Qpred.clone()
Qtarget[:,maxA] = rewards + self.GAMMA*torch.max(Qnext[1])# PROBLEMATIC LINE
# epsilon decay action
if self.steps > 2000:
if self.EPSILON-1e-4 >self.EPS_END:
self.EPSILON-= 1e-4
self.EPSILON = self.EPS_END
loss = self.Q_eval.loss(Qtarget,Qpred).to(self.Q_eval.device)
self.learn_step_counter +=1
env = gym.make("Invader-v0")
agent = DQNAgent(gamma=0.95,epsilon = 1.0,alpha = 0.003, maxMemory = 5000,replace = None)
while agent.memCount < agent.maxMemory:
obs = env.reset()
done = False
lives = 3
while not done:
action = env.action_space.sample()
obs_ , reward, done, info = env.step(action)
if done and info['lives']<lives:
lives = info['lives']
reward -= 200
obs= obs_
initialised = True
scores = []
epsHistory = []
numGames = 50
batch_size = 16
for i in range(numGames):
print(f'starting game {i+1}, epsilon = {agent.EPSILON}')
done = False
obs = env.reset()
frames = [np.sum(obs)]
score = 0
lastAction = 0
lives = 3
while not done:
if len(frames) == 4:
action = agent.chooseAction(frames)
frames = []
action = lastAction
obs_, reward, done, info = env.step(action)
score += score-reward
if done and info['lives'] < lives:
reward -=200
obs= obs_
lastAction = action
print('score: ', score)
x = [i+1 for i in range(numGames)]
You have to do use .detach() for :
Qnext = self.Q_next.forward(list(memory[:,3][:])).detach().to(self.Q_eval.device)
class loss(Function):
def forward(ctx,x,INPUT):
batch_size = x.shape[0]
X = x.detach().numpy()
input = INPUT.detach().numpy()
Loss = 0
for i in range(batch_size):
t_R_r = input[i,0:4]
R_r = t_R_r[np.newaxis,:]
t_R_i = input[i,4:8]
R_i = t_R_i[np.newaxis,:]
t_H_r = input[i,8:12]
H_r = t_H_r[np.newaxis,:]
t_H_i = input[i,12:16]
H_i = t_H_i[np.newaxis,:]
t_T_r = input[i, 16:32]
T_r = t_T_r.reshape(4,4)
t_T_i = input[i, 32:48]
T_i = t_T_i.reshape(4,4)
R = np.concatenate((R_r, R_i), axis=1)
H = np.concatenate((H_r, H_i), axis=1)
temp_t1 = np.concatenate((T_r,T_i),axis=1)
temp_t2 = np.concatenate((-T_i,T_r),axis=1)
T = np.concatenate((temp_t1,temp_t2),axis=0)
phi_r = np.zeros((4,4))
row, col = np.diag_indices(4)
phi_r[row,col] = X[i,0:4]
phi_i = np.zeros((4, 4))
row, col = np.diag_indices(4)
phi_i[row, col] = 1 - np.power(X[i, 0:4],2)
temp_phi1 = np.concatenate((phi_r,phi_i),axis=1)
temp_phi2 = np.concatenate((-phi_i, phi_r), axis=1)
phi = np.concatenate((temp_phi1,temp_phi2),axis=0)
temp1 = np.matmul(R,phi)
temp2 = np.matmul(temp1,T) # error
H_hat = H + temp2
t_Q_r = np.zeros((4,4))
t_Q_r[np.triu_indices(4,1)] = X[i,4:10]
Q_r = t_Q_r + t_Q_r.T
row,col = np.diag_indices(4)
Q_r[row,col] = X[i,10:14]
Q_i = np.zeros((4,4))
Q_i[np.triu_indices(4,1)] = X[i,14:20]
Q_i = Q_i - Q_i.T
temp_Q1 = np.concatenate((Q_r,Q_i),axis=1)
temp_Q2 = np.concatenate((-Q_i,Q_r),axis=1)
Q = np.concatenate((temp_Q1,temp_Q2),axis=0)
t_H_hat_r = H_hat[0,0:4]
H_hat_r = t_H_hat_r[np.newaxis,:]
t_H_hat_i= H_hat[0,4:8]
H_hat_i = t_H_hat_i[np.newaxis,:]
temp_H1 = np.concatenate((-H_hat_i.T,H_hat_r.T),axis=0)
H_hat_H = np.concatenate((H_hat.T,temp_H1),axis=1)
temp_result1 = np.matmul(H_hat,Q)
temp_result2 = np.matmul(temp_result1,H_hat_H)
Loss += np.log10(1+temp_result2[0][0])
Loss = t.from_numpy(np.array(Loss / batch_size))
return Loss
def backward(ctx,grad_output):
return grad_output
def criterion(output,input):
return loss.apply(output,input)
This is my loss function. But it present the error:
Traceback (most recent call last):
File "/Users/mrfang/channel_capacity/training.py", line 24, in
loss.backward() File "/Users/mrfang/anaconda3/lib/python3.6/site-packages/torch/tensor.py",
line 150, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph) File
line 99, in backward
allow_unreachable=True) # allow_unreachable flag RuntimeError: function lossBackward returned an incorrect number of gradients
(expected 2, got 1)
How could I fix it. Thanks very much
Your forward(ctx,x,INPUT) takes two inputs, x and INPUT, thus backward should output two gradients as well, grad_x and grad_INPUT.
In addition, in your snippet, you're not really computing a custom gradient, so you could compute that with Pytorch's autograd, without having to define a special Function.
If this is working code and you're going to define the custom loss, here's a quick boilerplate of what backward should comprise:
def forward(ctx, x, INPUT):
# this is required so they're available during the backwards call
ctx.save_for_backward(x, INPUT)
# custom forward
def backward(ctx, grad_output):
x, INPUT = ctx.saved_tensors
grad_x = grad_INPUT = None
# compute grad here
return grad_x, grad_INPUT
You don't need to return gradients for inputs that don't require it, thus you can return None for them.
More info here and here.
I'm currently a bit confused. I implemented an Actor-Critic Network and depending on the setup ist either begins to converge a little but produces values far from right. Or it produces the nearly the same loss values over and over again but right from the start produces values which are kind of correct.
I really have no clue how that is possible.
This is my current Model which produces values but does not converge:
def create_actor_model(self):
state_input = Input(shape=self.observation_space_shape)
h1 = Dense(18, activation='linear')(state_input)
h1l = LeakyReLU(alpha=0.01)(h1)
h3 = Dense(18, activation='tanh')(h1l)
h3n = BatchNormalization()(h3)
output = Dense(self.action_space_shape[0], activation='tanh')(h3n)
model = Model(input=state_input, output=output)
adam = Adam(lr=self.action_space_shape)
model.compile(loss="mse", optimizer=adam)
return state_input, model
def create_critic_model(self):
state_input = Input(shape=self.observation_space_shape)
state_h1 = Dense(18, activation='relu')(state_input)
state_h2 = Dense(36)(state_h1)
action_input = Input(shape=self.action_space_shape)
action_h1 = Dense(36)(action_input)
merged = Add()([state_h2, action_h1])
l_h1 = LeakyReLU(alpha=0.01)(merged)
merged_h1 = Dense(18, activation='tanh')(l_h1)
h1n = BatchNormalization()(merged_h1)
output = Dense(1, activation='tanh')(h1n)
model = Model(input=[state_input, action_input], output=output)
adam = Adam(lr=self.action_space_shape)
model.compile(loss="mse", optimizer=adam, metrics=['mae', 'mse', 'msle'])
return state_input, action_input, model
def _train_actor_batch(self, batch_size, s_batch, a_batch, r_batch, s2_batch):
predicted_action = self.actor_model.predict_on_batch(s_batch)
grads = self.sess.run(self.critic_grads, feed_dict={
self.critic_state_input: s_batch,
self.critic_action_input: predicted_action
self.sess.run(self.optimize, feed_dict={
self.actor_state_input: s_batch,
self.actor_critic_grad: grads[0]
def _train_critic_batch(self, batch_size, s_batch, a_batch, r_batch, s2_batch):
target_action = self.target_actor_model.predict_on_batch(s2_batch)
future_reward = self.target_critic_model.predict_on_batch([s2_batch, target_action])
rewards = []
for k in range(batch_size):
this_future_reward = future_reward[k] if batch_size > 1 else future_reward
rewards.append(r_batch[k] + self.gamma * this_future_reward)
return self.critic_model.train_on_batch([s_batch, a_batch], np.reshape(rewards, batch_size))
def replay(self, batch_size):
memory_length = len(self.memory)
if memory_length < batch_size:
samples = random.sample(self.memory, memory_length)
samples = random.sample(self.memory, batch_size)
s_batch = np.array([cur_state[0] for cur_state, _, _, _ in samples])
a_batch = np.array([float(action[0]) for _, action, _, _ in samples])
r_batch = np.array([reward[0] for _, _, reward, _ in samples])
s2_batch = np.array([new_state[0] for _, _, _, new_state in samples])
critic_loss = self._train_critic_batch(len(s_batch), s_batch, a_batch, r_batch, s2_batch)
self._train_actor_batch(len(s_batch), s_batch, a_batch, r_batch, s2_batch)
return critic_loss
def _update_actor_target(self):
actor_model_weights = self.actor_model.get_weights()
actor_target_weights = self.target_actor_model.get_weights()
for i in range(len(actor_target_weights)):
actor_target_weights[i] = actor_model_weights[i] * self.tau + actor_target_weights[i] * (1 - self.tau)
def _update_critic_target(self):
critic_model_weights = self.critic_model.get_weights()
critic_target_weights = self.target_critic_model.get_weights()
for i in range(len(critic_target_weights)):
critic_target_weights[i] = critic_model_weights[i] * self.tau + critic_target_weights[i] * (1 - self.tau)
def update_target(self):
def __init__(self):
self.memory = deque(maxlen=2000)
self.actor_state_input, self.actor_model = self.create_actor_model()
_, self.target_actor_model = self.create_actor_model()
self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.action_space_shape[0]])
actor_model_weights = self.actor_model.trainable_weights
self.actor_grads = tf.gradients(self.actor_model.output,
actor_model_weights, -self.actor_critic_grad)
grads = zip(self.actor_grads, actor_model_weights)
self.optimize = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(grads)
self.critic_state_input, self.critic_action_input, \
self.critic_model = self.create_critic_model()
_, _, self.target_critic_model = self.create_critic_model()
self.critic_grads = tf.gradients(self.critic_model.output,
And then i am training with the following method which is called for each epoch (at the end, the memory which is getting cleared is the experience-replay memory):
def train(self, states, epoch, env, is_new_epoch):
train_size = int(len(states) * 0.70)
train = dict(list(states.items())[0:train_size])
test = dict(list(states.items())[train_size:len(states)])
with warnings.catch_warnings():
working_states = copy(train)
critic_eval = list()
rewards = dict()
for last_day, (last_state_vec, _, last_action) in working_states.items():
this_day = last_day + timedelta(days=1)
if this_day in working_states:
(new_state_vec, _, _) = working_states.get(this_day)
rewards[last_day] = env.get_reward_by_states(last_state_vec, new_state_vec)
amt = len(working_states)
i = 0
for last_day, (last_state_vec, _, last_action) in working_states.items():
i+= 1
this_day = last_day + timedelta(days=1)
if this_day in working_states:
(new_state_vec, _, _) = working_states.get(this_day)
reward = np.reshape(rewards[last_day], [1, ])
self.remember(last_state_vec, [last_action], reward, new_state_vec)
new_eval = self.replay(env.batch_size)
These are the loss values i got over 15 epochs:
One Sample as it comes from the memory:
[8 79 48246 53607 29 34 37 Decimal('1.0000000000') 6]
next state
[9 79 48074 57869 27 28 32 Decimal('1.0000000000') 0]