Related
I'm trying to run a DQN for a multi-agent system, so there is one DNN for each agent.
It takes input=state [batch, state size, #time steps, #nodes], while for simplicity we assume #time steps=1. #nodes is number of agents. And output=Q-values for each agent.
The problem is that I test various stuff with this network, but it return not so consistent results. I suspect it has to do with me running separately DQN for each agent, but learning it via the same model. I sum the losses for all agents into one loss, and then it divide by their amount.
I'm not sure it is correct. I'd be grateful for any help.
Here's my code:
class DQN(nn.Module):
def __init__(self, args): #node_size, inputs, outputs, layers=[128, 64, 16]):
# state_size, n_actions = inputs, outputs
super(DQN, self).__init__()
self.model_type = args.model_type
if args.model_type == "seperate_state_DNN":
out_size = args.num_of_actions
self.shared_model = nn.Sequential()
h_sizes = [args.input_state_size] + args.layers
for k in range(len(h_sizes) - 1):
self.shared_model.add_module('k1'+str(k), nn.Linear(h_sizes[k], h_sizes[k + 1]))
self.shared_model.add_module('k2'+str(k), args.activations[args.layers_nl[k]])
self.shared_model.add_module('final', nn.Linear(h_sizes[-1], out_size))
def forward(self, input, i=None):
# input state dimension: [batch, state size, #time steps, #nodes]
if self.model_type == "seperate_state_DNN":
if i is None:
final_output = torch.zeros_like(input)
else:
final_output = self.shared_model(input) # [:, :, :, i].unsqueeze(3))
return final_output
And here is the calling function:
def select_action(self, state, edge_state):
#self.policy_net.eval()
sample = random.random()
if self.configuration == 2:
self.eps_threshold = 0.0 # no exploration at all, only optimal values!
else:
self.eps_threshold = self.decay_functionn()
self.steps_done += 1
if sample > self.eps_threshold:
self.last_exploration = False
with torch.no_grad():
# t.max(1) will return largest column value of each row.
# second column on max result is index of where max element was
# found, so we pick action with the larger expected reward.
state = state.to(self.device)# torch.from_numpy(state).float().to(self.device) # Convert to tensor.
state = state.unsqueeze(0) # Add batch dimension (also to action below): [batch=1, #time steps, #nodes, state size]
final_output = []
x1 = self.policy_net(state, None)#.detach()
for i in range(self.node_size):
final_output.append(self.policy_net(x1[:, :, -1, i]+state[:, :, -1, i], i).max(1)[1].detach().cpu().view(state.shape[0], -1))
# .to(self.device) # action dimension: [batch=1, #nodes]
return torch.cat(final_output, dim=1)
else:
self.last_exploration = True
return torch.randint(0, self.n_actions, (1, self.node_size))
And this is the main RL training loop:
for epi in range(self.episodes):
print("### Starting Episode: ", epi, ' ### in index=', self.run_index)
state = env.reset(self, heatup=self.sim_heatup) # single step state
done = False
while not done:
action = agent.select_action(state) # .to(device)
next_state1, reward, done = env.do_step(action)
agent.add_to_memory(state, action, next_state, reward)
agent.optimize_model()
state = next_state
agent.curr_episode += 1
# Plot and dump statistics and learning curves.
agent.dump_data_on_episode_end(plot=True)
env.capture_episode()
env.close()
Finally, this is the optimization, executed in "agent.optimize_model()" above, including the functions it uses:
def optimize_model(self):
if len(self.memory) < self.batch_size:
return
transitions = self.memory.sample(self.batch_size)
# This converts batch-array of Transitions
# to Transition of batch-arrays.
batch = Transition(*zip(*transitions))
next_states_batch = torch.stack(batch.next_state).to(self.device)
state_batch = torch.stack(batch.state).to(self.device)
action_batch = torch.cat(batch.action).view(self.batch_size, -1).to(self.device) #torch.stack(batch.action, dim=0).to(self.device)
reward_batch = torch.cat(batch.reward).view(self.batch_size, -1).to(self.device)
# dims: states=[batch, steps, nodes, state size]; action=[batch, nodes]; reward=[batch, nodes]
loss = torch.tensor(0., device=self.device)
self.policy_net.train() # IM NOT SURE IF IT SHOULD BE HERE...
x1 = self.policy_net(state_batch, None)
x2 = self.policy_net(next_states_batch, None)
for i in range(self.node_size):
action_batch1 = action_batch[:,i].unsqueeze(1).reshape(-1, 1) # action=[batchXnodes, 1]
reward_batch1 = reward_batch[:,i].unsqueeze(1).view(-1, 1) # reward=[batchXnodes, 1]
# Compute loss
loss += self._compute_loss(i, x1[:, :, -1, i]+state_batch[:, :, -1, i], edge_state_batch, action_batch1,
x2[:, :, -1, i]+next_states_batch[:, :, -1, i], next_edge_state_batch, reward_batch1)
# Optimize the model
loss.div_(self.node_size)
self.optimizer.zero_grad()
loss.backward()
# clip grad
if self.grad_clip is not None:
for param in self.policy_net.parameters():
param.grad.data.clamp_(-self.grad_clip, self.grad_clip)
# update Policy net weights
self.optimizer.step()
#del loss
self.losses.append(loss.detach().cpu().numpy())
# update Target net weights
self._update_target()
def _compute_loss(self, i, state_batch, edge_state_batch, action_batch, next_states_batch, next_edge_state_batch, reward_batch):
# Q{policy net}(s, a): [batchXnodes, actions] ---gather---> [batchXnodes, 1=q_values according to this policy]
state_action_q_values = self.policy_net(state_batch, i).gather(1, action_batch)
# argmax{a} Q{policy net}(s', a'): [batchXnodes, actions] ---argmax---> [batchXnodes] ---unsqueeze---> [batchXnodes, 1]
next_state_actions = torch.argmax(self.policy_net(next_states_batch, i), dim=1).unsqueeze(1)
# Q{ploicy net}(s', argmax{a} Q{target net}(s', a') ): [batchXnodes, actions] --gather--> [batchXnodes, 1=q_values according to this policy]
next_state_q_values = self.target_net(next_states_batch, i).gather(1, next_state_actions)
# Q* = Disount * Q(s', argmax(..)) + R: [batchXnodes, 1]
expected_state_action_values = (next_state_q_values.detach() * self.discount) + reward_batch
loss = F.smooth_l1_loss(state_action_q_values, expected_state_action_values)
return loss
def _update_target(self):
if self.target_net is None:
# There is nothing to update.
return
# Update the target network, copying all weights and biases in DQN
if self.target_update > 1:
# Hard copy of weights.
if self.steps_done % self.target_update == 0:
self.target_net.load_state_dict(self.policy_net.state_dict())
return
elif self.target_update < 1 and self.target_update > 0:
# polyak averaging:
tau = self.target_update
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
target_param.data.copy_(tau * param + (1 - tau) * target_param)
return
else:
raise NotImplementedError
Sorry for the large question, I just wanted to supply all the necessary information.
If more information is needed I'd be happy to give it.
Any suggestion is much appreciated.
Thanks,
Shimon
I'm trying to implement an Pong game with DQN model by torch. However I got two problems during the execution. Firstly, I found that the game never get done. Secondly, I found the loss function does not have any change in the trainning. This is my code below:
I defined a CNN network with the input of the size (batch=32, channels=4, height=84, weight=84). By this step there's nothing wrong happened:
class CNN(nn.Module):
def __init__(self, s_channels, a_space):
super(CNN, self).__init__()
self.pool = nn.MaxPool2d(kernel_size=2, stride=1)
self.conv1 = nn.Conv2d(s_channels,out_channels=32,kernel_size=8,stride=4)
self.conv2 = nn.Conv2d(32,64,4,2)
self.conv3 = nn.Conv2d(64,64,3,1)
self.fc1 = nn.Linear(64*4*4,1024)
self.fc2 = nn.Linear(1024,512)
self.fc3 = nn.Linear(512,a_space)
def forward(self,input):
output = self.pool(F.relu(self.conv1(input)))
output = self.pool(F.relu(self.conv2(output)))
output = self.pool(F.relu(self.conv3(output)))
output = output.view(-1,64*4*4)
output = F.relu(self.fc1(output))
output = F.relu(self.fc2(output))
output = F.relu(self.fc3(output))
return output
For the agent class, I defined a back propagation function to replay the weight in CNN and the data pre-processing function:
# Agent
class Agent():
def __init__(self, s_space, a_space) -> None:
# define parameters
self.epsilon = 1.0
self.min_epsilon = 0.01
self.dr = 0.995
self.lr = 0.001
self.gamma = 0.9
# define models
self.evl_net = CNN(s_space, a_space)
self.tgt_net = CNN(s_space, a_space)
self.cert = nn.SmoothL1Loss()
self.optimal = th.optim.Adam(self.evl_net.parameters(),lr=self.lr)
# define memory store
self.memory = deque(maxlen=2000)
# self.img_stack = deque(maxlen=4)
# pre-processing frame images: transform the imaages into tensors
# def bsl_image_pre_process(self,env):
# env = aw.AtariWrapper(env,noop_max=30,frame_skip=4,screen_size=84,terminal_on_life_loss=True,clip_reward = True)
# return env
def gym_image_pre_process(self,env):
#Atari preprocessing
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
#create frame stack
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
return env,channels
# env = aw.AtariWrapper(env,noop_max=30,frame_skip=4,screen_size=84,terminal_on_life_loss=True,clip_reward = True)
# return env
def data_pre_process(self,batch_size):
s_v = []
a_v = []
next_s_v = []
r_v = []
dones = []
materials = random.sample(self.memory,batch_size)
for t in materials:
s_v.append(t[0])
a_v.append(t[1])
next_s_v.append(t[2])
r_v.append(t[3])
dones.append(t[4])
# print(th.FloatTensor(r_v))
# print(th.FloatTensor(r_v).size())
# print(s_v)
s_v = th.Tensor(s_v) # size: [32,3,210,160]
a_v = th.LongTensor(a_v).unsqueeze(1) # size: [32,1]
next_s_v = th.Tensor(next_s_v) # size: [32,3,210,160]
r_v = th.FloatTensor(r_v) # size: [32]
return s_v, a_v, next_s_v, r_v, dones
# remember the transformed images
def record(self,tpl):
self.memory.append(tpl)
# select actions according to the states (input images with 4 channels)
def select(self,state,a_space):
actions = self.evl_net(state).data.tolist()
if(random.random() <= self.epsilon):
action = random.randint(0,a_space-1)
else:
action = actions.index(max(actions))
return action
# DQN trainning progression
def train(self,state,batch_size):
s_v,a_v,next_s_v,r_v,dones = self.data_pre_process(batch_size)
self.tgt_net.load_state_dict(self.evl_net.state_dict())
evl_Q_value = self.evl_net(s_v).gather(0,a_v) # size: [32,6].gather() -> [32,1]
tgt = self.tgt_net(next_s_v).max(1)[0].detach() # size [32,1]
tgt_Q_value = (r_v + self.gamma * tgt)
for index in range(len(dones)):
if(dones[index]==True):
tgt[index][0] = -1
# print(tgt_Q_value)
tgt_Q_value = tgt_Q_value.reshape(batch_size,1) # size: [32, 1] cannot be back propagated
# print(tgt_Q_value)
self.optimal.zero_grad()
loss = self.cert(evl_Q_value, tgt_Q_value)
print(loss)
loss.backward()
for pr in self.evl_net.parameters():
pr.grad.data.clamp_(-1, 1)
self.optimal.step()
if(self.epsilon > self.min_epsilon):
self.epsilon *= self.dr
At the training stage, I found the first question. the condition of done in each episode is always false. With gym.wrappers I've pre-processed the image tensor into 48484 and the environment with only one life. But it still appears:
# main test
_display = Display(visible=0, size=(900,1400))
_display.start()
# set episode step and batch_size
episodes = 5000
batch_size = 32
env = gym.make("PongNoFrameskip-v4")
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
# create frame stack for the input image data (size: (4,84,84))
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
a_space = env.action_space.n
agent = Agent(channels, a_space)
# env.render()
# testing:
for e in range(episodes):
# step 1: reset the agent at the beginning
s = np.array(env.reset())
for run in range(100):
score = 0
# display.clear_output(wait=True)
# display.display(Image.fromarray(env.render(mode='rgb_array')))
# env.render("rgb_array")
img = plt.imshow(env.render('rgb_array'))
# step 2: create state space tensor
# step 3: iterate actions
a = agent.select(th.Tensor(s).unsqueeze(0),a_space)
next_s, reward, done, _ = env.step(a)
if(done==True):
next_s = None
next_s = np.array(next_s) # done is never true. Why?
# step 4: record the data into buffer
dataset = (s,a,next_s,reward,done)
agent.record(dataset)
# step 5: update state steps
s = next_s
score += reward
if(done==True or run == 99):
print("episodes:",e,"score:",score,"epsilon: {:.2}".format(agent.epsilon))
break
# step 6: training and update CNN
if(len(agent.memory) > batch_size):
agent.train(channels,batch_size)
As I tried to find this problem, I detected that the loss value never even roughly decreases(at most fluctuate around 1.2). I rechecked the input and output tensor but found nothing else. I hope to get some help for how to fix these two problems. Many thanks!
I'm trying to understand how to use #tf.function properly in a A2C problem.
I constantly get the following error:
Cannot convert a symbolic Keras input/output to a numpy array. This error may indicate that you're trying to pass a symbolic value to a NumPy call, which is not supported. Or, you may be trying to pass Keras symbolic inputs/outputs to a TF API that does not register dispatching, preventing Keras from automatically converting the API call to a lambda layer in the Functional Model.
The agent is built as follows:
class Agent():
learning_rate = 0.0001
CLIP_EDGE = 1e-8
entropy = 0.0001
critic_weight = 0.95
def __init__(self,state_shape,action_size,hidden_neurons,memory,learning_rate = learning_rate, CLIP_EDGE = CLIP_EDGE, entropy = entropy,
critic_weight = critic_weight, actor_name = "actor",critic_name = "critic", policy_name = "policy",main_folder = "main_folder"):
self.state_shape = state_shape
self.action_size = action_size
self.hidden_neurons = hidden_neurons
self.memory = memory
self.learning_rate = learning_rate
self.CLIP_EDGE = CLIP_EDGE
self.entropy = entropy
self.critic_weight = critic_weight
self.actor_name = actor_name
self.critic_name = critic_name
self.policy_name = policy_name
self.main_folder = main_folder
self.actor, self.critic, self.policy = self.build_networks()
def act(self, state):
"""Selects an action for the agent to take given a game state.
Args:
state (list of numbers): The state of the environment to act on.
traning (bool): True if the agent is training.
Returns:
(int) The index of the action to take.
"""
# If not acting randomly, take action with highest predicted value.
state_batch = np.expand_dims(state, axis=0)
probabilities = self.policy.predict(state_batch)[0]
action = np.random.choice(self.action_size, p=probabilities)
return action
def learn(self, print_variables=False):
"""Trains the Deep Q Network based on stored experiences."""
gamma = self.memory.gamma
experiences = self.memory.sample()
state_mb, action_mb, reward_mb, dones_mb, next_value = experiences
# One hot enocde actions
actions = np.zeros([len(action_mb), self.action_size])
actions[np.arange(len(action_mb)), action_mb] = 1
#Apply TD(0)
discount_mb = reward_mb + next_value * gamma * (1 - dones_mb)
state_values = self.critic.predict([state_mb])
advantages = discount_mb - np.squeeze(state_values)
if print_variables:
print("discount_mb", discount_mb)
print("next_value", next_value)
print("state_values", state_values)
print("advantages", advantages)
else:
self.actor.train_on_batch(
[state_mb, advantages], [actions, discount_mb])
def build_networks(self):
"""Creates Actor Critic Neural Networks.
Creates a hidden-layer Policy Gradient Neural Network. The loss
function is altered to be a log-likelihood function weighted
by an action's advantage.
"""
state_input = Input(shape=self.state_shape, name='frames')
advantages = Input((1,), name='advantages') # PG, A instead of G
# PG
actor_1 = Dense(units=self.hidden_neurons, activation="relu",name='actor1')(state_input)
actor_3 = Dense(units=int(self.hidden_neurons), activation="relu",name='actor3')(actor_1)
adrop_1 = Dropout(0.2,name='actor_drop_1')(actor_3)
actor_4 = Dense(units = self.hidden_neurons, activation="relu")(adrop_1)
probabilities = Dense(self.action_size, activation='softmax',name='actor_output')(actor_4)
# DQN
critic_1 = Dense(units = self.hidden_neurons,activation="relu",name='critic1')(state_input)
critic_3 = Dense(units = int(self.hidden_neurons), activation="relu",name='critic3')(critic_1)
cdrop_1 = Dropout(0.2,name='critic_drop_1')(critic_3)
critic_4 = Dense(units = self.hidden_neurons, activation="relu")(cdrop_1) #activation era relu por error... se cambio a elu, MONITOREAR
values = Dense(1, activation='linear',name='critic_output')(critic_4)
def actor_loss(y_true, y_pred): # PG
y_pred_clipped = K.clip(y_pred, self.CLIP_EDGE, 1-self.CLIP_EDGE)
log_lik = y_true*K.log(y_pred_clipped)
entropy_loss = y_pred * K.log(K.clip(y_pred, self.CLIP_EDGE, 1-self.CLIP_EDGE)) # New
return K.sum(-log_lik * advantages) - (self.entropy * K.sum(entropy_loss))
# Train both actor and critic at the same time.
actor = Model(
inputs=[state_input, advantages], outputs=[probabilities, values])
actor.compile(
loss=[actor_loss, 'mean_squared_error'], # [PG, DQN]
loss_weights=[1, self.critic_weight], # [PG, DQN]
optimizer=Adam(learning_rate=self.learning_rate))#,clipnorm=1.0))
critic = Model(inputs=[state_input], outputs=[values])
policy = Model(inputs=[state_input], outputs=[probabilities])
tf.keras.utils.plot_model(actor,f"{self.main_folder}/Agents/{self.actor_name}.png",show_shapes=True)
tf.keras.utils.plot_model(critic,f"{self.main_folder}/Agents/{self.critic_name}.png",show_shapes=True)
tf.keras.utils.plot_model(policy,f"{self.main_folder}/Agents/{self.policy_name}.png",show_shapes=True)
return actor, critic, policy
The loop where the agent interacts with the environment is this:
with tf.Graph().as_default():
agent = Agent()
environment = Environment()
state = environment.reset()
done = False
while not done:
acion = agent.act(state)
state,reward,done,info = environment.step(action)
next_value = agent.critic.predict([[state]])
agent.memory.add((state,action,reward,done,next_value))
if agent.memory.full():
agent.learn()
This works fine. My problem comes when I try to switch to use #tf.function because it seems (afaik) that increases training speed (also did a little of benchmark in a jupyter notebook and it is actually faster).
The "refactored" code is this:
The main loop:
agent = Agent()
environment = Environment()
state = environment.reset()
done = False
while not done:
acion = agent.act(state)
state,reward,done,info = environment.step(action)
next_value = agent.model_predict(agent.critic,[[state]]).numpy() #REMOVED .predict FROM MODEL
agent.memory.add((state,action,reward,done,next_value))
if agent.memory.full():
agent.learn()
The modified functions in the Agent class:
#tf.function #NEW FUNCTION ADDED USING #tf.function
def model_predict(self,model,x):
return model(x)
def act(self, state): #MODIFIED FUNCTION, NOW USES self.model_predict
"""Selects an action for the agent to take given a game state.
Args:
state (list of numbers): The state of the environment to act on.
traning (bool): True if the agent is training.
Returns:
(int) The index of the action to take.
"""
# If not acting randomly, take action with highest predicted value.
state_batch = np.expand_dims(state, axis=0)
probabilities = self.model_predict(self.policy,state_batch).numpy()[0]
action = np.random.choice(self.action_size, p=probabilities)
return action
def learn(self, print_variables=False): #MODIFIED FUNCTION, NOW USES self.model_predict
"""Trains the Deep Q Network based on stored experiences."""
gamma = self.memory.gamma
experiences = self.memory.sample()
state_mb, action_mb, reward_mb, dones_mb, next_value = experiences
# One hot enocde actions
actions = np.zeros([len(action_mb), self.action_size])
actions[np.arange(len(action_mb)), action_mb] = 1
#Apply TD(0)
discount_mb = reward_mb + next_value * gamma * (1 - dones_mb)
state_values = self.model_predict(self.critic,[state_mb]).numpy()
advantages = discount_mb - np.squeeze(state_values)
if print_variables:
print("discount_mb", discount_mb)
print("next_value", next_value)
print("state_values", state_values)
print("advantages", advantages)
else:
self.actor.train_on_batch(
[state_mb, advantages], [actions, discount_mb])
The error is triggered when self.actor.train_on_batch is executed, giving me the error mentioned above. Why this happens and what I'm doing wrong?
I have created a custom environment in open ai gym and i am facing error while loading the weights Could some one help me to resolve the issue . I am training a TD3 network in a custom environment and i have trained successfully but while inferencing i am facing this issue
class Actor(nn.Module):
def __init__(self, state_dim, action_dim, max_action):
super(Actor, self).__init__()
self.layer_1 = nn.Linear(state_dim, 400)
self.layer_2 = nn.Linear(400, 300)
self.layer_3 = nn.Linear(300, action_dim)
self.max_action = max_action
def forward(self, x):
x = F.relu(self.layer_1(x))
x = F.relu(self.layer_2(x))
x = self.max_action * torch.tanh(self.layer_3(x))
return x
class Critic(nn.Module):
def __init__(self, state_dim, action_dim):
super(Critic, self).__init__()
# Defining the first Critic neural network
self.layer_1 = nn.Linear(state_dim + action_dim, 400)
self.layer_2 = nn.Linear(400, 300)
self.layer_3 = nn.Linear(300, 1)
# Defining the second Critic neural network
self.layer_4 = nn.Linear(state_dim + action_dim, 400)
self.layer_5 = nn.Linear(400, 300)
self.layer_6 = nn.Linear(300, 1)
def forward(self, x, u):
xu = torch.cat([x, u], 1)
# Forward-Propagation on the first Critic Neural Network
x1 = F.relu(self.layer_1(xu))
x1 = F.relu(self.layer_2(x1))
x1 = self.layer_3(x1)
# Forward-Propagation on the second Critic Neural Network
x2 = F.relu(self.layer_4(xu))
x2 = F.relu(self.layer_5(x2))
x2 = self.layer_6(x2)
return x1, x2
def Q1(self, x, u):
xu = torch.cat([x, u], 1)
x1 = F.relu(self.layer_1(xu))
x1 = F.relu(self.layer_2(x1))
x1 = self.layer_3(x1)
return x1
# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Building the whole Training Process into a class
class TD3(object):
def __init__(self, state_dim, action_dim, max_action):
self.actor = Actor(state_dim, action_dim, max_action).to(device)
self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
self.actor_target.load_state_dict(self.actor.state_dict())
self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
self.critic = Critic(state_dim, action_dim).to(device)
self.critic_target = Critic(state_dim, action_dim).to(device)
self.critic_target.load_state_dict(self.critic.state_dict())
self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
self.max_action = max_action
def select_action(self, state):
state = torch.Tensor(state.reshape(1, -1)).to(device)
return self.actor(state).cpu().data.numpy().flatten()
def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
for it in range(iterations):
# Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
state = torch.Tensor(batch_states).to(device)
next_state = torch.Tensor(batch_next_states).to(device)
action = torch.Tensor(batch_actions).to(device)
reward = torch.Tensor(batch_rewards).to(device)
done = torch.Tensor(batch_dones).to(device)
# Step 5: From the next state s’, the Actor target plays the next action a’
next_action = self.actor_target(next_state)
# Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
noise = noise.clamp(-noise_clip, noise_clip)
next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
# Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
target_Q1, target_Q2 = self.critic_target(next_state, next_action)
# Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
target_Q = torch.min(target_Q1, target_Q2)
# Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
target_Q = reward + ((1 - done) * discount * target_Q).detach()
# Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
current_Q1, current_Q2 = self.critic(state, action)
# Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
# Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
if it % policy_freq == 0:
actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
# Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
# Making a save method to save a trained model
def save(self, filename, directory):
torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
# Making a load method to load a pre-trained model
def load(self, filename, directory):
self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
def evaluate_policy(policy, eval_episodes=10):
avg_reward = 0.
for _ in range(eval_episodes):
obs = env.reset()
done = False
while not done:
action = policy.select_action(np.array(obs))
obs, reward, done, _ = env.step(action)
avg_reward += reward
avg_reward /= eval_episodes
print ("---------------------------------------")
print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
print ("---------------------------------------")
return avg_reward
env_name = "Pygame-v0"
seed = 0
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")
eval_episodes = 10
save_env_vid = True
env = gym.make(env_name)
max_episode_steps = env._max_episode_steps
if save_env_vid:
env = wrappers.Monitor(env, monitor_dir, force = True)
env.reset()
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
policy = TD3(state_dim, action_dim, max_action)
#policy.load(file_name, './pytorch_models/')
policy.load(file_name,"/content/gdrive/My Drive/reinforce/gym_game/pytorch_models")
_ = evaluate_policy(policy, eval_episodes=eval_episodes)
Traceback:
I am facing a runtime error while loading the state_dict for actor model .I searched google but couldnt find similar issues .
RuntimeError: Error(s) in loading state_dict for Actor:
Missing key(s) in state_dict: "layer_1.weight", "layer_1.bias", "layer_2.weight", "layer_2.bias", "layer_3.weight", "layer_3.bias".
Unexpected key(s) in state_dict: "encoder.0.weight", "encoder.0.bias", "encoder.2.weight", "encoder.2.bias", "encoder.2.running_mean", "encoder.2.running_var", "encoder.2.num_batches_tracked", "encoder.3.weight", "encoder.3.bias", "encoder.5.weight", "encoder.5.bias", "encoder.5.running_mean", "encoder.5.running_var", "encoder.5.num_batches_tracked", "encoder.6.weight", "encoder.6.bias", "encoder.8.weight", "encoder.8.bias", "encoder.8.running_mean", "encoder.8.running_var", "encoder.8.num_batches_tracked", "encoder.10.weight", "encoder.10.bias", "encoder.12.weight", "encoder.12.bias", "encoder.12.running_mean", "encoder.12.running_var", "encoder.12.num_batches_tracked", "encoder.13.weight", "encoder.13.bias", "encoder.15.weight", "encoder.15.bias", "encoder.15.running_mean", "encoder.15.running_var", "encoder.15.num_batches_tracked", "encoder.16.weight", "encoder.16.bias", "linear.0.weight", "linear.0.bias", "linear.2.weight", "linear.2.bias".
it was answered by #MicaelJungo
The weights you saved were not from the model you are using here. Make sure to load the correct checkpoint, which was created when training this particular model.
I have built an encoder-decoder model with attention for morph inflection generation. I am able to train the model and predict on test data but I am getting wrong predicting after loading a saved model
I am not getting any error during saving or loading but
When I load a saved model its predictions are completely wrong. It looks like some parameters are not getting saved.
I have tried to load and save the model using both techniques
using state_dict() eg. torch.save(encoder.state_dict(),'path')
saving complete model eg.torch.save(encoder,'path')
I have tried to save different classes one by one and also making a superclass that initiates all those class and then saving just superclass
but nothing seems to be working
Encoder class
class Encoder(nn.Module):
def __init__(self,vocab_size,embedding_size, encoder_hid_dem,decoder_hid_dem,bidirectional,dropout):
super().__init__()
self.encoder_hid_dem = encoder_hid_dem
self.encoder_n_direction=1;
self.bias = False
self.dropout=dropout
if(bidirectional==True):
self.encoder_n_direction=2;
self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size, padding_idx=0)
self.GRU_layer = nn.GRU(input_size=embedding_size, hidden_size=encoder_hid_dem, batch_first=True, bidirectional=bidirectional)
self.fc = nn.Linear(encoder_hid_dem*self.encoder_n_direction,decoder_hid_dem)
self.dropout = nn.Dropout(dropout)
def forward(self, input_word):
# print(input_word.size())
#[batch_size src_sent_lent]
embed_out = self.embedding_layer(input_word)
#[BATCH_SIZE src_sent_lent embedding_dim]
embed_out = F.relu(embed_out)
embed_out = self.dropout(embed_out)
self.batch = embed_out.size()[0]
# hidden = self.init_hidden()
GRU_out,hidden = self.GRU_layer(embed_out)
# print(GRU_out.size())
# print(hidd.size())
#[BATCH_SIZE sec_sent_len n_direction*hid_dem]
#[n_layer*n_direction batch_size hid_dem]
#where the first hid_dim elements in the third axis are the hidden states from the top layer forward RNN, and the last hid_dim elements are hidden states from the top layer backward RNN
#hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
#hidden [-2, :, : ] is the last of the forwards RNN
#hidden [-1, :, : ] is the last of the backwards RNN
GRU_out = F.relu(GRU_out)
hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)))
# print(GRU_out.size())
# print(hidden.size())
#outputs = [batch_size src sent len, encoder_hid_dim * n_direction]
#hidden = [batch size, dec hid dim]
return GRU_out,hidden
def init_hidden(self):
return (Variable(torch.eye(1, self.encoder_hid_dem)).unsqueeze(1).repeat(2, self.batch, 1).to(self.device))
Attention class
class Attention(nn.Module):
def __init__(self,encoder_hid_dem,decoder_hid_dem,bidirectional):
super().__init__()
self.enc_hid_dim = encoder_hid_dem
self.dec_hid_dim = decoder_hid_dem
self.encoder_n_direction=1;
if(bidirectional==True):
self.encoder_n_direction=2;
self.attn = nn.Linear((encoder_hid_dem * self.encoder_n_direction) + decoder_hid_dem, decoder_hid_dem)
self.v = nn.Parameter(torch.rand(decoder_hid_dem))
def forward(self, hidden, encoder_outputs):
#hidden = [batch size, dec hid dim]
#encoder_outputs = [batch_size ,src sent len, enc hid dim * encoder_n_direction]
batch_size = encoder_outputs.shape[0]
src_len = encoder_outputs.shape[1]
hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
#hidden = [batch size, src sent len, dec hid dim]
#encoder_outputs = [batch size, src sent len, enc hid dim * encoder_n_direction]
energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
#energy = [batch size, src sent len, dec hid dim]
energy = energy.permute(0, 2, 1)
#energy = [batch size, dec hid dim, src sent len]
#v = [dec hid dim]
v = self.v.repeat(batch_size, 1).unsqueeze(1)
#v = [batch size, 1, dec hid dim]
attention = torch.bmm(v, energy).squeeze(1)
#attention= [batch size, src len]
return F.softmax(attention, dim=1)
Decoder class
class Decoder(nn.Module):
def __init__(self, decoder_hid_dem, encoder_hid_dem, vocab_size,embedding_dim,attention,decoder_input_size,linear_input_size,bidirectional,dropout):
super().__init__()
self.encoder_hid_dem=encoder_hid_dem
self.decoder_hid_dem=decoder_hid_dem
self.attention=attention
self.dropout = dropout
self.output_dim = vocab_size
self.decoder_n_direction=1;
if(bidirectional==True):
self.decoder_n_direction=2;
self.GRU_layer_out = nn.GRU(decoder_input_size,decoder_hid_dem)
self.out_layer = nn.Linear(in_features=linear_input_size, out_features=vocab_size)
self.dropout = nn.Dropout(dropout)
#self.GRU_layer_out.bias = torch.nn.Parameter(torch.zeros(decoder_input_size))
def forward(self, feature, hidden,actual_word,encoder_outputs):
feature = feature.unsqueeze(1)
# print('decoder')
# print(feature.size())
#[batch_size src_sent_lent=1 feat_size=6]
# print(hidden.size())
# [batch_size dec_hid_dim]
# print(actual_word.size())
# [batch_size src_sent_lent=1 embedding_dim]
# print(encoder_outputs.size())
# outputs = [batch_size src sent len, encoder_hid_dim * encoder_n_directional]
a = self.attention(hidden,encoder_outputs)
# print(a.size())
# [batch_size src_sent_len]
a = a.unsqueeze(1)
#a = [batch size, 1, src len]
weighted = torch.bmm(a,encoder_outputs)
# print(weighted.size())
# weighted = [batch size, 1, enc_hid_dim * encoder_n_direction]
# if len(actual_word.size()) != 0:
input_char = torch.cat((actual_word,feature,weighted),2)
# else:
# input_char = torch.cat((feature,weighted),2)
input_char=input_char.permute(1,0,2)
# print(input_char.size())
# [1 BATCH_SIZE decoder_input_size]
hidden = hidden.unsqueeze(0)
# print(hidden.size())
#[1 batch_size decoder_hid_dem]
output, hidden = self.GRU_layer_out(input_char, hidden)
# print(output.size())
# [sent_len=1 batch_size decoder_n_direction*decoder_hid_dem]
# print(hidden.size())
# [n_layer*n_direction BATCH_SIZE hid_dem]
output = F.leaky_relu(output)
output = self.dropout(output)
output = torch.cat((output.squeeze(0),weighted.squeeze(1),actual_word.squeeze(1)),dim=1)
pre_out = self.out_layer(output)
predicted_output = F.log_softmax(pre_out, dim=1)
# print(predicted_output.size())
# [ batch_size vacab_size ]
return predicted_output, hidden.squeeze(0)
def init_hidden(self, batch):
return (Variable(torch.eye(1, self.decoder_hid_dem)).unsqueeze(1).repeat(1, batch, 1).to(self.device),Variable(torch.eye(1, self.decoder_hid_dem)).unsqueeze(1).repeat(1, batch, 1).to(self.device))
seq2seq class
class Seq2Seq(nn.Module):
def __init__(self,encoder,decoder,device):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
def forward(self,input_word,output_word,features_word,teaching_forcing_ratio,limit):
#print(input_word)
#print(input_word.size())
input_word = input_word.to(self.device)
output_word = output_word.to(self.device)
features_word = features_word.to(self.device)
batch_size= input_word.size()[0]
if(limit==0):
max_len = input_word.size()[1]
else:
max_len = limit
vocabsize = self.decoder.output_dim
actual_word = self.encoder.embedding_layer(torch.tensor(char_to_index['<sos>']).view(1, -1).to(self.device)).repeat(batch_size, 1, 1)
encoder_outputs,hidden = self.encoder(input_word)
features=features_word[:,:]
predicted_word = torch.zeros(max_len,batch_size,vocabsize).to(self.device)
for t in range(1,max_len):
output,hidden=self.decoder(features, hidden,actual_word,encoder_outputs)
#print(output.size())
predicted_word[t] = output
topv, topi = output.topk(1)
bs = topi.size()[0]
temp2 = torch.zeros(0,1,300).to(self.device)
for row in range(bs):
index = topi[row][0].item()
temp = self.encoder.embedding_layer(torch.tensor(index).view(1, -1).to(self.device))
temp2 = torch.cat((temp2,temp))
teacher_force = random.random() < teaching_forcing_ratio
if teacher_force == 1:
actual_word = self.encoder.embedding_layer(output_word[:,t]).unsqueeze(1)
else:
actual_word = temp2
return predicted_word
and this code is used to save and load model
torch.save(model.state_dict(), 'model.pt')
model.load_state_dict(torch.load('model.pt'))
I want that when I run my model on pre-trained weights, it should predict correctly acc to those weights
Your provided code for saving/loading parameters is wrong. The loading and saving model parameters are pretty straight-forward. In your case, it should be:
# loading
saved_params = torch.load(
filename, map_location=lambda storage, loc: storage
)
s2s.load_state_dict(saved_params)
# saving
params = s2s.state_dict()
torch.save(params, filename)
[Update]
You need to make the Seq2Seq class a derived class of PyTorch's nn.Module just like your encoder/decoder classes. Otherwise, you can't use the state_dict() method. You can assume Seq2Seq class is like a container that contains your whole network, although it does not have any learnable weights itself.