RuntimeError: Error(s) in loading state_dict for Actor - torch.load() - python-3.x

I have created a custom environment in open ai gym and i am facing error while loading the weights Could some one help me to resolve the issue . I am training a TD3 network in a custom environment and i have trained successfully but while inferencing i am facing this issue
class Actor(nn.Module):
def __init__(self, state_dim, action_dim, max_action):
super(Actor, self).__init__()
self.layer_1 = nn.Linear(state_dim, 400)
self.layer_2 = nn.Linear(400, 300)
self.layer_3 = nn.Linear(300, action_dim)
self.max_action = max_action
def forward(self, x):
x = F.relu(self.layer_1(x))
x = F.relu(self.layer_2(x))
x = self.max_action * torch.tanh(self.layer_3(x))
return x
class Critic(nn.Module):
def __init__(self, state_dim, action_dim):
super(Critic, self).__init__()
# Defining the first Critic neural network
self.layer_1 = nn.Linear(state_dim + action_dim, 400)
self.layer_2 = nn.Linear(400, 300)
self.layer_3 = nn.Linear(300, 1)
# Defining the second Critic neural network
self.layer_4 = nn.Linear(state_dim + action_dim, 400)
self.layer_5 = nn.Linear(400, 300)
self.layer_6 = nn.Linear(300, 1)
def forward(self, x, u):
xu = torch.cat([x, u], 1)
# Forward-Propagation on the first Critic Neural Network
x1 = F.relu(self.layer_1(xu))
x1 = F.relu(self.layer_2(x1))
x1 = self.layer_3(x1)
# Forward-Propagation on the second Critic Neural Network
x2 = F.relu(self.layer_4(xu))
x2 = F.relu(self.layer_5(x2))
x2 = self.layer_6(x2)
return x1, x2
def Q1(self, x, u):
xu = torch.cat([x, u], 1)
x1 = F.relu(self.layer_1(xu))
x1 = F.relu(self.layer_2(x1))
x1 = self.layer_3(x1)
return x1
# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Building the whole Training Process into a class
class TD3(object):
def __init__(self, state_dim, action_dim, max_action):
self.actor = Actor(state_dim, action_dim, max_action).to(device)
self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
self.actor_target.load_state_dict(self.actor.state_dict())
self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
self.critic = Critic(state_dim, action_dim).to(device)
self.critic_target = Critic(state_dim, action_dim).to(device)
self.critic_target.load_state_dict(self.critic.state_dict())
self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
self.max_action = max_action
def select_action(self, state):
state = torch.Tensor(state.reshape(1, -1)).to(device)
return self.actor(state).cpu().data.numpy().flatten()
def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
for it in range(iterations):
# Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
state = torch.Tensor(batch_states).to(device)
next_state = torch.Tensor(batch_next_states).to(device)
action = torch.Tensor(batch_actions).to(device)
reward = torch.Tensor(batch_rewards).to(device)
done = torch.Tensor(batch_dones).to(device)
# Step 5: From the next state s’, the Actor target plays the next action a’
next_action = self.actor_target(next_state)
# Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
noise = noise.clamp(-noise_clip, noise_clip)
next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
# Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
target_Q1, target_Q2 = self.critic_target(next_state, next_action)
# Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
target_Q = torch.min(target_Q1, target_Q2)
# Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
target_Q = reward + ((1 - done) * discount * target_Q).detach()
# Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
current_Q1, current_Q2 = self.critic(state, action)
# Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
# Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
if it % policy_freq == 0:
actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
# Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
# Making a save method to save a trained model
def save(self, filename, directory):
torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
# Making a load method to load a pre-trained model
def load(self, filename, directory):
self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
def evaluate_policy(policy, eval_episodes=10):
avg_reward = 0.
for _ in range(eval_episodes):
obs = env.reset()
done = False
while not done:
action = policy.select_action(np.array(obs))
obs, reward, done, _ = env.step(action)
avg_reward += reward
avg_reward /= eval_episodes
print ("---------------------------------------")
print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
print ("---------------------------------------")
return avg_reward
env_name = "Pygame-v0"
seed = 0
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")
eval_episodes = 10
save_env_vid = True
env = gym.make(env_name)
max_episode_steps = env._max_episode_steps
if save_env_vid:
env = wrappers.Monitor(env, monitor_dir, force = True)
env.reset()
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
policy = TD3(state_dim, action_dim, max_action)
#policy.load(file_name, './pytorch_models/')
policy.load(file_name,"/content/gdrive/My Drive/reinforce/gym_game/pytorch_models")
_ = evaluate_policy(policy, eval_episodes=eval_episodes)
Traceback:
I am facing a runtime error while loading the state_dict for actor model .I searched google but couldnt find similar issues .
RuntimeError: Error(s) in loading state_dict for Actor:
Missing key(s) in state_dict: "layer_1.weight", "layer_1.bias", "layer_2.weight", "layer_2.bias", "layer_3.weight", "layer_3.bias".
Unexpected key(s) in state_dict: "encoder.0.weight", "encoder.0.bias", "encoder.2.weight", "encoder.2.bias", "encoder.2.running_mean", "encoder.2.running_var", "encoder.2.num_batches_tracked", "encoder.3.weight", "encoder.3.bias", "encoder.5.weight", "encoder.5.bias", "encoder.5.running_mean", "encoder.5.running_var", "encoder.5.num_batches_tracked", "encoder.6.weight", "encoder.6.bias", "encoder.8.weight", "encoder.8.bias", "encoder.8.running_mean", "encoder.8.running_var", "encoder.8.num_batches_tracked", "encoder.10.weight", "encoder.10.bias", "encoder.12.weight", "encoder.12.bias", "encoder.12.running_mean", "encoder.12.running_var", "encoder.12.num_batches_tracked", "encoder.13.weight", "encoder.13.bias", "encoder.15.weight", "encoder.15.bias", "encoder.15.running_mean", "encoder.15.running_var", "encoder.15.num_batches_tracked", "encoder.16.weight", "encoder.16.bias", "linear.0.weight", "linear.0.bias", "linear.2.weight", "linear.2.bias".

it was answered by #MicaelJungo
The weights you saved were not from the model you are using here. Make sure to load the correct checkpoint, which was created when training this particular model.

Related

multi-agent DQN learn single model for all agents

I'm trying to run a DQN for a multi-agent system, so there is one DNN for each agent.
It takes input=state [batch, state size, #time steps, #nodes], while for simplicity we assume #time steps=1. #nodes is number of agents. And output=Q-values for each agent.
The problem is that I test various stuff with this network, but it return not so consistent results. I suspect it has to do with me running separately DQN for each agent, but learning it via the same model. I sum the losses for all agents into one loss, and then it divide by their amount.
I'm not sure it is correct. I'd be grateful for any help.
Here's my code:
class DQN(nn.Module):
def __init__(self, args): #node_size, inputs, outputs, layers=[128, 64, 16]):
# state_size, n_actions = inputs, outputs
super(DQN, self).__init__()
self.model_type = args.model_type
if args.model_type == "seperate_state_DNN":
out_size = args.num_of_actions
self.shared_model = nn.Sequential()
h_sizes = [args.input_state_size] + args.layers
for k in range(len(h_sizes) - 1):
self.shared_model.add_module('k1'+str(k), nn.Linear(h_sizes[k], h_sizes[k + 1]))
self.shared_model.add_module('k2'+str(k), args.activations[args.layers_nl[k]])
self.shared_model.add_module('final', nn.Linear(h_sizes[-1], out_size))
def forward(self, input, i=None):
# input state dimension: [batch, state size, #time steps, #nodes]
if self.model_type == "seperate_state_DNN":
if i is None:
final_output = torch.zeros_like(input)
else:
final_output = self.shared_model(input) # [:, :, :, i].unsqueeze(3))
return final_output
And here is the calling function:
def select_action(self, state, edge_state):
#self.policy_net.eval()
sample = random.random()
if self.configuration == 2:
self.eps_threshold = 0.0 # no exploration at all, only optimal values!
else:
self.eps_threshold = self.decay_functionn()
self.steps_done += 1
if sample > self.eps_threshold:
self.last_exploration = False
with torch.no_grad():
# t.max(1) will return largest column value of each row.
# second column on max result is index of where max element was
# found, so we pick action with the larger expected reward.
state = state.to(self.device)# torch.from_numpy(state).float().to(self.device) # Convert to tensor.
state = state.unsqueeze(0) # Add batch dimension (also to action below): [batch=1, #time steps, #nodes, state size]
final_output = []
x1 = self.policy_net(state, None)#.detach()
for i in range(self.node_size):
final_output.append(self.policy_net(x1[:, :, -1, i]+state[:, :, -1, i], i).max(1)[1].detach().cpu().view(state.shape[0], -1))
# .to(self.device) # action dimension: [batch=1, #nodes]
return torch.cat(final_output, dim=1)
else:
self.last_exploration = True
return torch.randint(0, self.n_actions, (1, self.node_size))
And this is the main RL training loop:
for epi in range(self.episodes):
print("### Starting Episode: ", epi, ' ### in index=', self.run_index)
state = env.reset(self, heatup=self.sim_heatup) # single step state
done = False
while not done:
action = agent.select_action(state) # .to(device)
next_state1, reward, done = env.do_step(action)
agent.add_to_memory(state, action, next_state, reward)
agent.optimize_model()
state = next_state
agent.curr_episode += 1
# Plot and dump statistics and learning curves.
agent.dump_data_on_episode_end(plot=True)
env.capture_episode()
env.close()
Finally, this is the optimization, executed in "agent.optimize_model()" above, including the functions it uses:
def optimize_model(self):
if len(self.memory) < self.batch_size:
return
transitions = self.memory.sample(self.batch_size)
# This converts batch-array of Transitions
# to Transition of batch-arrays.
batch = Transition(*zip(*transitions))
next_states_batch = torch.stack(batch.next_state).to(self.device)
state_batch = torch.stack(batch.state).to(self.device)
action_batch = torch.cat(batch.action).view(self.batch_size, -1).to(self.device) #torch.stack(batch.action, dim=0).to(self.device)
reward_batch = torch.cat(batch.reward).view(self.batch_size, -1).to(self.device)
# dims: states=[batch, steps, nodes, state size]; action=[batch, nodes]; reward=[batch, nodes]
loss = torch.tensor(0., device=self.device)
self.policy_net.train() # IM NOT SURE IF IT SHOULD BE HERE...
x1 = self.policy_net(state_batch, None)
x2 = self.policy_net(next_states_batch, None)
for i in range(self.node_size):
action_batch1 = action_batch[:,i].unsqueeze(1).reshape(-1, 1) # action=[batchXnodes, 1]
reward_batch1 = reward_batch[:,i].unsqueeze(1).view(-1, 1) # reward=[batchXnodes, 1]
# Compute loss
loss += self._compute_loss(i, x1[:, :, -1, i]+state_batch[:, :, -1, i], edge_state_batch, action_batch1,
x2[:, :, -1, i]+next_states_batch[:, :, -1, i], next_edge_state_batch, reward_batch1)
# Optimize the model
loss.div_(self.node_size)
self.optimizer.zero_grad()
loss.backward()
# clip grad
if self.grad_clip is not None:
for param in self.policy_net.parameters():
param.grad.data.clamp_(-self.grad_clip, self.grad_clip)
# update Policy net weights
self.optimizer.step()
#del loss
self.losses.append(loss.detach().cpu().numpy())
# update Target net weights
self._update_target()
def _compute_loss(self, i, state_batch, edge_state_batch, action_batch, next_states_batch, next_edge_state_batch, reward_batch):
# Q{policy net}(s, a): [batchXnodes, actions] ---gather---> [batchXnodes, 1=q_values according to this policy]
state_action_q_values = self.policy_net(state_batch, i).gather(1, action_batch)
# argmax{a} Q{policy net}(s', a'): [batchXnodes, actions] ---argmax---> [batchXnodes] ---unsqueeze---> [batchXnodes, 1]
next_state_actions = torch.argmax(self.policy_net(next_states_batch, i), dim=1).unsqueeze(1)
# Q{ploicy net}(s', argmax{a} Q{target net}(s', a') ): [batchXnodes, actions] --gather--> [batchXnodes, 1=q_values according to this policy]
next_state_q_values = self.target_net(next_states_batch, i).gather(1, next_state_actions)
# Q* = Disount * Q(s', argmax(..)) + R: [batchXnodes, 1]
expected_state_action_values = (next_state_q_values.detach() * self.discount) + reward_batch
loss = F.smooth_l1_loss(state_action_q_values, expected_state_action_values)
return loss
def _update_target(self):
if self.target_net is None:
# There is nothing to update.
return
# Update the target network, copying all weights and biases in DQN
if self.target_update > 1:
# Hard copy of weights.
if self.steps_done % self.target_update == 0:
self.target_net.load_state_dict(self.policy_net.state_dict())
return
elif self.target_update < 1 and self.target_update > 0:
# polyak averaging:
tau = self.target_update
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
target_param.data.copy_(tau * param + (1 - tau) * target_param)
return
else:
raise NotImplementedError
Sorry for the large question, I just wanted to supply all the necessary information.
If more information is needed I'd be happy to give it.
Any suggestion is much appreciated.
Thanks,
Shimon

DQN model for Atari game never learns

I'm trying to implement an Pong game with DQN model by torch. However I got two problems during the execution. Firstly, I found that the game never get done. Secondly, I found the loss function does not have any change in the trainning. This is my code below:
I defined a CNN network with the input of the size (batch=32, channels=4, height=84, weight=84). By this step there's nothing wrong happened:
class CNN(nn.Module):
def __init__(self, s_channels, a_space):
super(CNN, self).__init__()
self.pool = nn.MaxPool2d(kernel_size=2, stride=1)
self.conv1 = nn.Conv2d(s_channels,out_channels=32,kernel_size=8,stride=4)
self.conv2 = nn.Conv2d(32,64,4,2)
self.conv3 = nn.Conv2d(64,64,3,1)
self.fc1 = nn.Linear(64*4*4,1024)
self.fc2 = nn.Linear(1024,512)
self.fc3 = nn.Linear(512,a_space)
def forward(self,input):
output = self.pool(F.relu(self.conv1(input)))
output = self.pool(F.relu(self.conv2(output)))
output = self.pool(F.relu(self.conv3(output)))
output = output.view(-1,64*4*4)
output = F.relu(self.fc1(output))
output = F.relu(self.fc2(output))
output = F.relu(self.fc3(output))
return output
For the agent class, I defined a back propagation function to replay the weight in CNN and the data pre-processing function:
# Agent
class Agent():
def __init__(self, s_space, a_space) -> None:
# define parameters
self.epsilon = 1.0
self.min_epsilon = 0.01
self.dr = 0.995
self.lr = 0.001
self.gamma = 0.9
# define models
self.evl_net = CNN(s_space, a_space)
self.tgt_net = CNN(s_space, a_space)
self.cert = nn.SmoothL1Loss()
self.optimal = th.optim.Adam(self.evl_net.parameters(),lr=self.lr)
# define memory store
self.memory = deque(maxlen=2000)
# self.img_stack = deque(maxlen=4)
# pre-processing frame images: transform the imaages into tensors
# def bsl_image_pre_process(self,env):
# env = aw.AtariWrapper(env,noop_max=30,frame_skip=4,screen_size=84,terminal_on_life_loss=True,clip_reward = True)
# return env
def gym_image_pre_process(self,env):
#Atari preprocessing
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
#create frame stack
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
return env,channels
# env = aw.AtariWrapper(env,noop_max=30,frame_skip=4,screen_size=84,terminal_on_life_loss=True,clip_reward = True)
# return env
def data_pre_process(self,batch_size):
s_v = []
a_v = []
next_s_v = []
r_v = []
dones = []
materials = random.sample(self.memory,batch_size)
for t in materials:
s_v.append(t[0])
a_v.append(t[1])
next_s_v.append(t[2])
r_v.append(t[3])
dones.append(t[4])
# print(th.FloatTensor(r_v))
# print(th.FloatTensor(r_v).size())
# print(s_v)
s_v = th.Tensor(s_v) # size: [32,3,210,160]
a_v = th.LongTensor(a_v).unsqueeze(1) # size: [32,1]
next_s_v = th.Tensor(next_s_v) # size: [32,3,210,160]
r_v = th.FloatTensor(r_v) # size: [32]
return s_v, a_v, next_s_v, r_v, dones
# remember the transformed images
def record(self,tpl):
self.memory.append(tpl)
# select actions according to the states (input images with 4 channels)
def select(self,state,a_space):
actions = self.evl_net(state).data.tolist()
if(random.random() <= self.epsilon):
action = random.randint(0,a_space-1)
else:
action = actions.index(max(actions))
return action
# DQN trainning progression
def train(self,state,batch_size):
s_v,a_v,next_s_v,r_v,dones = self.data_pre_process(batch_size)
self.tgt_net.load_state_dict(self.evl_net.state_dict())
evl_Q_value = self.evl_net(s_v).gather(0,a_v) # size: [32,6].gather() -> [32,1]
tgt = self.tgt_net(next_s_v).max(1)[0].detach() # size [32,1]
tgt_Q_value = (r_v + self.gamma * tgt)
for index in range(len(dones)):
if(dones[index]==True):
tgt[index][0] = -1
# print(tgt_Q_value)
tgt_Q_value = tgt_Q_value.reshape(batch_size,1) # size: [32, 1] cannot be back propagated
# print(tgt_Q_value)
self.optimal.zero_grad()
loss = self.cert(evl_Q_value, tgt_Q_value)
print(loss)
loss.backward()
for pr in self.evl_net.parameters():
pr.grad.data.clamp_(-1, 1)
self.optimal.step()
if(self.epsilon > self.min_epsilon):
self.epsilon *= self.dr
At the training stage, I found the first question. the condition of done in each episode is always false. With gym.wrappers I've pre-processed the image tensor into 48484 and the environment with only one life. But it still appears:
# main test
_display = Display(visible=0, size=(900,1400))
_display.start()
# set episode step and batch_size
episodes = 5000
batch_size = 32
env = gym.make("PongNoFrameskip-v4")
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
# create frame stack for the input image data (size: (4,84,84))
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
a_space = env.action_space.n
agent = Agent(channels, a_space)
# env.render()
# testing:
for e in range(episodes):
# step 1: reset the agent at the beginning
s = np.array(env.reset())
for run in range(100):
score = 0
# display.clear_output(wait=True)
# display.display(Image.fromarray(env.render(mode='rgb_array')))
# env.render("rgb_array")
img = plt.imshow(env.render('rgb_array'))
# step 2: create state space tensor
# step 3: iterate actions
a = agent.select(th.Tensor(s).unsqueeze(0),a_space)
next_s, reward, done, _ = env.step(a)
if(done==True):
next_s = None
next_s = np.array(next_s) # done is never true. Why?
# step 4: record the data into buffer
dataset = (s,a,next_s,reward,done)
agent.record(dataset)
# step 5: update state steps
s = next_s
score += reward
if(done==True or run == 99):
print("episodes:",e,"score:",score,"epsilon: {:.2}".format(agent.epsilon))
break
# step 6: training and update CNN
if(len(agent.memory) > batch_size):
agent.train(channels,batch_size)
As I tried to find this problem, I detected that the loss value never even roughly decreases(at most fluctuate around 1.2). I rechecked the input and output tensor but found nothing else. I hope to get some help for how to fix these two problems. Many thanks!

problem changing input type in pytorch reinforcement learning

I'm trying to tun a code I found on github, but keep crashing in one part and getting TypeError: only size-1 arrays can be converted to Python scalars error. I've been trying to solve it for two days now. If I understand the problem correctly I have incompatible dtypes of tensors yet I've no idea how to fix this. Everytime I try to change my tensor type to say DoubleTensor, or float64 I get other errors. I simply haven't got a clue now which part needs to be changed and how.
This is my model defined:
class Policy(nn.Module):
def __init__(self):
super(Policy, self).__init__()
self.input_layer = nn.Linear(11, 128)
self.hidden_1 = nn.Linear(128, 128)
self.hidden_2 = nn.Linear(32,31)
self.hidden_state = torch.tensor(torch.zeros(2,1,32)).cuda()
self.rnn = nn.GRU(128, 32, 2)
self.action_head = nn.Linear(31, 5)
self.value_head = nn.Linear(31, 1)
self.saved_actions = []
self.rewards = []
def reset_hidden(self):
self.hidden_state = torch.tensor(torch.zeros(2,1,32)).cuda()
def forward(self, x):
x = torch.tensor(x).cuda()
x = torch.sigmoid(self.input_layer(x))
x = torch.tanh(self.hidden_1(x))
x, self.hidden_state = self.rnn(x.view(1,-1,128), self.hidden_state.data)
x = F.relu(self.hidden_2(x.squeeze()))
action_scores = self.action_head(x)
state_values = self.value_head(x)
return F.softmax(action_scores, dim=-1), state_values
def forward(self, x):
conv_out = self.conv(x).view(x.size()[0], -1)
val = self.fc_val(conv_out)
adv = self.fc_adv(conv_out)
return val + (adv - adv.mean(dim=1, keepdim=True))
def act(self, state):
probs, state_value = self.forward(state)
m = Categorical(probs)
action = m.sample()
if action == 1 and env.state[0] < 1: action = torch.LongTensor([2]).squeeze().cuda.DoubleTensor()
if action == 4 and env.state[1] < 1: action = torch.LongTensor([2]).squeeze().cuda.DoubleTensor()
if action == 6 and env.state[2] < 1: action = torch.LongTensor([2]).squeeze().cuda.DoubleTensor()
self.saved_actions.append((m.log_prob(action), state_value))
return action.item()
This is where it crashes
env.reset()
# In case you're running this a second time with the same model, delete the gradients
del model.rewards[:]
del model.saved_actions[:]
gamma = 0.9
log_interval = 60
def finish_episode():
R = 0
saved_actions = model.saved_actions
policy_losses = []
value_losses = []
rewards = []
for r in model.rewards[::-1]:
R = r + (gamma * R)
rewards.insert(0, R)
rewards = torch.tensor(rewards)
epsilon = (torch.rand(1) / 1e4) - 5e-5
# With different architectures, I found the following standardization step sometimes
# helpful, sometimes unhelpful.
#
rewards = (rewards - rewards.mean()) / (rewards.std(unbiased=False) + epsilon)
# Alternatively, comment it out and use the following line instead:
rewards += epsilon
for (log_prob, value), r in zip(saved_actions, rewards):
reward = torch.tensor(r - value.item()).cuda()
policy_losses.append(-log_prob * reward)
value_losses.append(F.smooth_l1_loss(value, torch.tensor([r]).cuda()))
optimizer.zero_grad()
loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
loss = torch.clamp(loss, -1e-5, 1e5)
loss.backward()
optimizer.step()
del model.rewards[:]
del model.saved_actions[:]
running_reward = 0
for episode in range(0, 4000):
state = env.reset()
state = state.type(torch.float32)
reward = 0
done = False
msg = None
while not done:
action = model.act(state)
state, reward, done, msg = env.step(action)
model.rewards.append(reward)
if done:
break
running_reward = running_reward * (1 - 1/log_interval) + reward * (1/log_interval)
finish_episode()
# Resetting the hidden state seems unnecessary - it's effectively random from the previous
# episode anyway, more random than a bunch of zeros.
# model.reset_hidden()
if msg["msg"] == "done" and env.portfolio_value() > env.starting_portfolio_value * 1.1 and running_reward > 500:
print("Early Stopping: " + str(int(reward)))
break
if episode % log_interval == 0:
print("""Episode {}: started at {:.1f}, finished at {:.1f} because {} # t={}, \
last reward {:.1f}, running reward {:.1f}""".format(episode, env.starting_portfolio_value, \
env.portfolio_value(), msg["msg"], env.cur_timestep, reward, running_reward))
This is the error I get
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-74-21b617d2e36f> in <module>()
46 msg = None
47 while not done:
---> 48 action = model.act(state)
49 state, reward, done, msg = env.step(action)
50 model.rewards.append(reward)
4 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
1751 if has_torch_function_variadic(input, weight):
1752 return handle_torch_function(linear, (input, weight), input, weight, bias=bias)
-> 1753 return torch._C._nn.linear(input, weight, bias)
1754
1755
RuntimeError: expected scalar type Double but found Float
This is the github I'm using:
https://github.com/tomgrek/RL-stocktrading/blob/master/Finance%20final.ipynb
I was suggested that teh problem is because when I use "state" as a parameter torch throws an error cause it expects a numeric type, but I cannot change state to any float, because I get another error that list can't be changed to float32.
I'll be super grateful if you could show me idiot-proof what I'm doing wrong.
First are you using the same environment "env" and/or dataset ?
Second, you added this line state = state.type(torch.float32) and it didn't throw an error so I suppose state was already a tensor (which is a bit weird). If you had to change the type to float32 then in the next while loop you may have forgotten to change the type.
while not done:
action = model.act(state)
state, reward, done, msg = env.step(action)
state = state.float() # To add as I think env.step(action) returns a long tensor for some reason
model.rewards.append(reward)
if done:
break
Good luck.

Gradient is equal to 'None'

I have two networks. The output of the first network is the input to the other. In order to calculate the loss for the second network, I use vanilla policy gradient. I want to backpropagate this loss into the first network. After checking if the gradeints has changed, I see that they are all none.
I first load the first network (a pre-trained autoencoer in my network this way):
def load_checkpoint(filepath, model):
checkpoint = torch.load(filepath)
model.load_state_dict(checkpoint['state_dict'])
for parameter in model.parameters():
parameter.requires_grad = True
model.train()
return model
Then I define the optimizers for both networks this way:
class MultipleOptimizer(object):
def __init__(self, *op):
self.optimizers = op
def zero_grad(self):
for op in self.optimizers:
op.zero_grad()
def step(self):
for op in self.optimizers:
op.step()
opt = MultipleOptimizer(SGD(model.parameters(), lr=1, momentum=0.9), Adam(logits_net.parameters(), lr=lr))
the reward function is:
#Reward function
def reward(x, act):
#print('action', act)
#print('x type', type(x))
km = KMeans(act, n_init=20, n_jobs=4)
y_pred = km.fit_predict(x.detach().cpu().numpy())# seems we can only get a centre from batch
#print('k-means output type', type(y_pred))
sil_score = sil(x.detach().cpu().numpy(), y_pred)
#print('sil score', sil_score)
return sil_score
The architecture of the second neural net and an alternative to avoid (logits=logits.mean(0)):
def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
# Build a feedforward neural network. outputs are the logits
layers = []
for j in range(len(sizes)-1):
act = activation if j < len(sizes)-2 else output_activation
layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
return nn.Sequential(*layers)
class mlp2(torch.nn.Module):
def __init__(self):
super(mlp2, self).__init__()
self.linear1 = nn.Linear(10,100)
self.relu1 = nn.ReLU(inplace=True)
self.linear2 = torch.nn.Linear(100,100)
self.linear3 = torch.nn.Linear(100,20)
self.linear4 = torch.nn.Linear(2000,100)
self.ident = nn.Identity()
def forward(self, x):
a = self.linear1(x)
a = self.relu1(a)
a = self.linear2(a)
a = self.relu1(a)
a = self.linear3(a)
a = torch.flatten(a)
a = self.linear4(a)
a = self.relu1(a)
a = self.linear3(a)
out = self.ident(a)
return out
Loss is calculated as in the following order:
def get_policy(obs):
logits = logits_net(obs)
return Categorical(logits=logits.mean(0))
def get_action(obs):
return get_policy(obs).sample().item()
def Logp(obs, act):
logp = get_policy(obs).log_prob(act.cuda())
return logp
def compute_loss(logp, weights):
return -(logp * weights).mean()
def train_one_epoch():
# make some empty lists for logging.
batch_obs = [] # for observations
batch_acts = [] # for actions
batch_weights = [] # for R(tau) weighting in policy gradient
batch_logp = []
# reset episode-specific variables
j = 1 # signal from environment that episode is over
ep_rews = [] # list for rewards accrued throughout ep
for i, data in enumerate(train_loader):
#Create the mean image out of those 100 images
x, label = data
x = model(x.cuda())#torch.Size([100, 10])
obs = x.data.cpu().numpy()#[100, 10] - a trajectory with only one state
# Save obs
batch_obs.append(obs.copy())
#act in the environment
#act = get_action(torch.as_tensor(obs, dtype=torch.float32))
act = get_action(x)
print('action type', type(act))
#log probability
#logp = Logp(torch.as_tensor(obs, dtype=torch.float32),act = torch.as_tensor(act, dtype=torch.int32))
logp = Logp(x, act = torch.as_tensor(act, dtype=torch.int32))
#rew = reward(obs, act+2)
rew = reward(x, act+2)
# save action, reward
batch_acts.append(act)
batch_weights.append(rew)#episode rewards
batch_logp.append(logp)
opt.zero_grad()
batch_logp = torch.stack(batch_logp, dim=0)
batch_loss = compute_loss(logp = torch.as_tensor(batch_logp, dtype=torch.float32),
weights = torch.as_tensor(batch_weights, dtype=torch.float32))
batch_loss.backward() #does it return anything? gradients? print them!
opt.step()
for name, param in logits_net.named_parameters():
print(name, param.grad)
I applied some changes with the assumption that maybe recreating some of the tensors maybe the issue:
I have the output of the first network, obs, converted like obs = x.data.cpu().numpy() this and then sent to get_action function: act = get_action(torch.as_tensor(obs, dtype=torch.float32)). I changes this to act = get_action(x) so, x is sent directly to this function. Also, change arguments of logp to logp = Logp(x, act = torch.as_tensor(act, dtype=torch.int32)).
After these changes, I still get the none value for the gradient. Is there anyway possible to backpropagate the gradient when loss is calculated this way? any changes that I can apply?
any help is appreciated.

Problem with reinforcement learning in keras not learning

My name is Andy and I am new to stackoverflow and this is my first question.
I started learning python 40ish days ago thanks to covid19 and jumped into machine learning/qlearning about 3 weeks ago and got stuck there since.
Goal:
have the computer play Rad Racer 2 (NES racing game) using reinforcement learning.
Plans to make this work:
after various tutorials/sites, I decided to use a double network to train/learn.
2x 256 convolution network using keras since I have watched a few tutorial vids on keras basic
3 actions(hold down accelerate(J), accelerate Left(JA), accelerate Right(JD)
I am using directinput keys codes I found online to send inputs to game as sending regular keys does not work.
I know ppl uses retro gym for these type of games but I wanted to see the inner working of reward/observation and such so I used yolov5 to detect lines/objects. Based on the result from yolov5, I calculate the reward for the step.
My input is a series of grayscale images(4) to represent motion using deque then stacked with numpy.
Once I have gather enough experiences/replay memory(1500) I started the training at the end of each of episode instead of each step. I found that it lag out a lot training after each step.
Problem:
My biggest problem currently is the model does not seem to learn properly. I seem to be slightly okay around episode 20-30 then after that it get worst and worst. It get to a point where it only does one action for hours.
I have tried playing around with the learning rate(0.1 - 0.00001), different inputs(1 bgr layer, grayscale layer, 4 layer..etc), different epsilon decay rate. I commented most of the reward stuffs, only basic reward for now.
most codes beside the yolo stuffs, had to removed a few lines due to # character limitation
# parameters
training = True
learning_rate = 0.0001
DISCOUNT = 0.99
REPLAY_MEMORY_SIZE = 50_000 # How many last steps to keep for model training
MIN_REPLAY_MEMORY_SIZE = 1500 # Minimum number of steps in a memory to start training
MINIBATCH_SIZE = 1000 # How many steps (samples) to use for training
batch_size = 32
UPDATE_TARGET_EVERY = 0 # Terminal states (end of episodes)
MODEL_NAME = 'RC'
MIN_REWARD = 0 # For model save
save_every = 5 # save every x episodes
EPISODES = 2_000
# Exploration settings
if training is True:
epsilon = 1 # not a constant, going to be decayed
else:
epsilon = 0
MIN_EPSILON = 0.01
START_EPISODE_DECAY = 0
END_EPISODE_DECAY = 20
if epsilon > MIN_EPSILON:
EPS_DECAY = -(epsilon/((END_EPISODE_DECAY-START_EPISODE_DECAY)/epsilon))
else:
EPS_DECAY = 0
# Agent class
class DQNAgent:
def __init__(self):
# Main model
self.model = self.create_model()
# self.model = self.load_model()
# Target network
self.target_model = self.create_model()
self.target_model.set_weights(self.model.get_weights())
# An array with last n steps for training
self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
# Used to count when to update target network with main network's weights
self.target_update_counter = 0
def create_model(self):
dropout = 0.1
model = Sequential()
model.add(Conv2D(256, (2, 2), input_shape=(int(height/resize_ratio), int(width/resize_ratio), img_channels)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(dropout))
model.add(Conv2D(256, (2, 2)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(dropout))
model.add(Flatten())
model.add(Dense(64))
model.add(Dense(env.ACTION_SPACE_SIZE, activation='linear')) # ACTION_SPACE_SIZE = how many choices (9)
model.compile(loss="mse", optimizer=Adam(lr=learning_rate), metrics=['accuracy'])
return model
# Trains main network at end of episode
def train(self, terminal_state):
# Start training only if certain number of samples is already saved
if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
return
minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
current_states = np.array([transition[0] for transition in minibatch])
# from (MINIBATCH_SIZE, 1, h, w, 4) > (MINIBATCH_SIZE, h, w, 4)
current_states = current_states.reshape(current_states.shape[0], current_states.shape[2],
current_states.shape[3], current_states.shape[4])
current_qs_list = self.model.predict(current_states)
new_current_states = np.array([transition[3] for transition in minibatch])
new_current_states = new_current_states.reshape(new_current_states.shape[0], new_current_states.shape[2],
new_current_states.shape[3], new_current_states.shape[4])
# new_current_states = np.expand_dims(new_current_states, axis=-1)
future_qs_list = self.target_model.predict(new_current_states)
X = []
y = []
for index, (current_state_img, current_action, current_reward, new_current_img, current_done) in enumerate(minibatch):
if not current_done:
max_future_q = np.max(future_qs_list[index])
new_q = current_reward + (DISCOUNT * max_future_q)
else:
new_q = 0.0
current_qs = current_qs_list[index]
current_qs[current_action] = new_q
X.append(np.squeeze(current_state_img, axis=0))
y.append(current_qs)
X = np.array(X)
# X = np.expand_dims(X, axis=-1)
# X = X.reshape(X.shape[0], X.shape[2], X.shape[3], X.shape[4])
y = np.array(y)
self.model.fit(X, y, batch_size=batch_size, verbose=0, shuffle=False)
# self.model.train_on_batch(X, y)
if terminal_state:
self.target_update_counter += 1
# If counter reaches set value, update target network with weights of main network
if self.target_update_counter > UPDATE_TARGET_EVERY:
self.target_model.set_weights(self.model.get_weights())
self.target_update_counter = 0
print('target_model trained!')
# Queries main network for Q values given current observation space (environment state)
def get_qs(self, state):
result = agent.model.predict(state)
result = result[0]
return result
agent = DQNAgent()
current_img_stack = deque(maxlen=4)
# make the game active
game = gw.getWindowsWithTitle('Mesen')[0]
game.activate()
time.sleep(1)
release_all()
# Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
episode_reward = 0
step = 1
if episode <= START_EPISODE_DECAY - 1:
start_epsilon = False
elif episode >= END_EPISODE_DECAY + 1:
start_epsilon = False
else:
start_epsilon = True
# Reset environment and get initial state
# blackscreens followed by the 1st screen starting out
current_state = env.reset()
blackscreen = np.zeros_like(current_state)
current_img_stack.append(blackscreen)
current_img_stack.append(blackscreen)
current_img_stack.append(blackscreen)
current_img_stack.append(current_state)
stacked_state = np.stack(current_img_stack, axis=2)
stacked_state = np.ascontiguousarray(stacked_state, dtype=np.float32) / 255
stacked_state = np.transpose(stacked_state, (1, 0, 2))
stacked_state = np.expand_dims(stacked_state, axis=0)
start_time = time.time()
# Reset flag and start iterating until episode ends
done = False
while not done:
if np.random.random() > epsilon:
action = np.argmax(agent.get_qs(stacked_state))
else:
action = np.random.randint(0, env.ACTION_SPACE_SIZE)
new_state, reward, done, prediction, preview = env.step(action)
if done is False:
next_img_stack = current_img_stack
next_img_stack.append(new_state)
next_stack = np.stack(next_img_stack, axis=2)
next_stack = np.ascontiguousarray(next_stack, dtype=np.float32) / 255
next_stack = np.transpose(next_stack, (1, 0, 2))
next_stack = np.expand_dims(next_stack, axis=0)
# current_state = new_state
current_img_stack = next_img_stack
stacked_state = next_stack
else:
next_img_stack = current_img_stack
next_img_stack.append(blackscreen)
next_stack = np.stack(next_img_stack, axis=2)
next_stack = np.ascontiguousarray(next_stack, dtype=np.float32) / 255
next_stack = np.transpose(next_stack, (1, 0, 2))
next_stack = np.expand_dims(next_stack, axis=0)
step += 1
episode_reward += reward
ep_rewards.append(episode_reward)
if SHOW_PREVIEW:
env.render(preview, prediction)
if training is True:
agent.update_replay_memory((stacked_state, action, reward, next_stack, done))
# print(episode_reward)
if done is True:
ep_reward_final.append(episode_reward)
print(' Epsilon(' + str(epsilon) + ') EPtimes(' + str(time.time() - start_time) + ') done('
+ str(done) + ') step(' + str(step) + ') EPreward(' + str(episode_reward) +
') best_reward_this_session(' + str(max(ep_reward_final)) + ') fps(' +
str(step/(time.time() - start_time)) + ')')
# plot(ep_reward_final)
if training is True:
agent.train(done)
# Decay epsilon
if show_info is False and epsilon <= MIN_EPSILON:
print(f"\nEPS_DECAY ended on episode {episode} - epsilon {epsilon}")
epsilon = MIN_EPSILON
show_info = True
elif start_epsilon is True:
epsilon += EPS_DECAY

Resources