Cannot use model.train_on_batch when using #tf.function - python-3.x

I'm trying to understand how to use #tf.function properly in a A2C problem.
I constantly get the following error:
Cannot convert a symbolic Keras input/output to a numpy array. This error may indicate that you're trying to pass a symbolic value to a NumPy call, which is not supported. Or, you may be trying to pass Keras symbolic inputs/outputs to a TF API that does not register dispatching, preventing Keras from automatically converting the API call to a lambda layer in the Functional Model.
The agent is built as follows:
class Agent():
learning_rate = 0.0001
CLIP_EDGE = 1e-8
entropy = 0.0001
critic_weight = 0.95
def __init__(self,state_shape,action_size,hidden_neurons,memory,learning_rate = learning_rate, CLIP_EDGE = CLIP_EDGE, entropy = entropy,
critic_weight = critic_weight, actor_name = "actor",critic_name = "critic", policy_name = "policy",main_folder = "main_folder"):
self.state_shape = state_shape
self.action_size = action_size
self.hidden_neurons = hidden_neurons
self.memory = memory
self.learning_rate = learning_rate
self.CLIP_EDGE = CLIP_EDGE
self.entropy = entropy
self.critic_weight = critic_weight
self.actor_name = actor_name
self.critic_name = critic_name
self.policy_name = policy_name
self.main_folder = main_folder
self.actor, self.critic, self.policy = self.build_networks()
def act(self, state):
"""Selects an action for the agent to take given a game state.
Args:
state (list of numbers): The state of the environment to act on.
traning (bool): True if the agent is training.
Returns:
(int) The index of the action to take.
"""
# If not acting randomly, take action with highest predicted value.
state_batch = np.expand_dims(state, axis=0)
probabilities = self.policy.predict(state_batch)[0]
action = np.random.choice(self.action_size, p=probabilities)
return action
def learn(self, print_variables=False):
"""Trains the Deep Q Network based on stored experiences."""
gamma = self.memory.gamma
experiences = self.memory.sample()
state_mb, action_mb, reward_mb, dones_mb, next_value = experiences
# One hot enocde actions
actions = np.zeros([len(action_mb), self.action_size])
actions[np.arange(len(action_mb)), action_mb] = 1
#Apply TD(0)
discount_mb = reward_mb + next_value * gamma * (1 - dones_mb)
state_values = self.critic.predict([state_mb])
advantages = discount_mb - np.squeeze(state_values)
if print_variables:
print("discount_mb", discount_mb)
print("next_value", next_value)
print("state_values", state_values)
print("advantages", advantages)
else:
self.actor.train_on_batch(
[state_mb, advantages], [actions, discount_mb])
def build_networks(self):
"""Creates Actor Critic Neural Networks.
Creates a hidden-layer Policy Gradient Neural Network. The loss
function is altered to be a log-likelihood function weighted
by an action's advantage.
"""
state_input = Input(shape=self.state_shape, name='frames')
advantages = Input((1,), name='advantages') # PG, A instead of G
# PG
actor_1 = Dense(units=self.hidden_neurons, activation="relu",name='actor1')(state_input)
actor_3 = Dense(units=int(self.hidden_neurons), activation="relu",name='actor3')(actor_1)
adrop_1 = Dropout(0.2,name='actor_drop_1')(actor_3)
actor_4 = Dense(units = self.hidden_neurons, activation="relu")(adrop_1)
probabilities = Dense(self.action_size, activation='softmax',name='actor_output')(actor_4)
# DQN
critic_1 = Dense(units = self.hidden_neurons,activation="relu",name='critic1')(state_input)
critic_3 = Dense(units = int(self.hidden_neurons), activation="relu",name='critic3')(critic_1)
cdrop_1 = Dropout(0.2,name='critic_drop_1')(critic_3)
critic_4 = Dense(units = self.hidden_neurons, activation="relu")(cdrop_1) #activation era relu por error... se cambio a elu, MONITOREAR
values = Dense(1, activation='linear',name='critic_output')(critic_4)
def actor_loss(y_true, y_pred): # PG
y_pred_clipped = K.clip(y_pred, self.CLIP_EDGE, 1-self.CLIP_EDGE)
log_lik = y_true*K.log(y_pred_clipped)
entropy_loss = y_pred * K.log(K.clip(y_pred, self.CLIP_EDGE, 1-self.CLIP_EDGE)) # New
return K.sum(-log_lik * advantages) - (self.entropy * K.sum(entropy_loss))
# Train both actor and critic at the same time.
actor = Model(
inputs=[state_input, advantages], outputs=[probabilities, values])
actor.compile(
loss=[actor_loss, 'mean_squared_error'], # [PG, DQN]
loss_weights=[1, self.critic_weight], # [PG, DQN]
optimizer=Adam(learning_rate=self.learning_rate))#,clipnorm=1.0))
critic = Model(inputs=[state_input], outputs=[values])
policy = Model(inputs=[state_input], outputs=[probabilities])
tf.keras.utils.plot_model(actor,f"{self.main_folder}/Agents/{self.actor_name}.png",show_shapes=True)
tf.keras.utils.plot_model(critic,f"{self.main_folder}/Agents/{self.critic_name}.png",show_shapes=True)
tf.keras.utils.plot_model(policy,f"{self.main_folder}/Agents/{self.policy_name}.png",show_shapes=True)
return actor, critic, policy
The loop where the agent interacts with the environment is this:
with tf.Graph().as_default():
agent = Agent()
environment = Environment()
state = environment.reset()
done = False
while not done:
acion = agent.act(state)
state,reward,done,info = environment.step(action)
next_value = agent.critic.predict([[state]])
agent.memory.add((state,action,reward,done,next_value))
if agent.memory.full():
agent.learn()
This works fine. My problem comes when I try to switch to use #tf.function because it seems (afaik) that increases training speed (also did a little of benchmark in a jupyter notebook and it is actually faster).
The "refactored" code is this:
The main loop:
agent = Agent()
environment = Environment()
state = environment.reset()
done = False
while not done:
acion = agent.act(state)
state,reward,done,info = environment.step(action)
next_value = agent.model_predict(agent.critic,[[state]]).numpy() #REMOVED .predict FROM MODEL
agent.memory.add((state,action,reward,done,next_value))
if agent.memory.full():
agent.learn()
The modified functions in the Agent class:
#tf.function #NEW FUNCTION ADDED USING #tf.function
def model_predict(self,model,x):
return model(x)
def act(self, state): #MODIFIED FUNCTION, NOW USES self.model_predict
"""Selects an action for the agent to take given a game state.
Args:
state (list of numbers): The state of the environment to act on.
traning (bool): True if the agent is training.
Returns:
(int) The index of the action to take.
"""
# If not acting randomly, take action with highest predicted value.
state_batch = np.expand_dims(state, axis=0)
probabilities = self.model_predict(self.policy,state_batch).numpy()[0]
action = np.random.choice(self.action_size, p=probabilities)
return action
def learn(self, print_variables=False): #MODIFIED FUNCTION, NOW USES self.model_predict
"""Trains the Deep Q Network based on stored experiences."""
gamma = self.memory.gamma
experiences = self.memory.sample()
state_mb, action_mb, reward_mb, dones_mb, next_value = experiences
# One hot enocde actions
actions = np.zeros([len(action_mb), self.action_size])
actions[np.arange(len(action_mb)), action_mb] = 1
#Apply TD(0)
discount_mb = reward_mb + next_value * gamma * (1 - dones_mb)
state_values = self.model_predict(self.critic,[state_mb]).numpy()
advantages = discount_mb - np.squeeze(state_values)
if print_variables:
print("discount_mb", discount_mb)
print("next_value", next_value)
print("state_values", state_values)
print("advantages", advantages)
else:
self.actor.train_on_batch(
[state_mb, advantages], [actions, discount_mb])
The error is triggered when self.actor.train_on_batch is executed, giving me the error mentioned above. Why this happens and what I'm doing wrong?

Related

DQN model for Atari game never learns

I'm trying to implement an Pong game with DQN model by torch. However I got two problems during the execution. Firstly, I found that the game never get done. Secondly, I found the loss function does not have any change in the trainning. This is my code below:
I defined a CNN network with the input of the size (batch=32, channels=4, height=84, weight=84). By this step there's nothing wrong happened:
class CNN(nn.Module):
def __init__(self, s_channels, a_space):
super(CNN, self).__init__()
self.pool = nn.MaxPool2d(kernel_size=2, stride=1)
self.conv1 = nn.Conv2d(s_channels,out_channels=32,kernel_size=8,stride=4)
self.conv2 = nn.Conv2d(32,64,4,2)
self.conv3 = nn.Conv2d(64,64,3,1)
self.fc1 = nn.Linear(64*4*4,1024)
self.fc2 = nn.Linear(1024,512)
self.fc3 = nn.Linear(512,a_space)
def forward(self,input):
output = self.pool(F.relu(self.conv1(input)))
output = self.pool(F.relu(self.conv2(output)))
output = self.pool(F.relu(self.conv3(output)))
output = output.view(-1,64*4*4)
output = F.relu(self.fc1(output))
output = F.relu(self.fc2(output))
output = F.relu(self.fc3(output))
return output
For the agent class, I defined a back propagation function to replay the weight in CNN and the data pre-processing function:
# Agent
class Agent():
def __init__(self, s_space, a_space) -> None:
# define parameters
self.epsilon = 1.0
self.min_epsilon = 0.01
self.dr = 0.995
self.lr = 0.001
self.gamma = 0.9
# define models
self.evl_net = CNN(s_space, a_space)
self.tgt_net = CNN(s_space, a_space)
self.cert = nn.SmoothL1Loss()
self.optimal = th.optim.Adam(self.evl_net.parameters(),lr=self.lr)
# define memory store
self.memory = deque(maxlen=2000)
# self.img_stack = deque(maxlen=4)
# pre-processing frame images: transform the imaages into tensors
# def bsl_image_pre_process(self,env):
# env = aw.AtariWrapper(env,noop_max=30,frame_skip=4,screen_size=84,terminal_on_life_loss=True,clip_reward = True)
# return env
def gym_image_pre_process(self,env):
#Atari preprocessing
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
#create frame stack
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
return env,channels
# env = aw.AtariWrapper(env,noop_max=30,frame_skip=4,screen_size=84,terminal_on_life_loss=True,clip_reward = True)
# return env
def data_pre_process(self,batch_size):
s_v = []
a_v = []
next_s_v = []
r_v = []
dones = []
materials = random.sample(self.memory,batch_size)
for t in materials:
s_v.append(t[0])
a_v.append(t[1])
next_s_v.append(t[2])
r_v.append(t[3])
dones.append(t[4])
# print(th.FloatTensor(r_v))
# print(th.FloatTensor(r_v).size())
# print(s_v)
s_v = th.Tensor(s_v) # size: [32,3,210,160]
a_v = th.LongTensor(a_v).unsqueeze(1) # size: [32,1]
next_s_v = th.Tensor(next_s_v) # size: [32,3,210,160]
r_v = th.FloatTensor(r_v) # size: [32]
return s_v, a_v, next_s_v, r_v, dones
# remember the transformed images
def record(self,tpl):
self.memory.append(tpl)
# select actions according to the states (input images with 4 channels)
def select(self,state,a_space):
actions = self.evl_net(state).data.tolist()
if(random.random() <= self.epsilon):
action = random.randint(0,a_space-1)
else:
action = actions.index(max(actions))
return action
# DQN trainning progression
def train(self,state,batch_size):
s_v,a_v,next_s_v,r_v,dones = self.data_pre_process(batch_size)
self.tgt_net.load_state_dict(self.evl_net.state_dict())
evl_Q_value = self.evl_net(s_v).gather(0,a_v) # size: [32,6].gather() -> [32,1]
tgt = self.tgt_net(next_s_v).max(1)[0].detach() # size [32,1]
tgt_Q_value = (r_v + self.gamma * tgt)
for index in range(len(dones)):
if(dones[index]==True):
tgt[index][0] = -1
# print(tgt_Q_value)
tgt_Q_value = tgt_Q_value.reshape(batch_size,1) # size: [32, 1] cannot be back propagated
# print(tgt_Q_value)
self.optimal.zero_grad()
loss = self.cert(evl_Q_value, tgt_Q_value)
print(loss)
loss.backward()
for pr in self.evl_net.parameters():
pr.grad.data.clamp_(-1, 1)
self.optimal.step()
if(self.epsilon > self.min_epsilon):
self.epsilon *= self.dr
At the training stage, I found the first question. the condition of done in each episode is always false. With gym.wrappers I've pre-processed the image tensor into 48484 and the environment with only one life. But it still appears:
# main test
_display = Display(visible=0, size=(900,1400))
_display.start()
# set episode step and batch_size
episodes = 5000
batch_size = 32
env = gym.make("PongNoFrameskip-v4")
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
# create frame stack for the input image data (size: (4,84,84))
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
a_space = env.action_space.n
agent = Agent(channels, a_space)
# env.render()
# testing:
for e in range(episodes):
# step 1: reset the agent at the beginning
s = np.array(env.reset())
for run in range(100):
score = 0
# display.clear_output(wait=True)
# display.display(Image.fromarray(env.render(mode='rgb_array')))
# env.render("rgb_array")
img = plt.imshow(env.render('rgb_array'))
# step 2: create state space tensor
# step 3: iterate actions
a = agent.select(th.Tensor(s).unsqueeze(0),a_space)
next_s, reward, done, _ = env.step(a)
if(done==True):
next_s = None
next_s = np.array(next_s) # done is never true. Why?
# step 4: record the data into buffer
dataset = (s,a,next_s,reward,done)
agent.record(dataset)
# step 5: update state steps
s = next_s
score += reward
if(done==True or run == 99):
print("episodes:",e,"score:",score,"epsilon: {:.2}".format(agent.epsilon))
break
# step 6: training and update CNN
if(len(agent.memory) > batch_size):
agent.train(channels,batch_size)
As I tried to find this problem, I detected that the loss value never even roughly decreases(at most fluctuate around 1.2). I rechecked the input and output tensor but found nothing else. I hope to get some help for how to fix these two problems. Many thanks!

TensorFlow 2.0 : ValueError - No Gradients Provided (After Modifying DDPG Actor)

Background
I'm currently trying to implement a DDPG framework to control a simple car agent. At first, the car agent would only need to learn how to reach the end of a straight path as quickly as possible by adjusting its acceleration. This task was simple enough, so I decided to introduce an additional steering action as well. I updated my observation and action spaces accordingly.
The lines below are the for loop that runs each episode:
for i in range(episodes):
observation = env.reset()
done = False
score = 0
while not done:
action = agent.choose_action(observation, evaluate)
observation_, reward, done, info = env.step(action)
score += reward
agent.remember(observation, action, reward, observation_, done)
if not load_checkpoint:
agent.learn()
observation = observation_
The lines below are my choose_action and learn functions:
def choose_action(self, observation, evaluate=False):
state = tf.convert_to_tensor([observation], dtype=tf.float32)
actions = self.actor(state)
if not evaluate:
actions += tf.random.normal(shape=[self.n_actions],
mean=0.0, stddev=self.noise)
actions = tf.clip_by_value(actions, self.min_action, self.max_action)
return actions[0]
def learn(self):
if self.memory.mem_cntr < self.batch_size:
return
state, action, reward, new_state, done = \
self.memory.sample_buffer(self.batch_size)
states = tf.convert_to_tensor(state, dtype=tf.float32)
states_ = tf.convert_to_tensor(new_state, dtype=tf.float32)
rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
actions = tf.convert_to_tensor(action, dtype=tf.float32)
with tf.GradientTape() as tape:
target_actions = self.target_actor(states_)
critic_value_ = tf.squeeze(self.target_critic(
states_, target_actions), 1)
critic_value = tf.squeeze(self.critic(states, actions), 1)
target = reward + self.gamma*critic_value_*(1-done)
critic_loss = keras.losses.MSE(target, critic_value)
critic_network_gradient = tape.gradient(critic_loss,
self.critic.trainable_variables)
self.critic.optimizer.apply_gradients(zip(
critic_network_gradient, self.critic.trainable_variables))
with tf.GradientTape() as tape:
new_policy_actions = self.actor(states)
actor_loss = -self.critic(states, new_policy_actions)
actor_loss = tf.math.reduce_mean(actor_loss)
actor_network_gradient = tape.gradient(actor_loss,
self.actor.trainable_variables)
self.actor.optimizer.apply_gradients(zip(
actor_network_gradient, self.actor.trainable_variables))
self.update_network_parameters()
And finally, my ActorNetwork is as follows:
class ActorNetwork(keras.Model):
def __init__(self, fc1_dims=512, fc2_dims=512, n_actions=2, name='actor',
chkpt_dir='tmp/ddpg'):
super(ActorNetwork, self).__init__()
self.fc1_dims = fc1_dims
self.fc2_dims = fc2_dims
self.n_actions = n_actions
self.model_name = name
self.checkpoint_dir = chkpt_dir
self.checkpoint_file = os.path.join(self.checkpoint_dir,
self.model_name+'_ddpg.h5')
self.fc1 = Dense(self.fc1_dims, activation='relu')
self.fc2 = Dense(self.fc2_dims, activation='relu')
self.mu = Dense(self.n_actions, activation='tanh')
def call(self, state):
prob = self.fc1(state)
prob = self.fc2(prob)
mu = self.mu(prob) * 3.5
return mu
Note: The code I'm working with is just building off of the code from this tutorial
The Problem
Up until now, I hadn't faced any issues with the code but I did want to adjust the maximum/minimum values of my actions. When I was only considering the acceleration action, I simply multiplied mu by 3.5. However, I wanted the steering actions to exist within a range of -30 to 30 degrees, but I couldn't just multiply mu as I had before. To try to adjust the desired steering range, I made the following (not so elegant) changes to my ActorNetwork
def call(self, state):
prob = self.fc1(state)
prob = self.fc2(prob)
mu = self.mu(prob)# * 3.5
mu_ = []
mu_l = mu.numpy().tolist()
for i, elem1 in enumerate(mu_l):
temp_ = []
for j, elem2 in enumerate(elem1):
if j-1 == 0:
temp_.append(float(elem2 * 3.5))
else:
temp_.append(float(elem2 * math.radians(30)))
mu_.append(temp_)
mu = tf.convert_to_tensor(mu_, dtype=tf.float32)
return mu
The new lines that I added were meant to:
Convert the mu tensor into a list
Iterate through the elements in the mu list (mu_l) and if a value had an index of 0 (acceleration) then multiply by 3.5; otherwise, multiply the value at index=1 (steering) by the radians conversion of 30 degrees.
Append each adjusted value into a new list (mu_)
Set mu to be equal to a tensor conversion of mu_
It was at this point that I ran into the following error:
ValueError: No gradients provided for any variable: ['actor_network/dense/kernel:0', 'actor_network/dense/bias:0', 'actor_network/dense_1/kernel:0', 'actor_network/dense_1/bias:0', 'actor_network/dense_2/kernel:0', 'actor_network/dense_2/bias:0'].
I have tried to find solutions provided within StackOverflow and from outside sources (e.g. including watch, checking to make sure that I am using model() instead of model.predict() while in GradientTape(), making sure I'm not performing calculations outside of the Tape context) but I haven't had any luck resolving the issue. I suspect that my issue is similar to the one presented in this previous post but I'm not sure how to diagnose whether my problem stems from also overwritting mu with a tensor. Does anyone have any insight regarding this problem?
The issue has been resolved thanks to some simple but helpful advice I received on Reddit. I was disrupting the tracking of my variables by making changes using my custom for-loop. I should have used a TensorFlow function instead. The following changes fixed the problem for me:
def call(self, state):
prob = self.fc1(state)
prob = self.fc2(prob)
mu = self.mu(prob)
mult = tf.convert_to_tensor([3.5, math.radians(30)], dtype=tf.float32)
mu = tf.math.multiply(mu, mult)
return mu

Gradient is equal to 'None'

I have two networks. The output of the first network is the input to the other. In order to calculate the loss for the second network, I use vanilla policy gradient. I want to backpropagate this loss into the first network. After checking if the gradeints has changed, I see that they are all none.
I first load the first network (a pre-trained autoencoer in my network this way):
def load_checkpoint(filepath, model):
checkpoint = torch.load(filepath)
model.load_state_dict(checkpoint['state_dict'])
for parameter in model.parameters():
parameter.requires_grad = True
model.train()
return model
Then I define the optimizers for both networks this way:
class MultipleOptimizer(object):
def __init__(self, *op):
self.optimizers = op
def zero_grad(self):
for op in self.optimizers:
op.zero_grad()
def step(self):
for op in self.optimizers:
op.step()
opt = MultipleOptimizer(SGD(model.parameters(), lr=1, momentum=0.9), Adam(logits_net.parameters(), lr=lr))
the reward function is:
#Reward function
def reward(x, act):
#print('action', act)
#print('x type', type(x))
km = KMeans(act, n_init=20, n_jobs=4)
y_pred = km.fit_predict(x.detach().cpu().numpy())# seems we can only get a centre from batch
#print('k-means output type', type(y_pred))
sil_score = sil(x.detach().cpu().numpy(), y_pred)
#print('sil score', sil_score)
return sil_score
The architecture of the second neural net and an alternative to avoid (logits=logits.mean(0)):
def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
# Build a feedforward neural network. outputs are the logits
layers = []
for j in range(len(sizes)-1):
act = activation if j < len(sizes)-2 else output_activation
layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
return nn.Sequential(*layers)
class mlp2(torch.nn.Module):
def __init__(self):
super(mlp2, self).__init__()
self.linear1 = nn.Linear(10,100)
self.relu1 = nn.ReLU(inplace=True)
self.linear2 = torch.nn.Linear(100,100)
self.linear3 = torch.nn.Linear(100,20)
self.linear4 = torch.nn.Linear(2000,100)
self.ident = nn.Identity()
def forward(self, x):
a = self.linear1(x)
a = self.relu1(a)
a = self.linear2(a)
a = self.relu1(a)
a = self.linear3(a)
a = torch.flatten(a)
a = self.linear4(a)
a = self.relu1(a)
a = self.linear3(a)
out = self.ident(a)
return out
Loss is calculated as in the following order:
def get_policy(obs):
logits = logits_net(obs)
return Categorical(logits=logits.mean(0))
def get_action(obs):
return get_policy(obs).sample().item()
def Logp(obs, act):
logp = get_policy(obs).log_prob(act.cuda())
return logp
def compute_loss(logp, weights):
return -(logp * weights).mean()
def train_one_epoch():
# make some empty lists for logging.
batch_obs = [] # for observations
batch_acts = [] # for actions
batch_weights = [] # for R(tau) weighting in policy gradient
batch_logp = []
# reset episode-specific variables
j = 1 # signal from environment that episode is over
ep_rews = [] # list for rewards accrued throughout ep
for i, data in enumerate(train_loader):
#Create the mean image out of those 100 images
x, label = data
x = model(x.cuda())#torch.Size([100, 10])
obs = x.data.cpu().numpy()#[100, 10] - a trajectory with only one state
# Save obs
batch_obs.append(obs.copy())
#act in the environment
#act = get_action(torch.as_tensor(obs, dtype=torch.float32))
act = get_action(x)
print('action type', type(act))
#log probability
#logp = Logp(torch.as_tensor(obs, dtype=torch.float32),act = torch.as_tensor(act, dtype=torch.int32))
logp = Logp(x, act = torch.as_tensor(act, dtype=torch.int32))
#rew = reward(obs, act+2)
rew = reward(x, act+2)
# save action, reward
batch_acts.append(act)
batch_weights.append(rew)#episode rewards
batch_logp.append(logp)
opt.zero_grad()
batch_logp = torch.stack(batch_logp, dim=0)
batch_loss = compute_loss(logp = torch.as_tensor(batch_logp, dtype=torch.float32),
weights = torch.as_tensor(batch_weights, dtype=torch.float32))
batch_loss.backward() #does it return anything? gradients? print them!
opt.step()
for name, param in logits_net.named_parameters():
print(name, param.grad)
I applied some changes with the assumption that maybe recreating some of the tensors maybe the issue:
I have the output of the first network, obs, converted like obs = x.data.cpu().numpy() this and then sent to get_action function: act = get_action(torch.as_tensor(obs, dtype=torch.float32)). I changes this to act = get_action(x) so, x is sent directly to this function. Also, change arguments of logp to logp = Logp(x, act = torch.as_tensor(act, dtype=torch.int32)).
After these changes, I still get the none value for the gradient. Is there anyway possible to backpropagate the gradient when loss is calculated this way? any changes that I can apply?
any help is appreciated.

RuntimeError: Error(s) in loading state_dict for Actor - torch.load()

I have created a custom environment in open ai gym and i am facing error while loading the weights Could some one help me to resolve the issue . I am training a TD3 network in a custom environment and i have trained successfully but while inferencing i am facing this issue
class Actor(nn.Module):
def __init__(self, state_dim, action_dim, max_action):
super(Actor, self).__init__()
self.layer_1 = nn.Linear(state_dim, 400)
self.layer_2 = nn.Linear(400, 300)
self.layer_3 = nn.Linear(300, action_dim)
self.max_action = max_action
def forward(self, x):
x = F.relu(self.layer_1(x))
x = F.relu(self.layer_2(x))
x = self.max_action * torch.tanh(self.layer_3(x))
return x
class Critic(nn.Module):
def __init__(self, state_dim, action_dim):
super(Critic, self).__init__()
# Defining the first Critic neural network
self.layer_1 = nn.Linear(state_dim + action_dim, 400)
self.layer_2 = nn.Linear(400, 300)
self.layer_3 = nn.Linear(300, 1)
# Defining the second Critic neural network
self.layer_4 = nn.Linear(state_dim + action_dim, 400)
self.layer_5 = nn.Linear(400, 300)
self.layer_6 = nn.Linear(300, 1)
def forward(self, x, u):
xu = torch.cat([x, u], 1)
# Forward-Propagation on the first Critic Neural Network
x1 = F.relu(self.layer_1(xu))
x1 = F.relu(self.layer_2(x1))
x1 = self.layer_3(x1)
# Forward-Propagation on the second Critic Neural Network
x2 = F.relu(self.layer_4(xu))
x2 = F.relu(self.layer_5(x2))
x2 = self.layer_6(x2)
return x1, x2
def Q1(self, x, u):
xu = torch.cat([x, u], 1)
x1 = F.relu(self.layer_1(xu))
x1 = F.relu(self.layer_2(x1))
x1 = self.layer_3(x1)
return x1
# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Building the whole Training Process into a class
class TD3(object):
def __init__(self, state_dim, action_dim, max_action):
self.actor = Actor(state_dim, action_dim, max_action).to(device)
self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
self.actor_target.load_state_dict(self.actor.state_dict())
self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
self.critic = Critic(state_dim, action_dim).to(device)
self.critic_target = Critic(state_dim, action_dim).to(device)
self.critic_target.load_state_dict(self.critic.state_dict())
self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
self.max_action = max_action
def select_action(self, state):
state = torch.Tensor(state.reshape(1, -1)).to(device)
return self.actor(state).cpu().data.numpy().flatten()
def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
for it in range(iterations):
# Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
state = torch.Tensor(batch_states).to(device)
next_state = torch.Tensor(batch_next_states).to(device)
action = torch.Tensor(batch_actions).to(device)
reward = torch.Tensor(batch_rewards).to(device)
done = torch.Tensor(batch_dones).to(device)
# Step 5: From the next state s’, the Actor target plays the next action a’
next_action = self.actor_target(next_state)
# Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
noise = noise.clamp(-noise_clip, noise_clip)
next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
# Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
target_Q1, target_Q2 = self.critic_target(next_state, next_action)
# Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
target_Q = torch.min(target_Q1, target_Q2)
# Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
target_Q = reward + ((1 - done) * discount * target_Q).detach()
# Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
current_Q1, current_Q2 = self.critic(state, action)
# Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
# Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
if it % policy_freq == 0:
actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
# Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
# Making a save method to save a trained model
def save(self, filename, directory):
torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
# Making a load method to load a pre-trained model
def load(self, filename, directory):
self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
def evaluate_policy(policy, eval_episodes=10):
avg_reward = 0.
for _ in range(eval_episodes):
obs = env.reset()
done = False
while not done:
action = policy.select_action(np.array(obs))
obs, reward, done, _ = env.step(action)
avg_reward += reward
avg_reward /= eval_episodes
print ("---------------------------------------")
print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
print ("---------------------------------------")
return avg_reward
env_name = "Pygame-v0"
seed = 0
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")
eval_episodes = 10
save_env_vid = True
env = gym.make(env_name)
max_episode_steps = env._max_episode_steps
if save_env_vid:
env = wrappers.Monitor(env, monitor_dir, force = True)
env.reset()
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
policy = TD3(state_dim, action_dim, max_action)
#policy.load(file_name, './pytorch_models/')
policy.load(file_name,"/content/gdrive/My Drive/reinforce/gym_game/pytorch_models")
_ = evaluate_policy(policy, eval_episodes=eval_episodes)
Traceback:
I am facing a runtime error while loading the state_dict for actor model .I searched google but couldnt find similar issues .
RuntimeError: Error(s) in loading state_dict for Actor:
Missing key(s) in state_dict: "layer_1.weight", "layer_1.bias", "layer_2.weight", "layer_2.bias", "layer_3.weight", "layer_3.bias".
Unexpected key(s) in state_dict: "encoder.0.weight", "encoder.0.bias", "encoder.2.weight", "encoder.2.bias", "encoder.2.running_mean", "encoder.2.running_var", "encoder.2.num_batches_tracked", "encoder.3.weight", "encoder.3.bias", "encoder.5.weight", "encoder.5.bias", "encoder.5.running_mean", "encoder.5.running_var", "encoder.5.num_batches_tracked", "encoder.6.weight", "encoder.6.bias", "encoder.8.weight", "encoder.8.bias", "encoder.8.running_mean", "encoder.8.running_var", "encoder.8.num_batches_tracked", "encoder.10.weight", "encoder.10.bias", "encoder.12.weight", "encoder.12.bias", "encoder.12.running_mean", "encoder.12.running_var", "encoder.12.num_batches_tracked", "encoder.13.weight", "encoder.13.bias", "encoder.15.weight", "encoder.15.bias", "encoder.15.running_mean", "encoder.15.running_var", "encoder.15.num_batches_tracked", "encoder.16.weight", "encoder.16.bias", "linear.0.weight", "linear.0.bias", "linear.2.weight", "linear.2.bias".
it was answered by #MicaelJungo
The weights you saved were not from the model you are using here. Make sure to load the correct checkpoint, which was created when training this particular model.

Inference time varies over different GPUs using Torch

I get a bug when running the below inference code. In the function recognize(), it takes 0.4s to finish prediction. It takes another 3s to return the result preds_str to the caller function. I found that if I set gpu_id=0 in file config, it returns instantly. How can I fix this bug? Thanks in advance.
def recognize(imgs, model, demo_loader):
t = time()
model.eval()
with torch.no_grad():
for image_tensors, image_path_list in demo_loader:
batch_size = image_tensors.size(0)
image = image_tensors.to(config.device)
# For max length prediction
length_for_pred = torch.IntTensor([config.batch_max_length] * batch_size).to(config.device)
text_for_pred = torch.LongTensor(batch_size, config.batch_max_length + 1).fill_(0).to(config.device)
preds = model(image, text_for_pred, is_train=False)
_, preds_index = preds.max(2)
preds_str = converter.decode(preds_index, length_for_pred)
print('time elapsed before return:'time()-t) #0.4s
return preds_str
def main():
model = Model()
self.model.cuda(config.device)
model = torch.nn.DataParallel(model, device_ids=[config.device], output_device=[config.device]).to(config.device)
model.load_state_dict(torch.load(config.saved_model, map_location=config.device))
AlignCollate_demo = AlignCollate(imgH=config.imgH, imgW=config.imgW, keep_ratio_with_pad=config.PAD)
imgs_dataset = ImageDataset(imgs)
demo_loader = torch.utils.data.DataLoader(imgs_dataset, batch_size=config.batch_size,shuffle=False,num_workers=int(config.workers),collate_fn=AlignCollate_demo, pin_memory=True)
start_time = time()
# imgs = [img1, img2, ....]
preds_str = recognize(imgs, model, demo_loader)
print('time elapsed after return', time()-start_time) #3.4s
Config file:
class ConfigWordRecognizer:
gpu_id = 1 #troublesome line here
device = torch.device('cuda:{}'.format(gpu_id) if torch.cuda.is_available() else 'cpu')
imgH = 32
imgW = 100
batch_size = 80
workers = 8
batch_max_length = 25
I found the solution from this post.
I set CUDA_VISIBLE_DEVICES=1, gpu_id=0. Then, I remove
model = torch.nn.DataParallel(model, device_ids=[config.device], output_device=[config.device]).to(config.device)
and change
model.load_state_dict(torch.load(config.saved_model, map_location=config.device))
to
model.load_state_dict(self.copyStateDict(torch.load(self.config.saved_model, map_location=self.config.device)))
Copy stateDict function:
def copyStateDict(self, state_dict):
if list(state_dict.keys())[0].startswith("module"):
start_idx = 1
else:
start_idx = 0
new_state_dict = OrderedDict()
for k, v in state_dict.items():
name = ".".join(k.split(".")[start_idx:])
new_state_dict[name] = v
return new_state_dict
The model works well on gpu1. But I still don't understand why if I set 'gpu_id=0', it works well on gpu0 without copyStateDict

Resources