I am currently getting into tensorflow and have just now started to grasp the graph like concept of it. Now I tried to implement a NN using gradient descent(Adam optimizer) to solve the cartpole environment. I start by randomly intializing my weights and then take random actions(accounting for existing weights) during training. When testing I always take the action with maximum probability. However I always get a score that hovers around 10 and variance is around 0.8. Always. it doesn't change in a notable fashion at all making it look that it always takes purely random actions at every step, not learning anything at all. As I said it seems that the weights are never updated correctly. Where and how do I need to do that?
Here's my code:
import tensorflow as tf
import numpy as np
from gym.envs.classic_control import CartPoleEnv
env = CartPoleEnv()
learning_rate = 10**(-3)
gamma = 0.9999
n_train_trials = 10**3
n_test_trials = 10**2
n_actions = env.action_space.n
n_obs = env.observation_space.high.__len__()
goal_steps = 200
should_render = False
print_per_episode = 100
state_holder = tf.placeholder(dtype=tf.float32, shape=(None, n_obs), name='symbolic_state')
actions_one_hot_holder = tf.placeholder(dtype=tf.float32, shape=(None, n_actions),
name='symbolic_actions_one_hot_holder')
discounted_rewards_holder = tf.placeholder(dtype=tf.float32, shape=None, name='symbolic_reward')
# initialize neurons list dynamically
def get_neurons_list():
i = n_obs
n_neurons_list = [i]
while i < (n_obs * n_actions) // (n_actions // 2):
i *= 2
n_neurons_list.append(i)
while i // 2 > n_actions:
i = i // 2
n_neurons_list.append(i)
n_neurons_list.append(n_actions)
# print(n_neurons_list)
return n_neurons_list
with tf.name_scope('nonlinear_policy'):
# create list of layers with sizes
n_neurons_list = get_neurons_list()
network = None
for i in range((len(n_neurons_list) - 1)):
theta = tf.Variable(tf.random_normal([n_neurons_list[i], n_neurons_list[i+1]]))
bias = tf.Variable(tf.random_normal([n_neurons_list[i+1]]))
if network is None:
network = tf.matmul(state_holder, theta) + bias
else:
network = tf.matmul(network, theta) + bias
if i < len(n_neurons_list) - 1:
network = tf.nn.relu(network)
action_probabilities = tf.nn.softmax(network)
testing_action_choice = tf.argmax(action_probabilities, dimension=1, name='testing_action_choice')
with tf.name_scope('loss'):
actually_chosen_probability = action_probabilities * actions_one_hot_holder
L_theta = -1 * (tf.reduce_sum(tf.log(actually_chosen_probability)) * tf.reduce_sum(discounted_rewards_holder))
with tf.name_scope('train'):
# We define the optimizer to use the ADAM optimizer, and ask it to minimize our loss
gd_opt = tf.train.AdamOptimizer(learning_rate).minimize(L_theta)
sess = tf.Session() # FOR NOW everything is symbolic, this object has to be called to compute each value of Q
# Start
sess.run(tf.global_variables_initializer())
observation = env.reset()
batch_rewards = []
states = []
action_one_hots = []
episode_rewards = []
episode_rewards_list = []
episode_steps_list = []
step = 0
episode_no = 0
while episode_no <= n_train_trials:
if should_render: env.render()
step += 1
action_probability_values = sess.run(action_probabilities,
feed_dict={state_holder: [observation]})
# Choose the action using the action probabilities output by the policy implemented in tensorflow.
action = np.random.choice(np.arange(n_actions), p=action_probability_values.ravel())
# Calculating the one-hot action array for use by tensorflow
action_arr = np.zeros(n_actions)
action_arr[action] = 1.
action_one_hots.append(action_arr)
# Record states
states.append(observation)
observation, reward, done, info = env.step(action)
# We don't want to go above 200 steps
if step >= goal_steps:
done = True
batch_rewards.append(reward)
episode_rewards.append(reward)
# If the episode is done, and it contained at least one step, do the gradient updates
if len(batch_rewards) > 0 and done:
# First calculate the discounted rewards for each step
batch_reward_length = len(batch_rewards)
discounted_batch_rewards = batch_rewards.copy()
for i in range(batch_reward_length):
discounted_batch_rewards[i] *= (gamma ** (batch_reward_length - i - 1))
# Next run the gradient descent step
# Note that each of action_one_hots, states, discounted_batch_rewards has the first dimension as the length
# of the current trajectory
gradients = sess.run(gd_opt, feed_dict={actions_one_hot_holder: action_one_hots, state_holder: states,
discounted_rewards_holder: discounted_batch_rewards})
action_one_hots = []
states = []
batch_rewards = []
if done:
# Done with episode. Reset stuff.
episode_no += 1
episode_rewards_list.append(np.sum(episode_rewards))
episode_steps_list.append(step)
episode_rewards = []
step = 0
observation = env.reset()
if episode_no % print_per_episode == 0:
print("Episode {}: Average steps in last {} episodes".format(episode_no, print_per_episode),
np.mean(episode_steps_list[(episode_no - print_per_episode):episode_no]), '+-',
np.std(episode_steps_list[(episode_no - print_per_episode):episode_no])
)
observation = env.reset()
episode_rewards_list = []
episode_rewards = []
episode_steps_list = []
step = 0
episode_no = 0
print("Testing")
while episode_no <= n_test_trials:
env.render()
step += 1
# For testing, we choose the action using an argmax.
test_action, = sess.run([testing_action_choice],
feed_dict={state_holder: [observation]})
observation, reward, done, info = env.step(test_action[0])
if step >= 200:
done = True
episode_rewards.append(reward)
if done:
episode_no += 1
episode_rewards_list.append(np.sum(episode_rewards))
episode_steps_list.append(step)
episode_rewards = []
step = 0
observation = env.reset()
if episode_no % print_per_episode == 0:
print("Episode {}: Average steps in last {} episodes".format(episode_no, print_per_episode),
np.mean(episode_steps_list[(episode_no - print_per_episode):episode_no]), '+-',
np.std(episode_steps_list[(episode_no - print_per_episode):episode_no])
)
Here is an example tensorflow program that uses Q Learning to learn the CartPole Open Gym.
It is able to quickly learn to stay upright for 80 steps.
Here is the code :
import math
import numpy as np
import sys
import random
sys.path.append("../gym")
from gym.envs.classic_control import CartPoleEnv
env = CartPoleEnv()
discount = 0.5
learning_rate = 0.5
gradient = .001
regularizaiton_factor = .1
import tensorflow as tf
tf_state = tf.placeholder( dtype=tf.float32 , shape=[4] )
tf_state_2d = tf.reshape( tf_state , [1,4] )
tf_action = tf.placeholder( dtype=tf.int32 )
tf_action_1hot = tf.reshape( tf.one_hot( tf_action , 2 ) , [1,2] )
tf_delta_reward = tf.placeholder( dtype=tf.float32 )
tf_value = tf.placeholder( dtype=tf.float32 )
tf_matrix1 = tf.Variable( tf.random_uniform([4,7], -.001, .001) )
tf_matrix2 = tf.Variable( tf.random_uniform([7,2], -.001, .001) )
tf_logits = tf.matmul( tf_state_2d , tf_matrix1 )
tf_logits = tf.matmul( tf_logits , tf_matrix2 )
tf_loss = -1 * learning_rate * ( tf_delta_reward + discount * tf_value - tf_logits ) * tf_action_1hot
tf_regularize = tf.reduce_mean( tf.square( tf_matrix1 )) + tf.reduce_mean( tf.square( tf_matrix2 ))
tf_train = tf.train.GradientDescentOptimizer(gradient).minimize( tf_loss + tf_regularize * regularizaiton_factor )
sess = tf.Session()
sess.run( tf.global_variables_initializer() )
def max_Q( state ) :
actions = sess.run( tf_logits, feed_dict={ tf_state:state } )
actions = actions[0]
value = actions.max()
action = 0 if actions[0] == value else 1
return action , value
avg_age = 0
for trial in range(1,101) :
# initialize state
previous_state = env.reset()
# initialize action and the value of the expected reward
action , value = max_Q(previous_state)
previous_reward = 0
for age in range(1,301) :
if trial % 100 == 0 :
env.render()
new_state, new_reward, done, info = env.step(action)
new_state = new_state
action, value = max_Q(new_state)
# The cart-pole gym doesn't return a reward of Zero when done.
if done :
new_reward = 0
delta_reward = new_reward - previous_reward
# learning phase
sess.run(tf_train, feed_dict={ tf_state:previous_state, tf_action:action, tf_delta_reward:delta_reward, tf_value:value })
previous_state = new_state
previous_reward = new_reward
if done :
break
avg_age = avg_age * 0.95 + age * .05
if trial % 50 == 0 :
print "Average age =",int(round(avg_age))," , trial",trial," , discount",discount," , learning_rate",learning_rate," , gradient",gradient
elif trial % 10 == 0 :
print int(round(avg_age)),
Here is the output:
6 18 23 30 Average age = 36 , trial 50 , discount 0.5 , learning_rate 0.5 , gradient 0.001
38 47 50 53 Average age = 55 , trial 100 , discount 0.5 , learning_rate 0.5 , gradient 0.001
Summary
I wasn't able to get Q learning with a simple neural net to be able to solve the CartPole problem, but have fun experimenting with different NN sizes and depths!
Hope you enjoy this code,
cheers
Related
I am trying to solve the problem as follows with Gekko in python.
I_s is an indicator variable in the problem whose value is 1 if theta is positive and 0 if theta is zero.
I wrote the problem in a code using Gekko, python.
In contrast to my previous posts, I add some constraints with respect to I, which is an indicator variable.
If I set N=10, the solution, theta is all zero, which is the result that I want.
But if I set N=100 or 200, the solution cannot be found. I cannot understand why this happens.
I want to check if theta is also zero in larger N (200).
Is there any way to solve this issue?
My code is as belows.
# Import package
from gekko import GEKKO
import numpy as np
# Define parameters
P_CO = 600 # $/tonCO
beta_CO2 = 1 # no unit
P_CO2 = 80 # $/tonCO2eq
E_ref = 3.1022616 # tonCO2eq/tonCO
E_dir = -1.600570692 # tonCO2eq/tonCO
E_indir_others = 0.3339226804 # tonCO2eq/tonCO
E_indir_elec_cons = 18.46607256 # GJ/tonCO
C1_CAPEX = 285695 # no unit
C2_CAPEX = 188.42 # no unit
C1_FOX = 82282 # no unit
C2_FOX = 24.094 # no unit
C1_ROX = 4471.5 # no unit
C2_ROX = 96.034 # no unit
C1_UOX = 7934.9 # no unit
C2_UOX = 986.9 # no unit
r = 0.08 # discount rate
N = 10 # number of scenarios
T = 30 # total time period
GWP_init = 0.338723235 # 2020 Electricity GWP in EU 27 countries
theta_max = 1600000 # Max capacity
# Function to make GWP_EU matrix (TxN matrix)
def Electricity_GWP(GWP_init, n_years, num_episodes):
GWP_mean = 0.36258224*np.exp(-0.16395611*np.arange(1, n_years+2)) + 0.03091272
GWP_mean = GWP_mean.reshape(-1,1)
GWP_Yearly = np.tile(GWP_mean, num_episodes)
noise = np.zeros((n_years+1, num_episodes))
stdev2050 = GWP_mean[-1] * 0.25
stdev = np.arange(0, stdev2050 * (1 + 1/n_years), stdev2050/n_years)
for i in range(n_years+1):
noise[i,:] = np.random.normal(0, stdev[i], num_episodes)
GWP_forecast = GWP_Yearly + noise
return GWP_forecast
GWP_EU = Electricity_GWP(GWP_init, T, N) # (T+1)*N matrix
GWP_EU = GWP_EU[1:,:] # T*N matrix
print(np.shape(GWP_EU))
# Build Gekko model
m = GEKKO(remote=False)
theta = m.Array(m.Var, N, lb=0, ub=theta_max)
I = m.Array(m.Var, N, lb=0, ub=1, integer=True)
demand = np.ones((T,1))
demand[0] = 8031887.589
for k in range(1,11):
demand[k] = demand[k-1] * 1.026
for k in range(11,21):
demand[k] = demand[k-1] * 1.016
for k in range(21,T):
demand[k] = demand[k-1] * 1.011
demand = 0.12 * demand
demand = np.tile(demand, N) # T*N matrix
print(np.shape(demand))
m3 = [[m.min3(demand[t,s],theta[s]) for t in range(T)] for s in range(N)]
obj = m.sum([sum([((1/(1+r))**(t+1))*((P_CO*m3[s][t]) \
+ (beta_CO2*P_CO2*m3[s][t]*(E_ref-E_dir-E_indir_others-E_indir_elec_cons*GWP_EU[t,s])) \
- (C1_CAPEX*I[s]+C2_CAPEX*theta[s]+C1_FOX*I[s]+C2_FOX*theta[s])\
- (C1_ROX*I[s]+C2_ROX*m3[s][t]+C1_UOX*I[s]+C2_UOX*m3[s][t])) for t in range(T)]) for s in range(N)])
for i in range(N):
m.Equation(theta[i]<=1000000*I[i])
m.Equation(-theta[i]<1000000*(1-I[i]))
# obj = m.sum([m.sum([((1/(1+r))**(t+1))*((P_CO*m.min3(demand[t,s], theta[s])) \
# + (beta_CO2*P_CO2*m.min3(demand[t,s], theta[s])*(E_ref-E_dir-E_indir_others-E_indir_elec_cons*GWP_EU[t,s])) \
# - (C1_CAPEX+C2_CAPEX*theta[s]+C1_FOX+C2_FOX*theta[s])-(C1_ROX+C2_ROX*m.min3(demand[t,s], theta[s])+C1_UOX+C2_UOX*m.min3(demand[t,s], theta[s]))) for t in range(T)]) for s in range(N)])
m.Maximize(obj/N)
m.solve(disp=True)
# s = m.sum(m.sum(((1/(1+r))**(t+1))*((P_CO*m.min3(demand[t,s], theta[s])) \
# + beta_CO2*P_CO2*m.min3(demand[t,s], theta[s])*(E_ref-E_dir-E_indir_others-E_indir_elec_cons*GWP_EU[t,s]) \
# - (C1_CAPEX + C2_CAPEX*theta[s]) - (C1_FOX + C2_FOX*theta[s]) - (C1_ROX + C2_ROX*m.min3(demand[t,s], theta[s])) - (C1_UOX + C2_UOX*m.min3(demand[t,s], theta[s])))
# for s in range(N)) for t in range(T))/N
print(theta)
I solved this issue by increasing the big M in the constraint for an indicator variable I, 1000000 to 10000000.
for i in range(N):
m.Equation(theta[i]<=10000000*I[i])
m.Equation(-theta[i]<10000000*(1-I[i]))
I didn't understand why this worked, but the result gave me the solution of 200*1 array with all zero.
I am trying to track multiple set-points in the case of interacting quadruple tank system process. Here, the upper limits of tanks are 25 and lower limits are 0. I want to track the set-point values of 5,12,7 and 5. Although, I am able to track the initial 3 set-points (i.e. 5,12 and 7), I am not able to track the last set-point due to solver exceeding max. iterations. I have attached the code below->
#MHE+MPC model
#to measure computational time of the code
start=time.time()
#Process Model
p = GEKKO(remote=False)
process=0
p.time = [0,0.5]
noise = 0.25
#Constants
g = 981
g1 = .9
g2 = .9
A1=32
A3=32
A2=32
A4=32
a1=0.057
a3=0.057
a2=0.057
a4=0.057
init_h=5
#Controlled process variables
p.h1=p.SV(lb=0,ub=25)
p.h2=p.SV(lb=0,ub=25)
p.h3=p.SV(lb=0,ub=25)
p.h4=p.SV(lb=0,ub=25)
#Manipulated process variables
p.v1=p.MV(value=3.15,lb=0.1,ub=8)
p.v2=p.MV(value=3.15,lb=0.1,ub=8)
#Parameters of process
p.k1=p.Param(value=3.14,lb=0,ub=10)
p.k2=p.Param(value=3.14,lb=0,ub=10)
#Equations process
p.Equation(A1*p.h1.dt()==a3*((2*g*p.h3)**0.5)-(a1*((2*g*p.h1)**0.5))+(g1*p.k1*p.v1))
p.Equation(A2*p.h2.dt()==a4*((2*g*p.h4)**0.5)-(a2*((2*g*p.h2)**0.5))+(g2*p.k2*p.v2))
p.Equation(A3*p.h3.dt()==-a3*((2*g*p.h3)**0.5)+((1-g2)*p.k2*p.v2))
p.Equation(A4*p.h4.dt()==-a4*((2*g*p.h4)**0.5)+((1-g1)*p.k1*p.v1))
#options
p.options.IMODE = 4
#p.h1.TAU=-10^10
#p.h2.TAU=-10^10
#%% MHE Model
m = GEKKO(remote=False)
#prediction horizon
m.time = np.linspace(0,40,41) #0-20 by 0.5 -- discretization must match simulation
#MHE control, manipulated variables and parameters
m.h1=m.CV(lb=0,ub=25)
m.h2=m.CV(lb=0,ub=25)
m.h3=m.SV(lb=0,ub=25)
m.h4=m.SV(lb=0,ub=25)
m.v1=m.MV(value=3.15,lb=0.10,ub=8)
m.v2=m.MV(value=3.15,lb=0.10,ub=8)
m.k1=m.FV(value=3.14,lb=0,ub=10)
m.k2=m.FV(value=3.14,lb=0,ub=10)
#m.h1.TAU=0
#m.h2.TAU=0
#Equations
m.Equation(A1*m.h1.dt()==a3*((2*g*m.h3)**0.5)-(a1*((2*g*m.h1)**0.5))+(g1*m.k1*m.v1))
m.Equation(A2*m.h2.dt()==a4*((2*g*m.h4)**0.5)-(a2*((2*g*m.h2)**0.5))+(g2*m.k2*m.v2))
m.Equation(A3*m.h3.dt()==-a3*((2*g*m.h3)**0.5)+((1-g2)*m.k2*m.v2))
m.Equation(A4*m.h4.dt()==-a4*((2*g*m.h4)**0.5)+((1-g1)*m.k1*m.v1))
#Options
m.options.IMODE = 5 #MHE
m.options.EV_TYPE = 2
# STATUS = 0, optimizer doesn't adjust value
# STATUS = 1, optimizer can adjust
m.v1.STATUS = 0
m.v2.STATUS = 0
m.k1.STATUS=1
m.k2.STATUS=1
m.h1.STATUS = 1
m.h2.STATUS = 1
#m.h3.STATUS = 0
#m.h4.STATUS = 0
# FSTATUS = 0, no measurement
# FSTATUS = 1, measurement used to update model
m.v1.FSTATUS = 1
m.v2.FSTATUS = 1
m.k1.FSTATUS=0
m.k2.FSTATUS=0
m.h1.FSTATUS = 1
m.h2.FSTATUS = 1
m.h3.FSTATUS = 1
m.h4.FSTATUS = 1
#m.options.MAX_ITER=1000
m.options.SOLVER=3
m.options.NODES=3
#%% MPC Model
c = GEKKO(remote=False)
c.time = np.linspace(0,10,11) #0-5 by 0.5 -- discretization must match simulation
c.v1=c.MV(value=3.15,lb=0.10,ub=8)
c.v2=c.MV(value=3.15,lb=0.10,ub=8)
c.k1=c.FV(value=3.14,lb=0,ub=10)
c.k2=c.FV(value=3.14,lb=0,ub=10)
#Variables
c.h1=c.CV(lb=0,ub=25)
c.h2=c.CV(lb=0,ub=25)
c.h3=c.SV(lb=0,ub=25)
c.h4=c.SV(lb=0,ub=25)
#Equations
c.Equation(A1*c.h1.dt()==a3*((2*g*c.h3)**0.5)-(a1*((2*g*c.h1)**0.5))+(g1*c.k1*c.v1))
c.Equation(A2*c.h2.dt()==a4*((2*g*c.h4)**0.5)-(a2*((2*g*c.h2)**0.5))+(g2*c.k2*c.v2))
c.Equation(A3*c.h3.dt()==-a3*((2*g*c.h3)**0.5)+((1-g2)*c.k2*c.v2))
c.Equation(A4*c.h4.dt()==-a4*((2*g*c.h4)**0.5)+((1-g1)*c.k1*c.v1))
#Options
c.options.IMODE = 6 #MPC
c.options.CV_TYPE = 2
# STATUS = 0, optimizer doesn't adjust value
# STATUS = 1, optimizer can adjust
c.v1.STATUS = 1
c.v2.STATUS = 1
c.k1.STATUS=0
c.k2.STATUS=0
c.h1.STATUS = 1
c.h2.STATUS = 1
#c.h3.STATUS = 0
#c.h4.STATUS = 0
# FSTATUS = 0, no measurement
# FSTATUS = 1, measurement used to update model
c.v1.FSTATUS = 0
c.v2.FSTATUS = 0
c.k1.FSTATUS=1
c.k2.FSTATUS=1
c.h1.FSTATUS = 1
c.h2.FSTATUS = 1
c.h3.FSTATUS = 1
c.h4.FSTATUS = 1
sp=5
c.h1.SP=sp
c.h2.SP=sp
p1 = GEKKO(remote=False)
p1.time = [0,0.5]
#Parameters
p1.h1=p1.CV(lb=0,ub=25)
p1.h2=p1.CV(lb=0,ub=25)
p1.h3=p1.CV(lb=0,ub=25)
p1.h4=p1.CV(lb=0,ub=25)
p1.v1=p1.MV(value=3.15,lb=0.1,ub=8)
p1.v2=p1.MV(value=3.15,lb=0.1,ub=8)
p1.k1=p1.Param(lb=0,ub=10,value=3.14)
p1.k2=p1.Param(lb=0,ub=10,value=3.14)
#Equations
p1.Equation(A1*p1.h1.dt()==a3*((2*g*p1.h3)**0.5)-a1*((2*g*p1.h1)**0.5)+g1*p1.k1*p1.v1)
p1.Equation(A2*p1.h2.dt()==a4*((2*g*p1.h4)**0.5)-a2*((2*g*p1.h2)**0.5)+g2*p1.k2*p1.v2)
p1.Equation(A3*p1.h3.dt()==-a3*((2*g*p1.h3)**0.5)+(1-g2)*p1.k2*p1.v2)
p1.Equation(A4*p1.h4.dt()==-a4*((2*g*p1.h4)**0.5)+(1-g1)*p1.k1*p1.v1)
#options
p1.options.IMODE = 4
#%% problem configuration
# number of cycles
cycles = 480
# noise level
#%% run process, estimator and control for cycles
h1_meas = np.empty(cycles)
h2_meas =np.empty(cycles)
h3_meas =np.empty(cycles)
h4_meas=np.empty(cycles)
h1_est = np.empty(cycles)
h2_est = np.empty(cycles)
h3_est = np.empty(cycles)
h4_est = np.empty(cycles)
h1_plant=np.empty(cycles)
h2_plant=np.empty(cycles)
h3_plant=np.empty(cycles)
h4_plant=np.empty(cycles)
h1_measured=np.empty(cycles)
h2_measured=np.empty(cycles)
h3_measured=np.empty(cycles)
h4_measured=np.empty(cycles)
v1_est = np.empty(cycles)
v2_est = np.empty(cycles)
k1_est = np.empty(cycles)
k2_est = np.empty(cycles)
u_cont_k1 = np.empty(cycles)
u_cont_k2 = np.empty(cycles)
sp_store = np.empty(cycles)
sum_est=np.empty(cycles)
sum_model=np.empty(cycles)
# Create plot
plt.figure(figsize=(10,7))
plt.ion()
plt.show()
p.MAX_ITER=20
c.MAX_ITER=20
m.MAX_ITER=20
p1.MAX_ITER=20
for i in range(cycles):
print(i)
# set point changes
if i==cycles/4:
sp = 12
elif i==2*cycles/4:
sp = 7
elif i==3*cycles/4:
sp = 5
sp_store[i] = sp
c.h1.SP=sp
c.h2.SP=sp
c.k1.MEAS = m.k1.NEWVAL
c.k2.MEAS = m.k2.NEWVAL
if p.options.SOLVESTATUS == 1:
# print("going:",i)
c.h1.MEAS = p.h1.MODEL
c.h2.MEAS = p.h2.MODEL
c.h3.MEAS = p.h3.MODEL
c.h4.MEAS = p.h4.MODEL
print(i,'Plant Model:',p.h1.MODEL,p.h2.MODEL,p.h3.MODEL,p.h4.MODEL)
c.solve(disp=False,debug=0)
#print("NEWVAL:",i,c.u,c.u.NEWVAL)
u_cont_k1[i] = c.v1.NEWVAL
u_cont_k2[i] = c.v2.NEWVAL
#print("Horizon:",i,c.h1[0:],c.h2[0:])
#print("Move:",i,c.v1.NEWVAL,c.v2.NEWVAL)
## process simulator
#load control move
p.v1.MEAS = u_cont_k1[i]
p.v2.MEAS = u_cont_k2[i]
#simulate
p.solve(disp=False,debug=0)
#plant model
p1.k1=3.14
p1.k2=3.14
p1.v1.MEAS = u_cont_k1[i]
p1.v2.MEAS = u_cont_k2[i]
p1.solve(disp=False,debug=0)
h1_plant[i]=p1.h1.MODEL
h2_plant[i]=p1.h2.MODEL
h3_plant[i]=p1.h3.MODEL
h4_plant[i]=p1.h4.MODEL
h1_measured[i]=p1.h1.MODEL+(random()*2)*noise
h2_measured[i]=p1.h2.MODEL+(random()*2)*noise
h3_measured[i]=p1.h3.MODEL+(random()*2)*noise
h4_measured[i]=p1.h4.MODEL+(random()*2)*noise
#print("Model process output:",i,p.h1.MODEL,p.h2.MODEL,p.h3.MODEL,p.h4.MODEL)
#load output with white noise
h1_meas[i] = p.h1.MODEL+(random()-0.5)*noise
h2_meas[i] = p.h2.MODEL+(random()-0.5)*noise
h3_meas[i] = p.h3.MODEL+(random()-0.5)*noise
h4_meas[i] = p.h4.MODEL+(random()-0.5)*noise
#Only MPC
## estimator
#load input and measured output
m.v1.MEAS = u_cont_k1[i]
m.v2.MEAS = u_cont_k2[i]
#m.h1.MEAS = h1_meas[i]+(random()*2)*noise
#m.h2.MEAS = h2_meas[i]+(random()*2)*noise
#m.h3.MEAS = h3_meas[i]+(random()*2)*noise
#m.h4.MEAS = h4_meas[i]+(random()*2)*noise
m.h1.MEAS = h1_meas[i]
m.h2.MEAS = h2_meas[i]
m.h3.MEAS = h3_meas[i]
m.h4.MEAS = h4_meas[i]
#m.COLDSTART=2
#optimize parameters
m.solve(disp=False,debug=0)
#store results
if i>=process:
h1_est[i] = m.h1.MODEL
h2_est[i] = m.h2.MODEL
h3_est[i] = m.h3.MODEL
h4_est[i] = m.h4.MODEL
v1_est[i] = m.v1.NEWVAL
v2_est[i] = m.v2.NEWVAL
k1_est[i]= m.k1.NEWVAL
k2_est[i] = m.k2.NEWVAL
print("Estimated h:",i,h1_est[i],h2_est[i],h3_est[i],h4_est[i])
print("Estimated k:",i,k1_est[i],k2_est[i],p.k1[0],p.k2[0])
print("Estimated v:",i,v1_est[i],v2_est[i])
print("dh1/dt:",(a3*((2*g*h3_est[i])**0.5)-(a1*((2*g*h3_est[i])**0.5))+(g1*k1_est[i]*v1_est[i]))/A3)
print("dh2/dt:",(a4*((2*g*h4_est[i])**0.5)-(a2*((2*g*h2_est[i])**0.5))+(g2*k2_est[i]*v2_est[i]))/A2)
print("dh3/dt:",(-a3*((2*g*h3_est[i])**0.5)+((1-g2)*k2_est[i]*v2_est[i]))/A3)
print("dh4/dt:",(-a4*((2*g*h4_est[i])**0.5)+((1-g1)*k1_est[i]*v1_est[i]))/A4)
if i%1==0:
plt.clf()
plt.subplot(4,1,1)
#plt.plot(h1_meas[0:i])
#plt.plot(h2_meas[0:i])
#plt.plot(h3_meas[0:i])
#plt.plot(h4_meas[0:i])
plt.plot(h1_est[0:i])
plt.plot(h2_est[0:i])
plt.plot(sp_store[0:i])
plt.subplot(4,1,2)
plt.plot(h3_est[0:i])
plt.plot(h4_est[0:i])
#plt.legend(('h1_pred','h2_pred','h3_pred','h4_pred'))
plt.subplot(4,1,3)
plt.plot(k1_est[0:i])
plt.plot(k2_est[0:i])
plt.subplot(4,1,4)
plt.plot(v1_est[0:i])
plt.plot(v2_est[0:i])
plt.draw()
plt.pause(0.05)
end=time.time()
print("total time:",end-start)
I feel there is some issue with my MHE+MPC code. However, I am not able to realize the mistake?
Nice application. I needed a few imports to make the script work. These may be loaded automatically for you.
from gekko import GEKKO
import time
import numpy as np
import matplotlib.pyplot as plt
from random import random
The script solves successfully if a lower bound is included on all the level variables (1e-6). There is a problem when the level goes below zero or is at zero when using m.sqrt(). This small adjustment helps it solve successfully so it doesn't get into a region where it is undefined. Gekko solvers can't deal with imaginary numbers.
Although the solution is successful, it appears that the control performance oscillates. There may need to be some tuning of the application.
My name is Andy and I am new to stackoverflow and this is my first question.
I started learning python 40ish days ago thanks to covid19 and jumped into machine learning/qlearning about 3 weeks ago and got stuck there since.
Goal:
have the computer play Rad Racer 2 (NES racing game) using reinforcement learning.
Plans to make this work:
after various tutorials/sites, I decided to use a double network to train/learn.
2x 256 convolution network using keras since I have watched a few tutorial vids on keras basic
3 actions(hold down accelerate(J), accelerate Left(JA), accelerate Right(JD)
I am using directinput keys codes I found online to send inputs to game as sending regular keys does not work.
I know ppl uses retro gym for these type of games but I wanted to see the inner working of reward/observation and such so I used yolov5 to detect lines/objects. Based on the result from yolov5, I calculate the reward for the step.
My input is a series of grayscale images(4) to represent motion using deque then stacked with numpy.
Once I have gather enough experiences/replay memory(1500) I started the training at the end of each of episode instead of each step. I found that it lag out a lot training after each step.
Problem:
My biggest problem currently is the model does not seem to learn properly. I seem to be slightly okay around episode 20-30 then after that it get worst and worst. It get to a point where it only does one action for hours.
I have tried playing around with the learning rate(0.1 - 0.00001), different inputs(1 bgr layer, grayscale layer, 4 layer..etc), different epsilon decay rate. I commented most of the reward stuffs, only basic reward for now.
most codes beside the yolo stuffs, had to removed a few lines due to # character limitation
# parameters
training = True
learning_rate = 0.0001
DISCOUNT = 0.99
REPLAY_MEMORY_SIZE = 50_000 # How many last steps to keep for model training
MIN_REPLAY_MEMORY_SIZE = 1500 # Minimum number of steps in a memory to start training
MINIBATCH_SIZE = 1000 # How many steps (samples) to use for training
batch_size = 32
UPDATE_TARGET_EVERY = 0 # Terminal states (end of episodes)
MODEL_NAME = 'RC'
MIN_REWARD = 0 # For model save
save_every = 5 # save every x episodes
EPISODES = 2_000
# Exploration settings
if training is True:
epsilon = 1 # not a constant, going to be decayed
else:
epsilon = 0
MIN_EPSILON = 0.01
START_EPISODE_DECAY = 0
END_EPISODE_DECAY = 20
if epsilon > MIN_EPSILON:
EPS_DECAY = -(epsilon/((END_EPISODE_DECAY-START_EPISODE_DECAY)/epsilon))
else:
EPS_DECAY = 0
# Agent class
class DQNAgent:
def __init__(self):
# Main model
self.model = self.create_model()
# self.model = self.load_model()
# Target network
self.target_model = self.create_model()
self.target_model.set_weights(self.model.get_weights())
# An array with last n steps for training
self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
# Used to count when to update target network with main network's weights
self.target_update_counter = 0
def create_model(self):
dropout = 0.1
model = Sequential()
model.add(Conv2D(256, (2, 2), input_shape=(int(height/resize_ratio), int(width/resize_ratio), img_channels)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(dropout))
model.add(Conv2D(256, (2, 2)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(dropout))
model.add(Flatten())
model.add(Dense(64))
model.add(Dense(env.ACTION_SPACE_SIZE, activation='linear')) # ACTION_SPACE_SIZE = how many choices (9)
model.compile(loss="mse", optimizer=Adam(lr=learning_rate), metrics=['accuracy'])
return model
# Trains main network at end of episode
def train(self, terminal_state):
# Start training only if certain number of samples is already saved
if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
return
minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
current_states = np.array([transition[0] for transition in minibatch])
# from (MINIBATCH_SIZE, 1, h, w, 4) > (MINIBATCH_SIZE, h, w, 4)
current_states = current_states.reshape(current_states.shape[0], current_states.shape[2],
current_states.shape[3], current_states.shape[4])
current_qs_list = self.model.predict(current_states)
new_current_states = np.array([transition[3] for transition in minibatch])
new_current_states = new_current_states.reshape(new_current_states.shape[0], new_current_states.shape[2],
new_current_states.shape[3], new_current_states.shape[4])
# new_current_states = np.expand_dims(new_current_states, axis=-1)
future_qs_list = self.target_model.predict(new_current_states)
X = []
y = []
for index, (current_state_img, current_action, current_reward, new_current_img, current_done) in enumerate(minibatch):
if not current_done:
max_future_q = np.max(future_qs_list[index])
new_q = current_reward + (DISCOUNT * max_future_q)
else:
new_q = 0.0
current_qs = current_qs_list[index]
current_qs[current_action] = new_q
X.append(np.squeeze(current_state_img, axis=0))
y.append(current_qs)
X = np.array(X)
# X = np.expand_dims(X, axis=-1)
# X = X.reshape(X.shape[0], X.shape[2], X.shape[3], X.shape[4])
y = np.array(y)
self.model.fit(X, y, batch_size=batch_size, verbose=0, shuffle=False)
# self.model.train_on_batch(X, y)
if terminal_state:
self.target_update_counter += 1
# If counter reaches set value, update target network with weights of main network
if self.target_update_counter > UPDATE_TARGET_EVERY:
self.target_model.set_weights(self.model.get_weights())
self.target_update_counter = 0
print('target_model trained!')
# Queries main network for Q values given current observation space (environment state)
def get_qs(self, state):
result = agent.model.predict(state)
result = result[0]
return result
agent = DQNAgent()
current_img_stack = deque(maxlen=4)
# make the game active
game = gw.getWindowsWithTitle('Mesen')[0]
game.activate()
time.sleep(1)
release_all()
# Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
episode_reward = 0
step = 1
if episode <= START_EPISODE_DECAY - 1:
start_epsilon = False
elif episode >= END_EPISODE_DECAY + 1:
start_epsilon = False
else:
start_epsilon = True
# Reset environment and get initial state
# blackscreens followed by the 1st screen starting out
current_state = env.reset()
blackscreen = np.zeros_like(current_state)
current_img_stack.append(blackscreen)
current_img_stack.append(blackscreen)
current_img_stack.append(blackscreen)
current_img_stack.append(current_state)
stacked_state = np.stack(current_img_stack, axis=2)
stacked_state = np.ascontiguousarray(stacked_state, dtype=np.float32) / 255
stacked_state = np.transpose(stacked_state, (1, 0, 2))
stacked_state = np.expand_dims(stacked_state, axis=0)
start_time = time.time()
# Reset flag and start iterating until episode ends
done = False
while not done:
if np.random.random() > epsilon:
action = np.argmax(agent.get_qs(stacked_state))
else:
action = np.random.randint(0, env.ACTION_SPACE_SIZE)
new_state, reward, done, prediction, preview = env.step(action)
if done is False:
next_img_stack = current_img_stack
next_img_stack.append(new_state)
next_stack = np.stack(next_img_stack, axis=2)
next_stack = np.ascontiguousarray(next_stack, dtype=np.float32) / 255
next_stack = np.transpose(next_stack, (1, 0, 2))
next_stack = np.expand_dims(next_stack, axis=0)
# current_state = new_state
current_img_stack = next_img_stack
stacked_state = next_stack
else:
next_img_stack = current_img_stack
next_img_stack.append(blackscreen)
next_stack = np.stack(next_img_stack, axis=2)
next_stack = np.ascontiguousarray(next_stack, dtype=np.float32) / 255
next_stack = np.transpose(next_stack, (1, 0, 2))
next_stack = np.expand_dims(next_stack, axis=0)
step += 1
episode_reward += reward
ep_rewards.append(episode_reward)
if SHOW_PREVIEW:
env.render(preview, prediction)
if training is True:
agent.update_replay_memory((stacked_state, action, reward, next_stack, done))
# print(episode_reward)
if done is True:
ep_reward_final.append(episode_reward)
print(' Epsilon(' + str(epsilon) + ') EPtimes(' + str(time.time() - start_time) + ') done('
+ str(done) + ') step(' + str(step) + ') EPreward(' + str(episode_reward) +
') best_reward_this_session(' + str(max(ep_reward_final)) + ') fps(' +
str(step/(time.time() - start_time)) + ')')
# plot(ep_reward_final)
if training is True:
agent.train(done)
# Decay epsilon
if show_info is False and epsilon <= MIN_EPSILON:
print(f"\nEPS_DECAY ended on episode {episode} - epsilon {epsilon}")
epsilon = MIN_EPSILON
show_info = True
elif start_epsilon is True:
epsilon += EPS_DECAY
I am following this online tutorial for coding a DQN,https://github.com/philtabor/Youtube-Code-Repository/blob/master/ReinforcementLearning/DeepQLearning/torch_deep_q_model.py
, however I am running into this Runtime Error that I am unsure of how to debug or modify to prevent this error. Thanks!
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-196-00975d66fd2d> in <module>
28 agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
29 obs= obs_
---> 30 agent.learn(batch_size)
31 lastAction = action
32 scores.append(score)
<ipython-input-191-f6b163cc3a8a> in learn(self, batch_size)
72 Qtarget = Qpred.clone()
73 print(Qnext[1])
---> 74 Qtarget[:,maxA] = rewards + self.GAMMA*torch.max(Qnext[1])
75 # epsilon decay action
76 if self.steps > 2000:
RuntimeError: the derivative for 'indices' is not implemented
These are my code blocks in my jupyter notebook
class DeepQNetwork(nn.Module):
def __init__(self,Alpha):
super(DeepQNetwork,self).__init__()
self.conv1 = nn.Conv2d(1,32,8,stride=4, padding=1)
self.conv2 = nn.Conv2d(32,64,4,stride=2)
self.conv3 = nn.Conv2d(64,128,3)
self.fc1 = nn.Linear(128* 21* 12,512)
self.fc2 = nn.Linear(512,6)
self.optimizer = optim.RMSprop(self.parameters(), lr = Alpha)
self.loss = nn.MSELoss()
self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
self.to(self.device)
def forward(self,obs):
'''Passing in a sequence of arrays'''
obs = torch.Tensor(obs).to(self.device) # send to the GPU
''' Feed forward the Network Parameters'''
obs = obs.view(-1, 1,200,125)
#print(obs.shape)
obs = F.relu(self.conv1(obs))
#print(obs.shape)
obs = F.relu(self.conv2(obs))
#print(obs.shape)
obs = F.relu(self.conv3(obs))
#print(obs.shape)
obs = obs.view(-1,128* 21* 12)
obs = F.relu(self.fc1(obs))
# 4 Rows and 6 columns
actions = self.fc2(obs)
return actions
This is the Agent Code, and it contains the error causing line of code
class DQNAgent(object):
def __init__(self, gamma, epsilon, alpha, maxMemory,
epsEnd = 0.05, replace =10000, actionSpace = [0,1,2,3,4,5]):
'''
Gamma -> discount factor of valuing current reward over future reward
Epsilon -> for trade off between exploration-exploitation
alpha -> learn rate
maxMemory -> max size of Memory buffer
epsEnd -> smallest value of Exploration
repace -> how often to replace target network
'''
self.GAMMA = gamma
self.EPSILON = epsilon
self.EPS_END = epsEnd
self.actionSpace = actionSpace
self.maxMemory = maxMemory
self.steps = 0
self.learn_step_counter = 0
self.memory = []
self.memCount = 0
self.replace_tgt_count = replace
self.Q_eval = DeepQNetwork(alpha)
self.Q_next = DeepQNetwork(alpha)
def storeTransition(self, state, action, reward, state_):
'''Stores Transition states'''
if self.memCount < self.maxMemory:
self.memory.append([state,action,reward,state_])
else:
self.memory[self.memCount%self.maxMemory] = [state,action,reward,state_]
self.memCount +=1
def chooseAction(self,obs):
'''
Exploration if np.random > epsilon
else take epsilon greedy action
'''
rand = np.random.random()
# Get the value for all actions for the current set of states
# Forward pass the stack of frames to get value of each action given subset of staes in obs
actions = self.Q_eval.forward(obs)
if rand<1-self.EPSILON:
action = torch.argmax(actions[1]).item()
else:
action = np.random.choice(self.actionSpace)
self.steps += 1
return action
def learn(self, batch_size):
self.Q_eval.optimizer.zero_grad()
#0 gradient to do batch optimisation
if self.replace_tgt_count is not None and self.learn_step_counter % self.replace_tgt_count==0:
self.Q_next.load_state_dict(self.Q_eval.state_dict())
# memory subsampling
if self.memCount + batch_size < self.maxMemory:
memStart = int(np.random.choice(range(self.memCount)))
else:
memStart = int(np.random.choice(range(self.maxMemory-batch_size-1)))
miniBatch = self.memory[memStart:memStart+batch_size]
memory = np.array(miniBatch)
#feed forward current state and successor state conv to list as memory is array of numpy objects
Qpred = self.Q_eval.forward(list(memory[:,0][:])).to(self.Q_eval.device)
Qnext = self.Q_next.forward(list(memory[:,3][:])).to(self.Q_eval.device)
maxA = torch.argmax(Qnext,dim = 1).to(self.Q_eval.device)
#calculate rewards
rewards = torch.Tensor(list(memory[:,2])).to(self.Q_eval.device)
# loss for every action except max action to be 0
Qtarget = Qpred.clone()
print(Qnext.shape)
Qtarget[:,maxA] = rewards + self.GAMMA*torch.max(Qnext[1])# PROBLEMATIC LINE
# epsilon decay action
if self.steps > 2000:
if self.EPSILON-1e-4 >self.EPS_END:
self.EPSILON-= 1e-4
else:
self.EPSILON = self.EPS_END
loss = self.Q_eval.loss(Qtarget,Qpred).to(self.Q_eval.device)
loss.backward()
self.Q_eval.optimizer.step()
self.learn_step_counter +=1
env = gym.make("Invader-v0")
agent = DQNAgent(gamma=0.95,epsilon = 1.0,alpha = 0.003, maxMemory = 5000,replace = None)
while agent.memCount < agent.maxMemory:
obs = env.reset()
done = False
lives = 3
while not done:
action = env.action_space.sample()
obs_ , reward, done, info = env.step(action)
if done and info['lives']<lives:
lives = info['lives']
reward -= 200
agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
obs= obs_
initialised = True
scores = []
epsHistory = []
numGames = 50
batch_size = 16
for i in range(numGames):
print(f'starting game {i+1}, epsilon = {agent.EPSILON}')
epsHistory.append(agent.EPSILON)
done = False
obs = env.reset()
frames = [np.sum(obs)]
score = 0
lastAction = 0
lives = 3
while not done:
if len(frames) == 4:
action = agent.chooseAction(frames)
frames = []
else:
action = lastAction
obs_, reward, done, info = env.step(action)
score += score-reward
frames.append(preprocess(obs_))
if done and info['lives'] < lives:
reward -=200
agent.storeTransition(preprocess(obs),action,reward,preprocess(obs_))
obs= obs_
agent.learn(batch_size)
lastAction = action
scores.append(score)
print('score: ', score)
x = [i+1 for i in range(numGames)]
You have to do use .detach() for :
Qnext = self.Q_next.forward(list(memory[:,3][:])).detach().to(self.Q_eval.device)
Python code:
I have used the Python code as below. Here, machine is trained by using Logistic Regression algorithm and wine dataset. Here, problem is that weights are not getting updated. I don't understand where is the problem.
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
dataset = datasets.load_wine()
x = dataset.data
y = dataset.target
y = y.reshape(178,1)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.15,shuffle=True)
print(x_train.shape)
class log_reg():
def __init__(self):
pass
def sigmoid(self,x):
return 1 / (1 + np.exp(-x))
def train(self,x,y,w1,w2,alpha,iterations):
cost_history = [0] * iterations
Y_train = np.zeros([y.shape[0],3])
for i in range(Y_train.shape[0]):
for j in range(Y_train.shape[1]):
if(y[i] == j):
Y_train[i,j] = 1
for iteration in range(iterations):
z1 = x.dot(w1)
a1 = self.sigmoid(z1)
z2 = a1.dot(w2)
a2 = self.sigmoid(z2)
sig_sum = np.sum(np.exp(a2),axis=1)
sig_sum = sig_sum.reshape(a2.shape[0],1)
op = np.exp(a2) / sig_sum
loss = (Y_train * np.log(op))
dl = (op-Y_train)
dz1 = ((dl*(self.sigmoid(z2))*(1-self.sigmoid(z2))).dot(w2.T))*(self.sigmoid(z1))*(1-self.sigmoid(z1))
dz2 = (dl * (self.sigmoid(z2))*(1-self.sigmoid(z2)))
dw1 = x.T.dot(dz1)
dw2 = a1.T.dot(dz2)
w1 += alpha * dw1
w2 += alpha * dw2
cost_history[iteration] = (np.sum(loss)/len(loss))
return w1,w2,cost_history
def predict(self,x,y,w1,w2):
z1 = x.dot(w1)
a1 = self.sigmoid(z1)
z2 = a1.dot(w2)
a2 = self.sigmoid(z2)
sig_sum = np.sum(np.exp(a2),axis=1)
sig_sum = sig_sum.reshape(a2.shape[0],1)
op = np.exp(a2) / sig_sum
y_preds = np.argmax(op,axis=1)
acc = self.accuracy(y_preds,y)
return y_preds,acc
def accuracy(self,y_preds,y):
y_preds = y_preds.reshape(len(y_preds),1)
correct = (y_preds == y)
accuracy = (np.sum(correct) / len(y)) * 100
return (accuracy)
if __name__ == "__main__":
network = log_reg()
w1 = np.random.randn(14,4) * 0.01
w2 = np.random.randn(4,3) * 0.01
X_train = np.ones([x_train.shape[0],x_train.shape[1]+1])
X_train[:,:-1] = x_train
X_test = np.ones([x_test.shape[0],x_test.shape[1]+1])
X_test[:,:-1] = x_test
new_w1,new_w2,cost = network.train(X_train,y_train,w1,w2,0.0045,10000)
y_preds,accuracy = network.predict(X_test,y_test,new_w1,new_w2)
print(y_preds,accuracy)
In the above code, parameters are mentioned as below
x--training set,
y--target(output),
w1--weights for first layer,
w2--weights for second layer,
I used logistic regression with 2 hidden layers.
I am trying to train dataset wine from sklearn.I don't know where the problem is, but weights are not updating. Any help would be appreciated.
Your weights are updating , but I think you cant see them changing because you are printing them after execution. Python has a object reference method for numpy arrays so when you passed w1 , its values change values too so new_w1 and w1 become the same .
Take this example
import numpy as np
x=np.array([1,2,3,4])
def change(x):
x+=3
return x
print(x)
change(x)
print(x)
if you see the output it comes out as
[1 2 3 4]
[4 5 6 7]
I recommend that you add a bias and fix your accuracy function as I get my accuracy as 1000.
My execution when i run the code
the w1 and w2 values are indeed changing .
the only thing i changed was the main code and enabled the original data set , please do the same and tell if your weights are still not updating
if __name__ == "__main__":
network = log_reg()
w1 = np.random.randn(13,4) * 0.01
w2 = np.random.randn(4,3) * 0.01
print(w1)
print(" ")
print(w2)
print(" ")
new_w1,new_w2,cost = network.train(x_train,y_train,w1,w2,0.0045,10000)
print(w1)
print(" ")
print(w2)
print(" ")
y_preds,accuracy = network.predict(x_test,y_test,new_w1,new_w2)
print(y_preds,accuracy)