Marginal distribution integration for likelihood with PyMC3 and theano - theano

I'm trying to reproduce Random Fatigue-Limit model (Pascual & Meeker 1999, Ryan 2003 doi: 10.1198/1061860032012) using PyMC3. The performance of the code I came up with is terrible:
Applied log-transform to s_v and added transformed s_v_log to model.
Applied log-transform to tau and added transformed tau_log to model.
[ 0% ] 2 of 100000 complete in 1.7 sec
[ 0% ] 3 of 100000 complete in 2.9 sec
[ 0% ] 4 of 100000 complete in 3.5 sec
[ 0% ] 5 of 100000 complete in 6.2 sec
[ 0% ] 6 of 100000 complete in 7.5 sec
[ 0% ] 7 of 100000 complete in 13.2 sec
[ 0% ] 8 of 100000 complete in 13.7 sec
[ 0% ] 9 of 100000 complete in 19.4 sec
[ 0% ] 10 of 100000 complete in 113.5 sec
...
[ 0% ] 39 of 100000 complete in 588.8 sec
I suspect the problem lies in the marginal integrations of the fatigue life conditioned to the fatigue limit (trapezoidal rule). I left my previous attempts of getting the code to run also commented to maybe help in the readability. What would be the correct way to do this?
The model is log(lifetime) ~ N(beta0 + beta1*log(stress - fatigue_limit), eps). Fatigue limit is a limit stress which defines whether the lifetime is finite or not. log(fatigue_limit) ~ N(mu_v, s_v). Also, right-censorship is needed for observations where the test had to be stopped at finite log-life w (called run-out) so lumped likelihood 1-P(W<=w) is used for those observations in my custom likelihood function.
Thank you!
import numpy as np
import pymc3 as pymc
import theano
import theano.tensor as T
from theano.ifelse import ifelse
data = np.array([[161.908352526, 10000000.0],
[181.550578943, 10000000.0],
[201.19280536, 10000000.0],
[220.835031777, 10000000.0],
[240.477258194, 10000000.0],
[260.119484611, 3771909.80463],
[279.761711028, 3031517.02602],
[299.403937445, 246228.344425],
[319.046163862, 164947.588452],
[338.688390279, 57509.1400708],
[358.330616697, 80404.6132032],
[377.972843114, 38003.7533737],
[397.615069531, 5875.28886189],
[417.257295948, 1337.63562072],
[436.899522365, 1641.72977154],
[456.541748782, 184.309099829],
[476.183975199, 239.35420232]])
s = data[:,0] # stresses
y = data[:,1] # lifetimes
infty = 1e7 # Run-out limit
c = np.zeros(y.shape) # Censor vector
c[y <= infty] = 0 # Broken, finite lifetime
c[y > infty] = 1 # Survived, right-censor
x = np.log(s) # Logarithmic stresses
w = np.log(y) # Logarithmic lifetimes
with pymc.Model() as model:
# Priors
b0 = pymc.Normal('b0', 70.0, 1.0/35.0**2) # Constant parameter
b1 = pymc.Normal('b1', -10.0, 1.0/5.0**2) # Slope parameter
mu_v = pymc.Normal('mu_v', np.log(450.0), 1.0/np.log(0.2**2+1)) # Log-Fatigue limit mean
s_v = pymc.Lognormal('s_v', np.log(np.sqrt(np.log(0.2**2+1)))-0.5*np.log(0.2**2+1), 1.0/np.log(0.2**2+1)) # Log-Fatigue limit standard deviation
tau = pymc.Gamma('tau', 0.01, 0.01) # Measurement precision
v = pymc.Normal('v', mu_v, 1.0/s_v**2) # Log-Fatigue limit
def mu(x, b0, b1, v): # Logarithmic Random Fatigue-Limit lifetime median value
# if x-v<=0: # Stress below fatigue limit
# return 1e200 # Big number
# else:
# return b0 + b1*np.log(np.exp(x)-np.exp(v))
results, updates = theano.scan(lambda vi: ifelse(T.lt(x, vi), 1e200, b0 + b1*T.log(T.exp(x)-T.exp(vi))), sequences=v)
return results
def p_w(w, x, b0, b1, mu_v, s_v, tau, N=200):
# Lifetime distribution
# Integration limits
# a = min(x, mu_v - 6*s_v)
# b = min(x, mu_v + 6*s_v)
a = ifelse(T.lt(mu_v-6*s_v, x), mu_v-6*s_v, x)
b = ifelse(T.lt(mu_v+6*s_v, x), mu_v+6*s_v, x)
dv = (b-a)/N
# Trapezoidal quadrature
# sum = 0.0
# for i in range(N+1):
## fi = norm.pdf(w, mu(x, b0, b1, a+i*dv), 1.0/np.sqrt(tau))*norm.pdf(a+i*dv, mu_v, s_v)
# fi = T.sqrt(tau/(2.0*np.pi))*T.exp(-tau/2.0*(w - mu(x, b0, b1, a+i*dv))**2)*T.sqrt(1.0/(2.0*np.pi))/s_v*T.exp(-0.5*((a+i*dv - mu_v)/s_v)**2)
# if i==0 or i==N: # End points
# sum += 0.5*fi
# else: # Interior
# sum += fi
# return sum
vs = a + T.arange(N+1)*dv
values = T.sqrt(tau/(2.0*np.pi))*T.exp(-tau/2.0*(w - mu(x, b0, b1, vs))**2)*T.sqrt(1.0/(2.0*np.pi))/s_v*T.exp(-0.5*((vs - mu_v)/s_v)**2)
return dv*(T.sum(values[1:-1]) + 0.5*values[0] + 0.5*values[-1])
def p_W(w, x, b0, b1, mu_v, s_v, tau, N=200):
# Cumulative lifetime distribution
# Integration limits
# a = min(x, mu_v - 6*s_v)
# b = min(x, mu_v + 6*s_v)
a = ifelse(T.lt(mu_v-6*s_v, x), mu_v-6*s_v, x)
b = ifelse(T.lt(mu_v+6*s_v, x), mu_v+6*s_v, x)
dv = (b-a)/N
# Trapezoidal quadrature
# sum = 0.0
# for i in range(N+1):
## fi = norm.cdf(w, mu(x, b0, b1, a+i*dv), 1.0/np.sqrt(tau))*norm.pdf(a+i*dv, mu_v, s_v)
# fi = 0.5*(1.0 + T.erf(T.sqrt(tau/2.0)*(w - mu(x, b0, b1, a+i*dv))))*T.sqrt(1.0/(2.0*np.pi))/s_v*T.exp(-0.5*((a+i*dv - mu_v)/s_v)**2)
# if i==0 or i==N: # End points
# sum += 0.5*fi
# else: # Interior
# sum += fi
# return sum
vs = a + T.arange(N+1)*dv
values = 0.5*(1.0 + T.erf(T.sqrt(tau/2.0)*(w - mu(x, b0, b1, vs))))*T.sqrt(1.0/(2.0*np.pi))/s_v*T.exp(-0.5*((vs - mu_v)/s_v)**2)
return dv*(T.sum(values[1:-1]) + 0.5*values[0] + 0.5*values[-1])
def Li(value):
# Log-likelihood of observation
# value = np.array([ci, wi, xi])
# ci = 0 : Broken | 1 : Survived
# wi : log-lifetime
# xi : log-stress
ci = value[0]
wi = value[1]
xi = value[2]
# if ci==0: # Finite lifetime
# return np.log(p_w(wi, xi, b0, b1, mu_v, s_v, tau))
# return T.log(p_w(wi, xi, b0, b1, mu_v, s_v, tau))
# else: # Right-censored observation
# return np.log(1.0-p_W(wi, xi, b0, b1, mu_v, s_v, tau))
# return T.log(1.0-p_W(wi, xi, b0, b1, mu_v, s_v, tau))
return ifelse(T.eq(ci, 0), T.log(p_w(wi, xi, b0, b1, mu_v, s_v, tau)), T.log(1.0-p_W(wi, xi, b0, b1, mu_v, s_v, tau)))
def L(values):
# Log-likelihood of observations
# retval = 0.0
# for i in range(values.shape[0].eval()):
# retval += Li(values[i,:])
# return retval
results, updates = theano.scan(lambda i: Li(values[i]), sequences=T.arange(values.shape[0]))
return T.sum(results)
data = np.vstack([c,w,x]).T
mylike = pymc.DensityDist('mylike', L, observed=data)
# mu, sds, elbo = pymc.variational.advi(n=200000) # pymc advi
trace = pymc.sample(100000, pymc.NUTS()) # pymc NUTS

Related

Gekko feasible in smaller problem while infeasible in larger problem

I am trying to solve the problem as follows with Gekko in python.
I_s is an indicator variable in the problem whose value is 1 if theta is positive and 0 if theta is zero.
I wrote the problem in a code using Gekko, python.
In contrast to my previous posts, I add some constraints with respect to I, which is an indicator variable.
If I set N=10, the solution, theta is all zero, which is the result that I want.
But if I set N=100 or 200, the solution cannot be found. I cannot understand why this happens.
I want to check if theta is also zero in larger N (200).
Is there any way to solve this issue?
My code is as belows.
# Import package
from gekko import GEKKO
import numpy as np
# Define parameters
P_CO = 600 # $/tonCO
beta_CO2 = 1 # no unit
P_CO2 = 80 # $/tonCO2eq
E_ref = 3.1022616 # tonCO2eq/tonCO
E_dir = -1.600570692 # tonCO2eq/tonCO
E_indir_others = 0.3339226804 # tonCO2eq/tonCO
E_indir_elec_cons = 18.46607256 # GJ/tonCO
C1_CAPEX = 285695 # no unit
C2_CAPEX = 188.42 # no unit
C1_FOX = 82282 # no unit
C2_FOX = 24.094 # no unit
C1_ROX = 4471.5 # no unit
C2_ROX = 96.034 # no unit
C1_UOX = 7934.9 # no unit
C2_UOX = 986.9 # no unit
r = 0.08 # discount rate
N = 10 # number of scenarios
T = 30 # total time period
GWP_init = 0.338723235 # 2020 Electricity GWP in EU 27 countries
theta_max = 1600000 # Max capacity
# Function to make GWP_EU matrix (TxN matrix)
def Electricity_GWP(GWP_init, n_years, num_episodes):
GWP_mean = 0.36258224*np.exp(-0.16395611*np.arange(1, n_years+2)) + 0.03091272
GWP_mean = GWP_mean.reshape(-1,1)
GWP_Yearly = np.tile(GWP_mean, num_episodes)
noise = np.zeros((n_years+1, num_episodes))
stdev2050 = GWP_mean[-1] * 0.25
stdev = np.arange(0, stdev2050 * (1 + 1/n_years), stdev2050/n_years)
for i in range(n_years+1):
noise[i,:] = np.random.normal(0, stdev[i], num_episodes)
GWP_forecast = GWP_Yearly + noise
return GWP_forecast
GWP_EU = Electricity_GWP(GWP_init, T, N) # (T+1)*N matrix
GWP_EU = GWP_EU[1:,:] # T*N matrix
print(np.shape(GWP_EU))
# Build Gekko model
m = GEKKO(remote=False)
theta = m.Array(m.Var, N, lb=0, ub=theta_max)
I = m.Array(m.Var, N, lb=0, ub=1, integer=True)
demand = np.ones((T,1))
demand[0] = 8031887.589
for k in range(1,11):
demand[k] = demand[k-1] * 1.026
for k in range(11,21):
demand[k] = demand[k-1] * 1.016
for k in range(21,T):
demand[k] = demand[k-1] * 1.011
demand = 0.12 * demand
demand = np.tile(demand, N) # T*N matrix
print(np.shape(demand))
m3 = [[m.min3(demand[t,s],theta[s]) for t in range(T)] for s in range(N)]
obj = m.sum([sum([((1/(1+r))**(t+1))*((P_CO*m3[s][t]) \
+ (beta_CO2*P_CO2*m3[s][t]*(E_ref-E_dir-E_indir_others-E_indir_elec_cons*GWP_EU[t,s])) \
- (C1_CAPEX*I[s]+C2_CAPEX*theta[s]+C1_FOX*I[s]+C2_FOX*theta[s])\
- (C1_ROX*I[s]+C2_ROX*m3[s][t]+C1_UOX*I[s]+C2_UOX*m3[s][t])) for t in range(T)]) for s in range(N)])
for i in range(N):
m.Equation(theta[i]<=1000000*I[i])
m.Equation(-theta[i]<1000000*(1-I[i]))
# obj = m.sum([m.sum([((1/(1+r))**(t+1))*((P_CO*m.min3(demand[t,s], theta[s])) \
# + (beta_CO2*P_CO2*m.min3(demand[t,s], theta[s])*(E_ref-E_dir-E_indir_others-E_indir_elec_cons*GWP_EU[t,s])) \
# - (C1_CAPEX+C2_CAPEX*theta[s]+C1_FOX+C2_FOX*theta[s])-(C1_ROX+C2_ROX*m.min3(demand[t,s], theta[s])+C1_UOX+C2_UOX*m.min3(demand[t,s], theta[s]))) for t in range(T)]) for s in range(N)])
m.Maximize(obj/N)
m.solve(disp=True)
# s = m.sum(m.sum(((1/(1+r))**(t+1))*((P_CO*m.min3(demand[t,s], theta[s])) \
# + beta_CO2*P_CO2*m.min3(demand[t,s], theta[s])*(E_ref-E_dir-E_indir_others-E_indir_elec_cons*GWP_EU[t,s]) \
# - (C1_CAPEX + C2_CAPEX*theta[s]) - (C1_FOX + C2_FOX*theta[s]) - (C1_ROX + C2_ROX*m.min3(demand[t,s], theta[s])) - (C1_UOX + C2_UOX*m.min3(demand[t,s], theta[s])))
# for s in range(N)) for t in range(T))/N
print(theta)
I solved this issue by increasing the big M in the constraint for an indicator variable I, 1000000 to 10000000.
for i in range(N):
m.Equation(theta[i]<=10000000*I[i])
m.Equation(-theta[i]<10000000*(1-I[i]))
I didn't understand why this worked, but the result gave me the solution of 200*1 array with all zero.

Negative degrees of fredom when using GEKKO python

I'm trying to solve the optimization problem as above.
And my code is as belows.
It worked, but I got the negative degrees of freedom problem.
And the objective value was also negative, which I did not expect to be. I expected the positive one.
I can't understand why this happened and don't know how this problem can be solved.
Can somebody give me a suggestion?
Code
# Import package
from gekko import GEKKO
import numpy as np
# Define parameters
P_CO = 600 # $/tonCO
beta_CO2 = 1 # no unit
P_CO2 = 60 # $/tonCO2eq
E_ref = 3.1022616 # tonCO2eq/tonCO
E_dir = -1.600570692 # tonCO2eq/tonCO
E_indir_others = 0.3339226804 # tonCO2eq/tonCO
E_indir_elec_cons = 18.46607256 # GJ/tonCO
C1_CAPEX = 285695 # no unit
C2_CAPEX = 188.42 # no unit
C1_FOX = 82282 # no unit
C2_FOX = 24.094 # no unit
C1_ROX = 4471.5 # no unit
C2_ROX = 96.034 # no unit
C1_UOX = 1983.7 # no unit
C2_UOX = 249.79 # no unit
r = 0.08 # discount rate
N = 10 # number of scenarios
T = 30 # total time period
GWP_init = 0.338723235 # 2020 Electricity GWP in EU 27 countries
theta_max = 1600000 # Max capacity
# Function to make GWP_EU matrix (TxN matrix)
def Electricity_GWP(GWP_init, n_years, num_episodes):
GWP_mean = 0.36258224*np.exp(-0.16395611*np.arange(1, n_years+2)) + 0.03091272
GWP_mean = GWP_mean.reshape(-1,1)
GWP_Yearly = np.tile(GWP_mean, num_episodes)
noise = np.zeros((n_years+1, num_episodes))
stdev2050 = GWP_mean[-1] * 0.25
stdev = np.arange(0, stdev2050 * (1 + 1/n_years), stdev2050/n_years)
for i in range(n_years+1):
noise[i,:] = np.random.normal(0, stdev[i], num_episodes)
GWP_forecast = GWP_Yearly + noise
return GWP_forecast
GWP_EU = Electricity_GWP(GWP_init, T, N) # (T+1)*N matrix
GWP_EU = GWP_EU[1:,:] # T*N matrix
print(np.shape(GWP_EU))
# Build Gekko model
m = GEKKO(remote=False)
theta = m.Array(m.Var, N, lb=0, ub=theta_max)
demand = np.ones((T,1))
demand[0] = 8031887.589
for k in range(1,11):
demand[k] = demand[k-1] * 1.026
for k in range(11,21):
demand[k] = demand[k-1] * 1.016
for k in range(21,T):
demand[k] = demand[k-1] * 1.011
demand = 0.12 * demand
demand = np.tile(demand, N) # T*N matrix
print(np.shape(demand))
obj = m.sum([m.sum([((1/(1+r))**(t+1))*((P_CO*m.min3(demand[t,s], theta[s])) \
+ (beta_CO2*P_CO2*m.min3(demand[t,s], theta[s])*(E_ref-E_dir-E_indir_others-E_indir_elec_cons*GWP_EU[t,s])) \
- (C1_CAPEX+C2_CAPEX*theta[s]+C1_FOX+C2_FOX*theta[s])-(C1_ROX+C2_ROX*m.min3(demand[t,s], theta[s])+C1_UOX+C2_UOX*m.min3(demand[t,s], theta[s]))) for t in range(T)]) for s in range(N)])
m.Maximize(obj/N)
m.solve()
Output message
(30, 10)
(30, 10)
----------------------------------------------------------------
APMonitor, Version 1.0.0
APMonitor Optimization Suite
----------------------------------------------------------------
--------- APM Model Size ------------
Each time step contains
Objects : 11
Constants : 0
Variables : 5121
Intermediates: 0
Connections : 321
Equations : 3901
Residuals : 3901
Number of state variables: 5121
Number of total equations: - 3911
Number of slack variables: - 2400
---------------------------------------
Degrees of freedom : -1190
* Warning: DOF <= 0
----------------------------------------------
Steady State Optimization with APOPT Solver
----------------------------------------------
Iter: 1 I: 0 Tm: 18.61 NLPi: 5 Dpth: 0 Lvs: 0 Obj: -1.87E+09 Gap: 0.00E+00
Successful solution
---------------------------------------------------
Solver : APOPT (v1.0)
Solution time : 18.619200000000003 sec
Objective : -1.8677021320161405E+9
Successful solution
---------------------------------------------------
The negative DOF warning is because of the slack variables that are created when using the min3() function. It is only a warning that if all of the inequalities are active then this could lead to an over-specified system of equations (more equations than variables). If there is a successful solution then this warning can be ignored.
The negative objective is because most solvers require a minimization of the objective. Gekko automatically converts m.Maximize(obj) to m.Minimize(-obj). This is an equivalent objective. If you'd like to report the maximization and the positive objective, use the following at the end:
print('Objective: ',-m.options.OBJFCNVAL)

How to use numpy to speed up code that calculates center of mass?

I made a small block of code that - given n objects of specified masses and vector coordinates over time - will calculate the center of mass. I think the code looks clunky (it uses 3 for-loops), and was wondering if there were numpy methods to vectorize (or at least speed up) this method. As a note, the use of the class Body could probably be averted for this task, but is used in other relevant code not shown here.
import numpy as np
class Body():
def __init__(self, mass, position):
self.mass = mass
self.position = position
def __str__(self):
return '\n .. mass:\n{}\n\n .. position:\n{}\n'.format(self.mass, self.position)
Three objects are initialized.
mass = 100 # same for all 3 objects
ndim = 3 # 3 dimensional space
nmoments = 10 # 10 moments in time
## initialize bodies
nelems = ndim * nmoments
x = np.arange(nelems).astype(int).reshape((nmoments, ndim))
A = Body(mass, position=x)
B = Body(mass, position=x / 2.)
C = Body(mass, position=x * 2.)
bodies = [A, B, C]
total_mass = sum([body.mass for body in bodies])
# print("\n ** A **\n{}\n".format(A))
# print("\n ** B **\n{}\n".format(B))
# print("\n ** C **\n{}\n".format(C))
## get center of mass
center_of_mass = []
for dim in range(ndim):
coms = []
for moment in range(nmoments):
numerator = 0
for body in bodies:
numerator += body.mass * body.position[moment, dim]
com = numerator / total_mass
coms.append(com)
center_of_mass.append(coms)
center_of_mass = np.array(center_of_mass).T
# print("\n .. center of mass:\n{}\n".format(center_of_mass))
As verification that the code works, the print statements in the code above output the following:
** A **
.. mass:
100
.. position:
[[ 0 1 2]
[ 3 4 5]
[ 6 7 8]
[ 9 10 11]
[12 13 14]
[15 16 17]
[18 19 20]
[21 22 23]
[24 25 26]
[27 28 29]]
** B **
.. mass:
100
.. position:
[[ 0. 0.5 1. ]
[ 1.5 2. 2.5]
[ 3. 3.5 4. ]
[ 4.5 5. 5.5]
[ 6. 6.5 7. ]
[ 7.5 8. 8.5]
[ 9. 9.5 10. ]
[10.5 11. 11.5]
[12. 12.5 13. ]
[13.5 14. 14.5]]
** C **
.. mass:
100
.. position:
[[ 0. 2. 4.]
[ 6. 8. 10.]
[12. 14. 16.]
[18. 20. 22.]
[24. 26. 28.]
[30. 32. 34.]
[36. 38. 40.]
[42. 44. 46.]
[48. 50. 52.]
[54. 56. 58.]]
.. center of mass:
[[ 0. 1.16666667 2.33333333]
[ 3.5 4.66666667 5.83333333]
[ 7. 8.16666667 9.33333333]
[10.5 11.66666667 12.83333333]
[14. 15.16666667 16.33333333]
[17.5 18.66666667 19.83333333]
[21. 22.16666667 23.33333333]
[24.5 25.66666667 26.83333333]
[28. 29.16666667 30.33333333]
[31.5 32.66666667 33.83333333]]
Using numpy will speed things up and make the code cleaner. I'm not an expert in n-body problems so I've hopefully followed the algorithm OK,the results look to be the same. All the loops become implicit in numpy.
# ***** From the question *****
import numpy as np
class Body():
def __init__(self, mass, position):
self.mass = mass
self.position = position
def __str__(self):
return '\n .. mass:\n{}\n\n .. position:\n{}\n'.format(self.mass, self.position)
mass = 100 # same for all 3 objects
ndim = 3 # 3 dimensional space
nmoments = 10 # 10 moments in time
## initialize bodies
nelems = ndim * nmoments
x = np.arange(nelems).astype(int).reshape((nmoments, ndim))
A = Body(mass, position=x)
B = Body(mass, position=x / 2.)
C = Body(mass, position=x * 2.)
bodies = [A, B, C]
# **** End of code from the question ****
# Fill the numpy arrays
np_mass = np.array( [ body.mass for body in bodies ])[ :,None, None ]
# the [:, None, None] turns np_mass into a 3D array for correct broadcasting
np_pos = np.array( [ body.position for body in bodies ]) # 3D
np_mass.shape
# (3, 1, 1) # (n_bodies, 1, 1 ) - The two 'spare' dimensions force the broadcasting to be along the correct axes
np_pos.shape
# (3, 10, 3) # ( n_bodies, nmoments, ndims )
total_mass = np_mass.sum() # Sum the three masses
numerator = (np_mass * np_pos).sum(axis=0) # sum np_mass * np_pos along the body (0) axis.
com = numerator / total_mass # divide by total_mass
# Could be a oneliner
# com = (np_mass * np_pos).sum(axis=0) / np.mass.sum()
print(com)
# array([[ 0. , 1.16666667, 2.33333333],
# [ 3.5 , 4.66666667, 5.83333333],
# [ 7. , 8.16666667, 9.33333333],
# [10.5 , 11.66666667, 12.83333333],
# [14. , 15.16666667, 16.33333333],
# [17.5 , 18.66666667, 19.83333333],
# [21. , 22.16666667, 23.33333333],
# [24.5 , 25.66666667, 26.83333333],
# [28. , 29.16666667, 30.33333333],
# [31.5 , 32.66666667, 33.83333333]])
center_of_mass = (A.mass * A.position + B.mass * B.position + C.mass * C.position) / total_mass

to calculate CDF of a continuous regression

I need to calculate a CDF for a regression.I have N observations, I need to reestimate coefficients(beta) in a joint distribution.
Yobs is my observations and Y is calculated by X(matrix of predctors)* array of coefficients(betas)
def CDF(beta):
Y = X.dot(beta)
sigma = 0
for n in range(0,N):
sigma = sigma + (np.square(Yobs[n] - Y[n])) # summation of squarred of residuals
SSR = sigma / N # mu (mean or expectation)
dof = N - P - 1 # degree of freedom
var = sigma / dof # the mean square of residuals
PDF = np.zeros(N)
CDF = np.zeros(N) # I want to calculate the F(X < Yobs)
for n in range (0,N):
PDF[n] = (1/np.sqrt(2*np.pi*var))*np.exp(-SSR/(2*var)) # probability density function
CDF[n] = integrate.quad(PDF, -np.inf , (Yobs+a)) # CDF
return CDF
Where am I wrong? I think CDF is wrong since I haven't determined the arg, but how can I define? can I simply use?
from scipy.stats import norm
def CDF(beta):
Y = X.dot(beta)
sigma = 0
for n in range(0,N):
sigma = sigma + (np.square(Yobs[n] - Y[n])) # summation of squarred of residuals
SSR = sigma / N # mu (mean or expectation)
dof = N - P - 1 # degree of freedom
var = sigma / dof # the mean square of residuals
CDF = np.zeros(N)
for n in range(0,N): # I want to calculate the F(X < Yobs)
CDF[n] = norm.cdf(Yobs[n],SSR,var)
return CDF

Backpropagation neural network

I need to use Backpropagation Neural Netwrok for multiclass classification purposes in my application. I have found this code and try to adapt it to my needs. It is based on the lections of Machine Learning in Coursera from Andrew Ng.
I have tested it in IRIS dataset and achieved good results (accuracy of classification around 0.96), whereas on my real data I get terrible results. I assume there is some implementation error, because the data is very simple. But I cannot figure out what exactly is the problem.
What are the parameters that it make sense to adjust?
I tried with:
number of units in hidden layer
generalization parameter (lambda)
number of iterations for minimization function
Built-in minimization function used in this code is pretty much confusing me. It is used just once, as #goncalopp has mentioned in comment. Shouldn't it iteratively update the weights? How it can be implemented?
Here is my training data (target class is in the last column):
65535, 3670, 65535, 3885, -0.73, 1
65535, 3962, 65535, 3556, -0.72, 1
65535, 3573, 65535, 3529, -0.61, 1
3758, 3123, 4117, 3173, -0.21, 0
3906, 3119, 4288, 3135, -0.28, 0
3750, 3073, 4080, 3212, -0.26, 0
65535, 3458, 65535, 3330, -0.85, 2
65535, 3315, 65535, 3306, -0.87, 2
65535, 3950, 65535, 3613, -0.84, 2
65535, 32576, 65535, 19613, -0.35, 3
65535, 16657, 65535, 16618, -0.37, 3
65535, 16657, 65535, 16618, -0.32, 3
The dependencies are so obvious, I think it should be so easy to classify it...
But results are terrible. I get accuracy of 0.6 to 0.8. This is absolutely inappropriate for my application. Can someone please point out possible improvements I could make in order to achieve better results.
Here is the code:
import numpy as np
from scipy import optimize
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
import math
class NN_1HL(object):
def __init__(self, reg_lambda=0, epsilon_init=0.12, hidden_layer_size=25, opti_method='TNC', maxiter=500):
self.reg_lambda = reg_lambda
self.epsilon_init = epsilon_init
self.hidden_layer_size = hidden_layer_size
self.activation_func = self.sigmoid
self.activation_func_prime = self.sigmoid_prime
self.method = opti_method
self.maxiter = maxiter
def sigmoid(self, z):
return 1 / (1 + np.exp(-z))
def sigmoid_prime(self, z):
sig = self.sigmoid(z)
return sig * (1 - sig)
def sumsqr(self, a):
return np.sum(a ** 2)
def rand_init(self, l_in, l_out):
self.epsilon_init = (math.sqrt(6))/(math.sqrt(l_in + l_out))
return np.random.rand(l_out, l_in + 1) * 2 * self.epsilon_init - self.epsilon_init
def pack_thetas(self, t1, t2):
return np.concatenate((t1.reshape(-1), t2.reshape(-1)))
def unpack_thetas(self, thetas, input_layer_size, hidden_layer_size, num_labels):
t1_start = 0
t1_end = hidden_layer_size * (input_layer_size + 1)
t1 = thetas[t1_start:t1_end].reshape((hidden_layer_size, input_layer_size + 1))
t2 = thetas[t1_end:].reshape((num_labels, hidden_layer_size + 1))
return t1, t2
def _forward(self, X, t1, t2):
m = X.shape[0]
ones = None
if len(X.shape) == 1:
ones = np.array(1).reshape(1,)
else:
ones = np.ones(m).reshape(m,1)
# Input layer
a1 = np.hstack((ones, X))
# Hidden Layer
z2 = np.dot(t1, a1.T)
a2 = self.activation_func(z2)
a2 = np.hstack((ones, a2.T))
# Output layer
z3 = np.dot(t2, a2.T)
a3 = self.activation_func(z3)
return a1, z2, a2, z3, a3
def function(self, thetas, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda):
t1, t2 = self.unpack_thetas(thetas, input_layer_size, hidden_layer_size, num_labels)
m = X.shape[0]
Y = np.eye(num_labels)[y]
_, _, _, _, h = self._forward(X, t1, t2)
costPositive = -Y * np.log(h).T
costNegative = (1 - Y) * np.log(1 - h).T
cost = costPositive - costNegative
J = np.sum(cost) / m
if reg_lambda != 0:
t1f = t1[:, 1:]
t2f = t2[:, 1:]
reg = (self.reg_lambda / (2 * m)) * (self.sumsqr(t1f) + self.sumsqr(t2f))
J = J + reg
return J
def function_prime(self, thetas, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda):
t1, t2 = self.unpack_thetas(thetas, input_layer_size, hidden_layer_size, num_labels)
m = X.shape[0]
t1f = t1[:, 1:]
t2f = t2[:, 1:]
Y = np.eye(num_labels)[y]
Delta1, Delta2 = 0, 0
for i, row in enumerate(X):
a1, z2, a2, z3, a3 = self._forward(row, t1, t2)
# Backprop
d3 = a3 - Y[i, :].T
d2 = np.dot(t2f.T, d3) * self.activation_func_prime(z2)
Delta2 += np.dot(d3[np.newaxis].T, a2[np.newaxis])
Delta1 += np.dot(d2[np.newaxis].T, a1[np.newaxis])
Theta1_grad = (1 / m) * Delta1
Theta2_grad = (1 / m) * Delta2
if reg_lambda != 0:
Theta1_grad[:, 1:] = Theta1_grad[:, 1:] + (reg_lambda / m) * t1f
Theta2_grad[:, 1:] = Theta2_grad[:, 1:] + (reg_lambda / m) * t2f
return self.pack_thetas(Theta1_grad, Theta2_grad)
def fit(self, X, y):
num_features = X.shape[0]
input_layer_size = X.shape[1]
num_labels = len(set(y))
theta1_0 = self.rand_init(input_layer_size, self.hidden_layer_size)
theta2_0 = self.rand_init(self.hidden_layer_size, num_labels)
thetas0 = self.pack_thetas(theta1_0, theta2_0)
options = {'maxiter': self.maxiter}
_res = optimize.minimize(self.function, thetas0, jac=self.function_prime, method=self.method,
args=(input_layer_size, self.hidden_layer_size, num_labels, X, y, 0), options=options)
self.t1, self.t2 = self.unpack_thetas(_res.x, input_layer_size, self.hidden_layer_size, num_labels)
np.savetxt("weights_t1.txt", self.t1, newline="\n")
np.savetxt("weights_t2.txt", self.t2, newline="\n")
def predict(self, X):
return self.predict_proba(X).argmax(0)
def predict_proba(self, X):
_, _, _, _, h = self._forward(X, self.t1, self.t2)
return h
##################
# IR data #
##################
values = np.loadtxt('infrared_data.txt', delimiter=', ', usecols=[0,1,2,3,4])
targets = np.loadtxt('infrared_data.txt', delimiter=', ', dtype=(int), usecols=[5])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(values, targets, test_size=0.4)
nn = NN_1HL()
nn.fit(values, targets)
print("Accuracy of classification: "+str(accuracy_score(y_test, nn.predict(X_test))))
The most obvious problem is that your training dataset is very small.
Since you're using scipy.optimize.minimize instead of the usual iterative gradient descent, I think it's also likely you're overfitting your model to your training data. Possibly a iterative algorithm works better, here. Don't forget to carefully monitor the validation error.
If you try backpropagation with gradient descent, notice that, depending on the parameters used on backpropagation, neural networks take a while to converge
You can try to feed the network the same training data multiple times or tweak the learning rate but ideally you should use more diverse data.
Correctly normalizing the data solved the problem. I used preprocessing module from sklearn. Here is example:
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1., 2.],
[ 2., 0., 0.],
[ 0., 1., -1.]])
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
print(X_train_minmax)
X_test = np.array([[ -3., -1., 4.]])
X_test_minmax = min_max_scaler.transform(X_test)
print(X_test_minmax)
And the output is:
[[ 0.5 0. 1. ]
[ 1. 0.5 0.3333]
[ 0. 1. 0. ]]
[[-1.5 0. 1.6667]]

Resources