Neural network numerical gradient check not working with matrices using Python-numpy - python-3.x

I'm trying to implement a simple numerical gradient check using Python 3 and numpy to be used for neural network.
It works well for simple 1D functions but fails when applied to matrices of parameters.
My guess is that either my cost function is not calculated well for a matrix or that the way I do the numerical gradient check is wrong somehow.
See code below and thanks for your help!
import numpy as np
import random
import copy
def gradcheck_naive(f, x):
""" Gradient check for a function f.
Arguments:
f -- a function that takes a single argument (x) and outputs the
cost (fx) and its gradients grad
x -- the point (numpy array) to check the gradient at
"""
rndstate = random.getstate()
random.setstate(rndstate)
fx, grad = f(x) # Evaluate function value at original point
#fx=cost
#grad=gradient
h = 1e-4
# Iterate over all indexes in x
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
ix = it.multi_index #multi-index number
random.setstate(rndstate)
xp = copy.deepcopy(x)
xp[ix] += h
fxp, gradp = f(xp)
random.setstate(rndstate)
xn = copy.deepcopy(x)
xn[ix] -= h
fxn, gradn = f(xn)
numgrad = (fxp-fxn) / (2*h)
# Compare gradients
reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
if reldiff > 1e-5:
print ("Gradient check failed.")
print ("First gradient error found at index %s" % str(ix))
print ("Your gradient: %f \t Numerical gradient: %f" % (
grad[ix], numgrad))
return
it.iternext() # Step to next dimension
print ("Gradient check passed!")
#sanity check with 1D function
exp_f = lambda x: (np.sum(np.exp(x)), np.exp(x))
gradcheck_naive(exp_f, np.random.randn(4,5)) #this works fine
#sanity check with matrices
#forward pass
W = np.random.randn(5,10)
x = np.random.randn(10,3)
D = W.dot(x)
#backpropagation pass
gradx = W
func_f = lambda x: (np.sum(W.dot(x)), gradx)
gradcheck_naive(func_f, np.random.randn(10,3)) #this does not work (grad check fails)

I figured it out! (my math teacher would be so proud...)
The short answer is that I was mixing up matrices dot product and element wise product.
When using an element wise product, the gradient is equal to:
W = np.array([[2,4],[3,5],[3,1]])
x = np.array([[1,7],[5,-1],[4,7]])
D = W*x #element-wise multiplication
gradx = W
func_f = lambda x: (np.sum(W*x), gradx)
gradcheck_naive(func_f, np.random.randn(3,2))
When using the dot product, the gradient becomes:
W = np.array([[2,4],[3,5]]))
x = np.array([[1,7],[5,-1],[5,1]])
D = x.dot(W)
unitary = np.array([[1,1],[1,1],[1,1]])
gradx = unitary.dot(np.transpose(W))
func_f = lambda x: (np.sum(x.dot(W)), gradx)
gradcheck_naive(func_f, np.random.randn(3,2))
I was also wondering how did the element wise product behave with matrices of not equal dimensions like below:
x = np.random.randn(10)
W = np.random.randn(3,10)
D1 = x*W
D2 = W*x
Turns out that D1=D2 (same dimension as W=3x10) and my understanding is that x is being broadcasted by numpy to be a 3x10 matrix to allow the element wise multiplication.
Conclusion: when in doubt, write it out with small matrices to figure out where the error is.

Related

Linear spline loop to quadratic spline loop

def LinearSpline(x, fx): #to determine the coefficients
'''
Valid call:
coeffs = LinearSpline(x, fx)
Inputs:
x : (array) x values at which we have f(x) values.
fx : (array) f(x)values associated with x values.
Output:
coeffs : (array) coefficients of the linear spline.
Assumption:
All inputs are given correctly.
'''
nsegs = len(x)-1
A = np.zeros((2*nsegs, 2*nsegs))
b = np.zeros((2*nsegs, 1))
for i in range(nsegs):
A[2*i, 2*i] = x[i]
A[2*i, 2*i+1] = 1.0
A[2*i+1, 2*i] = x[i+1]
A[2*i+1, 2*i+1] = 1.0
b[2*i] = fx[i]
b[2*i+1] = fx[i+1]
# solve the system
coeffs = la.solve(A, b)
print(A)
print(b)
return coeffs
*I created a linear spline loop...need to also create a qudratic but is having trouble printing the "c" componet for the c coefficient....any help would be greatly appreciated

Numpy.linalg.eig is giving different results than numpy.linalg.eigh for Hermitian matrices

I have one hermitian matrix (specifically, a Hamiltonian). Though phase of a singe eigenvector can be arbitrary, the quantities I am calculating is physical (I reduced the code a bit keeping just the reproducible part). eig and eigh are giving very different results.
import numpy as np
import numpy.linalg as nlg
import matplotlib.pyplot as plt
def Ham(Ny, Nx, t, phi):
h = np.zeros((Ny,Ny), dtype=complex)
for ii in range(Ny-1):
h[ii+1,ii] = t
h[Ny-1,0] = t
h=h+np.transpose(np.conj(h))
u = np.zeros((Ny,Ny), dtype=complex)
for ii in range(Ny):
u[ii,ii] = -t*np.exp(-2*np.pi*1j*phi*ii)
u = u + 1e-10*np.eye(Ny)
H = np.kron(np.eye(Nx,dtype=int),h) + np.kron(np.diag(np.ones(Nx-1), 1),u) + np.kron(np.diag(np.ones(Nx-1), -1),np.transpose(np.conj(u)))
H[0:Ny,Ny*(Nx-1):Ny*Nx] = np.transpose(np.conj(u))
H[Ny*(Nx-1):Ny*Nx,0:Ny] = u
x=[]; y=[];
for jj in range (1,Nx+1):
for ii in range (1,Ny+1):
x.append(jj); y.append(ii)
x = np.asarray(x)
y = np.asarray(y)
return H, x, y
def C_num(Nx, Ny, E, t, phi):
H, x, y = Ham(Ny, Nx, t, phi)
ifhermitian = np.allclose(H, np.transpose(np.conj(H)), rtol=1e-5, atol=1e-8)
assert ifhermitian == True
Hp = H
V,wf = nlg.eigh(Hp) ##Check. eig gives different result
idx = np.argsort(np.real(V))
wf = wf[:, idx]
normmat = wf*np.conj(wf)
norm = np.sqrt(np.sum(normmat, axis=0))
wf = wf/(norm*np.sqrt(len(H)))
wf = wf[:, V<=E] ##Chose a subset of eigenvectors
V01 = wf*np.exp(1j*x)[:,None]; V12 = wf*np.exp(1j*y)[:,None]
V23 = wf*np.exp(1j*x)[:,None]; V30 = wf*np.exp(1j*y)[:,None]
wff = np.transpose(np.conj(wf))
C01 = np.dot(wff,V01); C12 = np.dot(wff,V12); C23 = np.dot(wff,V23); C30 = np.dot(wff,V30)
F = nlg.multi_dot([C01,C12,C23,C30])
ifhermitian = np.allclose(F, np.transpose(np.conj(F)), rtol=1e-5, atol=1e-8)
assert ifhermitian == True
evals, efuns = nlg.eig(F) ##Check eig gives different result
C = (1/(2*np.pi))*np.sum(np.angle(evals));
return C
C = C_num(16, 16, 0, 1, 1/8)
print(C)
Changing both nlg.eigh to nlg.eig, or even changing only the last one, giving very different results.
As I mentioned elsewhere, the eigenvalue and eigenvector are not unique.
The only thing that is true is that for each eigenvalue $A v = lambda v$, the two matrices returned by eig and eigh describe those solutions, it is natural that eig inexact but approximate results.
You can see that both the solutions will triangularize your matrix in different ways
H, x, y = Ham(16, 16, 1, 1./8)
D, V = nlg.eig(H)
Dh, Vh = nlg.eigh(H)
Then
import matplotlib.pyplot as plt
plt.figure(figsize=(14, 7))
plt.subplot(121);
plt.imshow(abs(np.conj(Vh.T) # H # Vh))
plt.title('diagonalized with eigh')
plt.subplot(122);
plt.imshow(abs(np.conj(V.T) # H # V))
plt.title('diagonalized with eig')
Plots this
That both diagonalizations were successfull, but the eigenvalues are indifferent order.
If you sort the eigenvalues you see they match
plt.plot(np.diag(np.real(np.conj(Vh.T) # H # Vh)))
plt.plot(np.diag(np.imag(np.conj(Vh.T) # H # Vh)))
plt.plot(np.sort(np.diag(np.real(np.conj(V.T) # H # V))))
plt.title('eigenvalues')
plt.legend(['real eigh', 'imag eigh', 'sorted real eig'], loc='upper left')
Since many eigenvalues are repeated, the eigenvector associated with a given eigenvalue is not unique as well, the only thing we can guarantee is that the eigenvectors for a given eigenvalue must span the same subspace.
The diagonalization test is the best in my opinion.
Is eigh always better than eig?
If you search for the eigenvalues in the lapack routines you will have many options. So it is I cannot discuss each possible implementation here. The common sense says that we can expect that the symmetric/hermitian routines to perform better, otherwise ther would be no reason to add one more routine that is more limited. But I never tested carefully the behavior of eig vs eigh.
To have an intuition compare the equation for tridiagonalization for symmetric matrices, and the equation for reduction of a general matrix to its Heisenberg form found here.

How can I calculate the gradient of a vector field from its values?

I'd like some help with numpy and arrays. I want to calculate the gradient of a vector field.
Suppose I have a function foo that takes a tuple of coordinates (x,y,z) and returns a vector (u,v,w).
Then if I have an array of coordinates POS = [[x1,y1,z1],[x2,y2,z2],[x3,y3,z3],etc] I can generate an array of vectors with origin in pos and direction in DIR = [[u1,v1,w1],[u2,v2,w2],[u3,v3,w3],etc].
Now how could I calculate the gradient of this vector field in every point of POS ? What I need in the end would be something like another array GRAD = [grad1, grad2, grad3, etc] where every grad would be a 3x3 array of the partial derivatives of the vector field in that corresponding point in POS.
PS: I know I can derivate manually the function foo and then implement the derivatives in python but in my case the function foo is really complex and I want to do the derivatives this way :)
EDIT1: for now I'm coming up with POS this way:
parts = 100
limit = 10
xs = linspace(-limit, limit, parts)
ys = linspace(-limit, limit, parts)
zs = linspace(-limit, limit, parts)
POS = array([(x, y, z) for z in zs for y in ys for x in xs])
DIR = array([foo(pos) for pos in POS])
which allows me to do this if necessary:
POS = POS.reshape(parts,parts,parts,3)
DIR = DIR.reshape(parts,parts,parts,3)
You can use numpy.gradient for this in the following way:
import numpy as np
N = 100
limit = .1
def vec(x,y,z): # Example vector field
return np.array([x,x,z])
x = np.arange(-limit, limit, 2*limit/N) # np.arange takes the spacing as 3. arg
y = np.arange(-limit, limit, 2*limit/N)
z = np.arange(-limit, limit, 2*limit/N)
# Create 3D grid from 1D arrays, indexing is important!
X,Y,Z = np.meshgrid(x,y,z,indexing='ij')
V = vec(X,Y,Z) # Get vector field, shape: (3,N,N,N)
D = np.gradient(V, x, y, z, axis=(1,2,3)) # Get gradient, this is a list!
D = np.array(D).transpose((1,0,2,3,4)) # Make it an array and flip first axes.
The resulting array has shape (3,3,N,N,N) where the first axis indexes the component of the vector field and the second axis indexes the coordinate direction w.r.t. which the derivative was computed.

How to vectorize a function of two matrices in numpy?

Say, I have a binary (adjacency) matrix A of dimensions nxn and another matrix U of dimensions nxl. I use the following piece of code to compute a new matrix that I need.
import numpy as np
from numpy import linalg as LA
new_U = np.zeros_like(U)
for idx, a in np.ndenumerate(A):
diff = U[idx[0], :] - U[idx[1], :]
if a == 1.0:
new_U[idx[0], :] += 2 * diff
elif a == 0.0:
norm_diff = LA.norm(U[idx[0], :] - U[idx[1], :])
new_U[idx[0], :] += -2 * diff * np.exp(-norm_diff**2)
return new_U
This takes quite a lot of time to run even when n and l are small. Is there a better way to rewrite (vectorize) this code to reduce the runtime?
Edit 1: Sample input and output.
A = np.array([[0,1,0], [1,0,1], [0,1,0]], dtype='float64')
U = np.array([[2,3], [4,5], [6,7]], dtype='float64')
new_U = np.array([[-4.,-4.], [0,0],[4,4]], dtype='float64')
Edit 2: In mathematical notation, I am trying to compute the following:
where u_ik = U[i, k],u_jk = U[j, k], and u_i = U[i, :]. Also, (i,j) \in E corresponds to a == 1.0 in the code.
Leveraging broadcasting and np.einsum for the sum-reductions -
# Get pair-wise differences between rows for all rows in a vectorized manner
Ud = U[:,None,:]-U
# Compute norm L1 values with those differences
L = LA.norm(Ud,axis=2)
# Compute 2 * diff values for all rows and mask it with ==0 condition
# and sum along axis=1 to simulate the accumulating behaviour
p1 = np.einsum('ijk,ij->ik',2*Ud,A==1.0)
# Similarly, compute for ==1 condition and finally sum those two parts
p2 = np.einsum('ijk,ij,ij->ik',-2*Ud,np.exp(-L**2),A==0.0)
out = p1+p2
Alternatively, use einsum for computing squared-norm values and using those to get p2 -
Lsq = np.einsum('ijk,ijk->ij',Ud,Ud)
p2 = np.einsum('ijk,ij,ij->ik',-2*Ud,np.exp(-Lsq),A==0.0)

Theano scan: how do I have a tuple output as input into next step?

I want to do this sort of loop in Theano:
def add_multiply(a,b, k):
return a+b+k, a*b*k, k
x=1
y=2
k=1
tuples = []
for i in range(5):
x,y,k = add_multiply(x,y,k)
tuples.append((x,y,k))
However, when I do
x0 = T.dvector('x0')
i = T.iscalar('i')
results,updates=th.scan(fn=add_multiply,outputs_info=[{'initial':x0,'taps':[-1]}],n_steps=i)
I get TypeError: add_multiply() takes exactly 3 arguments (1 given). If I change it so that the function takes a single tuple instead, I get ValueError: length not known
In particular, I eventually want to differentiate the entire result with respect to k.
The first error is because your add_multiply function takes 3 arguments but, by having only one element in the outputs_info list, you're only providing a single argument. It's not clear if you intended the x0 vector to be the initial value for just a or were expecting it to be spread over a, b, and k. The latter isn't supported by Theano and, in general, tuples are not supported by Theano. In Theano, everything needs to be a tensor (e.g. scalars are just special types of tensors with zero dimensions).
You can achieve a replica of the Python implementation in Theano as follows.
import theano
import theano.tensor as tt
def add_multiply(a, b, k):
return a + b + k, a * b * k
def python_main():
x = 1
y = 2
k = 1
tuples = []
for i in range(5):
x, y = add_multiply(x, y, k)
tuples.append((x, y, k))
return tuples
def theano_main():
x = tt.constant(1, dtype='uint32')
y = tt.constant(2, dtype='uint32')
k = tt.scalar(dtype='uint32')
outputs, _ = theano.scan(add_multiply, outputs_info=[x, y], non_sequences=[k], n_steps=5)
g = theano.grad(tt.sum(outputs), k)
f = theano.function(inputs=[k], outputs=outputs + [g])
tuples = []
xvs, yvs, _ = f(1)
for xv, yv in zip(xvs, yvs):
tuples.append((xv, yv, 1))
return tuples
print 'Python:', python_main()
print 'Theano:', theano_main()
Note that in the Theano version, all the tuple handling happens outside Theano; Python has to convert from the three tensors returned by the Theano function into a list of tuples.
Update:
It's unclear what "the entire result" should refer to but the code has been updated to show how you might differentiate with respect to k. Note that in Theano the symbolic differentiation only works with scalar expressions, but can differentiate with respect to multi-dimensional tensors.
In this update the add_multiply method no longer returns k since that is constant. For similar reasons, the Theano version now accepts k as a non_sequence.

Resources