No performance increase when looping of FFTs in Cython - python-3.x

I'm writing a script that tracks the shifts of a sample by estimating the displacement of an ensemble of particles. The first implementation, in Python, works alright, but it takes too long for a large amount of samples. To combat this, I tried rewriting the method in Cython, but as this was my first time ever using it, I can't seem to get any performance increases. I know 3D FFTs exist and are often faster than looped 2D FFTs, but for this instance, they take too much memory and or slower than for-loops.
Python function:
import numpy as np
from scipy.fft import fftshift
import pyfftw
def python_corr(frame_a, frame_b):
DTYPEf = 'float32'
DTYPEc = 'complex64'
k = frame_a.shape[0]
m = frame_a.shape[1] # size y of 2d sample
n = frame_a.shape[2] # size x of 2d sample
fs = [m,n] # sample shape
bs = [m,n//2+1] # rfft sample shape
corr = np.zeros([k,m,n], DTYPEf) # out
fft_forward = pyfftw.builders.rfft2(
pyfftw.empty_aligned(fs, dtype = DTYPEf),
axes = [-2,-1],
)
fft_backward = pyfftw.builders.irfft2(
pyfftw.empty_aligned(bs, dtype = DTYPEc),
axes = [-2,-1],
)
for ind in range(k): # looping over 2D samples
window_a = frame_a[ind,:,:]
window_b = frame_b[ind,:,:]
corr[ind,:,:] = fftshift( # cross correlation via FFT algorithm
np.real(fft_backward(
np.conj(fft_forward(window_a))*fft_forward(window_b)
)),
axes = [-2,-1]
)
return corr
Cython function:
import numpy as np
from scipy.fft import fftshift
import pyfftw
cimport numpy as np
np.import_array()
cimport cython
DTYPEf = np.float32
ctypedef np.float32_t DTYPEf_t
DTYPEc = np.complex64
ctypedef np.complex64_t DTYPEc_t
#cython.boundscheck(False)
#cython.nonecheck(False)
def cython_corr(
np.ndarray[DTYPEf_t, ndim = 3] frame_a,
np.ndarray[DTYPEf_t, ndim = 3] frame_b,
):
cdef int ind, k, m, n
k = frame_a.shape[0]
m = frame_a.shape[1] # size y of sample
n = frame_a.shape[2] # size x of sample
cdef DTYPEf_t[:,:] window_a = pyfftw.empty_aligned([m,n], dtype = DTYPEf) # sample a
window_a[:,:] = 0.
cdef DTYPEf_t[:,:] window_b = pyfftw.empty_aligned([m,n], dtype = DTYPEf) # sample b
window_b[:,:] = 0.
cdef DTYPEf_t[:,:] corr = pyfftw.empty_aligned([m,n], dtype = DTYPEf) # cross-corr matrix
corr[:,:] = 0.
cdef DTYPEf_t[:,:,:] out = pyfftw.empty_aligned([k,m,n], dtype = DTYPEf) # out
out[:,:] = 0.
cdef object fft_forward
cdef object fft_backward
cdef DTYPEc_t[:,:] f2a = pyfftw.empty_aligned([m, n//2+1], dtype = DTYPEc) # rfft out of sample a
f2a[:,:] = 0. + 0.j
cdef DTYPEc_t[:,:] f2b = pyfftw.empty_aligned([m, n//2+1], dtype = DTYPEc) # rfft out of sample b
f2b[:,:] = 0. + 0.j
cdef DTYPEc_t[:,:] r = pyfftw.empty_aligned([m, n//2+1], dtype = DTYPEc) # power spectrum of sample a and b
r[:,:] = 0. + 0.j
fft_forward = pyfftw.builders.rfft2(
pyfftw.empty_aligned([m,n], dtype = DTYPEf),
axes = [0,1],
)
fft_backward = pyfftw.builders.irfft2(
pyfftw.empty_aligned([m,n//2+1], dtype = DTYPEc),
axes = [0,1],
)
for ind in range(k):
window_a = frame_a[ind,:,:]
window_b = frame_b[ind,:,:]
r = np.conj(fft_forward(window_a))*fft_forward(window_b) # power spectrum of sample a and b
corr = fft_backward(r).real # cross correlation
corr = fftshift(corr, axes = [0,1]) # shift Q1 --> Q3, Q2 --> Q4
# the fftshift could be moved out of the loop, but lets use that as a last resort :)
out[ind,:,:] = corr
return out
Test for methods:
import time
aa = bb = np.empty([14000, 24,24]).astype('float32') # a small test with 14000 24x24px samples
print(f'Number of samples: {aa.shape[0]}')
start = time.time()
corr = python_corr(aa, bb)
print(f'Time for Python: {time.time() - start}')
del corr
start = time.time()
corr = cython_corr(aa, bb)
print(f'Time for Cython: {time.time() - start}')
del corr

Related

Translating a mixed-integer programming formulation to Scipy

I would like to solve the above formulation in Scipy and solve it using milp(). For a given graph (V, E), f_ij and x_ij are the decision variables. f_ij is the flow from i to j (it can be continuous). x_ij is the number of vehicles from i to j. p is the price. X is the available number vehicles in a region. c is the capacity.
I have difficulty in translating the formulation to Scipy milp code. I would appreciate it if anyone could give me some pointers.
What I have done:
The code for equation (1):
f_obj = [p[i] for i in Edge]
x_obj = [0]*len(Edge)
obj = f_obj + v_obj
Integrality:
f_cont = [0 for i in Edge] # continous
x_int = [1]*len(Edge) # integer
integrality = f_cont + x_int
Equation (2):
def constraints(self):
b = []
A = []
const = [0]*len(Edge) # for f_ij
for i in v: # for x_ij
for e in Edge:
if e[0] == i:
const.append(1)
else:
const.append(0)
A.append(const)
b.append(self.accInit[i])
const = [0]*len(Edge) # for f_ij
return A, b
Equation (4):
[(0, demand[e]) for e in Edge]
I'm going to do some wild guessing, given how much you've left open to interpretation. Let's assume that
this is a maximisation problem, since the minimisation problem is trivial
Expression (1) is actually the maximisation objective function, though you failed to write it as such
p and d are floating-point vectors
X is an integer vector
c is a floating-point scalar
the graph edges, since you haven't described them at all, do not matter for problem setup
The variable names are not well-chosen and hide what they actually contain. I demonstrate potential replacements.
import numpy as np
from numpy.random._generator import Generator
from scipy.optimize import milp, Bounds, LinearConstraint
import scipy.sparse
from numpy.random import default_rng
rand: Generator = default_rng(seed=0)
N = 20
price = rand.uniform(low=0, high=10, size=N) # p
demand = rand.uniform(low=0, high=10, size=N) # d
availability = rand.integers(low=0, high=10, size=N) # X aka. accInit
capacity = rand.uniform(low=0, high=10) # c
c = np.zeros(2*N) # f and x
c[:N] = -price # (1) f maximized with coefficients of 'p'
# x not optimized
CONTINUOUS = 0
INTEGER = 1
integrality = np.empty_like(c, dtype=int)
integrality[:N] = CONTINUOUS # f
integrality[N:] = INTEGER # x
upper = np.empty_like(c)
upper[:N] = demand # (4) f
upper[N:] = availability # (2) x
eye_N = scipy.sparse.eye(N)
A = scipy.sparse.hstack((-eye_N, capacity*eye_N)) # (3) 0 <= -f + cx
result = milp(
c=c, integrality=integrality,
bounds=Bounds(lb=np.zeros_like(c), ub=upper),
constraints=LinearConstraint(lb=np.zeros(N), A=A),
)
print(result.message)
flow = result.x[:N]
vehicles = result.x[N:].astype(int)

Discrete Laplacian on a non-regular mesh (python)

I have coded the laplacien function for a non-regular mesh (created with the scipy.spatial.Delaunay function).
I have not errors but the results are not correct : the eigenvectors are correct but the eigenvalues ​​are too high (in absolute value).
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import scipy.spatial
def rect_drum(L,H,U):
vals = []
val = 0
k = 1
l = 1
while val >= -U:
while val >= -U:
val = -np.pi**2*((k/L)**2+(l/H)**2)
if val >= -U:
vals.append(val)
l += 1
l = 1
k += 1
val = -np.pi**2*((k/L)**2+(l/H)**2)
return np.array(vals)
def count_vp(tab,U):
#count the n eigenvalues ​​greater than equal to -U in the array tab
return tab[tab>=-U]
def in_curve(f,fargs,shape,a):
points = [] # the points inside the curve
for j in range(shape[0]):
for i in range(shape[1]):
if f(i*a,j*a,*fargs) < 0:
points.append([i*a,j*a])
return np.array(points)
def triang(points,a,f,fargs,bord):
tri_points = points.copy()
tri_points[:,1] *= np.sqrt(3)
tri_points2 = np.vstack((points,bord))
tri_points2[:,1] *= np.sqrt(3)
tri_points2[:,0] += a/2
tri_points2[:,1] += np.sqrt(3)/2*a
fin = np.vstack((tri_points,tri_points2))
i = 0
eps = 0.01
while i < len(fin):
if f(fin[i,0]+eps,fin[i,1]+eps,*fargs) > 0:
fin = np.delete(fin,i,0)
i -= 1
i += 1
return np.vstack((fin,bord)),len(fin),len(bord)
def tri_ang(points,ind,p0):
# sort the points in trigonometric order
vec=np.arctan2((points-p0)[:,1],(points-p0)[:,0])
values = []
dtype = [('val',float),('n',int)]
for i in range(len(vec)):
values.append((vec[i],i))
values = np.sort(np.array(values,dtype),order='val')
new_points = []
new_ind = []
for tup in values:
new_points.append(points[tup[1]])
new_ind.append(ind[tup[1]])
return np.array(new_points),np.array(new_ind)
def M(points,tri,Nint):
indptr,ind = tri.vertex_neighbor_vertices
W = np.zeros((Nint,Nint)) # cotangents matrix
A = np.zeros((Nint,1)) # surfaces vertex array for each point i (A[i])
for i in range(Nint):
tot = 0
nhb_ind = ind[indptr[i]:indptr[i+1]] # indices of the points close to the point of index k
nhb = points[nhb_ind] # their coordinates
nhb,nhb_ind = tri_ang(nhb,nhb_ind,points[i]) #the coordinates (nhb) and (nhb_ind) of each neighbor of i
for j in range(len(nhb_ind)):
vec = nhb[j]-points[i] # a vector connecting the point to his neighbor of index 0
vec_av = nhb[j-1]-points[i] # another vector but with the Vosin from before
if j+1 >= len(nhb_ind):
k = 0
else:
k = j+1
vec_ap = nhb[k]-points[i] # another vector but with the next neighbor
# another vector but with the next neighbor
A[i] += 0.5/3*np.linalg.norm(np.cross(vec,vec_av))
if nhb_ind[j] < Nint:
# we use the vector and scalar product to calculate the cotangents: A.B/||AxB||
cotan_alpha = np.dot(vec_av,vec_av-vec)/np.linalg.norm(np.cross(vec_av,vec_av-vec))
cotan_beta = np.dot(vec_ap,vec_ap-vec)/np.linalg.norm(np.cross(vec_ap,vec_ap-vec))
# Wij value :
W[i,nhb_ind[j]] = -0.5*(cotan_alpha+cotan_beta)
tot += cotan_alpha+cotan_beta
W[i,i] = -0.5*tot # diagonal values
return (1/A)*W
def rect(x,y,L,H,x0=0,y0=0):
if 0<x-x0<L and 0<y-y0<H:
return -1
else:
return 1
def rect_rim(L,H,a,x0=0,y0=0):
tab1 = np.arange(x0,L+x0,a)[:,np.newaxis]
h = np.hstack((tab1,H*np.ones((len(tab1),1))+y0))
b = np.hstack((tab1,np.zeros((len(tab1),1))+y0))
tab2 = np.arange(y0+a,H+y0,a)[:,np.newaxis]
g = np.hstack((np.zeros((len(tab2),1))+x0,tab2))
d = np.hstack((L*np.ones((len(tab2),1))+x0,tab2))
hp = np.array([[L+x0,H+y0]])
bp = np.array([[L+x0,0]])
return np.vstack((h,b,g,d,hp,bp))
# sample with a square 1*1
L = 1
H = 1
dl = 0.05
sol = in_curve(rect,[L,H],(100,100),dl)
sol_tri,Nint,Nbord = triang(sol,dl,rect,[L,H],rect_rim(L,H,dl))
# plt.plot(sol_tri[:,0],sol_tri[:,1],linestyle="",marker="+",label="tri")
# plt.plot(sol[:,0],sol[:,1],linestyle="",marker="x")
# plt.legend()
# plt.show()
# triangulation
tri = scipy.spatial.Delaunay(sol_tri)
# plt.triplot(sol_tri[:,0],sol_tri[:,1],tri.simplices)
# plt.show()
M = M(sol_tri,tri,Nint)
valp,vecp = np.linalg.eig(M) # eigenvalues and eigenvectors
vecp = np.real(vecp)
# comparison with the exact solution:
T = 1000
U = np.arange(0,T,1)
NUsim = np.array([len(count_vp(valp,u)) for u in U])
NU = np.array([len(rect_drum(L,H,u)) for u in U])
plt.plot(U,NUsim,label='simulation')
plt.plot(U,NU,label='exacts')
plt.legend()
plt.show()
# 3D plot of an eigenvector
vecp_tot = np.vstack((vecp,np.zeros((Nbord,Nint))))
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_trisurf(sol_tri[:,0],sol_tri[:,1],vecp_tot[:,0],triangles=tri.simplices)
plt.show()
The laplacian is the function named "M".
The "in_curve function" return the points inside a curve defined by f(x,y,*fargs) < 0 (a square in the sample).
The "triang" function return points with added points (triangle meshs). The fonction uses an another function for the rim of the curve (for most precision), in the sample it is the "rect_rim" function.
I used the formula given at https://en.wikipedia.org/wiki/Discrete_Laplace_operator ("mesh laplacians").
I have solve my problem : it's a sign and a rim problems.
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import scipy.spatial
def rect_drum(L,H,U):
vals = []
val = 0
k = 1
l = 1
while val >= -U:
while val >= -U:
val = -np.pi**2*((k/L)**2+(l/H)**2)
if val >= -U:
vals.append(val)
l += 1
l = 1
k += 1
val = -np.pi**2*((k/L)**2+(l/H)**2)
return np.array(vals)
def count_vp(tab,U):
#count the n eigenvalues ​​greater than equal to -U in the array tab
return tab[tab>=-U]
def in_curve(f,fargs,shape,a):
points = [] # the points inside the curve
for j in range(shape[0]):
for i in range(shape[1]):
if f(i*a,j*a,*fargs) < 0:
points.append([i*a,j*a])
return np.array(points)
def triang(points,a,f,fargs,bord):
tri_points = points.copy()
tri_points[:,1] *= np.sqrt(3)
tri_points2 = np.vstack((points,bord))
tri_points2[:,1] *= np.sqrt(3)
tri_points2[:,0] += a/2
tri_points2[:,1] += np.sqrt(3)/2*a
fin = np.vstack((tri_points,tri_points2))
i = 0
eps = 0.01
while i < len(fin):
if f(fin[i,0]+eps,fin[i,1]+eps,*fargs) > 0:
fin = np.delete(fin,i,0)
i -= 1
i += 1
return np.vstack((fin,bord)),len(fin),len(bord)
def tri_ang(points,ind,p0):
# sort the points in trigonometric order
vec=np.arctan2((points-p0)[:,1],(points-p0)[:,0])
values = []
dtype = [('val',float),('n',int)]
for i in range(len(vec)):
values.append((vec[i],i))
values = np.sort(np.array(values,dtype),order='val')
new_points = []
new_ind = []
for tup in values:
new_points.append(points[tup[1]])
new_ind.append(ind[tup[1]])
return np.array(new_points),np.array(new_ind)
def Laplacian(points,tri,Nint):
indptr,ind = tri.vertex_neighbor_vertices
W = np.zeros((Nint,Nint)) # cotangents matrix
A = np.zeros((Nint,1)) # surfacesvertex aray of point i (A[i])
for i in range(Nint):
tot = 0
nhb_ind = ind[indptr[i]:indptr[i+1]] # indices of the points close to the point of index k
nhb = points[nhb_ind] # their coordinates
nhb,nhb_ind = tri_ang(nhb,nhb_ind,points[i]) #the coordinates (nhb) and (nhb_ind) of each neighbor of i
for j in range(len(nhb_ind)):
vec = nhb[j]-points[i] # a vector connecting the point to his neighbor of index 0
vec_av = nhb[j-1]-points[i] # another vector but with the Vosin from before
if j+1 >= len(nhb_ind):
k = 0
else:
k = j+1
vec_ap = nhb[k]-points[i] # another vector but with the next neighbor
# we use the cross product to calculate the areas of the triangles: ||AxB||/2:
A[i] += 0.5/3*np.linalg.norm(np.cross(vec,vec_av))
# we use the cross product and scalar product to calculate the cotangents: A.B/||AxB||
cotan_alpha = np.dot(vec_av,vec_av-vec)/np.linalg.norm(np.cross(vec_av,vec_av-vec))
cotan_beta = np.dot(vec_ap,vec_ap-vec)/np.linalg.norm(np.cross(vec_ap,vec_ap-vec))
tot += cotan_alpha+cotan_beta
if nhb_ind[j] < Nint:
W[i,nhb_ind[j]] = 0.5*(cotan_alpha+cotan_beta)
W[i,i] = -0.5*tot # diagonal values
return (1/A)*W
def rect(x,y,L,H,x0=0,y0=0):
if 0<x-x0<L and 0<y-y0<H:
return -1
else:
return 1
def rect_rim(L,H,a,x0=0,y0=0):
tab1 = np.arange(x0,L+x0,a)[:,np.newaxis]
h = np.hstack((tab1,H*np.ones((len(tab1),1))+y0))
b = np.hstack((tab1,np.zeros((len(tab1),1))+y0))
tab2 = np.arange(y0+a,H+y0,a)[:,np.newaxis]
g = np.hstack((np.zeros((len(tab2),1))+x0,tab2))
d = np.hstack((L*np.ones((len(tab2),1))+x0,tab2))
hp = np.array([[L+x0,H+y0]])
bp = np.array([[L+x0,0]])
return np.vstack((h,b,g,d,hp,bp))
# sample with a square 1*1
L = 1
H = 1
dl = 0.04
sol = in_curve(rect,[L,H],(100,100),dl)
sol_tri,Nint,Nbord = triang(sol,dl,rect,[L,H],rect_rim(L,H,dl))
# triangulation
tri = scipy.spatial.Delaunay(sol_tri)
M = Laplacian(sol_tri,tri,Nint)
valp,vecp = np.linalg.eig(M) # eigenvalues and eigenvectors
vecp = np.real(vecp)
# comparison with the exact solution:
T = 1000
U = np.arange(0,T,1)
NUsim = np.array([len(count_vp(valp,u)) for u in U])
NU = np.array([len(rect_drum(L,H,u)) for u in U])
plt.plot(U,NUsim,label='simulation')
plt.plot(U,NU,label='exacts')
plt.legend()
plt.show()
# 3D plot of an eigenvector
mode = 0 # change this for an another mode
vecp_tot = np.vstack((vecp,np.zeros((Nbord,Nint))))
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_trisurf(sol_tri[:,0],sol_tri[:,1],vecp_tot[:,mode],triangles=tri.simplices)
plt.show()
Notes :
1- The hight eigenvalues are false : it's an effect of discretisation.
2- If dl is too small, we have false eigenvectors and eigenvalues (at the top of valp and firsts vectors of vecp), it's probably due to the quality of the meshing.

Weights are not getting updated while training logistic regression by using iris dataset

Python code:
I have used the Python code as below. Here, machine is trained by using Logistic Regression algorithm and wine dataset. Here, problem is that weights are not getting updated. I don't understand where is the problem.
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
dataset = datasets.load_wine()
x = dataset.data
y = dataset.target
y = y.reshape(178,1)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.15,shuffle=True)
print(x_train.shape)
class log_reg():
def __init__(self):
pass
def sigmoid(self,x):
return 1 / (1 + np.exp(-x))
def train(self,x,y,w1,w2,alpha,iterations):
cost_history = [0] * iterations
Y_train = np.zeros([y.shape[0],3])
for i in range(Y_train.shape[0]):
for j in range(Y_train.shape[1]):
if(y[i] == j):
Y_train[i,j] = 1
for iteration in range(iterations):
z1 = x.dot(w1)
a1 = self.sigmoid(z1)
z2 = a1.dot(w2)
a2 = self.sigmoid(z2)
sig_sum = np.sum(np.exp(a2),axis=1)
sig_sum = sig_sum.reshape(a2.shape[0],1)
op = np.exp(a2) / sig_sum
loss = (Y_train * np.log(op))
dl = (op-Y_train)
dz1 = ((dl*(self.sigmoid(z2))*(1-self.sigmoid(z2))).dot(w2.T))*(self.sigmoid(z1))*(1-self.sigmoid(z1))
dz2 = (dl * (self.sigmoid(z2))*(1-self.sigmoid(z2)))
dw1 = x.T.dot(dz1)
dw2 = a1.T.dot(dz2)
w1 += alpha * dw1
w2 += alpha * dw2
cost_history[iteration] = (np.sum(loss)/len(loss))
return w1,w2,cost_history
def predict(self,x,y,w1,w2):
z1 = x.dot(w1)
a1 = self.sigmoid(z1)
z2 = a1.dot(w2)
a2 = self.sigmoid(z2)
sig_sum = np.sum(np.exp(a2),axis=1)
sig_sum = sig_sum.reshape(a2.shape[0],1)
op = np.exp(a2) / sig_sum
y_preds = np.argmax(op,axis=1)
acc = self.accuracy(y_preds,y)
return y_preds,acc
def accuracy(self,y_preds,y):
y_preds = y_preds.reshape(len(y_preds),1)
correct = (y_preds == y)
accuracy = (np.sum(correct) / len(y)) * 100
return (accuracy)
if __name__ == "__main__":
network = log_reg()
w1 = np.random.randn(14,4) * 0.01
w2 = np.random.randn(4,3) * 0.01
X_train = np.ones([x_train.shape[0],x_train.shape[1]+1])
X_train[:,:-1] = x_train
X_test = np.ones([x_test.shape[0],x_test.shape[1]+1])
X_test[:,:-1] = x_test
new_w1,new_w2,cost = network.train(X_train,y_train,w1,w2,0.0045,10000)
y_preds,accuracy = network.predict(X_test,y_test,new_w1,new_w2)
print(y_preds,accuracy)
In the above code, parameters are mentioned as below
x--training set,
y--target(output),
w1--weights for first layer,
w2--weights for second layer,
I used logistic regression with 2 hidden layers.
I am trying to train dataset wine from sklearn.I don't know where the problem is, but weights are not updating. Any help would be appreciated.
Your weights are updating , but I think you cant see them changing because you are printing them after execution. Python has a object reference method for numpy arrays so when you passed w1 , its values change values too so new_w1 and w1 become the same .
Take this example
import numpy as np
x=np.array([1,2,3,4])
def change(x):
x+=3
return x
print(x)
change(x)
print(x)
if you see the output it comes out as
[1 2 3 4]
[4 5 6 7]
I recommend that you add a bias and fix your accuracy function as I get my accuracy as 1000.
My execution when i run the code
the w1 and w2 values are indeed changing .
the only thing i changed was the main code and enabled the original data set , please do the same and tell if your weights are still not updating
if __name__ == "__main__":
network = log_reg()
w1 = np.random.randn(13,4) * 0.01
w2 = np.random.randn(4,3) * 0.01
print(w1)
print(" ")
print(w2)
print(" ")
new_w1,new_w2,cost = network.train(x_train,y_train,w1,w2,0.0045,10000)
print(w1)
print(" ")
print(w2)
print(" ")
y_preds,accuracy = network.predict(x_test,y_test,new_w1,new_w2)
print(y_preds,accuracy)

why do I get Partition index must be integer in KNeighborsclassifier?

I am trying to use sciklearn to find the goodness of a KNeighborsClassifier on my data.
My code is below (X is a matrix with NUM_MATCHES rows and NUM_FEATURES columns, Y is a column vector with NUM_MATCHES rows). I keep getting the error
TypeError: Partition index must be integer
on this line of the code below
rad_prob = estimator.predict_proba(np.reshape(radiant_query,(1,-1)))[0][1]
I am new to sciklearn not sure what the issue is.
from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation
import numpy as np
K=2
FOLDS_FINISHED=0
NUM_HEROES = 78
NUM_FEATURES = NUM_HEROES*2
def score(estimator, X, y):
global FOLDS_FINISHED
correct_predictions = 0
for i, radiant_query in enumerate(X):
dire_query = np.concatenate((radiant_query[NUM_HEROES:NUM_FEATURES], radiant_query[0:NUM_HEROES]))
rad_prob = estimator.predict_proba(np.reshape(radiant_query,(1,-1)))[0][1]
dire_prob = estimator.predict_proba(np.reshape(dire_query,(1,-1)))[0][0]
overall_prob = (rad_prob + dire_prob) / 2
prediction = 1 if (overall_prob > 0.5) else -1
result = 1 if prediction == y[i] else 0
correct_predictions += result
FOLDS_FINISHED += 1
accuracy = float(correct_predictions) / len(X)
print ('Accuracy: %f' % accuracy)
return accuracy
preprocessed = np.load('train_9000.npz')
X = preprocessed['X']
Y = preprocessed['Y']
NUM_MATCHES = 3000
X = X[0:NUM_MATCHES]
Y = Y[0:NUM_MATCHES]
k_fold = cross_validation.KFold(n=NUM_MATCHES, n_folds=K, shuffle=True)
d_tries = [3, 4, 5]
d_accuracy_pairs = []
for d_index, d in enumerate(d_tries):
model = KNeighborsClassifier(n_neighbors=NUM_MATCHES/K,metric=my_distance,weights=poly_param(d))
model_accuracies = cross_validation.cross_val_score(model, X, Y, scoring=score, cv=k_fold)
model_accuracy = model_accuracies.mean()
d_accuracy_pairs.append((d, model_accuracy))

Incremental PCA

I've never used incremental PCA which exists in sklearn and I'm a bit confused about it's parameters and not able to find a good explanation of them.
I see that there is batch_size in the constructor, but also, when using partial_fit method you can again pass only a part of your data, I've found the following way:
n = df.shape[0]
chunk_size = 100000
iterations = n//chunk_size
ipca = IncrementalPCA(n_components=40, batch_size=1000)
for i in range(0, iterations):
ipca.partial_fit(df[i*chunk_size : (i+1)*chunk_size].values)
ipca.partial_fit(df[iterations*chunk_size : n].values)
Now, what I don't understand is the following - when using partial fit, does the batch_size play any role at all, or not? And how are they related?
Moreover, if both are considered, how should I change their values properly, when wanting to increase the precision while increasing memory footprint (and the other way around, decrease the memory consumption for the price of decreased accuracy)?
The docs say:
batch_size : int or None, (default=None)
The number of samples to use for each batch. Only used when calling fit...
This param is not used within partial_fit, where the batch-size is controlled by the user.
Bigger batches will increase memory-consumption, smaller ones will decrease it.
This is also written in the docs:
This algorithm has constant memory complexity, on the order of batch_size, enabling use of np.memmap files without loading the entire file into memory.
Despite some checks and parameter-heuristics, the whole fit-function looks like this:
for batch in gen_batches(n_samples, self.batch_size_):
self.partial_fit(X[batch], check_input=False)
Here is some an incremental PCA code based on https://github.com/kevinhughes27/pyIPCA which is an implementation of CCIPCA method.
import scipy.sparse as sp
import numpy as np
from scipy import linalg as la
import scipy.sparse as sps
from sklearn import datasets
class CCIPCA:
def __init__(self, n_components, n_features, amnesic=2.0, copy=True):
self.n_components = n_components
self.n_features = n_features
self.copy = copy
self.amnesic = amnesic
self.iteration = 0
self.mean_ = None
self.components_ = None
self.mean_ = np.zeros([self.n_features], np.float)
self.components_ = np.ones((self.n_components,self.n_features)) / \
(self.n_features*self.n_components)
def partial_fit(self, u):
n = float(self.iteration)
V = self.components_
# amnesic learning params
if n <= int(self.amnesic):
w1 = float(n+2-1)/float(n+2)
w2 = float(1)/float(n+2)
else:
w1 = float(n+2-self.amnesic)/float(n+2)
w2 = float(1+self.amnesic)/float(n+2)
# update mean
self.mean_ = w1*self.mean_ + w2*u
# mean center u
u = u - self.mean_
# update components
for j in range(0,self.n_components):
if j > n: pass
elif j == n: V[j,:] = u
else:
# update the components
V[j,:] = w1*V[j,:] + w2*np.dot(u,V[j,:])*u / la.norm(V[j,:])
normedV = V[j,:] / la.norm(V[j,:])
normedV = normedV.reshape((self.n_features, 1))
u = u - np.dot(np.dot(u,normedV),normedV.T)
self.iteration += 1
self.components_ = V / la.norm(V)
return
def post_process(self):
self.explained_variance_ratio_ = np.sqrt(np.sum(self.components_**2,axis=1))
idx = np.argsort(-self.explained_variance_ratio_)
self.explained_variance_ratio_ = self.explained_variance_ratio_[idx]
self.components_ = self.components_[idx,:]
self.explained_variance_ratio_ = (self.explained_variance_ratio_ / \
self.explained_variance_ratio_.sum())
for r in range(0,self.components_.shape[0]):
d = np.sqrt(np.dot(self.components_[r,:],self.components_[r,:]))
self.components_[r,:] /= d
You can test it with
import pandas as pd, ccipca
df = pd.read_csv('iris.csv')
df = np.array(df)[:,:4].astype(float)
pca = ccipca.CCIPCA(n_components=2,n_features=4)
S = 10
print df[0, :]
for i in range(150): pca.partial_fit(df[i, :])
pca.post_process()
The resulting eigenvectors / values will not exaactly be the same as the batch PCA. Results are approximate, but they are useful.

Resources