Use ray for parallelization of nearest neighbor search - python-3.x

Suppose I have a big sparse matrix. I want to take each row vector of that matrix and compute the cosine distance to its nearest neighbor among the previous window_size rows of the matrix.
Since sklearn.neighbors does not improve run time when using parallelization (see this issue on github ), I tried to parallelize the process using ray. My code does better than sklearn with multiprocessing, but it's still slower than just serial distance computation.
My code is below. Is there something I have done wrong and should improve?
import scipy.sparse
from sklearn.metrics.pairwise import cosine_distances
from sklearn.neighbors import NearestNeighbors
import numpy as np
import timeit
import ray
import math
n = 4000
m = 100
big_matrix = scipy.sparse.random(n, m).tocsr()
window_size = 1000
retry = 1
n_jobs = 4
ray.init(num_cpus=n_jobs)
def simple_cosine_distance():
distances = np.zeros(n)
for i in range(1, n):
past = big_matrix[max(0, i - window_size):i]
query = big_matrix[i]
distances[i] = cosine_distances(query, past).min(axis=1)
def sklearn_nneighbor():
distances = np.zeros(n)
for i in range(1, n):
past = big_matrix[max(0, i - window_size):i]
nn = NearestNeighbors(metric="cosine", n_neighbors=1, n_jobs=1)
nn.fit(X=past)
query = big_matrix[i]
distance, _ = nn.kneighbors(query)
distances[i] = distance[0]
def sklearn_nneighbor_parallel():
distances = np.zeros(n)
for i in range(1, n):
past = big_matrix[max(0, i - window_size):i]
nn = NearestNeighbors(metric="cosine", n_neighbors=1, n_jobs=n_jobs)
nn.fit(X=past)
query = big_matrix[i]
distance, _ = nn.kneighbors(query)
distances[i] = distance[0]
#ray.remote
def get_chunk_min(data, indices, indptr, shape, slice_i, slice_j, query):
matrix = scipy.sparse.csr_matrix((data, indices, indptr), shape=shape)
past = matrix[slice_i:slice_j]
query = matrix[query]
chunk_min = cosine_distances(query, past).min(axis=1)
return chunk_min
def ray_parallel():
distances = np.zeros(n)
data = ray.put(big_matrix.data)
indices = ray.put(big_matrix.indices)
indptr = ray.put(big_matrix.indptr)
shape = ray.put(big_matrix.shape)
for i in range(1, n):
chunk_size = math.ceil((i - max(0, i - window_size)) / n_jobs)
chunk_mins = ray.get([
get_chunk_min.remote(
data,
indices,
indptr,
shape,
enum,
enum + chunk_size,
i
) for enum in range(max(0, i - window_size), i, chunk_size)])
distances[i] = min(chunk_mins)
for method in ["simple_cosine_distance", "sklearn_nneighbor", "sklearn_nneighbor_parallel", "ray_parallel"]:
print(method)
print(timeit.timeit(method + "()", setup="from __main__ import " + method, number=retry))
print("*"*50)
Output:
simple_cosine_distance
3.978868665999471
sklearn_nneighbor
4.265772191996803
sklearn_nneighbor_parallel
28.664759318002325
ray_parallel
17.89882287799992

https://docs.python.org/3/library/concurrent.futures.html
This documentation is a good resource to understand some mistakes you can do while are parallelizing workload using Ray.
Also note that, if your remote function is simple enough, the overhead of distributed computing (e.g. scheduling time, serialization time, etc.) can be bigger than function itself's computing time.

Related

Speeding up this PySCIPOpt routine for a Mixed Integer Program

I'm writing code to find median orders of tournaments, given a tournament T with n vertices, a median order is an ordering of the vertices of T such that it maximices the number of edges pointing in the 'increasing' direction with respect to the ordering.
In particular if the vertex set of T is {0,...,n-1}, the following integer problem (maximizing over the set of permutations) yields an optimal answer to the problem where Q is also a boolean n by n matrix.
I've implemented a linealization of this problem, noting that Q is a permutation, the following python code works, but my computer can't handle graphs that are as small as 10 vertices, where i would expect to have fast answers, is there any relatively easy way to speed up this computation?.
import numpy as np
from pyscipopt import Model,quicksum
from networkx.algorithms.tournament import random_tournament as rt
import math
# Some utilities to define the adjacency matrix of an oriented graph
def adjacency_matrix(t,order):
n = len(order)
adj_t = np.zeros((n,n))
for e in t.edges:
adj_t[order.index(e[0]),order.index(e[1])] = 1
return adj_t
# Random tournaments to instanciate the problem
def random_tournament(n):
r_t = rt(n)
adj_t = adjacency_matrix(r_t,list(range(n)))
return r_t, adj_t
###############################################################
############# PySCIPOpt optimization Routine ##################
###############################################################
n = 5 # some arbitrary size parameter
t,adj_t = random_tournament(n)
model = Model()
p,w,r = {},{},{}
# Defining model variables and weights
for k in range(n):
for l in range(n):
p[k,l] = model.addVar(vtype='B')
for i in range(n):
for j in range(i,n):
r[i,k,l,j] = model.addVar(vtype='C')
w[i,k,l,j] = adj_t[k][l]
for i in range(n):
# Forcing p to be a permutation
model.addCons(quicksum(p[s,i] for s in range(n))==1)
model.addCons(quicksum(p[i,s] for s in range(n))==1)
for k in range(n):
for j in range(i,n):
for l in range(n):
# Setting r[i,k,l,j] = min(p[i,k],p[l,j])
model.addCons(r[i,k,l,j] <= p[k,i])
model.addCons(r[i,k,l,j] <= p[l,j])
# Set the objective function
model.setObjective(quicksum(r[i,k,l,j]*w[i,k,l,j] for i in range(n) for j in range(i,n) for k in range(n) for l in range(n)), "maximize")
model.data = p,r
model.optimize()
sol = model.getBestSol()
# Print the solution on a readable format
Q = np.array([math.floor(model.getVal(model.data[0][key])) for key in model.data[0].keys()]).reshape([n,n])
print(f'\nOptimization ended with status {model.getStatus()} in {"{:.2f}".format(end_optimization-end_setup)}s, with {model.getObjVal()} increasing edges and optimal solution:')
print('\n',Q)
order = [int(x) for x in list(np.matmul(Q.T,np.array(range(n))))]
new_adj_t = adjacency_matrix(t,order)
print(f'\nwhich induces the ordering:\n\n {order}')
print(f'\nand induces the following adjacency matrix: \n\n {new_adj_t}')
Right now I've run it for n=5 taking between 5 and 20 seconds, and have ran it succesfully for small integers 6,7 with not much change in time needed.
For n=10, on the other hand, it has been running for around an hour with no solution yet, I suppose the linearization having O(n**4) variables hurts, but I don't understand why it blows up so fast. Is this normal? How would a better implementation be in case there is one?.

No performance increase when looping of FFTs in Cython

I'm writing a script that tracks the shifts of a sample by estimating the displacement of an ensemble of particles. The first implementation, in Python, works alright, but it takes too long for a large amount of samples. To combat this, I tried rewriting the method in Cython, but as this was my first time ever using it, I can't seem to get any performance increases. I know 3D FFTs exist and are often faster than looped 2D FFTs, but for this instance, they take too much memory and or slower than for-loops.
Python function:
import numpy as np
from scipy.fft import fftshift
import pyfftw
def python_corr(frame_a, frame_b):
DTYPEf = 'float32'
DTYPEc = 'complex64'
k = frame_a.shape[0]
m = frame_a.shape[1] # size y of 2d sample
n = frame_a.shape[2] # size x of 2d sample
fs = [m,n] # sample shape
bs = [m,n//2+1] # rfft sample shape
corr = np.zeros([k,m,n], DTYPEf) # out
fft_forward = pyfftw.builders.rfft2(
pyfftw.empty_aligned(fs, dtype = DTYPEf),
axes = [-2,-1],
)
fft_backward = pyfftw.builders.irfft2(
pyfftw.empty_aligned(bs, dtype = DTYPEc),
axes = [-2,-1],
)
for ind in range(k): # looping over 2D samples
window_a = frame_a[ind,:,:]
window_b = frame_b[ind,:,:]
corr[ind,:,:] = fftshift( # cross correlation via FFT algorithm
np.real(fft_backward(
np.conj(fft_forward(window_a))*fft_forward(window_b)
)),
axes = [-2,-1]
)
return corr
Cython function:
import numpy as np
from scipy.fft import fftshift
import pyfftw
cimport numpy as np
np.import_array()
cimport cython
DTYPEf = np.float32
ctypedef np.float32_t DTYPEf_t
DTYPEc = np.complex64
ctypedef np.complex64_t DTYPEc_t
#cython.boundscheck(False)
#cython.nonecheck(False)
def cython_corr(
np.ndarray[DTYPEf_t, ndim = 3] frame_a,
np.ndarray[DTYPEf_t, ndim = 3] frame_b,
):
cdef int ind, k, m, n
k = frame_a.shape[0]
m = frame_a.shape[1] # size y of sample
n = frame_a.shape[2] # size x of sample
cdef DTYPEf_t[:,:] window_a = pyfftw.empty_aligned([m,n], dtype = DTYPEf) # sample a
window_a[:,:] = 0.
cdef DTYPEf_t[:,:] window_b = pyfftw.empty_aligned([m,n], dtype = DTYPEf) # sample b
window_b[:,:] = 0.
cdef DTYPEf_t[:,:] corr = pyfftw.empty_aligned([m,n], dtype = DTYPEf) # cross-corr matrix
corr[:,:] = 0.
cdef DTYPEf_t[:,:,:] out = pyfftw.empty_aligned([k,m,n], dtype = DTYPEf) # out
out[:,:] = 0.
cdef object fft_forward
cdef object fft_backward
cdef DTYPEc_t[:,:] f2a = pyfftw.empty_aligned([m, n//2+1], dtype = DTYPEc) # rfft out of sample a
f2a[:,:] = 0. + 0.j
cdef DTYPEc_t[:,:] f2b = pyfftw.empty_aligned([m, n//2+1], dtype = DTYPEc) # rfft out of sample b
f2b[:,:] = 0. + 0.j
cdef DTYPEc_t[:,:] r = pyfftw.empty_aligned([m, n//2+1], dtype = DTYPEc) # power spectrum of sample a and b
r[:,:] = 0. + 0.j
fft_forward = pyfftw.builders.rfft2(
pyfftw.empty_aligned([m,n], dtype = DTYPEf),
axes = [0,1],
)
fft_backward = pyfftw.builders.irfft2(
pyfftw.empty_aligned([m,n//2+1], dtype = DTYPEc),
axes = [0,1],
)
for ind in range(k):
window_a = frame_a[ind,:,:]
window_b = frame_b[ind,:,:]
r = np.conj(fft_forward(window_a))*fft_forward(window_b) # power spectrum of sample a and b
corr = fft_backward(r).real # cross correlation
corr = fftshift(corr, axes = [0,1]) # shift Q1 --> Q3, Q2 --> Q4
# the fftshift could be moved out of the loop, but lets use that as a last resort :)
out[ind,:,:] = corr
return out
Test for methods:
import time
aa = bb = np.empty([14000, 24,24]).astype('float32') # a small test with 14000 24x24px samples
print(f'Number of samples: {aa.shape[0]}')
start = time.time()
corr = python_corr(aa, bb)
print(f'Time for Python: {time.time() - start}')
del corr
start = time.time()
corr = cython_corr(aa, bb)
print(f'Time for Cython: {time.time() - start}')
del corr

multprocessing for a stochastic process with multiple arguments

I want to solve a stochastic differential equation using multiprocessing. A simplified not-parallel code is like:
import numpy as np
x = np.zeros((2, 3, 4)) #matrix
z = np.random.normal(0, 1, (2,3,4)) #noise
z_array = z
for i in range(2):
for j in range(3):
x[i,j,0] = i
for k in range(3):
x[i,j,k+1] = x[i,j,k]*z_array[i,j,k]
The outcomes are the noisez_array and the corresponding matrix x. I want to use multiprocessing for the second loop. The problem is that I don't know how to incorporate the noise z in the parallel code. A naive implementation is like
import os
import numpy as np
import functools as ft
from multiprocess import Pool
def fun(i, k):
x = np.zeros(4)
x[0] = i
for k in range(2):
z = np.random.normal(0, 1)
x[k+1] = x[k]*z
return x
if __name__=='__main__':
pool = Pool(os.cpu_count()-1)
x = np.zeros((2, 3, 4))
for i in range(2):
result = np.array(pool.map(ft.partial(fun, i), range(3)))
x[i] = result
pool.close()
pool.join()
Since the code involves random numbers, I am not sure whether parallel code is correct or not and I don't know how to get the noises z. Any ideas?
You can try pre-generating the noise z and passing it to the argument along with k as a tuple. That way you have the noise with you and you do not need to generate it in the function. You can also add the first loop with i in the original function in the tuple to run it in the multiprocessing code.
For the code below:
In the second code you wrote, you ran the k loop inside the fun as range(2), which I assume is a typo and I am keeping it till range(3) as in the original code
I have incorporated the first loop into the multiprocessing setup too
If memory is not an issue and the matrix is small, use the below option which is cleaner and the equivalency of your original code and multiprocessing code is easier to read. If memory is an issue, you can compute only smaller matrices inside the fun and then reshape the result rather than adding (let me know if you want that solution).
Main code:
import os
import numpy as np
from multiprocessing import Pool
def fun(t):
i, j, z = t
x = np.zeros((2, 3, 4))
x[i, j, 0] = i
for k in range(3):
x[i, j, k + 1] = x[i, j, k] * z[k]
return x
if __name__=='__main__':
z = np.random.normal(0, 1, (2,3,4))
pool = Pool(os.cpu_count() - 1)
map_args = ((i, j, z[i, j, :]) for i in range(2) for j in range (3))
result = np.array(pool.map(fun, map_args))
x = np.sum(result, 0)
pool.close()
pool.join()

Monte Carlo Optimization

I am trying to do Monte Carlo minimization to solve for parameters of a given equation. My equation has 4 parameters, making my iteration about 4**n
when I try iteration n = 100, I saw it is not a good idea to search all the parameter space.
Here is my code:
import sys
import numpy as np
#import matplotlib.pyplot as plt
#import pandas as pd
import random
#method returns sum square for given parameter m and c
def currentFunc(x,alpha1,alpha2,alpha3,alpha4):
term = -(x/alpha4)
term_Norm = term
expoterm = np.exp(term_Norm)
expoterm = np.exp(term_Norm)
#print('check term: x: %0.10f %0.10f exp: %0.10f' % (x,term_Norm,expoterm) )
return(-alpha1*( (alpha2/(alpha3+ expoterm )) - 1))
def sumsquarecurr(x,y,a1,a2,a3,a4):
xsize = len(x)
ysize = len(y)
sumsqdiff = 0
if(xsize != ysize):
print("check your X and Y length exiting ...")
sys.exit(0)
for i in range(ysize):
diff = y[i] - currentFunc(x[i],a1,a2,a3,a4)
sumsqdiff+=diff*diff
return sumsqdiff
# number of random number (this affects the accuracy of the Monte Carlo method
n = 10
a_rnad = []
b_rnad = []
c_rnad = []
d_rnad = []
for i in range(n):
#random.seed(555)
xtemp = random.uniform(0.0, 2.0)
print('check %.4f ' % (xtemp))
a_rnad.append(xtemp)
b_rnad.append(xtemp)
c_rnad.append(xtemp)
d_rnad.append(xtemp)
Yfit=[-7,-5,-3,-1,1,3,5,7]
Xfit=[8.077448e-07,6.221196e-07,4.231292e-07,1.710039e-07,-4.313762e-05,-8.248818e-05,-1.017410e-04,-1.087409e-04]
# placeholder for the parameters and the minimun sum squared
#[alpha1,alpha2,alpha3,alpha4,min]
minparam = [0,0,0,0,99999999999.0]
for j in range(len(a_rnad)):
for i in range(len(b_rnad)):
for k in range(len(c_rnad)):
for m in range(len(d_rnad)):
minsumsqdiff_temp =sumsquarecurr(Xfit,Yfit,a_rnad[j],b_rnad[i],c_rnad[k],d_rnad[m])
print('alpha1: %.4f alpha2: %.4f alpha3: %.4f alpha4: %.4f min: %0.4f' % (a_rnad[j],b_rnad[i],c_rnad[k],d_rnad[m],minsumsqdiff_temp))
if(minsumsqdiff_temp<minparam[4]):
minparam[0] = a_rnad[j]
minparam[1] = b_rnad[i]
minparam[2] = c_rnad[k]
minparam[3] = d_rnad[m]
minparam[4] = minsumsqdiff_temp
print('minimazation: alpha1: %.4f alpha2: %.4f alpha3: %.4f alpha4: %.4f min: %0.4f' % (minparam[0],minparam[1],minparam[2],minparam[3],minparam[4]))
Question:
is there a way to make this algorithm run faster (either by cutting the search/phase space down)?
I feel I am reinventing the wheel. please does anyone know a python module that can do what I am trying to do?
Thanks in advance for your help

Incremental PCA

I've never used incremental PCA which exists in sklearn and I'm a bit confused about it's parameters and not able to find a good explanation of them.
I see that there is batch_size in the constructor, but also, when using partial_fit method you can again pass only a part of your data, I've found the following way:
n = df.shape[0]
chunk_size = 100000
iterations = n//chunk_size
ipca = IncrementalPCA(n_components=40, batch_size=1000)
for i in range(0, iterations):
ipca.partial_fit(df[i*chunk_size : (i+1)*chunk_size].values)
ipca.partial_fit(df[iterations*chunk_size : n].values)
Now, what I don't understand is the following - when using partial fit, does the batch_size play any role at all, or not? And how are they related?
Moreover, if both are considered, how should I change their values properly, when wanting to increase the precision while increasing memory footprint (and the other way around, decrease the memory consumption for the price of decreased accuracy)?
The docs say:
batch_size : int or None, (default=None)
The number of samples to use for each batch. Only used when calling fit...
This param is not used within partial_fit, where the batch-size is controlled by the user.
Bigger batches will increase memory-consumption, smaller ones will decrease it.
This is also written in the docs:
This algorithm has constant memory complexity, on the order of batch_size, enabling use of np.memmap files without loading the entire file into memory.
Despite some checks and parameter-heuristics, the whole fit-function looks like this:
for batch in gen_batches(n_samples, self.batch_size_):
self.partial_fit(X[batch], check_input=False)
Here is some an incremental PCA code based on https://github.com/kevinhughes27/pyIPCA which is an implementation of CCIPCA method.
import scipy.sparse as sp
import numpy as np
from scipy import linalg as la
import scipy.sparse as sps
from sklearn import datasets
class CCIPCA:
def __init__(self, n_components, n_features, amnesic=2.0, copy=True):
self.n_components = n_components
self.n_features = n_features
self.copy = copy
self.amnesic = amnesic
self.iteration = 0
self.mean_ = None
self.components_ = None
self.mean_ = np.zeros([self.n_features], np.float)
self.components_ = np.ones((self.n_components,self.n_features)) / \
(self.n_features*self.n_components)
def partial_fit(self, u):
n = float(self.iteration)
V = self.components_
# amnesic learning params
if n <= int(self.amnesic):
w1 = float(n+2-1)/float(n+2)
w2 = float(1)/float(n+2)
else:
w1 = float(n+2-self.amnesic)/float(n+2)
w2 = float(1+self.amnesic)/float(n+2)
# update mean
self.mean_ = w1*self.mean_ + w2*u
# mean center u
u = u - self.mean_
# update components
for j in range(0,self.n_components):
if j > n: pass
elif j == n: V[j,:] = u
else:
# update the components
V[j,:] = w1*V[j,:] + w2*np.dot(u,V[j,:])*u / la.norm(V[j,:])
normedV = V[j,:] / la.norm(V[j,:])
normedV = normedV.reshape((self.n_features, 1))
u = u - np.dot(np.dot(u,normedV),normedV.T)
self.iteration += 1
self.components_ = V / la.norm(V)
return
def post_process(self):
self.explained_variance_ratio_ = np.sqrt(np.sum(self.components_**2,axis=1))
idx = np.argsort(-self.explained_variance_ratio_)
self.explained_variance_ratio_ = self.explained_variance_ratio_[idx]
self.components_ = self.components_[idx,:]
self.explained_variance_ratio_ = (self.explained_variance_ratio_ / \
self.explained_variance_ratio_.sum())
for r in range(0,self.components_.shape[0]):
d = np.sqrt(np.dot(self.components_[r,:],self.components_[r,:]))
self.components_[r,:] /= d
You can test it with
import pandas as pd, ccipca
df = pd.read_csv('iris.csv')
df = np.array(df)[:,:4].astype(float)
pca = ccipca.CCIPCA(n_components=2,n_features=4)
S = 10
print df[0, :]
for i in range(150): pca.partial_fit(df[i, :])
pca.post_process()
The resulting eigenvectors / values will not exaactly be the same as the batch PCA. Results are approximate, but they are useful.

Resources