Parallel C wrapper for cython code - multithreading

As i was recommended by DavidW in this Topic,
I'm trying to make a C wrapper function using OpenMP in order to multithread Cython code.
Here is what i have :
The C file "paral.h":
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
void paral(void (*func)(int,int), int nthreads){
int t;
#pragma omp parallel for
for (t = 0; t < nthreads; t++){
(*func)(t, nthreads);
}
}
The test.pyx file:
import time
import random
cimport cython
from libc.stdlib cimport malloc, realloc, free
ctypedef void (*func)(int,int)
cdef extern from "paral.h":
void paral(func function, int nthreads) nogil
cdef double *a = <double *> malloc ( 1000000 * sizeof(double) )
cdef double *b = <double *> malloc ( 1000000 * sizeof(double) )
cdef double *c = <double *> malloc ( 1000000 * sizeof(double) )
cdef int i
for i in range(1000000):
a[i] = random.random()
b[i] = random.random()
cdef void sum_ab(int thread, int nthreads):
cdef int start, stop, i
start = thread * (1000000 / nthreads)
stop = start + (1000000 / nthreads)
for i in range(start, stop):
c[i] = a[i] + b[i]
t0 = time.clock()
with nogil:
paral(sum_ab,4)
print(time.clock()-t0)
t0 = time.clock()
with nogil:
paral(sum_ab,1)
print(time.clock()-t0)
I have Visual Studio, so in the setup.py I have add:
extra_compile_args=["/openmp"],
extra_link_args=["/openmp"]
Results:
The 4-threaded is slightly slower than the 1-threaded.
If someone know what i'm doing wrong here.
Edit:
In response to Zultan.
To ensure that the time measured by time.clock() is correct, i make the execution last a few seconds, to be able to compare the time i get with time.clock() and time i measure with a stopwtach.
Somthing like this:
print("start timer 1")
t1 = time.clock()
for i in range(10000):
with nogil:
paral(sum_ab,4)
t2 = time.clock()
print(t2-t1)
print("strart timer 2")
t1 = time.clock()
for i in range(10000):
with nogil:
paral(sum_ab,1)
t2 = time.clock()
print(t2-t1)
print("stop")
Results with time.clock() are 15.0s 4-threaded, 14.5s 1-threaded and i see no noticable difference with what i measure.
Edit 2:
I think i've figured out what is happenng here. I read in some cases memory bandwidth can be saturated.
If i replace:
c[i] = a[i] + b[i]
by a more complex operation, for example:
c[i] = a[i]**b[i]
Now i have significant speedup between the single and the multi threaded (near x2).
However, i'm still 2x slower than a classic prange loop!
I see no reason why the prange is that faster. Maybe i need to change the C code...

Related

Cython: define arguments of unknown types

In the process of learing Cython, I am reproducing the example shown here regarding how to solve ordinary differential equations. I report it here to make it simpler:
import numpy as np
cimport numpy as np
cdef class Problem:
cpdef double rhs(self, double u, double t):
return 0
cdef class Problem1(Problem):
cpdef double rhs(self, double u, double t):
return -u +1 # u = 1-exp(-t)
from math import exp
cdef class Problem2(Problem):
cpdef double rhs(self, double u, double t):
return - u + exp(-2*t)
cdef class ODEMethod:
cpdef double advance(self, np.ndarray u, int n,
np.ndarray t, Problem p):
return 0
cdef class Method_RK2(ODEMethod):
cpdef double advance(self, np.ndarray u, int n,
np.ndarray t, Problem p):
cdef double K1, K2, unew, dt
dt = t[n+1] - t[n]
K1 = dt*p.rhs(u[n], t[n])
K2 = dt*p.rhs(u[n] + 0.5*K1, t[n] + 0.5*dt)
unew = u[n] + K2
return unew
# Create names compatible with ode0.py
RK2 = Method_RK2()
problem1 = Problem1()
problem2 = Problem2()
cpdef solver(Problem f, double I, np.ndarray t, ODEMethod method):
cdef int N = len(t)-1
cdef np.ndarray u = np.zeros(N+1, dtype=np.float)
u[0] = I
cdef int n
for n in range(N):
u[n+1] = method.advance(u, n, t, f)
return u, t
The class Problem is meant to make the code general . However, to make it more general, I would like to include some additional arguments for which you don't know in advance how many they are or their type, something similar to *args, for example when you define in pure Python:
class Problem
def __init__(self, *args)
self.args = args
Is there a way to do this or even a better one?
Thanks in advance!

PyOpenCL how to modify a matrix locally within the kernel function

I am trying to modify a matrix (Pbis) locally within a pyOpenCL kernel function and when filling up this matrix with 0 it alters the result matrix R. When executing this code we obtain weird values in the R matrix. It is probably due to memory allocation but we cannot figure out how to fix it. Normally R should be exclusively composed of the init value.
program = cl.Program(context, """
__kernel void generate_paths(__global float *P, ushort const n,
ushort N, ushort init, __global float *R){
int i = get_global_id(0);
__private float* Pbis;
for (int k=0; k<n; k++){
Pbis[k] = 0;
}
for (int j=0; j<n; j++)
{
R[i*(n+1) + j] = init;
}
R[i*(n+1) + n] = init;
}
""").build()
The parameters for the generation are:
program.generate_paths(queue, res_np.shape, None, P_buf, np.uint16(n), np.uint16(N), np.uint16(init), res_buf)
Here is the entire code for reproducibility:
import numpy as np
import pyopencl as cl
import numpy.linalg as la
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
os.environ['PYOPENCL_CTX'] = '1'
(n, N) = (3,6)
U = np.random.uniform(0,1, size=(n+1)*N)
U = U.astype(np.float32)
P = np.matrix([[0, 1/3, 1/3, 1/3], [1/3, 0, 1/3, 1/3], [1/3, 1/3, 0, 1/3], [1/3, 1/3, 1/3, 0]])
P = P.astype(np.float32)
res_np = np.zeros((N, n+1),dtype = np.float32)
platform = cl.get_platforms()[0]
device = platform.get_devices()[0]
context = cl.Context([device])
queue = cl.CommandQueue(context)
mf = cl.mem_flags
U_buf = cl.Buffer(context, mf.COPY_HOST_PTR | mf.COPY_HOST_PTR, hostbuf=U)
P_buf = cl.Buffer(context, mf.COPY_HOST_PTR | mf.COPY_HOST_PTR, hostbuf=P)
res_buf = cl.Buffer(context, mf.WRITE_ONLY, res_np.nbytes)
init = 0
program = cl.Program(context, """
__kernel void generate_paths(__global const float *U, __global float *P, ushort const n,
ushort N, ushort init, __global float *R){
int i = get_global_id(0);
int current = init;
__private float* Pbis;
for (int k=0; k<n; k++){
Pbis[k] = 0;
}
for (int j=0; j<n; j++)
{
R[i*(n+1) + j] = current;
}
R[i*(n+1) + n] = init;
}
""").build()
#prg.multiply(queue, c.shape, None,
# np.uint16(n), np.uint16(m), np.uint16(p),
# a_buf, b_buf, c_buf)
# a_mul_b = np.empty_like(c)
# cl.enqueue_copy(queue, a_mul_b, c_buf)
program.generate_paths(queue, res_np.shape, None, U_buf, P_buf, np.uint16(n), np.uint16(N), np.uint16(init), res_buf)
chem_gen = np.empty_like(res_np)
cl.enqueue_copy(queue, chem_gen, res_buf)
print("Platform Selected = %s" %platform.name)
print("Device Selected = %s" %device.name)
print("Generated Paths:")
print (chem_gen)

Enabling Parallelism with Cython

I am trying to get the prange function of Cython's parallel package to work and it seems like there is no parallelism in effect. To have a MWE, I have taken the example code from the book Cython: A Guide for Python Programmers and modified it a little bit by adding a few print statements. The example code is freely available at github and the code I'm referring to is at: examples/12-parallel-cython/02-prange-parallel-loops/.
The following is my modification of the julia.pyx file.
# distutils: extra_compile_args = -fopenmp
# distutils: extra_link_args = -fopenmp
from cython cimport boundscheck, wraparound
from cython cimport parallel
import numpy as np
cdef inline double norm2(double complex z) nogil:
return z.real * z.real + z.imag * z.imag
cdef int escape(double complex z,
double complex c,
double z_max,
int n_max) nogil:
cdef:
int i = 0
double z_max2 = z_max * z_max
while norm2(z) < z_max2 and i < n_max:
z = z * z + c
i += 1
return i
#boundscheck(False)
#wraparound(False)
def calc_julia(int resolution, double complex c,
double bound=1.5, double z_max=4.0, int n_max=1000):
cdef:
double step = 2.0 * bound / resolution
int i, j
double complex z
double real, imag
int[:, ::1] counts
counts = np.zeros((resolution+1, resolution+1), dtype=np.int32)
for i in parallel.prange(resolution + 1, nogil=True,
schedule='static', chunksize=1):
real = -bound + i * step
for j in range(resolution + 1):
imag = -bound + j * step
z = real + imag * 1j
counts[i,j] = escape(z, c, z_max, n_max)
return np.asarray(counts)
#boundscheck(False)
#wraparound(False)
def julia_fraction(int[:,::1] counts, int maxval=1000):
cdef:
unsigned int thread_id
int total = 0
int i, j, N, M
N = counts.shape[0]; M = counts.shape[1]
print("N = %d" % N)
with nogil:
for i in parallel.prange(N, schedule="static", chunksize=10):
thread_id = parallel.threadid()
with gil:
print("Thread %d." % (thread_id))
for j in range(M):
if counts[i,j] == maxval:
total += 1
return total / float(counts.size)
When I compile using the setup_julia.py given by
from distutils.core import setup
from Cython.Build import cythonize
from distutils.extension import Extension
setup(name="julia",
ext_modules=cythonize(Extension('julia', ['julia.pyx'], extra_compile_args=['-fopenmp'], extra_link_args=['-fopenmp'])))
with the command
python setup_julia.py build_ext --inplace
and run the run_julia.py file, I see that all instances of the for loop only use one thread -- Thread 0. The terminal output looks like below.
poulin8:02-prange-parallel-loops poulingroup$ python run_julia.py
time: 0.892143
julia fraction: N = 1001
Thread 0.
Thread 0.
Thread 0.
Thread 0.
.
.
.
.
Thread 0.
0.236994773458
As I understand, the for loop is simply running in parallel. Could someone guide me on what I must do for initiating the for loop to distribute load amongst many threads?
I have also tried to set the system variable OMP_NUM_THREADS to some number greater than 1 and there is no effect of this.
I am running the tests on a OSX 10.11.6, with Python 2.7.10 and gcc 5.2.0.
I've got the same problem on Windows 7.
It was running serial.
Noticed compilation message:
python setup_julia.py build_ext --inplace
cl : Command line warning D9002 : ignoring unknown option '-fopenmp'
Apparently in Visual Studio it has to be -openmp
# distutils: extra_compile_args = -openmp
# distutils: extra_link_args = -openmp
Now runs in parallel.
As #danny noted you can use fprintf:
from cython.parallel cimport prange, threadid
from libc.stdio cimport stdout, fprintf
def julia_fraction(int[:,::1] counts, int maxval=1000):
...
thread_id = threadid()
fprintf(stdout, "%d\n", <int>thread_id)
...

PyOpenCl Kernel in Loop Crashes GPU

I am writing a neighbor look up routine that is brute force using pypopencl. Later on it will fit into my smoothed particle hydro code. Brute force certainly is not efficient but its simple and its a starting point. I have been testing my look up kernel and I find that when I run it in a loop it crashes. I don't get any error messages in python but the screen flickers off, then comes back on with a note that the graphics drivers failed but have been recovered. The odd thing is that if the number of particles that are searched over are small (~1000 or less) its does just fine. If I increase the count (~10k) it crashes. I tried adding in barriers and wait commands, and a finish command, to no avail. I checked to see if I have an array overrun but I cannot find it. I am including the relevant code and apologize upfront for the size of it but wanted to give it out everything so people can look at it. I am hoping some one can run this and recreate the error, or tell me where I am going wrong. My setup is python 3.5 using spyder and installed pyopencl 2016.1.
Thanks,
Seth
First The main file
import numpy as np
import gpuParameters as gpuParameters
import pyopencl as cl
import pyopencl.array as ar
from BruteForceSearch import BruteForceSearch
import time as time
dim = 3 # dimensions of the problem
n = 15000 # number of particles
nbs = 50 # number of neighbors
x = np.random.rand(n) # randomly choose some x
y = np.random.rand(n) # randomly choose some y
z = np.random.rand(n) # randomly choose some z
h = np.ones(n) # smoothing parameter for the b spline
# setup gpu context
gpu = gpuParameters.gpuParameters()
# neighbor list
nlist = -1*np.ones(n*nbs, dtype=np.int32)
# data to gpu
xg = ar.to_device(gpu.queue, x) # x pos on gpu
yg = ar.to_device(gpu.queue, y) # y pos on gpu
zg = ar.to_device(gpu.queue, z) # z pos on gpu
hg = ar.to_device(gpu.queue, h) # h pos on gpu
num_p = ar.to_device(gpu.queue, np.array(n, dtype=np.int32)) # num of particles
nb = ar.to_device(gpu.queue, np.array(nbs, dtype=np.int32)) # num of neighbors
nlst = ar.to_device(gpu.queue, nlist) # neighbor list on gpu
dg = ar.to_device(gpu.queue, np.array(dim, dtype=np.int32)) # dimension on gpu
out = ar.zeros(gpu.queue, n, np.float64) # debug parameter
# call the Brute force neighbor search and h parameter set class
srch = BruteForceSearch(gpu) # instatiate
s = time.time() # timer start
for ii in range(100):
# set a marker I really didn't think this would be necessary
mark = cl.enqueue_marker(gpu.queue) # set a marker for kernel complete
srch.search.search(gpu.queue, x.shape, None,
num_p.data, nb.data, dg.data, xg.data, yg.data, zg.data,
hg.data, nlst.data, out.data) # run the kernel
cl.Event.wait(mark) # wait for complete run of kernel before next iteration
# gpu.queue.finish()
print('iteration: ', ii) # print iteration time to show me its running
e = time.time() # end the timer
cs = time.time() # clock the time it takes to return the array
nlist = nlst.get()
ce = time.time()
# output the times
print('time to calculate: ', e-s)
print('time to copy back: ', ce - cs)
GPU Context Class
import pyopencl as cl
class gpuParameters:
def __init__(self, dType = []):
#will setup the proper context based on given device preference
#if no device perference given will default to first value
if dType == []:
pltfrms = cl.get_platforms()[0]
devices = pltfrms.get_devices(cl.device_type.GPU)
context = cl.Context(devices) #create a device context
print(context)
print(devices)
self.cntxt = context#keep this context in motion
self.queue = cl.CommandQueue(self.cntxt) #create a command que for this context
self.mF = cl.mem_flags
Neighbor Loop up
import numpy as np
import pyopencl as cl
import gpu_sph_assistance_functions as gsaf
class BruteForceSearch:
def __init__(self, gpu):
# instantiation of the search routine primarilly for pre compiling of
# the function
self.gpu = gpu # save the gpu context
# setup and compile the search
self.bruteSearch()
def bruteSearch(self):
W = gsaf.gpu_sph_kernel()
self.search = cl.Program(
self.gpu.cntxt,
W + '''__kernel void search(__global int *nP, __global int *nN,
__global int *dim,
__global double *x, __global double *y,
__global double *z, __global double *h,
__global int *nlist, __global double *out)
{
// indices
int gid = get_global_id(0); // current particle
int idv = 0; // unrolled array id
int count = 0; // count
int dm = *dim; // problem dimension
int itr = 0; // start iteration
int mxitr = 25; // max number of iterations
// calculate variables
double dms = 1.0/(*dim); // 1 over dimension for pow
double xi = x[gid]; // current x position
double yi = y[gid]; // current y position
double zi = z[gid]; // current z position
double dx = 0; // difference in x
double dy = 0; // difference in y
double dz = 0; // difference in z
double r = 0; // radius
double hg = h[gid]; // smoothing parametre
double Wsum = 0; // sum of weights
double W = 0; // current weight
double dwdx = 0; // derivative of weight in x direction
double dwdy = 0; // derivative of weight in y direction
double dwdz = 0; // derivative of weight in z direction
double dwdr = 0; // derivative of weight in r direction
double V = 0; // Volume of particle
double hn = 0; // holding value for comparison
double err = 10; // error
double tol = 1e-7; // tolerance
double diff = 0; // difference
// first clean the array of neighbors
for (int ii = 0; ii < *nN; ii++) // length of num of neighbors
{
idv = *nN*gid + ii; // unrolled index
nlist[idv] = -1; // this is a trigger for excluding values
}
// Next calculate the h parameter
while (err > tol)
{
Wsum = 0; // clean summation
for (int jj = 0; jj < *nP; jj++) // loop over all particles
{
dx = xi - x[jj];
dy = yi - y[jj];
dz = zi - z[jj];
// spline for weights
quintic_spline(dm, hg, dx, dy, dz, &W,
&dwdx, &dwdy, &dwdz, &dwdr);
Wsum += W; // add to store
}
V = 1.0/Wsum; /// volume
hn = pow(V, dms); // new h parameter
diff = hn - hg; // difference
err = fabs(diff); // error
out[gid] = err; // store error for debug
hg = hn; // reset h
itr ++; // update iter
if (itr > mxitr) // break out
{ break; }
}
h[gid] = hg; // store h
/* // get all neighbors in vicinity of particle not
// currently assessed
for(int ii = 0; ii < *nP; ii++)
{
dx = xi - x[ii];
dy = yi - y[ii];
dz = zi - z[ii];
r = sqrt(dx*dx + dy*dy + dz*dz);
if (r < 3.25*hg & count < *nN)
{
idv = *nN*gid + count;
nlist[idv] = ii;
count++;
}
}
*/
}
''').build()
The Spline function for weighting
W = '''void quintic_spline(
int dim, double h, double dx, double dy, double dz, double *W,
double *dWdx, double *dWdy, double *dWdz, double *dWdrO)
{
double pi = 3.141592654; // pi
double m3q = 0; // prefix values
double m2q = 0; // prefix values
double m1q = 0; // prefix values
double T1 = 0; // prefix values
double T2 = 0; // prefix values
double T3 = 0; // prefix values
double D1 = 0; // prefix values
double D2 = 0; // prefix values
double D3 = 0; // prefix values
double Ch = 0; // normalizing parameter for kernel
double C = 0; // normalizing prior to h
double r = sqrt(dx*dx + dy*dy + dz*dz);
double q = r/h; // normalized radius
double dqdr = 1.0/h; // intermediate derivative
double dWdq = 0; // intermediate derivative
double dWdr = 0; // intermediate derivative
double drdx = dx/r; // intermediate derivative
double drdy = dy/r; // intermediate derivative
double drdz = dz/r; // intermediate derivative
if (dim == 1)
{
C = 1.0/120.0;
}
else if (dim == 2)
{
C = 7.0/(pi*478.0);
}
else if (dim == 3)
{
C = 1.0/(120.0*pi);
}
Ch = C/pow(h, dim);
if (r <= 0)
{
drdx = 0.0;
drdy = 0.0;
drdz = 0.0;
}
// local prefix constants
m1q = 1.0 - q;
m2q = 2.0 - q;
m3q = 3.0 - q;
// smoothing parameter constants
T1 = Ch*pow(m3q, 5);
T2 = -6.0*Ch*pow(m2q, 5);
T3 = 15.0*Ch*pow(m1q, 5);
//derivative of spline coefficients
D1 = -5.0*Ch*pow(m3q,4);
D2 = 30.0*Ch*pow(m2q,4);
D3 = -75.0*Ch*pow(m1q,4);
// W calculation
if (q < 1.0)
{
*W = T1 + T2 + T3;
dWdq = D1 + D2 + D3;
}
else if (q >= 1.0 && q < 2.0)
{
*W = T1 + T2;
dWdq = D1 + D2;
}
else if (q >= 2.0 && q < 3.0)
{
*W = T1;
dWdq = D1;
}
else
{
*W = 0.0;
dWdq = 0.0;
}
dWdr = dWdq*dqdr;
// assign the derivatives
*dWdx = dWdr*drdx;
*dWdy = dWdr*drdy;
*dWdz = dWdr*drdz;
*dWdrO = dWdr;
}'''
I tested the code on a Intel i7-4790K CPU with AMD Accelerated Parallel Processing. It does not crash at n=150000 (I only run one iteration). The only odd thing I discovered while quickly looking into the code, was that the kernel is reading and writing in the array h. This should not be a problem, but still I usually try to avoid this.

Cython: Calling an extension type cdef method from a cdef function called by python

I'm trying to write a Cython module that calculates pairwise distances as part of a larger class of locality sensitive hashes. Instead of writing code for each type and each distance metric, I am attempting to create one cdef function that takes various extension types that inherit from Metric:
cdef class Metric:
def __init__(self):
pass
cdef class Euclidean(Metric):
cdef numeric c_evaluate(self, numeric[:] x, numeric[:] y, int dims):
....
cdef numeric[:,:] pairwise(numeric[:] x, numeric[:] y, Metric func, bint symmetric):
...
dm[i,j] = func.c_evaluate(x,y,dims)
...
To access this function from Python:
def py_pairwise(numeric[:,:] x, numeric[:,:] y, str func, bint symmetric = 1, **kwargs):
cdef Metric mfunc = to_Metric(func, **kwargs)
return pairwise(x, y, mfunc, symmetric)
However I keep getting the error that "c_distance.[Metric] object has no attribute 'c_evaluate'". I'm wondering if the c_evaluate method isn't accessible because the class object is created in python code through the python function to_Metric, though I thought def and cdef functions were supposed to be able to call each other freely within a Cython module. The method works if I change c_evaluate to a cpdef method, but I'm not sure if this fixes the problem by allowing the cdef object to pass through python to cython or simply uses the slower python method. Any suggestions (I'm also not at my home computer so I don't have all the code right now. Will update later/on request)?
Edit: That typo isn't in the original functions (there could still be others):
ctypedef fused floating:
float
double
cdef class Euclidean(Metric):
cdef public floating c_evaluate(self, floating[:] x, floating[:] y, int dims):
cdef int i
cdef floating tmp, d = 0
for i in range(dims):
tmp = x[i]-y[i]
d += tmp*tmp
return sqrt(d)
##cython.boundscheck(False)
##cython.wraparound(False)
def py_pairwise(numeric[:,::1] x, numeric[:,::1] y,str metric, bint symmetric,**kwargs):
cdef Metric func = to_Metric(metric,**kwargs)
return pairwise(x,y,func,symmetric)
cdef numeric[:,::1] pairwise(numeric[:,::1] x,numeric[:,::1] y, Metric met, bint symmetric):#
cdef int n,m,k,i,j
n = x.shape[0]
m = y.shape[0]
dims = x.shape[1]
if numeric in floating:
mdtype = np.float
else:
mdtype = np.int
#mdtype = np.float
cdef numeric[:,::1] dm = (np.empty((n,m),dtype = mdtype)).fill(0)
if symmetric:
interval = lambda i,n,m: range(i+1,m)
else:
interval = lambda i,n,m: range(m)
for i in range(n):
for j in interval(i,n,m):
dm[i,j] = met.c_evaluate(x[i,:],y[j,:],dims)
return np.asarray(dm)
Also, to_Metric:
def to_Metric(str m, **kwargs):
if len(kwargs) == 0:
if m == 'euclidean':
met = Euclidean()
elif m in {'cos','cosine'}:
met = Cosine()
elif m in {'hamming','matching'}:
met = Hamming()
else:
raise ValueError('Unrecognized metric {}'.format('\''+m+'\''))
else:
if m in {'pnorm','p-norm'}:
met = Pnorm(kwargs['p'])
elif m == 'maximal':
met = Maximal(kwargs['m1'],kwargs['m2'],kwargs['sep'])
else:
raise ValueError('Unrecognized metric {}'.format('\''+m+'\''))
return met
The issue is that c_evaluate is associated with the class Euclidean and because of this can only be used with objects that are known to be of type Euclidean. However, in pairwise you declare the type of met to be Metric.
Because you declared c_evaluate function as cdef it can only be found at compile time. If you want the c_evaluate to be found at runtime like a standard Python function, you should declare it as def.
If you need the function to be found at compile time (which makes calling it quicker) then you should either make c_evaluate be a function of the Metric object, or you should make pairwise only take a Euclidean object.

Resources