PyOpenCL how to modify a matrix locally within the kernel function - python-3.x

I am trying to modify a matrix (Pbis) locally within a pyOpenCL kernel function and when filling up this matrix with 0 it alters the result matrix R. When executing this code we obtain weird values in the R matrix. It is probably due to memory allocation but we cannot figure out how to fix it. Normally R should be exclusively composed of the init value.
program = cl.Program(context, """
__kernel void generate_paths(__global float *P, ushort const n,
ushort N, ushort init, __global float *R){
int i = get_global_id(0);
__private float* Pbis;
for (int k=0; k<n; k++){
Pbis[k] = 0;
for (int j=0; j<n; j++)
R[i*(n+1) + j] = init;
R[i*(n+1) + n] = init;
The parameters for the generation are:
program.generate_paths(queue, res_np.shape, None, P_buf, np.uint16(n), np.uint16(N), np.uint16(init), res_buf)
Here is the entire code for reproducibility:
import numpy as np
import pyopencl as cl
import numpy.linalg as la
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
os.environ['PYOPENCL_CTX'] = '1'
(n, N) = (3,6)
U = np.random.uniform(0,1, size=(n+1)*N)
U = U.astype(np.float32)
P = np.matrix([[0, 1/3, 1/3, 1/3], [1/3, 0, 1/3, 1/3], [1/3, 1/3, 0, 1/3], [1/3, 1/3, 1/3, 0]])
P = P.astype(np.float32)
res_np = np.zeros((N, n+1),dtype = np.float32)
platform = cl.get_platforms()[0]
device = platform.get_devices()[0]
context = cl.Context([device])
queue = cl.CommandQueue(context)
mf = cl.mem_flags
U_buf = cl.Buffer(context, mf.COPY_HOST_PTR | mf.COPY_HOST_PTR, hostbuf=U)
P_buf = cl.Buffer(context, mf.COPY_HOST_PTR | mf.COPY_HOST_PTR, hostbuf=P)
res_buf = cl.Buffer(context, mf.WRITE_ONLY, res_np.nbytes)
init = 0
program = cl.Program(context, """
__kernel void generate_paths(__global const float *U, __global float *P, ushort const n,
ushort N, ushort init, __global float *R){
int i = get_global_id(0);
int current = init;
__private float* Pbis;
for (int k=0; k<n; k++){
Pbis[k] = 0;
for (int j=0; j<n; j++)
R[i*(n+1) + j] = current;
R[i*(n+1) + n] = init;
#prg.multiply(queue, c.shape, None,
# np.uint16(n), np.uint16(m), np.uint16(p),
# a_buf, b_buf, c_buf)
# a_mul_b = np.empty_like(c)
# cl.enqueue_copy(queue, a_mul_b, c_buf)
program.generate_paths(queue, res_np.shape, None, U_buf, P_buf, np.uint16(n), np.uint16(N), np.uint16(init), res_buf)
chem_gen = np.empty_like(res_np)
cl.enqueue_copy(queue, chem_gen, res_buf)
print("Platform Selected = %s"
print("Device Selected = %s"
print("Generated Paths:")
print (chem_gen)


Multithreaded Nagel–Schreckenberg model (traffic simulation) with OpenMP

I'm trying to write a multithreaded Nagel–Schreckenberg model simulation in c language and have some problems when a thread accesses the data which wasn't calculated yet.
Here is a working code which only parallelizes velocity calculation per line:
#define L 3000 // number of cells in row
#define num_iters 3000 // number of iterations
#define density 0.48 // how many positives
#define vmax 2
#define p 0.2
for (int i = 0; i < num_iters - 1; i++)
int temp[L] = {0};
#pragma omp parallel for
for (int x = 0; x < L; x++)
if (iterations[i][x] > -1)
int vi = iterations[i][x]; // velocity of previews iteration
int d = 1; // index of the next vehicle
while (iterations[i][(x + d) % L] < 0)
int vtemp = min(min(vi + 1, d - 1), vmax); // increase speed, but avoid hitting the next car
int v = r2() < p ? max(vtemp - 1, 0) : vtemp; // stop the vehicle with probability p
temp[x] = v;
for (int x = 0; x < L; x++) // write the velocities to the next line
if (iterations[i][x] > -1)
int v = temp[x];
iterations[i + 1][(x + v) % L] = v;
This works fine, but it's not fast enough. I'm trying to use convolution to increase the performance, but it can't read neighbor thread's data half of the time because it wasn't calculated yet. Here is the code I used:
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>
#define L 4000 // number of cells in row
#define num_iters 4000 // number of iterations
#define density 0.48 // how many positives
#define vmax 2
#define p 0.2
#define BLOCKS_Y 4
#define BLOCKS_X 4
time_t t;
#ifndef min
#define min(a, b) (((a) < (b)) ? (a) : (b))
#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
void shuffle(int *array, size_t n)
if (n > 1)
size_t i;
for (i = 0; i < n - 1; i++)
size_t j = i + rand() / (RAND_MAX / (n - i) + 1);
int t = array[j];
array[j] = array[i];
array[i] = t;
double r2()
return (double)rand() / (double)RAND_MAX;
void writeImage(int *iterations[], char filename[])
int h = L;
int w = num_iters;
FILE *f;
unsigned char *img = NULL;
int filesize = 54 + 3 * w * h;
img = (unsigned char *)malloc(3 * w * h);
memset(img, 0, 3 * w * h);
for (int i = 0; i < w; i++)
for (int j = 0; j < h; j++)
int x = i;
int y = (h - 1) - j;
int color = iterations[i][j] == 0 ? 0 : 255;
img[(x + y * w) * 3 + 2] = (unsigned char)(color);
img[(x + y * w) * 3 + 1] = (unsigned char)(color);
img[(x + y * w) * 3 + 0] = (unsigned char)(color);
unsigned char bmpfileheader[14] = {'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0};
unsigned char bmpinfoheader[40] = {40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0};
unsigned char bmppad[3] = {0, 0, 0};
bmpfileheader[2] = (unsigned char)(filesize);
bmpfileheader[3] = (unsigned char)(filesize >> 8);
bmpfileheader[4] = (unsigned char)(filesize >> 16);
bmpfileheader[5] = (unsigned char)(filesize >> 24);
bmpinfoheader[4] = (unsigned char)(w);
bmpinfoheader[5] = (unsigned char)(w >> 8);
bmpinfoheader[6] = (unsigned char)(w >> 16);
bmpinfoheader[7] = (unsigned char)(w >> 24);
bmpinfoheader[8] = (unsigned char)(h);
bmpinfoheader[9] = (unsigned char)(h >> 8);
bmpinfoheader[10] = (unsigned char)(h >> 16);
bmpinfoheader[11] = (unsigned char)(h >> 24);
f = fopen(filename, "wb");
fwrite(bmpfileheader, 1, 14, f);
fwrite(bmpinfoheader, 1, 40, f);
for (int i = 0; i < h; i++)
fwrite(img + (w * (h - i - 1) * 3), 3, w, f);
fwrite(bmppad, 1, (4 - (w * 3) % 4) % 4, f);
void simulation()
printf("L=%d, num_iters=%d\n", L, num_iters);
int z = 0;
int current_index = 0;
int success_moves = 0;
const int cars_num = (int)(density * L);
int **iterations = (int **)malloc(num_iters * sizeof(int *));
for (int i = 0; i < num_iters; i++)
iterations[i] = (int *)malloc(L * sizeof(int));
for (int i = 0; i < L; i++)
iterations[0][i] = i <= cars_num ? 0 : -1;
shuffle(iterations[0], L);
for (int i = 0; i < num_iters - 1; i++)
for (int x = 0; x < L; x++)
iterations[i + 1][x] = -1;
double *randoms = (double *)malloc(L * num_iters * sizeof(double));
for (int i = 0; i < L * num_iters; i++) {
randoms[i] = r2();
#pragma omp parallel for collapse(2)
for (int blocky = 0; blocky < BLOCKS_Y; blocky++)
for (int blockx = 0; blockx < BLOCKS_X; blockx++)
int ystart = blocky * BLOCKSIZEY;
int yend = ystart + BLOCKSIZEY;
int xstart = blockx * BLOCKSIZEX;
int xend = xstart + BLOCKSIZEX;
for (int y = ystart; y < yend; y++)
for (int x = xstart; x < xend; x++)
if (iterations[y][x] > -1)
int vi = iterations[y][x];
int d = 1;
int start = (x + d) % L;
int i;
for (i = start; i < L && iterations[y][i] < 0; ++i);
d += i - start;
if (i == L)
for (i = 0; i < start && iterations[y][i] < 0; ++i);
d += i;
int vtemp = min(min(vi + 1, d - 1), vmax);
int v = randoms[x * y] < p ? max(vtemp - 1, 0) : vtemp;
iterations[y + 1][(x + v) % L] = v;
if (L <= 4000)
writeImage(iterations, "img.bmp");
void main() {
As you can see, as the second block gets calculated the first one didn't probably calculate yet which produces that empty space.
I think it's possible to solve this with the convolution, but I'm just doing something wrong and I'm not sure what. If you could give any advice on how to fix this problem, I would really appreciate it.
There is a race condition in the second code because iterations can be read by a thread and written by another. More specifically, iterations[y + 1][(x + v) % L] = v set a value that another thread should read when checking iterations[y][x] or iterations[y][(x + d) % L] when two threads are working on consecutive y values (of two consecutive blocky values).
Moreover, the r2 function have to be thread-safe. It appears to be a random number generator (RNG), but such random function is generally implemented using global variables that are often not thread-safe. One simple and efficient solution is to use thread_local variables instead. An alternative solution is to explicitly pass in parameter a mutable state to the random function. The latter is a good practice when you design parallel applications since it makes visible the mutation of an internal state and it provides way to better control the determinism of the RNG.
Besides this, please note that modulus are generally expensive, especially if L is not a compile-time constant. You can remove some of them by pre-computing the remainder before a loop or splitting a loop so to perform checks only near the boundaries. Here is an (untested) example for the while:
int start = (x + d) % L;
int i;
for(i=start ; i < L && iterations[y][i] < 0 ; ++i);
d += i - start;
if(i == L) {
for(i=0 ; i < start && iterations[y][i] < 0 ; ++i);
d += i;
Finally, please note that the blocks should be divisible by 4. Otherwise, the current code is not valid (a min/max clamping is likely needed).

Parallel QuickSort, can someone help me?

I am trying to implement the quicksort parallelization by specifying the list separation snippet in two others compared to the pivo. I am having problems with the syntax and to save the pointer at the end of the two new lists. How do I get rid of the syntax errors and save the list sizes at the end of the kernel?
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda import gpuarray, compiler
from pycuda.compiler import SourceModule
import time
import numpy as np
def quickSort_paralleloGlobal(listElements: list) -> list:
if len(listElements) <= 1:
return listElements
pivo = listElements.pop()
list1 = []
list2 = []
kernel_code_template = """
__global__ void separateQuick(int *listElements, int *list1, int *list2, int pivo)
int index1 = 0, index2 = 0;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < %(ARRAY_SIZE)s; i+= stride)
if (lista[i] < pivo
list1[index2] = listElements[i];
list2[index2] = listElements[i];
SIZE = len(listElements)
listElements = np.asarray(listElements)
listElements = listElements.astype(
lista_gpu = cuda.mem_alloc(listElements.nbytes)
cuda.memcpy_htod(lista_gpu, listElements)
list1_gpu = cuda.mem_alloc(listElements.nbytes)
list2_gpu = cuda.mem_alloc(listElements.nbytes)
kernel_code = kernel_code_template % {
mod = compiler.SourceModule(kernel_code)
arraysQuick = mod.get_function("separateQuick")
arraysQuick(lista_gpu, list1_gpu, list2_gpu, pivo, block=(BLOCK_SIZE, 1, 1), grid=(NUM_BLOCKS, 1))
list1 = list1_gpu.get()
list2 = list2_gpu.get()
np.allclose(list1, list1_gpu.get())
np.allclose(list2, list2_gpu.get())
return quickSort_paralleloGlobal(list1) + [pivo] + quickSort_paralleloGlobal(list2)
Here is the runtime error:
Traceback (most recent call last):
File "C:/Users/mateu/Documents/GitHub/ppc_Sorting_and_Merging/", line 104, in <module>
print(quickSort_paraleloGlobal([1, 5, 4, 2, 0]))
File "C:/Users/mateu/Documents/GitHub/ppc_Sorting_and_Merging/", line 60, in quickSort_paraleloGlobal
mod = compiler.SourceModule(kernel_code)
File "C:\Users\mateu\Documents\GitHub\ppc_Sorting_and_Merging\venv\lib\site-packages\pycuda\", line 291, in __init__
arch, code, cache_dir, include_dirs)
File "C:\Users\mateu\Documents\GitHub\ppc_Sorting_and_Merging\venv\lib\site-packages\pycuda\", line 254, in compile
return compile_plain(source, options, keep, nvcc, cache_dir, target)
File "C:\Users\mateu\Documents\GitHub\ppc_Sorting_and_Merging\venv\lib\site-packages\pycuda\", line 137, in compile_plain
stderr=stderr.decode("utf-8", "replace"))
pycuda.driver.CompileError: nvcc compilation of C:\Users\mateu\AppData\Local\Temp\tmpefxgkfkk\ failed
[command: nvcc --cubin -arch sm_61 -m64 -Ic:\users\mateu\documents\github\ppc_sorting_and_merging\venv\lib\site-packages\pycuda\cuda]
[stderr: error: expected a ")" warning: parsing restarts here after previous syntax error error: expected a statement warning: variable "indexMenor" was declared but never referenced warning: variable "indexMaior" was declared but never referenced
2 errors detected in the compilation of "C:/Users/mateu/AppData/Local/Temp/tmpxft_00004260_00000000-10_kernel.cpp1.ii".
Process finished with exit code 1
There are a number of problems with your code. I don't think I will be able to list them all. However one of the central problems is that you have attempted to do a naive conversion of a serial quicksort into a thread-parallel quicksort, and such a simple conversion is not possible.
To allow threads to work in a parallel fashion, while dividing up an input list into one of two separate output lists, requires a number of changes to your kernel code.
However we can address most of the other issues by limiting your kernel launches to one thread each.
With that idea, the following code appears to sort the given input correctly:
$ cat
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda import gpuarray, compiler
from pycuda.compiler import SourceModule
import time
import numpy as np
def quickSort_paralleloGlobal(listElements):
if len(listElements) <= 1:
return listElements
pivo = listElements.pop()
pivo = np.int32(pivo)
kernel_code_template = """
__global__ void separateQuick(int *listElements, int *list1, int *list2, int *l1_size, int *l2_size, int pivo)
int index1 = 0, index2 = 0;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < %(ARRAY_SIZE)s; i+= stride)
if (listElements[i] < pivo)
list1[index1] = listElements[i];
list2[index2] = listElements[i];
*l1_size = index1;
*l2_size = index2;
SIZE = len(listElements)
listElements = np.asarray(listElements)
listElements = listElements.astype(np.int32)
lista_gpu = cuda.mem_alloc(listElements.nbytes)
cuda.memcpy_htod(lista_gpu, listElements)
list1_gpu = cuda.mem_alloc(listElements.nbytes)
list2_gpu = cuda.mem_alloc(listElements.nbytes)
l1_size = cuda.mem_alloc(4)
l2_size = cuda.mem_alloc(4)
kernel_code = kernel_code_template % {
mod = compiler.SourceModule(kernel_code)
arraysQuick = mod.get_function("separateQuick")
arraysQuick(lista_gpu, list1_gpu, list2_gpu, l1_size, l2_size, pivo, block=(BLOCK_SIZE, 1, 1), grid=(NUM_BLOCKS, 1))
l1_sh = np.zeros(1, dtype = np.int32)
l2_sh = np.zeros(1, dtype = np.int32)
cuda.memcpy_dtoh(l1_sh, l1_size)
cuda.memcpy_dtoh(l2_sh, l2_size)
list1 = np.zeros(l1_sh, dtype=np.int32)
list2 = np.zeros(l2_sh, dtype=np.int32)
cuda.memcpy_dtoh(list1, list1_gpu)
cuda.memcpy_dtoh(list2, list2_gpu)
list1 = list1.tolist()
list2 = list2.tolist()
return quickSort_paralleloGlobal(list1) + [pivo] + quickSort_paralleloGlobal(list2)
print(quickSort_paralleloGlobal([1, 5, 4, 2, 0]))
$ python
[0, 1, 2, 4, 5]
The next step in the porting process would be to convert your naive serial kernel to one that could operate in a thread-parallel fashion. One relatively simple approach would be to use atomics to manage all output data (both lists, as well as updates to the sizes of each list).
Here is one possible approach:
$ cat
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda import gpuarray, compiler
from pycuda.compiler import SourceModule
import time
import numpy as np
def quickSort_paralleloGlobal(listElements):
if len(listElements) <= 1:
return listElements
pivo = listElements.pop()
pivo = np.int32(pivo)
kernel_code_template = """
__global__ void separateQuick(int *listElements, int *list1, int *list2, int *l1_size, int *l2_size, int pivo)
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < %(ARRAY_SIZE)s; i+= stride)
if (listElements[i] < pivo)
list1[atomicAdd(l1_size, 1)] = listElements[i];
list2[atomicAdd(l2_size, 1)] = listElements[i];
SIZE = len(listElements)
listElements = np.asarray(listElements)
listElements = listElements.astype(np.int32)
lista_gpu = cuda.mem_alloc(listElements.nbytes)
cuda.memcpy_htod(lista_gpu, listElements)
list1_gpu = cuda.mem_alloc(listElements.nbytes)
list2_gpu = cuda.mem_alloc(listElements.nbytes)
l1_size = cuda.mem_alloc(4)
l2_size = cuda.mem_alloc(4)
kernel_code = kernel_code_template % {
mod = compiler.SourceModule(kernel_code)
arraysQuick = mod.get_function("separateQuick")
l1_sh = np.zeros(1, dtype = np.int32)
l2_sh = np.zeros(1, dtype = np.int32)
cuda.memcpy_htod(l1_size, l1_sh)
cuda.memcpy_htod(l2_size, l2_sh)
arraysQuick(lista_gpu, list1_gpu, list2_gpu, l1_size, l2_size, pivo, block=(BLOCK_SIZE, 1, 1), grid=(NUM_BLOCKS, 1))
cuda.memcpy_dtoh(l1_sh, l1_size)
cuda.memcpy_dtoh(l2_sh, l2_size)
list1 = np.zeros(l1_sh, dtype=np.int32)
list2 = np.zeros(l2_sh, dtype=np.int32)
cuda.memcpy_dtoh(list1, list1_gpu)
cuda.memcpy_dtoh(list2, list2_gpu)
list1 = list1.tolist()
list2 = list2.tolist()
return quickSort_paralleloGlobal(list1) + [pivo] + quickSort_paralleloGlobal(list2)
print(quickSort_paralleloGlobal([1, 5, 4, 2, 0]))
$ python
[0, 1, 2, 4, 5]
I'm not suggesting that the above examples are perfect or defect free. Also, I have not identified each and every change I made to your code. I suggest you study the differences between these examples and your posted code.
I should also mention that this isn't a fast or efficient way to sort numbers on the GPU. I assume this is for a learning exercise. If you're interested in fast parallel sorting, you are encouraged to use a library implementation. If you want to do this from python, one possible implementation is provided by cupy

How to detect the fuzzy edge of a raindrop?

I want to extract the edge of the raindrop.
This is raindrop's photo.
I divide the picture into 8*8 blocks and extract the edges using sobel and canny. Now I can get a rough edge.
This is the edge I got.
I can't get the fuzzy edge of the raindrop.
This fuzzy edge I can't get
Mat SobelProcess(Mat src)
Mat Output;
Mat grad_x, grad_y, abs_grad_x, abs_grad_y, SobelImage;
Sobel(src, grad_x, CV_16S, 1, 0, CV_SCHARR, 1, 1, BORDER_DEFAULT);
Sobel(src, grad_y, CV_16S, 0, 1, CV_SCHARR, 1, 1, BORDER_DEFAULT);
convertScaleAbs(grad_x, abs_grad_x);
convertScaleAbs(grad_y, abs_grad_y);
addWeighted(abs_grad_x, 0.5, abs_grad_y, 0.5, 0, Output);
//subtract(grad_x, grad_y, SobelImage);
//convertScaleAbs(SobelImage, Output);
return Output;
int main()
Mat Src;
Src = imread("rain.bmp",0)
imshow("src", Src);
Mat Gauss;
GaussianBlur(Src, Src, Size(5, 5), 0.5);
imshow("Gauss", Src);
//M * N = 8 * 8
int OtsuThresh[M * N];
vector<Mat>tempThresh = ImageSegment(Src);
for (int i = 0; i < M * N; i++)
OtsuThresh[i] = Otsu(tempThresh[i]); //get Otsu Threshold
temp = ImageSegment(Src);//ImageSegment() is a function to divide the picture into 8*8 blocks
for (int i = 0; i < M * N; i++)
temp[i] = SobelProcess(temp[i]);
GaussianBlur(temp[i], temp[i], Size(3, 3), 0.5);
Canny(temp[i], temp[i], OtsuThresh[i] / 3, OtsuThresh[i]);
Mat Tem;
Tem = ImageMerge(temp);//ImageMerge() is a function to merge the blocks
imshow("Tem", Tem);
Then I use watershed. But I can't use it get an ideal result.

PyOpenCl Kernel in Loop Crashes GPU

I am writing a neighbor look up routine that is brute force using pypopencl. Later on it will fit into my smoothed particle hydro code. Brute force certainly is not efficient but its simple and its a starting point. I have been testing my look up kernel and I find that when I run it in a loop it crashes. I don't get any error messages in python but the screen flickers off, then comes back on with a note that the graphics drivers failed but have been recovered. The odd thing is that if the number of particles that are searched over are small (~1000 or less) its does just fine. If I increase the count (~10k) it crashes. I tried adding in barriers and wait commands, and a finish command, to no avail. I checked to see if I have an array overrun but I cannot find it. I am including the relevant code and apologize upfront for the size of it but wanted to give it out everything so people can look at it. I am hoping some one can run this and recreate the error, or tell me where I am going wrong. My setup is python 3.5 using spyder and installed pyopencl 2016.1.
First The main file
import numpy as np
import gpuParameters as gpuParameters
import pyopencl as cl
import pyopencl.array as ar
from BruteForceSearch import BruteForceSearch
import time as time
dim = 3 # dimensions of the problem
n = 15000 # number of particles
nbs = 50 # number of neighbors
x = np.random.rand(n) # randomly choose some x
y = np.random.rand(n) # randomly choose some y
z = np.random.rand(n) # randomly choose some z
h = np.ones(n) # smoothing parameter for the b spline
# setup gpu context
gpu = gpuParameters.gpuParameters()
# neighbor list
nlist = -1*np.ones(n*nbs, dtype=np.int32)
# data to gpu
xg = ar.to_device(gpu.queue, x) # x pos on gpu
yg = ar.to_device(gpu.queue, y) # y pos on gpu
zg = ar.to_device(gpu.queue, z) # z pos on gpu
hg = ar.to_device(gpu.queue, h) # h pos on gpu
num_p = ar.to_device(gpu.queue, np.array(n, dtype=np.int32)) # num of particles
nb = ar.to_device(gpu.queue, np.array(nbs, dtype=np.int32)) # num of neighbors
nlst = ar.to_device(gpu.queue, nlist) # neighbor list on gpu
dg = ar.to_device(gpu.queue, np.array(dim, dtype=np.int32)) # dimension on gpu
out = ar.zeros(gpu.queue, n, np.float64) # debug parameter
# call the Brute force neighbor search and h parameter set class
srch = BruteForceSearch(gpu) # instatiate
s = time.time() # timer start
for ii in range(100):
# set a marker I really didn't think this would be necessary
mark = cl.enqueue_marker(gpu.queue) # set a marker for kernel complete, x.shape, None,,,,,,,,, # run the kernel
cl.Event.wait(mark) # wait for complete run of kernel before next iteration
# gpu.queue.finish()
print('iteration: ', ii) # print iteration time to show me its running
e = time.time() # end the timer
cs = time.time() # clock the time it takes to return the array
nlist = nlst.get()
ce = time.time()
# output the times
print('time to calculate: ', e-s)
print('time to copy back: ', ce - cs)
GPU Context Class
import pyopencl as cl
class gpuParameters:
def __init__(self, dType = []):
#will setup the proper context based on given device preference
#if no device perference given will default to first value
if dType == []:
pltfrms = cl.get_platforms()[0]
devices = pltfrms.get_devices(cl.device_type.GPU)
context = cl.Context(devices) #create a device context
self.cntxt = context#keep this context in motion
self.queue = cl.CommandQueue(self.cntxt) #create a command que for this context
self.mF = cl.mem_flags
Neighbor Loop up
import numpy as np
import pyopencl as cl
import gpu_sph_assistance_functions as gsaf
class BruteForceSearch:
def __init__(self, gpu):
# instantiation of the search routine primarilly for pre compiling of
# the function
self.gpu = gpu # save the gpu context
# setup and compile the search
def bruteSearch(self):
W = gsaf.gpu_sph_kernel() = cl.Program(
W + '''__kernel void search(__global int *nP, __global int *nN,
__global int *dim,
__global double *x, __global double *y,
__global double *z, __global double *h,
__global int *nlist, __global double *out)
// indices
int gid = get_global_id(0); // current particle
int idv = 0; // unrolled array id
int count = 0; // count
int dm = *dim; // problem dimension
int itr = 0; // start iteration
int mxitr = 25; // max number of iterations
// calculate variables
double dms = 1.0/(*dim); // 1 over dimension for pow
double xi = x[gid]; // current x position
double yi = y[gid]; // current y position
double zi = z[gid]; // current z position
double dx = 0; // difference in x
double dy = 0; // difference in y
double dz = 0; // difference in z
double r = 0; // radius
double hg = h[gid]; // smoothing parametre
double Wsum = 0; // sum of weights
double W = 0; // current weight
double dwdx = 0; // derivative of weight in x direction
double dwdy = 0; // derivative of weight in y direction
double dwdz = 0; // derivative of weight in z direction
double dwdr = 0; // derivative of weight in r direction
double V = 0; // Volume of particle
double hn = 0; // holding value for comparison
double err = 10; // error
double tol = 1e-7; // tolerance
double diff = 0; // difference
// first clean the array of neighbors
for (int ii = 0; ii < *nN; ii++) // length of num of neighbors
idv = *nN*gid + ii; // unrolled index
nlist[idv] = -1; // this is a trigger for excluding values
// Next calculate the h parameter
while (err > tol)
Wsum = 0; // clean summation
for (int jj = 0; jj < *nP; jj++) // loop over all particles
dx = xi - x[jj];
dy = yi - y[jj];
dz = zi - z[jj];
// spline for weights
quintic_spline(dm, hg, dx, dy, dz, &W,
&dwdx, &dwdy, &dwdz, &dwdr);
Wsum += W; // add to store
V = 1.0/Wsum; /// volume
hn = pow(V, dms); // new h parameter
diff = hn - hg; // difference
err = fabs(diff); // error
out[gid] = err; // store error for debug
hg = hn; // reset h
itr ++; // update iter
if (itr > mxitr) // break out
{ break; }
h[gid] = hg; // store h
/* // get all neighbors in vicinity of particle not
// currently assessed
for(int ii = 0; ii < *nP; ii++)
dx = xi - x[ii];
dy = yi - y[ii];
dz = zi - z[ii];
r = sqrt(dx*dx + dy*dy + dz*dz);
if (r < 3.25*hg & count < *nN)
idv = *nN*gid + count;
nlist[idv] = ii;
The Spline function for weighting
W = '''void quintic_spline(
int dim, double h, double dx, double dy, double dz, double *W,
double *dWdx, double *dWdy, double *dWdz, double *dWdrO)
double pi = 3.141592654; // pi
double m3q = 0; // prefix values
double m2q = 0; // prefix values
double m1q = 0; // prefix values
double T1 = 0; // prefix values
double T2 = 0; // prefix values
double T3 = 0; // prefix values
double D1 = 0; // prefix values
double D2 = 0; // prefix values
double D3 = 0; // prefix values
double Ch = 0; // normalizing parameter for kernel
double C = 0; // normalizing prior to h
double r = sqrt(dx*dx + dy*dy + dz*dz);
double q = r/h; // normalized radius
double dqdr = 1.0/h; // intermediate derivative
double dWdq = 0; // intermediate derivative
double dWdr = 0; // intermediate derivative
double drdx = dx/r; // intermediate derivative
double drdy = dy/r; // intermediate derivative
double drdz = dz/r; // intermediate derivative
if (dim == 1)
C = 1.0/120.0;
else if (dim == 2)
C = 7.0/(pi*478.0);
else if (dim == 3)
C = 1.0/(120.0*pi);
Ch = C/pow(h, dim);
if (r <= 0)
drdx = 0.0;
drdy = 0.0;
drdz = 0.0;
// local prefix constants
m1q = 1.0 - q;
m2q = 2.0 - q;
m3q = 3.0 - q;
// smoothing parameter constants
T1 = Ch*pow(m3q, 5);
T2 = -6.0*Ch*pow(m2q, 5);
T3 = 15.0*Ch*pow(m1q, 5);
//derivative of spline coefficients
D1 = -5.0*Ch*pow(m3q,4);
D2 = 30.0*Ch*pow(m2q,4);
D3 = -75.0*Ch*pow(m1q,4);
// W calculation
if (q < 1.0)
*W = T1 + T2 + T3;
dWdq = D1 + D2 + D3;
else if (q >= 1.0 && q < 2.0)
*W = T1 + T2;
dWdq = D1 + D2;
else if (q >= 2.0 && q < 3.0)
*W = T1;
dWdq = D1;
*W = 0.0;
dWdq = 0.0;
dWdr = dWdq*dqdr;
// assign the derivatives
*dWdx = dWdr*drdx;
*dWdy = dWdr*drdy;
*dWdz = dWdr*drdz;
*dWdrO = dWdr;
I tested the code on a Intel i7-4790K CPU with AMD Accelerated Parallel Processing. It does not crash at n=150000 (I only run one iteration). The only odd thing I discovered while quickly looking into the code, was that the kernel is reading and writing in the array h. This should not be a problem, but still I usually try to avoid this.

(computer graphics) radial image distortion

I need to create an effect, that radially distorts a bitmap, by stretching or shrinking its "layers of pixels" radially (as shown on the image):
by colored circles (their thickness) is shown the transform, that is applied to the image
What approach should I take? I have a bitmap (array of pixels) and an another bitmap, that should be the result of such a filter applied (as a result, there should be some kind of a round water ripple on the bitmap).
Where could I read about creating such effects?
Thank you.
Try to look here
Zoom and Spin Blur
it is Java but nevertheless it could be fit to your request.
Well, the most accurate results would come from mapping the euclidean coordinates to a polar matrix. Then you would very easily be able to stretch them out. Then just translate them back to a euclidean representation and save. I'll write and edit with some code in a second.
Alright I got a bit carried away but here's my code. It will take a bitmap, convert it to and from polar coordinates and save it. now, radial based distortion should be a breeze.
#define PI 3.141592654
#define C_R 1000
#define C_S 1000
#define C_M 2000
typedef struct{ int r,g,b; } color;
typedef struct{ int t; color* data; int w, h; } bitmap;
typedef struct{ int t; color* data; int r, s, w, h; } r_bitmap;
bitmap* bmp_load_from_file( const char* fname ){
FILE* b = fopen( fname, "rb" );
if( b <= 0 ) return 0;
int num;
fscanf( b, "BM%n", &num );
if( num < 2 ) return 0;
struct{ int size, reserved, offset;
int hsize, wid, hig, planes:16, bpp:16, comp, bmpsize, hres, vres, colors, important; } head;
fread( &head, 13, 4, b );
bitmap* bmp = malloc( sizeof( bitmap ) );
bmp->data = malloc( head.wid * head.hig * sizeof( color ) );
bmp->w = head.wid;
bmp->h = head.hig;
for( int y = head.hig - 1; y >= 0; --y ){
int x;
for( x = 0; x < head.wid; ++x ){
color t;
t.r = fgetc( b );
t.g = fgetc( b );
t.b = fgetc( b );
bmp->data[x+y*bmp->w] = t;
while( x%4 != 0 ){
fgetc( b );
bmp->t = 0;
fclose( b );
return bmp;
void bmp_save( const char* fname, bitmap* bmp ){
FILE* b = fopen( fname, "wb" );
if( b <= 0 ) return 0;
struct{ int size, reserved, offset;
int hsize, wid, hig, planes:16, bpp:16, comp, bmpsize, hres, vres, colors, important; } head;
fprintf( b, "BM" );
head.size = 3 * (bmp->w+4)/4*4 * bmp->h + 54;
head.offset = 54;
head.hsize = 40;
head.wid = bmp->w;
head.hig = bmp->h;
head.planes = 1;
head.bpp = 24;
head.comp = 0;
head.bmpsize = 3 * (bmp->w+4)/4*4 * bmp->h;
head.hres = 72;
head.vres = 72;
head.colors = 0;
head.important = 0;
fwrite( &head, 13, 4, b );
for( int y = bmp->h - 1; y >= 0; --y ){
int x;
for( x = 0; x < bmp->w; ++x ){
fputc( bmp->data[x + y * bmp->w].r, b );
fputc( bmp->data[x + y * bmp->w].g, b );
fputc( bmp->data[x + y * bmp->w].b, b );
while( x % 4 != 0 ){
fputc(0, b);
fclose( b );
color color_mix( color a, color b, int offset ){ /*offset is a value between 0 and 255 to determine the weight. the lower it is the more color a gets*/
//if( offset > 255 || offset < 0)
//printf("%i\t", offset);
a.r += ( b.r - a.r ) * offset / 255;
a.g += ( b.g - a.g ) * offset / 255;
a.b += ( b.b - a.b ) * offset / 255;
return a;
r_bitmap* bmp_to_r( bitmap* b ){
r_bitmap* r = malloc( sizeof( r_bitmap ) );
r->t = 1;
int radius = sqrt( b->w * b->w + b->h * b->h ) / 2 * C_R / C_M + 2;
int step = C_S * ( b->w + b->h ) / C_M;
r->data = malloc( radius * step * sizeof( color ) );
r->r = radius;
r->s = step;
r->w = b->w;
r->h = b->h;
color black = {0, 0, 0};
for( double i = 0; i < radius; ++ i ){
for( double j = 0; j < step; ++j ){
double x = i * C_M * cos( 2 * PI * j / step ) / C_R + b->w / 2;
double y = i * C_M * sin( 2 * PI * j / step ) / C_R + b->h / 2;
int ix = x;
int iy = y;
if( x < 0 || x >= b->w || y < 0 || y >= b->h )
r->data[(int)(j + i * step)] = black;
color tmp = b->data[ix + iy * b->w];
if( iy < b->h - 1 ){
int off = 255 * (y - iy);
tmp = color_mix( tmp, b->data[ix + (iy+1) * b->w], off );
if( ix < b->w - 1 ){
int off = 255 * ( x - ix );
tmp = color_mix( tmp, b->data[ix +1 + iy * b->w], off );
r->data[(int)(j + i * step)] = tmp;
return r;
bitmap* bmp_from_r( r_bitmap* r ){
bitmap* b = malloc( sizeof( bitmap ) );
b->t = 0;
b->data = malloc( r->w * r->h * sizeof( color ) );
b->w = r->w;
b->h = r->h;
for( int y = 0; y < b->h; ++y ){
for( int x = 0; x < b->w; ++x ){
int tx = x - b->w/2;
int ty = y - b->h/2;
double rad = sqrt( tx*tx+ty*ty ) * C_R / C_M;
double s = atan2( ty, tx );
if( s < 0 ) s += 2 * PI;
s *= r->s / ( 2 * PI );
int is = s;
int irad = rad;
color tmp = r->data[(int)(is + irad * r->s)];
/*if( x > 0 && x < r->w - 1 && y > 0 && y < r->h - 1 ){
tmp = color_mix(tmp, r->data[((int)(is+1)%r->s + irad * r->s)], abs(255* rad - floor(rad)));
tmp = color_mix(tmp, r->data[(is + (irad + 1) * r->s)], abs(255* s - floor(s)));
b->data[x+y*b->w] = tmp;
return b;
int main( ) {
bitmap* b = bmp_load_from_file( "foo.bmp" );
r_bitmap* r = bmp_to_r( b );
bitmap* c = bmp_from_r( r );
bmp_save( "lol.bmp", c );
