I try to build a fast sparse solver with Eigen and OpenMP for Python. For the interface between this solver and Python I use the PyBind11 package. Basically, the solver works fine, but unfortunately it only runs on one core and I cannot figure out how to use all cores of my cpu.
Although the OpenMP test functions uses the entire cpu.
Here is the C++ code:
#include <iostream>
#include <cmath>
#include <omp.h>
#include <unistd.h>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
namespace py = pybind11;
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
void openmp_test()
{
int N;
N = 8;
// omp_set_num_threads(4);
#pragma omp parallel for
for (int i=0; i<N; i=i+1)
{
sleep(10);
}
}
void eigen_test(int N)
{
Eigen::SparseMatrix<double> A(N, N);
Eigen::SparseMatrix<double> b(N, 1);
Eigen::SparseMatrix<double> x(N, 1);
Eigen::BiCGSTAB<Eigen::SparseMatrix<double>> solver;
A.reserve(5*N);
b.reserve(N);
x.reserve(N);
for(int i=0; i<N; i++)
{
b.insert(i, 0) = 20.0;
for(int j=(i-2); j<=(i+2); j++)
{
if(j == i)
{
A.insert(j, i) = 10.0;
}
else if((j >= 0) && (j < N))
{
A.insert(j, i) = 5.0;
}
}
}
solver.compute(A);
x = solver.solve(b);
}
PYBIND11_MODULE(mytest, m)
{
m.def("openmp_test", &openmp_test, py::call_guard<py::gil_scoped_release>());
m.def("eigen_test", &eigen_test, py::call_guard<py::gil_scoped_release>());
}
Here is the Compiler code:
g++ \
-O3 \
-Wall \
-shared \
-std=c++14 \
-fopenmp \
-fPIC \
-I /usr/local/lib/python3.10/dist-packages/pybind11/include \
-I /usr/include/python3.10 \
-I /workspaces/pybind11/external/eigen \
mytest.cpp \
-o mytest.so
And finally here the Python code:
from time import time
import mytest
N = 10 * 10**3
start = time()
mytest.openmp_test()
print("runtime: {:.3f} s".format(time() - start))
start = time()
mytest.eigen_test(N)
print("runtime: {:.3e} s".format(time() - start))
Has anyone an idea how to fix this problem?
Thanks a lot.
See documentation:
Currently, the following algorithms can make use of multi-threading:
general dense matrix - matrix products
PartialPivLU
row-major-sparse * dense vector/matrix products
ConjugateGradient with Lower|Upper as the UpLo template parameter.
BiCGSTAB with a row-major sparse matrix format.
LeastSquaresConjugateGradient
Only the row-major sparse matrix format is supported for multi-threading. Therefore, you will need to have:
Eigen::SparseMatrix<double, Eigen::RowMajor> A(N, N);
Eigen::BiCGSTAB<Eigen::SparseMatrix<double, Eigen::RowMajor>> solver;
I have noticed that inserting elements to the matrix takes longer than calculating the solution. Since the prior is sequential, it may not be feasible to observe the parallelism. You can increase the maximum number of solver iterations (i.e solver.setMaxIterations(1e9)) to force solution to take longer (thus easier to observe CPU occupation). You can also print checkpoints to observe which part of your code is executing at that moment. For example:
void eigen_test(int N)
{
Eigen::SparseMatrix<double, Eigen::RowMajor> A(N, N);
Eigen::SparseMatrix<double> b(N, 1);
Eigen::SparseMatrix<double> x(N, 1);
Eigen::BiCGSTAB<Eigen::SparseMatrix<double, Eigen::RowMajor>> solver;
solver.setMaxIterations(1e9);
A.reserve(5*N);
b.reserve(N);
x.reserve(N);
std::cout << "checkpoint: insert elems..." << std::endl;
for(int i=0; i<N; i++)
{
b.insert(i, 0) = 20.0;
for(int j=(i-2); j<=(i+2); j++)
{
if(j == i)
{
A.insert(j, i) = 10.0;
}
else if((j >= 0) && (j < N))
{
A.insert(j, i) = 5.0;
}
}
}
std::cout << "checkpoint: find solution..." << std::endl;
solver.compute(A);
x = solver.solve(b);
std::cout << "checkpoint: done!" << std::endl;
}
I'm trying to optimize an application with OpenACC. In the main, I have an iteration loop of this type:
while(t<tstop){
add(&data, nx);
}
Where data is a variable of type Data, defined by this Structure
typedef struct Data_{
double *x;
}Data;
The function I'm calling in the while loop is parallelizable, but what I don't manage to do is to maintain the array x[] in the device memory between the different calls of the function.
void add(Data *data, int n){
#pragma acc data pcopy(data[0:1])
#pragma acc data pcopy(data->x[0:n])
#pragma acc parallel loop
for(int i=0; i < n ; i++){
data->x[i] += 1.;
}
#pragma acc exit data copyout(data->x[0:n])
#pragma acc exit data copyout(data[0:1])
}
I know the program seems to be no sense but I just wrote something to reproduce the problem I have in the real code.
I tryied to use unstructured data region:
#pragma acc enter data copyin(data[0:1])
#pragma acc enter data copyin(data->x[0:n])
#pragma acc data present(data[:1], data->x[:n])
#pragma acc parallel loop
for(int i=0; i < n ; i++){
data->x[i] += 1.;
}
#pragma acc exit data copyout(data->x[0:n])
#pragma acc exit data copyout(data[0:1])
but for some reason I get an error of this type:
FATAL ERROR: variable in data clause is partially present on the device: name=data
I'm not able to reproduce the partially present error from the code snip-it provided so it's unclear why this error is occurring. In general, the error occurs when the size of the variable in the present table differs from the size being used in the data clause. If you can provide a reproducing example, I can take a look and determine why it's happening here.
To answer the topic question, device variables can be accessed anywhere within the scope of the data region they are in, even across subroutines. For unstructured data regions (i.e. enter data/exit data), the scope is defined at runtime between the enter and exit calls. For structured data regions, the scope is defined by the structured block.
Here's an example using the structure you define above (though I've included the size of x as part of the struct).
% cat test.c
#include <stdio.h>
#include <stdlib.h>
typedef struct Data_{
double *x;
int n;
}Data;
void add(Data *data){
#pragma acc parallel loop present(data)
for(int i=0; i < data->n ; i++){
data->x[i] += 1.;
}
}
int main () {
Data *data;
data = (Data*) malloc(sizeof(Data));
data->n = 64;
data->x = (double *) malloc(sizeof(double)*data->n);
for(int i=0; i < data->n ; i++){
data->x[i] = (double) i;
}
#pragma acc enter data copyin(data[0:1])
#pragma acc enter data copyin(data->x[0:data->n])
add(data);
#pragma acc exit data copyout(data->x[0:data->n])
#pragma acc exit data delete(data)
for(int i=0; i < data->n ; i++){
printf("%d:%f\n",i,data->x[i]);
}
free(data->x);
free(data);
}
% pgcc test.c -ta=tesla -Minfo=accel; a.out
add:
12, Generating present(data[:])
Generating Tesla code
13, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
main:
28, Generating enter data copyin(data[:1])
29, Generating enter data copyin(data->x[:data->n])
31, Generating exit data copyout(data->x[:data->n])
32, Generating exit data delete(data[:1])
0:1.000000
1:2.000000
2:3.000000
3:4.000000
4:5.000000
5:6.000000
6:7.000000
7:8.000000
8:9.000000
9:10.000000
10:11.000000
11:12.000000
12:13.000000
13:14.000000
14:15.000000
15:16.000000
16:17.000000
17:18.000000
18:19.000000
19:20.000000
20:21.000000
21:22.000000
22:23.000000
23:24.000000
24:25.000000
25:26.000000
26:27.000000
27:28.000000
28:29.000000
29:30.000000
30:31.000000
31:32.000000
32:33.000000
33:34.000000
34:35.000000
35:36.000000
36:37.000000
37:38.000000
38:39.000000
39:40.000000
40:41.000000
41:42.000000
42:43.000000
43:44.000000
44:45.000000
45:46.000000
46:47.000000
47:48.000000
48:49.000000
49:50.000000
50:51.000000
51:52.000000
52:53.000000
53:54.000000
54:55.000000
55:56.000000
56:57.000000
57:58.000000
58:59.000000
59:60.000000
60:61.000000
61:62.000000
62:63.000000
63:64.000000
Also, here's a second example, but now with "data" being an array where the size of each "x" can be different.
% cat test2.c
#include <stdio.h>
#include <stdlib.h>
#define M 16
typedef struct Data_{
double *x;
int n;
}Data;
void add(Data *data){
#pragma acc parallel loop present(data)
for(int i=0; i < data->n ; i++){
data->x[i] += 1.;
}
}
int main () {
Data *data;
data = (Data*) malloc(sizeof(Data)*M);
#pragma acc enter data create(data[0:M])
for (int i =0; i < M; ++i) {
data[i].n = i+1;
data[i].x = (double *) malloc(sizeof(double)*data[i].n);
for(int j=0; j < data[i].n ; j++){
data[i].x[j] = (double)((i*data[i].n) + j);
}
#pragma acc update device(data[i].n)
#pragma acc enter data copyin(data[i].x[0:data[i].n])
}
for (int i =0; i < M; ++i) {
add(&data[i]);
}
for (int i =0; i < M; ++i) {
#pragma acc update self(data[i].x[:data[i].n])
for(int j=0; j < data[i].n ; j++){
printf("%d:%d:%f\n",i,j,data[i].x[j]);
}}
for (int i =0; i < M; ++i) {
#pragma acc exit data delete(data[i].x)
free(data[i].x);
}
#pragma acc exit data delete(data)
free(data);
}
% pgcc test2.c -ta=tesla -Minfo=accel; a.out
add:
11, Generating present(data[:1])
Generating Tesla code
14, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
main:
22, Generating enter data create(data[:16])
32, Generating update device(data->n)
Generating enter data copyin(data->x[:data->n])
38, Generating update self(data->x[:data->n])
46, Generating exit data delete(data->x[:1])
49, Generating exit data delete(data[:1])
0:0:1.000000
1:0:3.000000
1:1:4.000000
2:0:7.000000
2:1:8.000000
2:2:9.000000
3:0:13.000000
3:1:14.000000
3:2:15.000000
3:3:16.000000
4:0:21.000000
4:1:22.000000
4:2:23.000000
4:3:24.000000
4:4:25.000000
5:0:31.000000
5:1:32.000000
5:2:33.000000
5:3:34.000000
5:4:35.000000
5:5:36.000000
6:0:43.000000
6:1:44.000000
6:2:45.000000
6:3:46.000000
6:4:47.000000
6:5:48.000000
6:6:49.000000
7:0:57.000000
7:1:58.000000
7:2:59.000000
7:3:60.000000
7:4:61.000000
7:5:62.000000
7:6:63.000000
7:7:64.000000
8:0:73.000000
8:1:74.000000
8:2:75.000000
8:3:76.000000
8:4:77.000000
8:5:78.000000
8:6:79.000000
8:7:80.000000
8:8:81.000000
9:0:91.000000
9:1:92.000000
9:2:93.000000
9:3:94.000000
9:4:95.000000
9:5:96.000000
9:6:97.000000
9:7:98.000000
9:8:99.000000
9:9:100.000000
10:0:111.000000
10:1:112.000000
10:2:113.000000
10:3:114.000000
10:4:115.000000
10:5:116.000000
10:6:117.000000
10:7:118.000000
10:8:119.000000
10:9:120.000000
10:10:121.000000
11:0:133.000000
11:1:134.000000
11:2:135.000000
11:3:136.000000
11:4:137.000000
11:5:138.000000
11:6:139.000000
11:7:140.000000
11:8:141.000000
11:9:142.000000
11:10:143.000000
11:11:144.000000
12:0:157.000000
12:1:158.000000
12:2:159.000000
12:3:160.000000
12:4:161.000000
12:5:162.000000
12:6:163.000000
12:7:164.000000
12:8:165.000000
12:9:166.000000
12:10:167.000000
12:11:168.000000
12:12:169.000000
13:0:183.000000
13:1:184.000000
13:2:185.000000
13:3:186.000000
13:4:187.000000
13:5:188.000000
13:6:189.000000
13:7:190.000000
13:8:191.000000
13:9:192.000000
13:10:193.000000
13:11:194.000000
13:12:195.000000
13:13:196.000000
14:0:211.000000
14:1:212.000000
14:2:213.000000
14:3:214.000000
14:4:215.000000
14:5:216.000000
14:6:217.000000
14:7:218.000000
14:8:219.000000
14:9:220.000000
14:10:221.000000
14:11:222.000000
14:12:223.000000
14:13:224.000000
14:14:225.000000
15:0:241.000000
15:1:242.000000
15:2:243.000000
15:3:244.000000
15:4:245.000000
15:5:246.000000
15:6:247.000000
15:7:248.000000
15:8:249.000000
15:9:250.000000
15:10:251.000000
15:11:252.000000
15:12:253.000000
15:13:254.000000
15:14:255.000000
15:15:256.000000
Note, be careful about copying structs with dynamic data members. Copying the struct itself, i.e. like you have above "#pragma acc exit data copyout(data[0:1])", will overwrite the host address of "x" with the device address. Instead, copy only "data->x" and delete "data".
This is a follow up question to dqrng with Rcpp for drawing from a normal and a binomial distribution. I tried to implement the answer but instead of drawing from a single distribution I'm drawing from 3. This is the code that I wrote:
// [[Rcpp::depends(dqrng, BH, RcppArmadillo)]]
#include <RcppArmadillo.h>
#include <boost/random/binomial_distribution.hpp>
#include <xoshiro.h>
#include <dqrng_distribution.h>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
arma::mat parallel_random_matrix(int n, int m, int ncores, double p=0.5) {
dqrng::xoshiro256plus rng(42);
arma::mat out(n*m,3);
// ok to use rng here
#pragma omp parallel num_threads(ncores)
{
dqrng::xoshiro256plus lrng(rng); // make thread local copy of rng
lrng.jump(omp_get_thread_num() + 1); // advance rng by 1 ... ncores jumps
int iter = 0;
#pragma omp for
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
iter = i * n + j;
// p can be a function of i and j
boost::random::binomial_distribution<int> dist_binomial(1,p);
auto gen_bernoulli = std::bind(dist_binomial, std::ref(lrng));
boost::random::normal_distribution<int> dist_normal1(2.0,1.0);
auto gen_normal1 = std::bind(dist_normal1, std::ref(lrng));
boost::random::normal_distribution<int> dist_normal2(4.0,3.0);
auto gen_normal2 = std::bind(dist_normal2, std::ref(lrng));
out(iter,0) = gen_bernoulli();
out(iter,1) = gen_normal1();
out(iter,2) = gen_normal2();
}
}
}
// ok to use rng here
return out;
}
/*** R
parallel_random_matrix(5, 5, 4, 0.75)
*/
When I try to run it Rstudio crashes. However, when I change the code like follows it does work:
// [[Rcpp::depends(dqrng, BH, RcppArmadillo)]]
#include <RcppArmadillo.h>
#include <boost/random/binomial_distribution.hpp>
#include <xoshiro.h>
#include <dqrng_distribution.h>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
arma::mat parallel_random_matrix(int n, int m, int ncores, double p=0.5) {
dqrng::xoshiro256plus rng(42);
arma::mat out(n*m,3);
// ok to use rng here
#pragma omp parallel num_threads(ncores)
{
dqrng::xoshiro256plus lrng(rng); // make thread local copy of rng
lrng.jump(omp_get_thread_num() + 1); // advance rng by 1 ... ncores jumps
int iter = 0;
#pragma omp for
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
iter = i * n + j;
// p can be a function of i and j
boost::random::binomial_distribution<int> dist_binomial(1,p);
auto gen_bernoulli = std::bind(dist_binomial, std::ref(lrng));
boost::random::normal_distribution<int> dist_normal1(2.0,1.0);
auto gen_normal1 = std::bind(dist_normal1, std::ref(lrng));
boost::random::normal_distribution<int> dist_normal2(4.0,3.0);
auto gen_normal2 = std::bind(dist_normal2, std::ref(lrng));
out(iter,0) = gen_bernoulli();
out(iter,1) = 2.0;//gen_normal1();
out(iter,2) = 3.0;//gen_normal2();
}
}
}
// ok to use rng here
return out;
}
/*** R
parallel_random_matrix(5, 5, 4, 0.75)
*/
What am I doing wrong?
Here lies the problem:
boost::random::normal_distribution<int> dist_normal1(2.0,1.0);
^^^
This distribution is meant for real types, not integral types, c.f. https://www.boost.org/doc/libs/1_69_0/doc/html/boost/random/normal_distribution.html. Correct would be
boost::random::normal_distribution<double> dist_normal1(2.0,1.0);
As an OpenMP & Rcpp performance test I wanted to check how fast I could calculate the Mandelbrot set in R using the most straightforward and simple Rcpp+OpenMP implementation. Currently what I did was:
#include <Rcpp.h>
#include <omp.h>
// [[Rcpp::plugins(openmp)]]
using namespace Rcpp;
// [[Rcpp::export]]
Rcpp::NumericMatrix mandelRcpp(const double x_min, const double x_max, const double y_min, const double y_max,
const int res_x, const int res_y, const int nb_iter) {
Rcpp::NumericMatrix ret(res_x, res_y);
double x_step = (x_max - x_min) / res_x;
double y_step = (y_max - y_min) / res_y;
int r,c;
#pragma omp parallel for default(shared) private(c) schedule(dynamic,1)
for (r = 0; r < res_y; r++) {
for (c = 0; c < res_x; c++) {
double zx = 0.0, zy = 0.0, new_zx;
double cx = x_min + c*x_step, cy = y_min + r*y_step;
int n = 0;
for (n=0; (zx*zx + zy*zy < 4.0 ) && ( n < nb_iter ); n++ ) {
new_zx = zx*zx - zy*zy + cx;
zy = 2.0*zx*zy + cy;
zx = new_zx;
}
ret(c,r) = n;
}
}
return ret;
}
And then in R:
library(Rcpp)
sourceCpp("mandelRcpp.cpp")
xlims=c(-0.74877,-0.74872);
ylims=c(0.065053,0.065103);
x_res=y_res=1080L; nb_iter=10000L;
system.time(m <- mandelRcpp(xlims[[1]], xlims[[2]], ylims[[1]], ylims[[2]], x_res, y_res, nb_iter))
# 0.92s
rainbow=c(rgb(0.47,0.11,0.53),rgb(0.27,0.18,0.73),rgb(0.25,0.39,0.81),rgb(0.30,0.57,0.75),rgb(0.39,0.67,0.60),rgb(0.51,0.73,0.44),rgb(0.67,0.74,0.32),rgb(0.81,0.71,0.26),rgb(0.89,0.60,0.22),rgb(0.89,0.39,0.18),rgb(0.86,0.13,0.13))
cols=c(colorRampPalette(rainbow)(100),rev(colorRampPalette(rainbow)(100)),"black") # palette
par(mar=c(0, 0, 0, 0))
system.time(image(m^(1/7), col=cols, asp=diff(ylims)/diff(xlims), axes=F, useRaster=T))
# 0.5s
I was unsure though if there is any other obvious speed improvements I could take advantage of aside from OpenMP multithreading, e.g. via simd vectorization? (using simd options in the openmp #pragma didn't seem to do anything)
PS at first my code was crashing but I later found this was solved by replacing ret[r,c] = n; with ret(r,c) = n;
Using Armadillo classes as suggested in the answer below make things very slightly faster, though the timings are almost the same. Also flipped around x and y so it comes out in the right orientation when plotted with image(). Using 8 threads speed is ca. 350 times faster than the vectorized plain R Mandelbrot version here and also about 7.3 times faster than the (non-multithreaded) Python/Numba version here (similar to PyCUDA or PyOpenCL speeds), so quite happy with that... Rasterizing/display now seems the bottleneck in R....
Do not use OpenMP with Rcpp's *Vector or *Matrix objects as they mask SEXP functions / memory allocations that are single-threaded. OpenMP is a multi-threaded approach.
This is why the code is crashing.
One way to get around this limitation is to use a non-R data structure to store the results. One of the following will be sufficient: arma::mat or Eigen::MatrixXd or std::vector<T>... As I favor armadillo, I will change the res matrix to arma::mat from Rcpp::NumericMatrix. Thus, the following will execute your code in parallel:
#include <RcppArmadillo.h> // Note the changed include and new attribute
// [[Rcpp::depends(RcppArmadillo)]]
// Avoid including header if openmp not on system
#ifdef _OPENMP
#include <omp.h>
#endif
// [[Rcpp::plugins(openmp)]]
// Note the changed return type
// [[Rcpp::export]]
arma::mat mandelRcpp(const double x_min, const double x_max,
const double y_min, const double y_max,
const int res_x, const int res_y, const int nb_iter) {
arma::mat ret(res_x, res_y); // note change
double x_step = (x_max - x_min) / res_x;
double y_step = (y_max - y_min) / res_y;
unsigned r,c;
#pragma omp parallel for shared(res)
for (r = 0; r < res_y; r++) {
for (c = 0; c < res_x; c++) {
double zx = 0.0, zy = 0.0, new_zx;
double cx = x_min + c*x_step, cy = y_min + r*y_step;
unsigned n = 0;
for (; (zx*zx + zy*zy < 4.0 ) && ( n < nb_iter ); n++ ) {
new_zx = zx*zx - zy*zy + cx;
zy = 2.0*zx*zy + cy;
zx = new_zx;
}
if(n == nb_iter) {
n = 0;
}
ret(r, c) = n;
}
}
return ret;
}
With the test code (note y and x were not defined, thus I assumed y = ylims and x = xlims) we have:
xlims = ylims = c(-2.0, 2.0)
x_res = y_res = 400L
nb_iter = 256L
system.time(m <-
mandelRcpp(xlims[[1]], xlims[[2]],
ylims[[1]], ylims[[2]],
x_res, y_res, nb_iter))
rainbow = c(
rgb(0.47, 0.11, 0.53),
rgb(0.27, 0.18, 0.73),
rgb(0.25, 0.39, 0.81),
rgb(0.30, 0.57, 0.75),
rgb(0.39, 0.67, 0.60),
rgb(0.51, 0.73, 0.44),
rgb(0.67, 0.74, 0.32),
rgb(0.81, 0.71, 0.26),
rgb(0.89, 0.60, 0.22),
rgb(0.89, 0.39, 0.18),
rgb(0.86, 0.13, 0.13)
)
cols = c(colorRampPalette(rainbow)(100),
rev(colorRampPalette(rainbow)(100)),
"black") # palette
par(mar = c(0, 0, 0, 0))
image(m,
col = cols,
asp = diff(range(ylims)) / diff(range(xlims)),
axes = F)
For:
I went ahead and vectorized the OP's code using GCC's and Clang's vector extensions. Before I show how I did this let me show the performance with the following hardware:
Skylake (SKL) at 3.1 GHz with 4 cores
Knights Landing (KNL) at 1.5 GHz with 68 cores
ARMv8 Cortex-A57 arch64 (Nvidia Jetson TX1) 4 cores at ? GHz
nb_iter = 1000000
GCC Clang
SKL_scalar 6m5,422s
SKL_SSE41 3m18,058s
SKL_AVX2 1m37,843s 1m39,943s
SKL_scalar_omp 0m52,237s
SKL_SSE41_omp 0m29,624s 0m31,356s
SKL_AVX2_omp 0m14,156s 0m16,783s
ARM_scalar 15m28.285s
ARM_vector 9m26.384s
ARM_scalar_omp 3m54.242s
ARM_vector_omp 2m21.780s
KNL_scalar 19m34.121s
KNL_SSE41 11m30.280s
KNL_AVX2 5m0.005s 6m39.568s
KNL_AVX512 2m40.934s 6m20.061s
KNL_scalar_omp 0m9.108s
KNL_SSE41_omp 0m6.666s 0m6.992s
KNL_AVX2_omp 0m2.973s 0m3.988s
KNL_AVX512_omp 0m1.761s 0m3.335s
The theoretical speed up of KNL vs. SKL is
(68 cores/4 cores)*(1.5 GHz/3.1 Ghz)*
(8 doubles per lane/4 doubles per lane) = 16.45
I went into detail about GCC's and Clang's vector extensions capabilities here. To vectorize the OP's code here are three additional vector operations that we need to define.
1. Broadcasting
For a vector v and a scalar s GCC cannot do v = s but Clang can. But I found a nice solution which works for GCC and Clang here. For example
vsi v = s - (vsi){};
2. A any() function like in OpenCL or like in R.
The best I came up with is a generic function
static bool any(vli const & x) {
for(int i=0; i<VLI_SIZE; i++) if(x[i]) return true;
return false;
}
Clang actually generates relatively efficient code for this using the ptest instruction (but not for AVX512) but GCC does not.
3. Compression
The calculations are done as 64-bit doubles but the result is written out as 32-bit integers. So two calculations are done using 64-bit integers and then the two calculations are compressed into one vector of 32-bit integers. I came up with a generic solution which Clang does a good job with
static vsi compress(vli const & lo, vli const & hi) {
vsi lo2 = (vsi)lo, hi2 = (vsi)hi, z;
for(int i=0; i<VLI_SIZE; i++) z[i+0*VLI_SIZE] = lo2[2*i];
for(int i=0; i<VLI_SIZE; i++) z[i+1*VLI_SIZE] = hi2[2*i];
return z;
}
The follow solution works better for GCC but is no better for Clang. But since this function is not critical I just use the generic version.
static vsi compress(vli const & low, vli const & high) {
#if defined(__clang__)
return __builtin_shufflevector((vsi)low, (vsi)high, MASK);
#else
return __builtin_shuffle((vsi)low, (vsi)high, (vsi){MASK});
#endif
}
These definitions don't rely on anything x86 specific and the code (defined below) compiles for ARM processors as well with GCC and Clang.
Now that these are defined here is the code
#include <string.h>
#include <inttypes.h>
#include <Rcpp.h>
using namespace Rcpp;
#ifdef _OPENMP
#include <omp.h>
#endif
// [[Rcpp::plugins(openmp)]]
// [[Rcpp::plugins(cpp14)]]
#if defined ( __AVX512F__ ) || defined ( __AVX512__ )
static const int SIMD_SIZE = 64;
#elif defined ( __AVX2__ )
static const int SIMD_SIZE = 32;
#else
static const int SIMD_SIZE = 16;
#endif
static const int VSI_SIZE = SIMD_SIZE/sizeof(int32_t);
static const int VLI_SIZE = SIMD_SIZE/sizeof(int64_t);
static const int VDF_SIZE = SIMD_SIZE/sizeof(double);
#if defined(__clang__)
typedef int32_t vsi __attribute__ ((ext_vector_type(VSI_SIZE)));
typedef int64_t vli __attribute__ ((ext_vector_type(VLI_SIZE)));
typedef double vdf __attribute__ ((ext_vector_type(VDF_SIZE)));
#else
typedef int32_t vsi __attribute__ ((vector_size (SIMD_SIZE)));
typedef int64_t vli __attribute__ ((vector_size (SIMD_SIZE)));
typedef double vdf __attribute__ ((vector_size (SIMD_SIZE)));
#endif
static bool any(vli const & x) {
for(int i=0; i<VLI_SIZE; i++) if(x[i]) return true;
return false;
}
static vsi compress(vli const & lo, vli const & hi) {
vsi lo2 = (vsi)lo, hi2 = (vsi)hi, z;
for(int i=0; i<VLI_SIZE; i++) z[i+0*VLI_SIZE] = lo2[2*i];
for(int i=0; i<VLI_SIZE; i++) z[i+1*VLI_SIZE] = hi2[2*i];
return z;
}
// [[Rcpp::export]]
IntegerVector frac(double x_min, double x_max, double y_min, double y_max, int res_x, int res_y, int nb_iter) {
IntegerVector out(res_x*res_y);
vdf x_minv = x_min - (vdf){}, y_minv = y_min - (vdf){};
vdf x_stepv = (x_max - x_min)/res_x - (vdf){}, y_stepv = (y_max - y_min)/res_y - (vdf){};
double a[VDF_SIZE] __attribute__ ((aligned(SIMD_SIZE)));
for(int i=0; i<VDF_SIZE; i++) a[i] = 1.0*i;
vdf vi0 = *(vdf*)a;
#pragma omp parallel for schedule(dynamic) collapse(2)
for (int r = 0; r < res_y; r++) {
for (int c = 0; c < res_x/(VSI_SIZE); c++) {
vli nv[2] = {0 - (vli){}, 0 - (vli){}};
for(int j=0; j<2; j++) {
vdf c2 = 1.0*VDF_SIZE*(2*c+j) + vi0;
vdf zx = 0.0 - (vdf){}, zy = 0.0 - (vdf){}, new_zx;
vdf cx = x_minv + c2*x_stepv, cy = y_minv + r*y_stepv;
vli t = -1 - (vli){};
for (int n = 0; any(t = zx*zx + zy*zy < 4.0) && n < nb_iter; n++, nv[j] -= t) {
new_zx = zx*zx - zy*zy + cx;
zy = 2.0*zx*zy + cy;
zx = new_zx;
}
}
vsi sp = compress(nv[0], nv[1]);
memcpy(&out[r*res_x + VSI_SIZE*c], (int*)&sp, SIMD_SIZE);
}
}
return out;
}
The R code is almost the same as the OP's code
library(Rcpp)
sourceCpp("frac.cpp", verbose=TRUE, rebuild=TRUE)
xlims=c(-0.74877,-0.74872);
ylims=c(0.065053,0.065103);
x_res=y_res=1080L; nb_iter=100000L;
t = system.time(m <- frac(xlims[[1]], xlims[[2]], ylims[[1]], ylims[[2]], x_res, y_res, nb_iter))
print(t)
m2 = matrix(m, ncol = x_res)
rainbow = c(
rgb(0.47, 0.11, 0.53),
rgb(0.27, 0.18, 0.73),
rgb(0.25, 0.39, 0.81),
rgb(0.30, 0.57, 0.75),
rgb(0.39, 0.67, 0.60),
rgb(0.51, 0.73, 0.44),
rgb(0.67, 0.74, 0.32),
rgb(0.81, 0.71, 0.26),
rgb(0.89, 0.60, 0.22),
rgb(0.89, 0.39, 0.18),
rgb(0.86, 0.13, 0.13)
)
cols = c(colorRampPalette(rainbow)(100),
rev(colorRampPalette(rainbow)(100)),"black") # palette
par(mar = c(0, 0, 0, 0))
image(m2^(1/7), col=cols, asp=diff(ylims)/diff(xlims), axes=F, useRaster=T)
To compile for GCC or Clang change the file ~/.R/Makevars to
CXXFLAGS= -Wall -std=c++14 -O3 -march=native -ffp-contract=fast -fopenmp
#uncomment the following two lines for clang
#CXX=clang-5.0
#LDFLAGS= -lomp
If you are having trouble getting OpenMP to work for Clang see this.
The code produces more or less the same image.