CUDA: string comparison - string

I write a simple code about string comparison. The code is shown as follows.
It is very simple. Just compare string a and string b,if the corresponding elements are
same, then assign 5 to the new matrix s; if the corresponding elements are different, then
assign -3 to the new matrix s.There is no compilation error. But the result is not what
I expected.Please give me some useful suggestion. Thank you!
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include "book.h"
#define M 6
#define BLOCK_SIZE 30 // maximum 1024 threads per block
#define GRID_SIZE 30 // 900 blocks per grid
#define P (900 * 900)
void Init();
char *gpu_a;
char *gpu_b;
float *gpu_s;
float *cpu_s;
char cpu_a[6] = {'A', 'T', 'G', 'C', 'G', 'T'};
char cpu_b[6] = {'G', 'T', 'G', 'A', 'T', 'G'};
void cpu_Allocate1dArray()
{
//cpu_a = (char*) malloc( M * sizeof( char) );
//cpu_b = (char*) malloc( M * sizeof(char) );
cpu_s = (float*) malloc( M * sizeof( float) );
}
void gpu_Allocate1dArray()
{
cudaMalloc( (void**)&gpu_a, M * sizeof(char) );
cudaMalloc( (void**)&gpu_b, M * sizeof(char) );
cudaMalloc( (void**)&gpu_s, M * sizeof(float));
}
__global__ void mykernel( char *gpu_a, char *gpu_b, float *gpu_s)
{
int i , j , tid;
i = threadIdx.x + blockIdx.x * blockDim.x;
j = threadIdx.y + blockIdx.y * blockDim.y;
tid = i + j * blockDim.x * gridDim.x;
if ( tid < P)
{
if( gpu_a[i] == gpu_b[j])
{
gpu_s[tid] = 5;
}
else
gpu_s[tid] = -3;
}
}
int main()
{
int q;
cpu_Allocate1dArray();
gpu_Allocate1dArray();
Init();
dim3 gridDim;
dim3 blockDim;
blockDim.x = blockDim.y = BLOCK_SIZE;
gridDim.x = gridDim.y = GRID_SIZE;
cudaMemcpy( gpu_a, cpu_a, sizeof(char) * M, cudaMemcpyHostToDevice);
cudaMemcpy( gpu_b, cpu_b, sizeof(char) * M, cudaMemcpyHostToDevice);
mykernel<<<gridDim, blockDim>>>(gpu_a, gpu_b, gpu_s);
cudaMemcpy( cpu_s, gpu_s, sizeof(float)* M, cudaMemcpyDeviceToHost);
for (q = 0; q < M; q++)
printf("%f ", cpu_s[q]);
printf("\n");
//Free device memory
free(cpu_s);
cudaFree(gpu_s);
cudaFree(gpu_a);
cudaFree(gpu_b);
return 0;
}
void Init()
{
int i;
for (i = 0; i < M; i++)
cpu_s[i] = 0;
}
The result is:
[Smith#server]$ ./test88.exe
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

Your code tries to reach beyond the array length - gpu_s length is 6 * sizeof(float) while tid can be up to 900*900.
Setting P to 6 prints out:
-3.000000 -3.000000 5.000000 -3.000000 5.000000 -3.000000
Note - you can easily detect such problems by running your application with cuda-memcheck.

Related

Fast way to write pairs int-double to binary file on disk

What I tried:
#include <Rcpp.h>
#include <fstream>
using namespace Rcpp;
using namespace std;
// [[Rcpp::export]]
void write_indval(const char * filename,
const IntegerVector& i,
const NumericVector& x) {
ofstream myFile(filename, ios::out | ios::binary);
const int * pI = i.begin();
const double * pX = x.begin();
int K = i.size();
for (int k = 0; k < K; k++, pI++, pX++) {
myFile.write(reinterpret_cast<const char*>(pI), 4);
myFile.write(reinterpret_cast<const char*>(pX), 8);
}
myFile.close();
}
If I try it in R:
N <- 1e8
i <- sample(1000, N, replace = TRUE)
x <- runif(N)
system.time(
write_indval("test.bin", i, x)
)
It takes 16 secs to write this 1.1 GB data.
The data written seems to be what I want, but I am writing at 72 MB / sec.
I think I can write at 300 MB / sec on my computer. So, anyone knows a faster alternative to this function?

Finding the Krishnamurthy Number using C

I just want to know that for finding the Krishnamurthy number, we have to first find the factorial of the digits, then the addition of those numbers. (like, 1!+4!+5! = 145).
So, below is my code, and I have applied a factorial function over there. But the output is not coming in favor (145 is not a Kri...).
#include <stdio.h>
#include <stdlib.h>
void main()
{
int digit,factorial = 1, temp, input, sum = 0;
printf("Enter a Number:\n");
scanf("%d",&input);
int Factorial(int digit){
factorial = factorial*digit;
return 0;
}
temp = input;
while(temp>0){
digit = temp%10;
temp = temp/10;
sum = sum + Factorial(digit);
}
if(sum==input){
printf("%d is a Krishnamurthy Number",input);
}
else{
printf("%d is not a Krishnamurthy Number",input);
}
}
Have I done anything wrong in logic, or function declaration or definition? Please help.
your factorial function is not performing correctly. factorial means, multiplication of all digits starting from n downto 1 -
(n-1) * (n-2) * ... * (n)
but your function is not giving the desired result you want.
int Factorial(int digit){
factorial = factorial*digit;
return 0;
}
you need to change that function to get the factorial value of a number, you can either iterate a loop downto one or use a recursive approach to get the factorial.
int Factorial(int digit){
int result = 1;
for(int i=n; i>=1; i--){
result *= i;
}
return result;
}
or
int Factorial(int digit) {
if(n <= 1) return digit;
return digit * Factorial(digit-1);
}
you can follow the thread to understand the depth of recursive function mentioned above.
#include<stdio.h>
int main(int argc, char* argv[], char* envp[])
{
int sum = 0,
int a,
int p = 0,
int d,
int i,
int fact;
//code
printf("Enter a number: ");
scanf("%d", &a);
p = a;
while (a > 0)
{
fact = 1;
d = a % 10;
a /= 10;
for (i = d; i >= 1; i--)
{
fact *= i;
}
sum += fact;
}
if (sum == p)
printf("It is a Krishnamurthy number.\n");
else
printf("It is not a Krishnamurthy number.\n");
printf("\n\n");
return(0);
}

Multithreaded & SIMD vectorized Mandelbrot in R using Rcpp & OpenMP

As an OpenMP & Rcpp performance test I wanted to check how fast I could calculate the Mandelbrot set in R using the most straightforward and simple Rcpp+OpenMP implementation. Currently what I did was:
#include <Rcpp.h>
#include <omp.h>
// [[Rcpp::plugins(openmp)]]
using namespace Rcpp;
// [[Rcpp::export]]
Rcpp::NumericMatrix mandelRcpp(const double x_min, const double x_max, const double y_min, const double y_max,
const int res_x, const int res_y, const int nb_iter) {
Rcpp::NumericMatrix ret(res_x, res_y);
double x_step = (x_max - x_min) / res_x;
double y_step = (y_max - y_min) / res_y;
int r,c;
#pragma omp parallel for default(shared) private(c) schedule(dynamic,1)
for (r = 0; r < res_y; r++) {
for (c = 0; c < res_x; c++) {
double zx = 0.0, zy = 0.0, new_zx;
double cx = x_min + c*x_step, cy = y_min + r*y_step;
int n = 0;
for (n=0; (zx*zx + zy*zy < 4.0 ) && ( n < nb_iter ); n++ ) {
new_zx = zx*zx - zy*zy + cx;
zy = 2.0*zx*zy + cy;
zx = new_zx;
}
ret(c,r) = n;
}
}
return ret;
}
And then in R:
library(Rcpp)
sourceCpp("mandelRcpp.cpp")
xlims=c(-0.74877,-0.74872);
ylims=c(0.065053,0.065103);
x_res=y_res=1080L; nb_iter=10000L;
system.time(m <- mandelRcpp(xlims[[1]], xlims[[2]], ylims[[1]], ylims[[2]], x_res, y_res, nb_iter))
# 0.92s
rainbow=c(rgb(0.47,0.11,0.53),rgb(0.27,0.18,0.73),rgb(0.25,0.39,0.81),rgb(0.30,0.57,0.75),rgb(0.39,0.67,0.60),rgb(0.51,0.73,0.44),rgb(0.67,0.74,0.32),rgb(0.81,0.71,0.26),rgb(0.89,0.60,0.22),rgb(0.89,0.39,0.18),rgb(0.86,0.13,0.13))
cols=c(colorRampPalette(rainbow)(100),rev(colorRampPalette(rainbow)(100)),"black") # palette
par(mar=c(0, 0, 0, 0))
system.time(image(m^(1/7), col=cols, asp=diff(ylims)/diff(xlims), axes=F, useRaster=T))
# 0.5s
I was unsure though if there is any other obvious speed improvements I could take advantage of aside from OpenMP multithreading, e.g. via simd vectorization? (using simd options in the openmp #pragma didn't seem to do anything)
PS at first my code was crashing but I later found this was solved by replacing ret[r,c] = n; with ret(r,c) = n;
Using Armadillo classes as suggested in the answer below make things very slightly faster, though the timings are almost the same. Also flipped around x and y so it comes out in the right orientation when plotted with image(). Using 8 threads speed is ca. 350 times faster than the vectorized plain R Mandelbrot version here and also about 7.3 times faster than the (non-multithreaded) Python/Numba version here (similar to PyCUDA or PyOpenCL speeds), so quite happy with that... Rasterizing/display now seems the bottleneck in R....
Do not use OpenMP with Rcpp's *Vector or *Matrix objects as they mask SEXP functions / memory allocations that are single-threaded. OpenMP is a multi-threaded approach.
This is why the code is crashing.
One way to get around this limitation is to use a non-R data structure to store the results. One of the following will be sufficient: arma::mat or Eigen::MatrixXd or std::vector<T>... As I favor armadillo, I will change the res matrix to arma::mat from Rcpp::NumericMatrix. Thus, the following will execute your code in parallel:
#include <RcppArmadillo.h> // Note the changed include and new attribute
// [[Rcpp::depends(RcppArmadillo)]]
// Avoid including header if openmp not on system
#ifdef _OPENMP
#include <omp.h>
#endif
// [[Rcpp::plugins(openmp)]]
// Note the changed return type
// [[Rcpp::export]]
arma::mat mandelRcpp(const double x_min, const double x_max,
const double y_min, const double y_max,
const int res_x, const int res_y, const int nb_iter) {
arma::mat ret(res_x, res_y); // note change
double x_step = (x_max - x_min) / res_x;
double y_step = (y_max - y_min) / res_y;
unsigned r,c;
#pragma omp parallel for shared(res)
for (r = 0; r < res_y; r++) {
for (c = 0; c < res_x; c++) {
double zx = 0.0, zy = 0.0, new_zx;
double cx = x_min + c*x_step, cy = y_min + r*y_step;
unsigned n = 0;
for (; (zx*zx + zy*zy < 4.0 ) && ( n < nb_iter ); n++ ) {
new_zx = zx*zx - zy*zy + cx;
zy = 2.0*zx*zy + cy;
zx = new_zx;
}
if(n == nb_iter) {
n = 0;
}
ret(r, c) = n;
}
}
return ret;
}
With the test code (note y and x were not defined, thus I assumed y = ylims and x = xlims) we have:
xlims = ylims = c(-2.0, 2.0)
x_res = y_res = 400L
nb_iter = 256L
system.time(m <-
mandelRcpp(xlims[[1]], xlims[[2]],
ylims[[1]], ylims[[2]],
x_res, y_res, nb_iter))
rainbow = c(
rgb(0.47, 0.11, 0.53),
rgb(0.27, 0.18, 0.73),
rgb(0.25, 0.39, 0.81),
rgb(0.30, 0.57, 0.75),
rgb(0.39, 0.67, 0.60),
rgb(0.51, 0.73, 0.44),
rgb(0.67, 0.74, 0.32),
rgb(0.81, 0.71, 0.26),
rgb(0.89, 0.60, 0.22),
rgb(0.89, 0.39, 0.18),
rgb(0.86, 0.13, 0.13)
)
cols = c(colorRampPalette(rainbow)(100),
rev(colorRampPalette(rainbow)(100)),
"black") # palette
par(mar = c(0, 0, 0, 0))
image(m,
col = cols,
asp = diff(range(ylims)) / diff(range(xlims)),
axes = F)
For:
I went ahead and vectorized the OP's code using GCC's and Clang's vector extensions. Before I show how I did this let me show the performance with the following hardware:
Skylake (SKL) at 3.1 GHz with 4 cores
Knights Landing (KNL) at 1.5 GHz with 68 cores
ARMv8 Cortex-A57 arch64 (Nvidia Jetson TX1) 4 cores at ? GHz
nb_iter = 1000000
GCC Clang
SKL_scalar 6m5,422s
SKL_SSE41 3m18,058s
SKL_AVX2 1m37,843s 1m39,943s
SKL_scalar_omp 0m52,237s
SKL_SSE41_omp 0m29,624s 0m31,356s
SKL_AVX2_omp 0m14,156s 0m16,783s
ARM_scalar 15m28.285s
ARM_vector 9m26.384s
ARM_scalar_omp 3m54.242s
ARM_vector_omp 2m21.780s
KNL_scalar 19m34.121s
KNL_SSE41 11m30.280s
KNL_AVX2 5m0.005s 6m39.568s
KNL_AVX512 2m40.934s 6m20.061s
KNL_scalar_omp 0m9.108s
KNL_SSE41_omp 0m6.666s 0m6.992s
KNL_AVX2_omp 0m2.973s 0m3.988s
KNL_AVX512_omp 0m1.761s 0m3.335s
The theoretical speed up of KNL vs. SKL is
(68 cores/4 cores)*(1.5 GHz/3.1 Ghz)*
(8 doubles per lane/4 doubles per lane) = 16.45
I went into detail about GCC's and Clang's vector extensions capabilities here. To vectorize the OP's code here are three additional vector operations that we need to define.
1. Broadcasting
For a vector v and a scalar s GCC cannot do v = s but Clang can. But I found a nice solution which works for GCC and Clang here. For example
vsi v = s - (vsi){};
2. A any() function like in OpenCL or like in R.
The best I came up with is a generic function
static bool any(vli const & x) {
for(int i=0; i<VLI_SIZE; i++) if(x[i]) return true;
return false;
}
Clang actually generates relatively efficient code for this using the ptest instruction (but not for AVX512) but GCC does not.
3. Compression
The calculations are done as 64-bit doubles but the result is written out as 32-bit integers. So two calculations are done using 64-bit integers and then the two calculations are compressed into one vector of 32-bit integers. I came up with a generic solution which Clang does a good job with
static vsi compress(vli const & lo, vli const & hi) {
vsi lo2 = (vsi)lo, hi2 = (vsi)hi, z;
for(int i=0; i<VLI_SIZE; i++) z[i+0*VLI_SIZE] = lo2[2*i];
for(int i=0; i<VLI_SIZE; i++) z[i+1*VLI_SIZE] = hi2[2*i];
return z;
}
The follow solution works better for GCC but is no better for Clang. But since this function is not critical I just use the generic version.
static vsi compress(vli const & low, vli const & high) {
#if defined(__clang__)
return __builtin_shufflevector((vsi)low, (vsi)high, MASK);
#else
return __builtin_shuffle((vsi)low, (vsi)high, (vsi){MASK});
#endif
}
These definitions don't rely on anything x86 specific and the code (defined below) compiles for ARM processors as well with GCC and Clang.
Now that these are defined here is the code
#include <string.h>
#include <inttypes.h>
#include <Rcpp.h>
using namespace Rcpp;
#ifdef _OPENMP
#include <omp.h>
#endif
// [[Rcpp::plugins(openmp)]]
// [[Rcpp::plugins(cpp14)]]
#if defined ( __AVX512F__ ) || defined ( __AVX512__ )
static const int SIMD_SIZE = 64;
#elif defined ( __AVX2__ )
static const int SIMD_SIZE = 32;
#else
static const int SIMD_SIZE = 16;
#endif
static const int VSI_SIZE = SIMD_SIZE/sizeof(int32_t);
static const int VLI_SIZE = SIMD_SIZE/sizeof(int64_t);
static const int VDF_SIZE = SIMD_SIZE/sizeof(double);
#if defined(__clang__)
typedef int32_t vsi __attribute__ ((ext_vector_type(VSI_SIZE)));
typedef int64_t vli __attribute__ ((ext_vector_type(VLI_SIZE)));
typedef double vdf __attribute__ ((ext_vector_type(VDF_SIZE)));
#else
typedef int32_t vsi __attribute__ ((vector_size (SIMD_SIZE)));
typedef int64_t vli __attribute__ ((vector_size (SIMD_SIZE)));
typedef double vdf __attribute__ ((vector_size (SIMD_SIZE)));
#endif
static bool any(vli const & x) {
for(int i=0; i<VLI_SIZE; i++) if(x[i]) return true;
return false;
}
static vsi compress(vli const & lo, vli const & hi) {
vsi lo2 = (vsi)lo, hi2 = (vsi)hi, z;
for(int i=0; i<VLI_SIZE; i++) z[i+0*VLI_SIZE] = lo2[2*i];
for(int i=0; i<VLI_SIZE; i++) z[i+1*VLI_SIZE] = hi2[2*i];
return z;
}
// [[Rcpp::export]]
IntegerVector frac(double x_min, double x_max, double y_min, double y_max, int res_x, int res_y, int nb_iter) {
IntegerVector out(res_x*res_y);
vdf x_minv = x_min - (vdf){}, y_minv = y_min - (vdf){};
vdf x_stepv = (x_max - x_min)/res_x - (vdf){}, y_stepv = (y_max - y_min)/res_y - (vdf){};
double a[VDF_SIZE] __attribute__ ((aligned(SIMD_SIZE)));
for(int i=0; i<VDF_SIZE; i++) a[i] = 1.0*i;
vdf vi0 = *(vdf*)a;
#pragma omp parallel for schedule(dynamic) collapse(2)
for (int r = 0; r < res_y; r++) {
for (int c = 0; c < res_x/(VSI_SIZE); c++) {
vli nv[2] = {0 - (vli){}, 0 - (vli){}};
for(int j=0; j<2; j++) {
vdf c2 = 1.0*VDF_SIZE*(2*c+j) + vi0;
vdf zx = 0.0 - (vdf){}, zy = 0.0 - (vdf){}, new_zx;
vdf cx = x_minv + c2*x_stepv, cy = y_minv + r*y_stepv;
vli t = -1 - (vli){};
for (int n = 0; any(t = zx*zx + zy*zy < 4.0) && n < nb_iter; n++, nv[j] -= t) {
new_zx = zx*zx - zy*zy + cx;
zy = 2.0*zx*zy + cy;
zx = new_zx;
}
}
vsi sp = compress(nv[0], nv[1]);
memcpy(&out[r*res_x + VSI_SIZE*c], (int*)&sp, SIMD_SIZE);
}
}
return out;
}
The R code is almost the same as the OP's code
library(Rcpp)
sourceCpp("frac.cpp", verbose=TRUE, rebuild=TRUE)
xlims=c(-0.74877,-0.74872);
ylims=c(0.065053,0.065103);
x_res=y_res=1080L; nb_iter=100000L;
t = system.time(m <- frac(xlims[[1]], xlims[[2]], ylims[[1]], ylims[[2]], x_res, y_res, nb_iter))
print(t)
m2 = matrix(m, ncol = x_res)
rainbow = c(
rgb(0.47, 0.11, 0.53),
rgb(0.27, 0.18, 0.73),
rgb(0.25, 0.39, 0.81),
rgb(0.30, 0.57, 0.75),
rgb(0.39, 0.67, 0.60),
rgb(0.51, 0.73, 0.44),
rgb(0.67, 0.74, 0.32),
rgb(0.81, 0.71, 0.26),
rgb(0.89, 0.60, 0.22),
rgb(0.89, 0.39, 0.18),
rgb(0.86, 0.13, 0.13)
)
cols = c(colorRampPalette(rainbow)(100),
rev(colorRampPalette(rainbow)(100)),"black") # palette
par(mar = c(0, 0, 0, 0))
image(m2^(1/7), col=cols, asp=diff(ylims)/diff(xlims), axes=F, useRaster=T)
To compile for GCC or Clang change the file ~/.R/Makevars to
CXXFLAGS= -Wall -std=c++14 -O3 -march=native -ffp-contract=fast -fopenmp
#uncomment the following two lines for clang
#CXX=clang-5.0
#LDFLAGS= -lomp
If you are having trouble getting OpenMP to work for Clang see this.
The code produces more or less the same image.

speed up multi-threading in boost

I wrote a program to calculate the eigen-values of a 2-by-2 random matrix. I generated 50,000 2x2 random matrices and computed their eigen-values.
With boost, I used multi-thread in the member function getEigVal() of myClass, but I found that the CPU utilization is only 35%.
How can I speed up the process of getEigVal() with multi-threading?
#define _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
#define BOOST_THREAD_PROVIDES_FUTURE
#include <boost/thread.hpp>
#include <boost/thread/future.hpp>
#include <vector>
#include <cmath>
#include <random>
#include <complex>
#include <chrono>
using namespace std;
using namespace std::chrono;
class myClass {
private:
int numOfRun;
double var;
vector <vector<complex<double>>> eigVal;
vector<complex<double>> quad_root(double a, double b, double c) {//quadratic formula
vector<complex<double>> root(2, complex<double>(0, 0));
complex<double> delta = sqrt(complex<double>(pow(b, 2) - 4 * a*c, 0));
root[0] = (-b + delta) / 2.0 / a;
root[1] = (-b - delta) / 2.0 / a;
return root;
}
vector<complex<double>> eig(vector<vector<double>> A) {//compute eigenvalues
double a = 1.0;
double b = -A[0][0] - A[1][1];
double c = A[0][0] * A[1][1] - A[0][1] * A[1][0];
vector<complex<double>> r = quad_root(a, b, c);
return r;
}
public:
myClass(int run = 5e4, double v = 1) :
numOfRun(run), var(v), eigVal(numOfRun, vector<complex<double>>(2)){
}
vector <vector<complex<double>>> getEigVal() {
random_device rd;
mt19937 e2(rd());
normal_distribution<> a(0.0, var);
vector <vector<double>> A(2, vector<double>(2));
for (int i = 0; i < numOfRun; i++) {
A = { { a(e2), a(e2) }, { a(e2), a(e2) } };//generate a 2x2 random matrix
boost::packaged_task<vector<complex<double>>> task{ bind(&myClass::eig, this, A) };
boost::future<vector<complex<double>>> f = task.get_future();
boost::thread t{ std::move(task) };
eigVal[i] = f.get();
}
return eigVal;
}
};
int main() {
myClass Test;
auto start = steady_clock::now();
vector <vector<complex<double>>> result = Test.getEigVal();
auto end = steady_clock::now();
cout << "Time elapsed: " << (duration_cast<milliseconds>(end - start).count())/1e3 << " seconds\n";//13.826 s
return 0;
}

OpenMP and MPI hybrid dynamic scheduling

As the number of threads increase, the count which is "temp" decreases..
When I sent the number of threads as "1" it gives an correct answer but as the number of threads increases, running time shorter but gives wrong answer
#include <stdio.h>
#include <mpi.h>
#include <complex.h>
#include <time.h>
#include <omp.h>
#define MAXITERS 1000
// globals
int count = 0;
int nptsside;
float side2;
float side4;
int temp = 0;
int inset(double complex c) {
int iters;
float rl,im;
double complex z = c;
for (iters = 0; iters < MAXITERS; iters++) {
z = z*z + c;
rl = creal(z);
im = cimag(z);
if (rl*rl + im*im > 4) return 0;
}
return 1;
}
int main(int argc, char **argv)
{
nptsside = atoi(argv[1]);
side2 = nptsside / 2.0;
side4 = nptsside / 4.0;
//struct timespec bgn,nd;
//clock_gettime(CLOCK_REALTIME, &bgn);
int x,y; float xv,yv;
double complex z;
int i;
int mystart, myend;
int nrows;
int nprocs, mype;
int data;
MPI_Status status;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &mype);
nrows = nptsside/nprocs;
printf("%d\n", nprocs);
mystart = mype*nrows;
myend = mystart + nrows - 1;
#pragma omp parallel shared(mystart, myend, temp)
{
int nth = omp_get_num_threads();
printf("%d\n", nth);
#ifdef STATIC
#pragma omp for reduction(+:temp) schedule(static)
#elif defined DYNAMIC
#pragma omp for reduction(+:temp) schedule(dynamic)
#elif defined GUIDED
#pragma omp for reduction(+:temp) schedule(guided)
#endif
for (x=mystart; x<=myend; x++) {
for ( y=0; y<nptsside; y++) {
xv = (x - side2) / side4;
yv = (y - side2) / side4;
z = xv + yv*I;
if (inset(z)) {
temp++;
}
}
}
}
if(mype==0) {
count += temp;
printf("%d\n", temp);
for (i = 1; i < nprocs; i++) {
MPI_Recv(&temp, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &status);
count += temp;
printf("%d\n", temp);
}
}
else{
MPI_Send(&temp, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
}
MPI_Finalize();
if(mype==0) {
printf("%d\n", count);
}
//clock_gettime(CLOCK_REALTIME, &nd);
//printf("%f\n",timediff(bgn,nd));
}
You are not defining any private variables for when you enter the OpenMP loop.
First off, you must always declare your loop counter for your OpenMP loop (as well as any loop counters for nested loops inside your OpenMP loop) private.
Secondly, you have three variables (xv, yv, and z) that each depend on your iterations in these loops. Thus, each thread needs to have its own private copy of these variables as well. Changing your parallel statement to
#pragma omp parallel shared(mystart, myend, temp) private(x, y, xv, yv, z)
should fix your OpenMP problems.
Seeing as you say that setting your number of threads to 1 yields the correct answer, I have not looked at your MPI code.
EDIT: OK I lied, I briefly looked into your MPI code now. Instead of all of your sends and receives, you should be writing a single reduce. This collective will be much faster than the blocking communication you set up currently.
MPI_Reduce(&temp, &count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

Resources