Rstudio crashes with Rcpp and OpenMP function - rcpp

This is a follow up question to dqrng with Rcpp for drawing from a normal and a binomial distribution. I tried to implement the answer but instead of drawing from a single distribution I'm drawing from 3. This is the code that I wrote:
// [[Rcpp::depends(dqrng, BH, RcppArmadillo)]]
#include <RcppArmadillo.h>
#include <boost/random/binomial_distribution.hpp>
#include <xoshiro.h>
#include <dqrng_distribution.h>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
arma::mat parallel_random_matrix(int n, int m, int ncores, double p=0.5) {
dqrng::xoshiro256plus rng(42);
arma::mat out(n*m,3);
// ok to use rng here
#pragma omp parallel num_threads(ncores)
{
dqrng::xoshiro256plus lrng(rng); // make thread local copy of rng
lrng.jump(omp_get_thread_num() + 1); // advance rng by 1 ... ncores jumps
int iter = 0;
#pragma omp for
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
iter = i * n + j;
// p can be a function of i and j
boost::random::binomial_distribution<int> dist_binomial(1,p);
auto gen_bernoulli = std::bind(dist_binomial, std::ref(lrng));
boost::random::normal_distribution<int> dist_normal1(2.0,1.0);
auto gen_normal1 = std::bind(dist_normal1, std::ref(lrng));
boost::random::normal_distribution<int> dist_normal2(4.0,3.0);
auto gen_normal2 = std::bind(dist_normal2, std::ref(lrng));
out(iter,0) = gen_bernoulli();
out(iter,1) = gen_normal1();
out(iter,2) = gen_normal2();
}
}
}
// ok to use rng here
return out;
}
/*** R
parallel_random_matrix(5, 5, 4, 0.75)
*/
When I try to run it Rstudio crashes. However, when I change the code like follows it does work:
// [[Rcpp::depends(dqrng, BH, RcppArmadillo)]]
#include <RcppArmadillo.h>
#include <boost/random/binomial_distribution.hpp>
#include <xoshiro.h>
#include <dqrng_distribution.h>
// [[Rcpp::plugins(openmp)]]
#include <omp.h>
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
arma::mat parallel_random_matrix(int n, int m, int ncores, double p=0.5) {
dqrng::xoshiro256plus rng(42);
arma::mat out(n*m,3);
// ok to use rng here
#pragma omp parallel num_threads(ncores)
{
dqrng::xoshiro256plus lrng(rng); // make thread local copy of rng
lrng.jump(omp_get_thread_num() + 1); // advance rng by 1 ... ncores jumps
int iter = 0;
#pragma omp for
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
iter = i * n + j;
// p can be a function of i and j
boost::random::binomial_distribution<int> dist_binomial(1,p);
auto gen_bernoulli = std::bind(dist_binomial, std::ref(lrng));
boost::random::normal_distribution<int> dist_normal1(2.0,1.0);
auto gen_normal1 = std::bind(dist_normal1, std::ref(lrng));
boost::random::normal_distribution<int> dist_normal2(4.0,3.0);
auto gen_normal2 = std::bind(dist_normal2, std::ref(lrng));
out(iter,0) = gen_bernoulli();
out(iter,1) = 2.0;//gen_normal1();
out(iter,2) = 3.0;//gen_normal2();
}
}
}
// ok to use rng here
return out;
}
/*** R
parallel_random_matrix(5, 5, 4, 0.75)
*/
What am I doing wrong?

Here lies the problem:
boost::random::normal_distribution<int> dist_normal1(2.0,1.0);
^^^
This distribution is meant for real types, not integral types, c.f. https://www.boost.org/doc/libs/1_69_0/doc/html/boost/random/normal_distribution.html. Correct would be
boost::random::normal_distribution<double> dist_normal1(2.0,1.0);

Related

Multithreaded & SIMD vectorized Mandelbrot in R using Rcpp & OpenMP

As an OpenMP & Rcpp performance test I wanted to check how fast I could calculate the Mandelbrot set in R using the most straightforward and simple Rcpp+OpenMP implementation. Currently what I did was:
#include <Rcpp.h>
#include <omp.h>
// [[Rcpp::plugins(openmp)]]
using namespace Rcpp;
// [[Rcpp::export]]
Rcpp::NumericMatrix mandelRcpp(const double x_min, const double x_max, const double y_min, const double y_max,
const int res_x, const int res_y, const int nb_iter) {
Rcpp::NumericMatrix ret(res_x, res_y);
double x_step = (x_max - x_min) / res_x;
double y_step = (y_max - y_min) / res_y;
int r,c;
#pragma omp parallel for default(shared) private(c) schedule(dynamic,1)
for (r = 0; r < res_y; r++) {
for (c = 0; c < res_x; c++) {
double zx = 0.0, zy = 0.0, new_zx;
double cx = x_min + c*x_step, cy = y_min + r*y_step;
int n = 0;
for (n=0; (zx*zx + zy*zy < 4.0 ) && ( n < nb_iter ); n++ ) {
new_zx = zx*zx - zy*zy + cx;
zy = 2.0*zx*zy + cy;
zx = new_zx;
}
ret(c,r) = n;
}
}
return ret;
}
And then in R:
library(Rcpp)
sourceCpp("mandelRcpp.cpp")
xlims=c(-0.74877,-0.74872);
ylims=c(0.065053,0.065103);
x_res=y_res=1080L; nb_iter=10000L;
system.time(m <- mandelRcpp(xlims[[1]], xlims[[2]], ylims[[1]], ylims[[2]], x_res, y_res, nb_iter))
# 0.92s
rainbow=c(rgb(0.47,0.11,0.53),rgb(0.27,0.18,0.73),rgb(0.25,0.39,0.81),rgb(0.30,0.57,0.75),rgb(0.39,0.67,0.60),rgb(0.51,0.73,0.44),rgb(0.67,0.74,0.32),rgb(0.81,0.71,0.26),rgb(0.89,0.60,0.22),rgb(0.89,0.39,0.18),rgb(0.86,0.13,0.13))
cols=c(colorRampPalette(rainbow)(100),rev(colorRampPalette(rainbow)(100)),"black") # palette
par(mar=c(0, 0, 0, 0))
system.time(image(m^(1/7), col=cols, asp=diff(ylims)/diff(xlims), axes=F, useRaster=T))
# 0.5s
I was unsure though if there is any other obvious speed improvements I could take advantage of aside from OpenMP multithreading, e.g. via simd vectorization? (using simd options in the openmp #pragma didn't seem to do anything)
PS at first my code was crashing but I later found this was solved by replacing ret[r,c] = n; with ret(r,c) = n;
Using Armadillo classes as suggested in the answer below make things very slightly faster, though the timings are almost the same. Also flipped around x and y so it comes out in the right orientation when plotted with image(). Using 8 threads speed is ca. 350 times faster than the vectorized plain R Mandelbrot version here and also about 7.3 times faster than the (non-multithreaded) Python/Numba version here (similar to PyCUDA or PyOpenCL speeds), so quite happy with that... Rasterizing/display now seems the bottleneck in R....
Do not use OpenMP with Rcpp's *Vector or *Matrix objects as they mask SEXP functions / memory allocations that are single-threaded. OpenMP is a multi-threaded approach.
This is why the code is crashing.
One way to get around this limitation is to use a non-R data structure to store the results. One of the following will be sufficient: arma::mat or Eigen::MatrixXd or std::vector<T>... As I favor armadillo, I will change the res matrix to arma::mat from Rcpp::NumericMatrix. Thus, the following will execute your code in parallel:
#include <RcppArmadillo.h> // Note the changed include and new attribute
// [[Rcpp::depends(RcppArmadillo)]]
// Avoid including header if openmp not on system
#ifdef _OPENMP
#include <omp.h>
#endif
// [[Rcpp::plugins(openmp)]]
// Note the changed return type
// [[Rcpp::export]]
arma::mat mandelRcpp(const double x_min, const double x_max,
const double y_min, const double y_max,
const int res_x, const int res_y, const int nb_iter) {
arma::mat ret(res_x, res_y); // note change
double x_step = (x_max - x_min) / res_x;
double y_step = (y_max - y_min) / res_y;
unsigned r,c;
#pragma omp parallel for shared(res)
for (r = 0; r < res_y; r++) {
for (c = 0; c < res_x; c++) {
double zx = 0.0, zy = 0.0, new_zx;
double cx = x_min + c*x_step, cy = y_min + r*y_step;
unsigned n = 0;
for (; (zx*zx + zy*zy < 4.0 ) && ( n < nb_iter ); n++ ) {
new_zx = zx*zx - zy*zy + cx;
zy = 2.0*zx*zy + cy;
zx = new_zx;
}
if(n == nb_iter) {
n = 0;
}
ret(r, c) = n;
}
}
return ret;
}
With the test code (note y and x were not defined, thus I assumed y = ylims and x = xlims) we have:
xlims = ylims = c(-2.0, 2.0)
x_res = y_res = 400L
nb_iter = 256L
system.time(m <-
mandelRcpp(xlims[[1]], xlims[[2]],
ylims[[1]], ylims[[2]],
x_res, y_res, nb_iter))
rainbow = c(
rgb(0.47, 0.11, 0.53),
rgb(0.27, 0.18, 0.73),
rgb(0.25, 0.39, 0.81),
rgb(0.30, 0.57, 0.75),
rgb(0.39, 0.67, 0.60),
rgb(0.51, 0.73, 0.44),
rgb(0.67, 0.74, 0.32),
rgb(0.81, 0.71, 0.26),
rgb(0.89, 0.60, 0.22),
rgb(0.89, 0.39, 0.18),
rgb(0.86, 0.13, 0.13)
)
cols = c(colorRampPalette(rainbow)(100),
rev(colorRampPalette(rainbow)(100)),
"black") # palette
par(mar = c(0, 0, 0, 0))
image(m,
col = cols,
asp = diff(range(ylims)) / diff(range(xlims)),
axes = F)
For:
I went ahead and vectorized the OP's code using GCC's and Clang's vector extensions. Before I show how I did this let me show the performance with the following hardware:
Skylake (SKL) at 3.1 GHz with 4 cores
Knights Landing (KNL) at 1.5 GHz with 68 cores
ARMv8 Cortex-A57 arch64 (Nvidia Jetson TX1) 4 cores at ? GHz
nb_iter = 1000000
GCC Clang
SKL_scalar 6m5,422s
SKL_SSE41 3m18,058s
SKL_AVX2 1m37,843s 1m39,943s
SKL_scalar_omp 0m52,237s
SKL_SSE41_omp 0m29,624s 0m31,356s
SKL_AVX2_omp 0m14,156s 0m16,783s
ARM_scalar 15m28.285s
ARM_vector 9m26.384s
ARM_scalar_omp 3m54.242s
ARM_vector_omp 2m21.780s
KNL_scalar 19m34.121s
KNL_SSE41 11m30.280s
KNL_AVX2 5m0.005s 6m39.568s
KNL_AVX512 2m40.934s 6m20.061s
KNL_scalar_omp 0m9.108s
KNL_SSE41_omp 0m6.666s 0m6.992s
KNL_AVX2_omp 0m2.973s 0m3.988s
KNL_AVX512_omp 0m1.761s 0m3.335s
The theoretical speed up of KNL vs. SKL is
(68 cores/4 cores)*(1.5 GHz/3.1 Ghz)*
(8 doubles per lane/4 doubles per lane) = 16.45
I went into detail about GCC's and Clang's vector extensions capabilities here. To vectorize the OP's code here are three additional vector operations that we need to define.
1. Broadcasting
For a vector v and a scalar s GCC cannot do v = s but Clang can. But I found a nice solution which works for GCC and Clang here. For example
vsi v = s - (vsi){};
2. A any() function like in OpenCL or like in R.
The best I came up with is a generic function
static bool any(vli const & x) {
for(int i=0; i<VLI_SIZE; i++) if(x[i]) return true;
return false;
}
Clang actually generates relatively efficient code for this using the ptest instruction (but not for AVX512) but GCC does not.
3. Compression
The calculations are done as 64-bit doubles but the result is written out as 32-bit integers. So two calculations are done using 64-bit integers and then the two calculations are compressed into one vector of 32-bit integers. I came up with a generic solution which Clang does a good job with
static vsi compress(vli const & lo, vli const & hi) {
vsi lo2 = (vsi)lo, hi2 = (vsi)hi, z;
for(int i=0; i<VLI_SIZE; i++) z[i+0*VLI_SIZE] = lo2[2*i];
for(int i=0; i<VLI_SIZE; i++) z[i+1*VLI_SIZE] = hi2[2*i];
return z;
}
The follow solution works better for GCC but is no better for Clang. But since this function is not critical I just use the generic version.
static vsi compress(vli const & low, vli const & high) {
#if defined(__clang__)
return __builtin_shufflevector((vsi)low, (vsi)high, MASK);
#else
return __builtin_shuffle((vsi)low, (vsi)high, (vsi){MASK});
#endif
}
These definitions don't rely on anything x86 specific and the code (defined below) compiles for ARM processors as well with GCC and Clang.
Now that these are defined here is the code
#include <string.h>
#include <inttypes.h>
#include <Rcpp.h>
using namespace Rcpp;
#ifdef _OPENMP
#include <omp.h>
#endif
// [[Rcpp::plugins(openmp)]]
// [[Rcpp::plugins(cpp14)]]
#if defined ( __AVX512F__ ) || defined ( __AVX512__ )
static const int SIMD_SIZE = 64;
#elif defined ( __AVX2__ )
static const int SIMD_SIZE = 32;
#else
static const int SIMD_SIZE = 16;
#endif
static const int VSI_SIZE = SIMD_SIZE/sizeof(int32_t);
static const int VLI_SIZE = SIMD_SIZE/sizeof(int64_t);
static const int VDF_SIZE = SIMD_SIZE/sizeof(double);
#if defined(__clang__)
typedef int32_t vsi __attribute__ ((ext_vector_type(VSI_SIZE)));
typedef int64_t vli __attribute__ ((ext_vector_type(VLI_SIZE)));
typedef double vdf __attribute__ ((ext_vector_type(VDF_SIZE)));
#else
typedef int32_t vsi __attribute__ ((vector_size (SIMD_SIZE)));
typedef int64_t vli __attribute__ ((vector_size (SIMD_SIZE)));
typedef double vdf __attribute__ ((vector_size (SIMD_SIZE)));
#endif
static bool any(vli const & x) {
for(int i=0; i<VLI_SIZE; i++) if(x[i]) return true;
return false;
}
static vsi compress(vli const & lo, vli const & hi) {
vsi lo2 = (vsi)lo, hi2 = (vsi)hi, z;
for(int i=0; i<VLI_SIZE; i++) z[i+0*VLI_SIZE] = lo2[2*i];
for(int i=0; i<VLI_SIZE; i++) z[i+1*VLI_SIZE] = hi2[2*i];
return z;
}
// [[Rcpp::export]]
IntegerVector frac(double x_min, double x_max, double y_min, double y_max, int res_x, int res_y, int nb_iter) {
IntegerVector out(res_x*res_y);
vdf x_minv = x_min - (vdf){}, y_minv = y_min - (vdf){};
vdf x_stepv = (x_max - x_min)/res_x - (vdf){}, y_stepv = (y_max - y_min)/res_y - (vdf){};
double a[VDF_SIZE] __attribute__ ((aligned(SIMD_SIZE)));
for(int i=0; i<VDF_SIZE; i++) a[i] = 1.0*i;
vdf vi0 = *(vdf*)a;
#pragma omp parallel for schedule(dynamic) collapse(2)
for (int r = 0; r < res_y; r++) {
for (int c = 0; c < res_x/(VSI_SIZE); c++) {
vli nv[2] = {0 - (vli){}, 0 - (vli){}};
for(int j=0; j<2; j++) {
vdf c2 = 1.0*VDF_SIZE*(2*c+j) + vi0;
vdf zx = 0.0 - (vdf){}, zy = 0.0 - (vdf){}, new_zx;
vdf cx = x_minv + c2*x_stepv, cy = y_minv + r*y_stepv;
vli t = -1 - (vli){};
for (int n = 0; any(t = zx*zx + zy*zy < 4.0) && n < nb_iter; n++, nv[j] -= t) {
new_zx = zx*zx - zy*zy + cx;
zy = 2.0*zx*zy + cy;
zx = new_zx;
}
}
vsi sp = compress(nv[0], nv[1]);
memcpy(&out[r*res_x + VSI_SIZE*c], (int*)&sp, SIMD_SIZE);
}
}
return out;
}
The R code is almost the same as the OP's code
library(Rcpp)
sourceCpp("frac.cpp", verbose=TRUE, rebuild=TRUE)
xlims=c(-0.74877,-0.74872);
ylims=c(0.065053,0.065103);
x_res=y_res=1080L; nb_iter=100000L;
t = system.time(m <- frac(xlims[[1]], xlims[[2]], ylims[[1]], ylims[[2]], x_res, y_res, nb_iter))
print(t)
m2 = matrix(m, ncol = x_res)
rainbow = c(
rgb(0.47, 0.11, 0.53),
rgb(0.27, 0.18, 0.73),
rgb(0.25, 0.39, 0.81),
rgb(0.30, 0.57, 0.75),
rgb(0.39, 0.67, 0.60),
rgb(0.51, 0.73, 0.44),
rgb(0.67, 0.74, 0.32),
rgb(0.81, 0.71, 0.26),
rgb(0.89, 0.60, 0.22),
rgb(0.89, 0.39, 0.18),
rgb(0.86, 0.13, 0.13)
)
cols = c(colorRampPalette(rainbow)(100),
rev(colorRampPalette(rainbow)(100)),"black") # palette
par(mar = c(0, 0, 0, 0))
image(m2^(1/7), col=cols, asp=diff(ylims)/diff(xlims), axes=F, useRaster=T)
To compile for GCC or Clang change the file ~/.R/Makevars to
CXXFLAGS= -Wall -std=c++14 -O3 -march=native -ffp-contract=fast -fopenmp
#uncomment the following two lines for clang
#CXX=clang-5.0
#LDFLAGS= -lomp
If you are having trouble getting OpenMP to work for Clang see this.
The code produces more or less the same image.

speed up multi-threading in boost

I wrote a program to calculate the eigen-values of a 2-by-2 random matrix. I generated 50,000 2x2 random matrices and computed their eigen-values.
With boost, I used multi-thread in the member function getEigVal() of myClass, but I found that the CPU utilization is only 35%.
How can I speed up the process of getEigVal() with multi-threading?
#define _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
#define BOOST_THREAD_PROVIDES_FUTURE
#include <boost/thread.hpp>
#include <boost/thread/future.hpp>
#include <vector>
#include <cmath>
#include <random>
#include <complex>
#include <chrono>
using namespace std;
using namespace std::chrono;
class myClass {
private:
int numOfRun;
double var;
vector <vector<complex<double>>> eigVal;
vector<complex<double>> quad_root(double a, double b, double c) {//quadratic formula
vector<complex<double>> root(2, complex<double>(0, 0));
complex<double> delta = sqrt(complex<double>(pow(b, 2) - 4 * a*c, 0));
root[0] = (-b + delta) / 2.0 / a;
root[1] = (-b - delta) / 2.0 / a;
return root;
}
vector<complex<double>> eig(vector<vector<double>> A) {//compute eigenvalues
double a = 1.0;
double b = -A[0][0] - A[1][1];
double c = A[0][0] * A[1][1] - A[0][1] * A[1][0];
vector<complex<double>> r = quad_root(a, b, c);
return r;
}
public:
myClass(int run = 5e4, double v = 1) :
numOfRun(run), var(v), eigVal(numOfRun, vector<complex<double>>(2)){
}
vector <vector<complex<double>>> getEigVal() {
random_device rd;
mt19937 e2(rd());
normal_distribution<> a(0.0, var);
vector <vector<double>> A(2, vector<double>(2));
for (int i = 0; i < numOfRun; i++) {
A = { { a(e2), a(e2) }, { a(e2), a(e2) } };//generate a 2x2 random matrix
boost::packaged_task<vector<complex<double>>> task{ bind(&myClass::eig, this, A) };
boost::future<vector<complex<double>>> f = task.get_future();
boost::thread t{ std::move(task) };
eigVal[i] = f.get();
}
return eigVal;
}
};
int main() {
myClass Test;
auto start = steady_clock::now();
vector <vector<complex<double>>> result = Test.getEigVal();
auto end = steady_clock::now();
cout << "Time elapsed: " << (duration_cast<milliseconds>(end - start).count())/1e3 << " seconds\n";//13.826 s
return 0;
}

C++ Threaded Template Vector Quicksort

Threaded quick sort method:
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include "MD5.h"
#include <thread>
using namespace std;
template<typename T>
void quickSort(vector<T> &arr, int left, int right) {
int i = left, j = right; //Make local copys to modify
T tmp; //Termorary variable to use for swaping.
T pivot = arr[(left + right) / 2]; //Find the centerpoint. if 0.5 truncate.
while (i <= j) {
while (arr[i] < pivot) //is i < pivot?
i++;
while (arr[j] > pivot) //Is j > pivot?
j--;
if (i <= j) { //Swap
tmp = arr[i];
arr[i] = arr[j];
arr[j] = tmp;
i++;
j--;
}
};
thread left_t; //Left thread
thread right_t; //Right thread
if (left < j)
left_t = thread(quickSort<T>, ref(arr), left, j);
if (i < right)
right_t = thread(quickSort<T>, ref(arr), i, right);
if (left < j)
left_t.join();
if (left < j)
right_t.join();
}
int main()
{
vector<int> table;
for (int i = 0; i < 100; i++)
{
table.push_back(rand() % 100);
}
cout << "Before" << endl;
for each(int val in table)
{
cout << val << endl;
}
quickSort(table, 0, 99);
cout << "After" << endl;
for each(int val in table)
{
cout << val << endl;
}
char temp = cin.get();
return 0;
}
Above program lags like mad hell and Spams "abort()" has been called.
Im thinking it has something to do with vectors and it Having threading issues
Iv seen the Question asked by Daniel Makardich, His Utilizes a Vector int While mine uses Vector T
You don't have any problem with quick sort, but with passing a templated function to a thread. There is no function quickSort. You need to explicitly give type, to instantiate the function template:
#include <thread>
#include <iostream>
template<typename T>
void f(T a) { std::cout << a << '\n'; }
int main () {
std::thread t;
int a;
std::string b("b");
t = std::thread(f, a); // Won't work
t = std::thread(f<int>, a);
t.join();
t = std::thread(f<decltype(b)>, b); // a bit fancier, more dynamic way
t.join();
return 0;
}
I suspect in your case this should do:
left_t = thread(quickSort<T>, ref(arr), left, j);
And similar for right_t. Also, you have mistake there trying to use operator()() instead of constructing an object. That is why the error is different.
Can't verify though, cause there's no minimal verifiable example =/
I don't know if it's possible to make compiler to use automatic type deduction for f passed as a param, if anyone knows that would probably make it a better answer.
Problem was with thread joins and what #luk32 said
Needed to convert the threads to pointers to threads.
thread* left_t = nullptr; //Left thread
thread* right_t = nullptr; //Right thread
if (left < j)
left_t = new thread(quickSort<T>, ref(arr), left, j);
if (i < right)
right_t = new thread(quickSort<T>, ref(arr), i, right);
if (left_t)
{
left_t->join();
delete left_t;
}
if (right_t)
{
right_t->join();
delete right_t;
}
Seems like if you create a default constructed thread object. But don't use it, it still wants to be joined. and if you do join it, it will complain.

Why is this triggering a breakpoint?

I have looked extensively for the problem in this code, but I can't seem to figure out what tragic error I made and why it is triggering a breakpoint.
(After 3 or 4 inputs, it triggers and I don't know why it doesn't trigger at the start or what is causing it)
#include <conio.h> // For function getch()
#include <cstdlib> // For several general-purpose functions
#include <fstream> // For file handling
#include <iomanip> // For formatted output
#include <iostream> // For cin, cout, and system
#include <string> // For string data type
using namespace std; // So "std::cout" may be abbreviated to "cout", for example.
string convertDecToBin(int dec)
{
int *arrayHex, arraySize = 0;
arrayHex = new int[];
string s = " ";
int r = dec;
for (int i = 0; r != 0; i++)
{
arrayHex[i] = r % 2;
r = r / 2;
arraySize++;
}
for (int j = 0; j < arraySize; j++)
{
s = s + to_string(arrayHex[arraySize - 1 - j]);
}
delete[] arrayHex;
return s;
}
string convertDecToOct(int dec)
{
int *arrayHex, arraySize = 0;
arrayHex = new int[];
string s = " ";
int r = dec;
for (int i = 0; r != 0; i++)
{
arrayHex[i] = r % 8;
r = r / 8;
arraySize++;
}
for (int j = 0; j < arraySize; j++)
{
s = s + to_string(arrayHex[arraySize - 1 - j]);
}
delete[] arrayHex;
return s;
}
int main()
{
int input = 0;
while (input != -1)
{
cout << "\nEnter a decimal number (-1 to exit loop): ";
cin >> input;
if (input != -1)
{
cout << "Your decimal number in binary expansion: " << convertDecToBin(input);
cout << "\nYour decimal number in octal ecpression: " << convertDecToOct(input);
}
}
cout << "\n\nPress any key to exit. . .";
_getch();
return 0;
}
arrayHex = new int[] is your problem - C\C++ does not support dynamic sizing arrays. You need to specify a size for the array to allocation, otherwise you'll get memory block overruns.

OpenMP and MPI hybrid dynamic scheduling

As the number of threads increase, the count which is "temp" decreases..
When I sent the number of threads as "1" it gives an correct answer but as the number of threads increases, running time shorter but gives wrong answer
#include <stdio.h>
#include <mpi.h>
#include <complex.h>
#include <time.h>
#include <omp.h>
#define MAXITERS 1000
// globals
int count = 0;
int nptsside;
float side2;
float side4;
int temp = 0;
int inset(double complex c) {
int iters;
float rl,im;
double complex z = c;
for (iters = 0; iters < MAXITERS; iters++) {
z = z*z + c;
rl = creal(z);
im = cimag(z);
if (rl*rl + im*im > 4) return 0;
}
return 1;
}
int main(int argc, char **argv)
{
nptsside = atoi(argv[1]);
side2 = nptsside / 2.0;
side4 = nptsside / 4.0;
//struct timespec bgn,nd;
//clock_gettime(CLOCK_REALTIME, &bgn);
int x,y; float xv,yv;
double complex z;
int i;
int mystart, myend;
int nrows;
int nprocs, mype;
int data;
MPI_Status status;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &mype);
nrows = nptsside/nprocs;
printf("%d\n", nprocs);
mystart = mype*nrows;
myend = mystart + nrows - 1;
#pragma omp parallel shared(mystart, myend, temp)
{
int nth = omp_get_num_threads();
printf("%d\n", nth);
#ifdef STATIC
#pragma omp for reduction(+:temp) schedule(static)
#elif defined DYNAMIC
#pragma omp for reduction(+:temp) schedule(dynamic)
#elif defined GUIDED
#pragma omp for reduction(+:temp) schedule(guided)
#endif
for (x=mystart; x<=myend; x++) {
for ( y=0; y<nptsside; y++) {
xv = (x - side2) / side4;
yv = (y - side2) / side4;
z = xv + yv*I;
if (inset(z)) {
temp++;
}
}
}
}
if(mype==0) {
count += temp;
printf("%d\n", temp);
for (i = 1; i < nprocs; i++) {
MPI_Recv(&temp, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &status);
count += temp;
printf("%d\n", temp);
}
}
else{
MPI_Send(&temp, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
}
MPI_Finalize();
if(mype==0) {
printf("%d\n", count);
}
//clock_gettime(CLOCK_REALTIME, &nd);
//printf("%f\n",timediff(bgn,nd));
}
You are not defining any private variables for when you enter the OpenMP loop.
First off, you must always declare your loop counter for your OpenMP loop (as well as any loop counters for nested loops inside your OpenMP loop) private.
Secondly, you have three variables (xv, yv, and z) that each depend on your iterations in these loops. Thus, each thread needs to have its own private copy of these variables as well. Changing your parallel statement to
#pragma omp parallel shared(mystart, myend, temp) private(x, y, xv, yv, z)
should fix your OpenMP problems.
Seeing as you say that setting your number of threads to 1 yields the correct answer, I have not looked at your MPI code.
EDIT: OK I lied, I briefly looked into your MPI code now. Instead of all of your sends and receives, you should be writing a single reduce. This collective will be much faster than the blocking communication you set up currently.
MPI_Reduce(&temp, &count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

Resources