Working with labels in OpenMP: How to or is it possible to jump in Threads?

Working with labels in OpenMP: How to or is it possible to jump in Threads? - multithreading

I am trying to get some code paralleled, and algorithm must be remained. Here is the original Code:
#include <stdio.h>
#define N 50
main()
{
int prime[N] ;
int j ;
int k ;
int n ;
int quo,rem ;
P1: prime[0] = 2 ;
n = 3 ;
j = 0 ;
P2: j = j+1 ;
prime[j] = n ;
P3: if (j == (N-1)) goto P9 ;
P4: n = n + 2 ;
P5: k = 1 ;
P6: quo = n / prime[k] ;
rem = n % prime[k] ;
if (rem == 0) goto P4 ;
P7: if (quo <= prime[k]) goto P2 ;
P8: k = k+1 ;
goto P6 ;
P9: for(j=0 ; j < N ; j++) printf("%d ",prime[j]) ;
}
And I changed it into this:
#include <stdio.h>
#include <omp.h>
#include <array>
#include <vector>
#define N 50
int tChecked = 0;
std::vector<int> tempArr;
std::array<int, 4> storeArr;
int main()
{
int prime[N];
int j;
int k;
int n;
int quo, rem;
int test = 0;
P1: prime[0] = 2;
n = 3;
j = 0;
P2:
if (tempArr.empty())
{
j = j + 1;
prime[j] = n;
}
else
{
std::sort(std::begin(tempArr), std::end(tempArr));
for (int i = 0; i < tempArr.size(); i++)
{
j = j + 1;
prime[j] = tempArr[i];
}
}
tChecked = 0;
tempArr.clear();
P3: if (j == (N - 1)) goto P9;
//P4: n = n + 2;
PX:
#pragma omp parallel num_threads(4)
{
int ID = omp_get_thread_num();
if (tChecked = 1)
{
n = storeArr[ID];
goto P6;
}
P4:
n = n + 2 * (ID + 1);
storeArr[ID] = n;
P5: k = 1;
P6: quo = n / prime[k];
rem = n % prime[k];
if (rem == 0) goto P4;
P7:
if (quo <= prime[k])
{
tempArr.push_back(n);
}
}
if (!tempArr.empty()) goto P2;
P8: k = k + 1;
tChecked = 1;
goto PX;
P9: for (j = 0; j < N; j++) printf("%d ", prime[j]);
getchar();
return 0;
}
I am using Visual Studio 2015 and OpenMP suport is on, when I run the code it crashes like this:
Exception thrown at 0x00ED9D29 in HW1.exe: 0xC0000005: Access
violation reading location 0x33612FA8.
When I look at the local variables, I see absurd values like:
prime[2] to prime[49] = -858993460
rem, quo, k = -858993460
What is this -858993460 anyway? I assume this is related to jumping in threads because original code works very fine.
So can you explain where this '-858993460' comes from and why? is it related to jumping or any other thing? And is it possible to jump in parallel threads?
Note: I am sharing the question also maybe it helps:
Implement an OpenMP program that generates prime numbers in a given
interval. You should use the prime generation method given in the next
page ( Do NOT use other method !). Your program should generate a csv
le called results.csv that reports the timing results in the
following format.
table

Related

Why the output goes wrong when using pthread_join?

I am still learning about threads and I was trying to solve this problem in my code, when I am putting the pthread_join(thread[i],NULL) outside the loop that is creating the threads it always gives me wrong output and Thread with ID = 0 will not work(call the median func) and the last thread will work two times, for better understanding see the output below:
ThreadID= 0, startRow= 0, endRow= 0 // first thread doesn't call the median func
ThreadID= 1, startRow= 1, endRow= 1
ThreadID 1 numOfBright 0 numOfDark 1 numOfNormal 4
ThreadID= 2, startRow= 2, endRow= 2
ThreadID 2 numOfBright 0 numOfDark 1 numOfNormal 4
ThreadID= 3, startRow= 3, endRow= 3
ThreadID 3 numOfBright 0 numOfDark 0 numOfNormal 5
ThreadID= 4, startRow= 4, endRow= 4
ThreadID 4 numOfBright 0 numOfDark 5 numOfNormal 0
ThreadID 4 numOfBright 0 numOfDark 5 numOfNormal 0 // last thread is calling the median func two times.
This is the part of the code that prints the start and end row of each thread.
pthread_t* threads = new pthread_t[num_threads];
struct Th_Range* RANGE = (struct Th_Range*)malloc(sizeof(struct Th_Range*));
int thread_status;
RANGE->SizeOfImage = r; // 2d array with size (n*n) so rows(r) = columns(c)
if (n == num_threads) { //rows = num of threads then every thread will work in a single row
for (int i = 0; i < num_threads; i++) {
RANGE->ThreadId = i;
RANGE->StartRow = RANGE->EndRow = i;
cout << "ThreadID= " << i << ", startRow= " << RANGE->StartRow << ", endRow= " << RANGE->EndRow << endl;
thread_status = pthread_create(&threads[i], NULL, Median, RANGE);
if (thread_status)
exit(-1);
} //for loop ends here
for (int i = 0; i < num_threads; i++)
pthread_join(threads[i],NULL);
} //end of if statement
Here is the part of the code if needed with the median function and the above if statement.
#include <iostream>
#include <bits/stdc++.h>
#include <pthread.h>
pthread_mutex_t Lock;
pthread_mutex_t Pixels;
pthread_mutex_t Pixels2;
using namespace std;
int numOfBright, numOfDark, numOfNormal;
int** Oimage, ** Fimage; //original and filtered image
struct Th_Range {
int SizeOfImage;
int StartRow;
int EndRow;
int ThreadId;
};
void* Median(void* par)
{
struct Th_Range* Num = (struct Th_Range*)par;
int StartRow = Num->StartRow;
int EndRow = Num->EndRow;
int Size = Num->SizeOfImage;
int Neighbour[9] = { 0 };
int dark = 0, bright = 0, normal = 0;
if (EndRow == StartRow)
EndRow += 2;
else
EndRow++;
for (int i = StartRow +1; i < EndRow ; i++)
{
for (int j = 1; j < Size - 1; j++)
{
Neighbour[0] = Oimage[i - 1][j - 1];
Neighbour[1] = Oimage[i - 1][j];
Neighbour[2] = Oimage[i - 1][j + 1];
Neighbour[3] = Oimage[i][j - 1];
Neighbour[4] = Oimage[i][j];
Neighbour[5] = Oimage[i][j + 1];
Neighbour[6] = Oimage[i + 1][j - 1];
Neighbour[7] = Oimage[i + 1][j];
Neighbour[8] = Oimage[i + 1][j + 1];
pthread_mutex_lock(&Pixels); //it can be moved only to lock the Fimage and the numOfBright or any other global variables
sort(Neighbour, Neighbour + 9);
Fimage[i][j] = Neighbour[4];
if (Neighbour[4] > 200) {
bright++;
numOfBright++;
}
else if (Neighbour[4] < 50) {
dark++;
numOfDark++;
}
else {
normal++;
numOfNormal++;
}
pthread_mutex_unlock(&Pixels);
}
}
pthread_mutex_lock(&Pixels2); //when I try to remove this lock the output gets interrupted
cout << "ThreadID " << Num->ThreadId << " numOfBright " << bright << " numOfDark " << dark << " numOfNormal " << normal<<endl;
pthread_mutex_unlock(&Pixels2);
pthread_exit(NULL);
}
int main(int argc, char* argv[])
{
int num_threads, n, r, c; // n is the size of the matrix r and c are rows and columns
numOfNormal = numOfDark = numOfBright = 0;
if (argc >= 2)
num_threads = atoi(argv[1]);
else
exit(-1);
ifstream cin("input.txt");
cin >> n;
r = c = n + 2;
Oimage = new int* [r]();
Fimage = new int* [r]();
for (int i = 0; i < c; i++)
{
Oimage[i] = new int[c]();
Fimage[i] = new int[c]();
}
for (int i = 1; i < r - 1; i++)
for (int j = 1; j < c - 1; j++)
cin >> Oimage[i][j];
pthread_t* threads = new pthread_t[num_threads];
struct Th_Range* RANGE = (struct Th_Range*)malloc(sizeof(struct Th_Range*));
RANGE->SizeOfImage = r;
if (n == num_threads) { //rows = num of threads then every thread will work in a single row
//n+2
int thread_status;
for (int i = 0; i < num_threads; i++) {
RANGE->ThreadId = i;
RANGE->StartRow = RANGE->EndRow = i;
// pthread_mutex_lock(&Lock);
cout << "ThreadID= " << i << ", startRow= " << RANGE->StartRow << ", endRow= " << RANGE->EndRow << endl;
thread_status = pthread_create(&threads[i], NULL, Median, RANGE);
if (thread_status)
exit(-1);
}
}
I tried to move pthread_join inside the loop of pthread_create it gives a correct output but of course it is a wrong solution. I have no idea what to do next. Thanks in advance

Maybe you should use #include
or (using namespace sff) it must work

Stack around the variable A (array) or n (size of the array) was corrupted

This program checks distinctness in an array. (no repeated values in an array I.e if 1 2 3 3 4 is an array it is not distinct). this code Won't compile although (I believe that) index of array did not go out of range in for loop.
theRun-Time Check Failure says stack around variable 'n' was corrupted when I enter n =12. BUT says stack around variable 'A' was corrupted when I enter n = 10. with exactly the same variables entered in the array in the second step. (the error shows up after entering the fourth integer)
#include <iostream>
using namespace std;
int main()
{
int n;
int A[] = {0};
int integer;
cout<<"Enter the size of the array\n";
cin>>n;
cout<<"enter "<<n<<" integers\n";
for (int i = 0 ; i < n ; i++)
{
cin>>A[i];
}
for (int i = 0 ; i < n ; i++)
{
for (int j = 0 ; j < n - i; j++)
{
if(A[j+1] > A[j])
{
int temp;
temp = A[j];
A[j+1] = A[j];
A[j+1] = temp;
}
}
}
for (int i = 0 ; i < n; i++)
{
if (A[i] - A[i+1] ==0 ){
cout<<"\nThe Array Is Not Distinct !\n";
break;
}
else
{
cout<<"\nThe Array Is Distinct !\n";
}
}
system("pause");
return 0;
}

Hybrid MPI+OpenMP Vs MPI Performance

I am converting a 3-D Jacobi solver from pure MPI to Hybrid MPI+OpenMP. I have a 192x192x192 array which is divided among 24 processes in Pure MPI in 1-D decomposition i.e. each process has 192/24 x 192 x 192 = 8 x 192 x 192 slab of data. Now I do :
for(i=0 ; i <= 7; i++)
for(j=0; j<= 191; j++)
for(k=0; k<= 191; k++)
{
unew[i][j][k] = 1/6.0 * (u[i+1][j][k]+u[i-1][j][k]+
u[i][j+1][k]+u[i][j-1][k]+
u[i][j][k+1]+u[i][j][k-1]);
}
This update takes around 60 seconds for each process.
Now with Hybrid MPI, I run two processes (1 process per socket --bind-to socket --map-by socket and OMP_PROC_PLACES=coreswith OMP_PROC_BIND=close). I create 12 threads per MPI Process (i.e. 12 threads per socket or processor). Now each MPI process has an array of size : 192/2 x 192 x 192 = 96x192x192 elements. Each thread works on 96/12 x 192 x 192 = 8 x 192 x 192 portion of the array owned by each process. I do the same triple loop update using threads but the time is approximately 76 seconds for each thread. The load balance is perfect in both the problems. What could be the possible causes of performance degradation ? Is is False Sharing because threads could be invalidating the cache lines close to each other's chunk of data ? If yes, then how do I reduce this performance degradation ? (I have purposefully not mentioned ghost data but initially I am NOT overlapping communication with computation.)
In response to the comments below, am posting the code. Apologies for the long MWE but you can very safely ignore (1) Header files declaration (2) Variable Declaration (3) Memory allocation routine (4) Formation of Cartesian Topology (5) Setting boundary conditions in parallel using OpenMP parallel region (6) Declaration of MPI_Type_subarray datatype (7) MPI_Isend() and MPI_Irecv() calls and just concentrate on (a) INDEPENDENT UPDATE OpenMP parallel region (b) independent_update(...) routine being called from here.
/* IGNORE THIS PORTION */
#include<mpi.h>
#include<omp.h>
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#define MIN(a,b) (a < b ? a : b)
#define Tol 0.00001
/* IGNORE THIS ROUTINE */
void input(int *X, int *Y, int *Z)
{
int a=193, b=193, c=193;
*X = a;
*Y = b;
*Z = c;
}
/* IGNORE THIS ROUTINE */
float*** allocate_mem(int X, int Y, int Z)
{
int i,j;
float ***matrix;
float *arr;
arr = (float*)calloc(X*Y*Z, sizeof(float));
matrix = (float***)calloc(X, sizeof(float**));
for(i = 0 ; i<= X-1; i++)
matrix[i] = (float**)calloc(Y, sizeof(float*));
for(i = 0 ; i <= X-1; i++)
for(j=0; j<= Y-1; j++)
matrix[i][j] = &(arr[i*Y*Z + j*Z]);
return matrix ;
}
/* THIS ROUTINE IS IMPORTANT */
float independent_update(float ***old, float ***new, int NX, int NY, int NZ, int tID, int chunk)
{
int i,j,k, start, end;
float error = 0.0;
float diff;
start = tID * chunk + 1;
end = MIN( (tID+1)*chunk, NX-2 );
for(i = start; i <= end ; i++)
{
for(j = 1; j<= NY-2; j++)
{
#pragma omp simd
for(k = 1; k<= NZ-2; k++)
{
new[i][j][k] = (1/6.0) *(old[i-1][j][k] + old[i+1][j][k] + old[i][j-1][k] + old[i][j+1][k] + old[i][j][k-1] + old[i][j][k+1] );
diff = 1.0 - new[i][j][k];
diff = (diff > 0 ? diff : -1.0 * diff );
if(diff > error)
error = diff;
}
}
}
return error;
}
int main(int argc, char *argv[])
{
/* IGNORE VARIABLE DECLARATION */
int size, rank; //Size of old_comm and rank of process
int i, j, k,l; //General loop variables
MPI_Comm old_comm, new_comm; //MPI_COMM_WORLD handle and for MPI_Cart_create()
int N[3]; //For taking input of size of matrix from user
int P; //Represent number of processes i.e. same as size
int dims[3]; //For dimensions of Cartesian topology
int PX, PY, PZ; //X dim, Y dim, Z dim of each process
float ***old, ***new, ***temp; //Matrices for results dimensions is (Px+2)*(PY+2)*(PZ+2)
int period[3]; //Periodicity for each dimension
int reorder; //Whether processes should be reordered in new cartesian topology
int ndims; //Number of dimensions (which is 3)
int Z_TOWARDS_U, Z_AWAY_U; //Z neighbour towards you and away from you (Z const)
int X_DOWN, X_UP; //Below plane and above plane (X const)
int Y_LEFT, Y_RIGHT; //Left plane and right plane (Y const)
int coords[3]; //Finding coordinates of processes
int dimension; //Used in MPI_Cart_shift() , values = 0, 1,2
int displacement; //Used in MPI_Cart_shift(), values will be +1 to find immediate neighbours
float l_max_err; //Local maximum error on process
float l_max_err_new; //For dependent faces.
float G_max_err = 1.0; //Maximum error for stopping criterion
int iterations = 0 ; //Counting number of iterations
MPI_Request send[6], recv[6]; //For MPI_Isend and MPI_Irecv
int start[3]; //Start will be defined in MPI_Isend() and MPI_Irecv()
int gsize[3]; //Defining global size of subarray
MPI_Datatype x_subarray; //For sending X_UP and X_DOWN
int local_x[3]; //Defining local plane size for X_UP/X_DOWN
MPI_Datatype y_subarray; //For sending Y_LEFT and Y_RIGHT
int local_y[3]; //Defining local plane for Y_LEFT/Y_RIGHT
MPI_Datatype z_subarray; //For sending Z_TOWARDS_U and Z_AWAY_U
int local_z[3]; //Defining local plan size for XY plane i.e. where Z=0
double strt, end; //For measuring time
double strt1, end1, delta1; //For measuring trivial time 1
double strt2, end2, delta2; //For measuring trivial time 2
double t_i_strt, t_i_end, t_i_sum=0; //Time for independent computational kernel
double t_up_strt, t_up_end, t_up_sum=0; //Time for X_UP
double t_down_strt, t_down_end, t_down_sum=0; //Time for X_DOWN
double t_left_strt, t_left_end, t_left_sum=0; //Time for Y_LEFT
double t_right_strt, t_right_end, t_right_sum=0; //Time for Y_RIGHT
double t_towards_strt, t_towards_end, t_towards_sum=0; //For Z_TOWARDS_U
double t_away_strt, t_away_end, t_away_sum=0; //For Z_AWAY_U
double t_comm_strt, t_comm_end, t_comm_sum=0; //Time comm + independent update (need to subtract to get comm time)
double t_setup_strt,t_setup_end; //Set-up start and end time
double t_allred_strt,t_allred_end,t_allred_total=0.0; //Measuring Allreduce time separately.
int threadID; //ID of a thread
int nthreads; //Total threads in OpenMP region
int chunk; //chunk - used to calculate iterations of a thread
/* IGNORE MPI STARTUP ETC */
MPI_Init(&argc, &argv);
t_setup_strt = MPI_Wtime();
old_comm = MPI_COMM_WORLD;
MPI_Comm_size(old_comm, &size);
MPI_Comm_rank(old_comm, &rank);
P = size;
if(rank == 0)
{
input(&N[0], &N[1], &N[2]);
}
MPI_Bcast(N, 3, MPI_INT, 0, old_comm);
dims[0] = 0;
dims[1] = 0;
dims[2] = 0;
period[0] = period[1] = period[2] = 0; //All dimensions aperiodic
reorder = 0 ; //No reordering of ranks in new_comm
ndims = 3;
MPI_Dims_create(P,ndims,dims);
MPI_Cart_create(old_comm, ndims, dims, period, reorder, &new_comm);
if( (N[0]-1) % dims[0] == 0 && (N[1]-1) % dims[1] == 0 && (N[2]-1) % dims[2] == 0 )
{
PX = (N[0]-1)/dims[0]; //Rows of unknowns each process gets
PY = (N[1]-1)/dims[1]; //Columns of unknowns each process gets
PZ = (N[2]-1)/dims[2]; //Depth of unknowns each process gets
}
old = allocate_mem(PX+2, PY+2, PZ+2); //3D arrays with ghost points
new = allocate_mem(PX+2, PY+2, PZ+2); //3D arrays with ghost points
dimension = 0;
displacement = 1;
MPI_Cart_shift(new_comm, dimension, displacement, &X_UP, &X_DOWN); //Find UP and DOWN neighbours
dimension = 1;
MPI_Cart_shift(new_comm, dimension, displacement, &Y_LEFT, &Y_RIGHT); //Find UP and DOWN neighbours
dimension = 2;
MPI_Cart_shift(new_comm, dimension, displacement, &Z_TOWARDS_U, &Z_AWAY_U); //Find UP and DOWN neighbours
/* IGNORE BOUNDARY SETUPS FOR PDE */
#pragma omp parallel for default(none) shared(old,new,PX,PY,PZ) private(i,j,k) schedule(static)
for(i = 0; i <= PX+1; i++)
{
for(j = 0; j <= PY+1; j++)
{
for(k = 0; k <= PZ+1; k++)
{
old[i][j][k] = 0.0;
new[i][j][k] = 0.0;
}
}
}
#pragma omp parallel default(none) shared(X_DOWN,X_UP,Y_LEFT,Y_RIGHT,Z_TOWARDS_U,Z_AWAY_U,old,new,PX,PY,PZ) private(i,j,k,threadID,nthreads)
{
threadID = omp_get_thread_num();
nthreads = omp_get_num_threads();
if(threadID == 0)
{
if(X_DOWN == MPI_PROC_NULL) //X is constant here, this is YZ upper plane
{
for(j = 1 ; j<= PY ; j++)
for(k = 1 ; k<= PZ ; k++)
{
old[0][j][k] = 1;
new[0][j][k] = 1; //Set boundaries in new also
}
}
}
if(threadID == (nthreads-1))
{
if(X_UP == MPI_PROC_NULL) //YZ lower plane
{
for(j = 1 ; j<= PY ; j++)
for(k = 1; k<= PZ ; k++)
{
old[PX+1][j][k] = 1;
new[PX+1][j][k] = 1;
}
}
}
if(Y_LEFT == MPI_PROC_NULL) //Y is constant, this is left XZ plane, possibly can use collapse(2)
{
#pragma omp for schedule(static)
for(i = 1 ; i<= PX ; i++)
for(k = 1; k<= PZ; k++)
{
old[i][0][k] = 1;
new[i][0][k] = 1;
}
}
if(Y_RIGHT == MPI_PROC_NULL) //XZ right plane, again collapse(2) potential
{
#pragma omp for schedule(static)
for(i = 1 ; i<= PX; i++)
for(k = 1; k<= PZ ; k++)
{
old[i][PY+1][k] = 1;
new[i][PY+1][k] = 1;
}
}
if(Z_TOWARDS_U == MPI_PROC_NULL) //Z is constant here, towards you XY plane, collapse(2)
{
#pragma omp for schedule(static)
for(i = 1 ; i<= PX ; i++)
for(j = 1; j<= PY ; j++)
{
old[i][j][0] = 1;
new[i][j][0] = 1;
}
}
if(Z_AWAY_U == MPI_PROC_NULL) //Away from you XY plane, collapse(2)
{
#pragma omp for schedule(static)
for(i = 1 ; i<= PX; i++)
for(j = 1; j<= PY ; j++)
{
old[i][j][PZ+1] = 1;
new[i][j][PZ+1] = 1;
}
}
}
/* IGNORE SUBARRAY DECLARATION */
gsize[0] = PX+2; //Global sizes of 3-D cubes for each process
gsize[1] = PY+2;
gsize[2] = PZ+2;
start[0] = 0; //Will specify starting location while sending/receiving
start[1] = 0;
start[2] = 0;
local_x[0] = 1;
local_x[1] = PY;
local_x[2] = PZ;
MPI_Type_create_subarray(ndims, gsize, local_x, start, MPI_ORDER_C, MPI_FLOAT, &x_subarray);
MPI_Type_commit(&x_subarray);
local_y[0] = PX;
local_y[1] = 1;
local_y[2] = PZ;
MPI_Type_create_subarray(ndims, gsize, local_y, start, MPI_ORDER_C, MPI_FLOAT, &y_subarray);
MPI_Type_commit(&y_subarray);
local_z[0] = PX;
local_z[1] = PY;
local_z[2] = 1;
MPI_Type_create_subarray(ndims, gsize, local_z, start, MPI_ORDER_C, MPI_FLOAT, &z_subarray);
MPI_Type_commit(&z_subarray);
t_setup_end = MPI_Wtime();
strt = MPI_Wtime();
while(G_max_err > Tol) //iterations < ITERATIONS)
{
iterations++ ;
t_comm_strt = MPI_Wtime();
/* IGNORE MPI COMMUNICATION */
MPI_Irecv(&old[0][1][1], 1, x_subarray, X_DOWN, 10, new_comm, &recv[0]);
MPI_Irecv(&old[PX+1][1][1], 1, x_subarray, X_UP, 20, new_comm, &recv[1]);
MPI_Irecv(&old[1][PY+1][1], 1, y_subarray, Y_RIGHT, 30, new_comm, &recv[2]);
MPI_Irecv(&old[1][0][1], 1, y_subarray, Y_LEFT, 40, new_comm, &recv[3]);
MPI_Irecv(&old[1][1][PZ+1], 1, z_subarray, Z_AWAY_U, 50, new_comm, &recv[4]);
MPI_Irecv(&old[1][1][0], 1, z_subarray, Z_TOWARDS_U, 60, new_comm, &recv[5]);
MPI_Isend(&old[PX][1][1], 1, x_subarray, X_UP, 10, new_comm, &send[0]);
MPI_Isend(&old[1][1][1], 1, x_subarray, X_DOWN, 20, new_comm, &send[1]);
MPI_Isend(&old[1][1][1], 1, y_subarray, Y_LEFT, 30, new_comm, &send[2]);
MPI_Isend(&old[1][PY][1], 1, y_subarray, Y_RIGHT, 40, new_comm, &send[3]);
MPI_Isend(&old[1][1][1], 1, z_subarray, Z_TOWARDS_U, 50, new_comm, &send[4]);
MPI_Isend(&old[1][1][PZ], 1, z_subarray, Z_AWAY_U, 60, new_comm, &send[5]);
MPI_Waitall(6, send, MPI_STATUSES_IGNORE);
MPI_Waitall(6, recv, MPI_STATUSES_IGNORE);
t_comm_end = MPI_Wtime();
t_comm_sum = t_comm_sum + (t_comm_end - t_comm_strt);
/* Use threads in Independent update */
t_i_strt = MPI_Wtime();
l_max_err = 0.0; //Very important, Reduction result is combined with this !
/* THIS IS THE IMPORTANT REGION */
#pragma omp parallel default(none) shared(old,new,PX,PY,PZ,chunk) private(threadID,nthreads) reduction(max:l_max_err)
{
nthreads = omp_get_num_threads();
threadID = omp_get_thread_num();
chunk = (PX-1+1) / nthreads ;
l_max_err = independent_update(old, new, PX+2, PY+2, PZ+2, threadID, chunk);
}
t_i_end = MPI_Wtime();
t_i_sum = t_i_sum + (t_i_end - t_i_strt) ;
/* IGNORE THE REMAINING CODE */
t_allred_strt = MPI_Wtime();
MPI_Allreduce(&l_max_err, &G_max_err, 1, MPI_FLOAT, MPI_MAX, new_comm);
t_allred_end = MPI_Wtime();
t_allred_total = t_allred_total + (t_allred_end - t_allred_strt);
temp = new ;
new = old;
old = temp;
}
MPI_Barrier(new_comm);
end = MPI_Wtime();
if( rank == 0)
{
printf("\nIterations = %d, G_max_err = %f", iterations, G_max_err);
printf("\nThe total SET-UP time for MPI and boundary conditions is %lf", (t_setup_end-t_setup_strt));
printf("\nThe total time for SOLVING is %lf", (end-strt));
printf("\nThe total time for INDEPENDENT COMPUTE %lf", t_i_sum);
printf("\nThe total time for COMMUNICATION OVERHEAD is %lf", t_comm_sum);
printf("\nThe total time for MPI_ALLREDUCE() is %lf", t_allred_total);
}
MPI_Type_free(&x_subarray);
MPI_Type_free(&y_subarray);
MPI_Type_free(&z_subarray);
free(&old[0][0][0]);
free(&new[0][0][0]);
MPI_Finalize();
return 0;
}
P.S. : I am almost sure that the cost of spawning/waking the threads is not the reason for such a huge difference in the timing.
Please find attached Scalasca snapshot for INDEPENDENT COMPUTE of the Hybrid Program.
Using loop simd construct
#pragma omp parallel default(none) shared(old,new,PX,PY,PZ,l_max_err) private(i,j,k,diff)
{
#pragma omp for simd schedule(static) reduction(max:l_max_err)
for(i = 1; i <= PX ; i++)
{
for(j = 1; j<= PY; j++)
{
for(k = 1; k<= PZ; k++)
{
new[i][j][k] = (1/6.0) *(old[i-1][j][k] + old[i+1][j][k] + old[i][j-1][k] + old[i][j+1][k] + old[i][j][k-1] + old[i][j][k+1] );
diff = 1.0 - new[i][j][k];
diff = (diff > 0 ? diff : -1.0 * diff );
if(diff > l_max_err)
l_max_err = diff;
}
}
}
}

You frequently get memory access and cache issues when you just do one MPI process per socket on a CPU with multiple memory controllers. It can be on either the read or the write side, so you can't really say which. This is especially an issue when doing thread-parallel execution with lightweight compute tasks (e.g. math on arrays). One MPI process per socket in this case tends to fare significantly worse than pure MPI.
In your BIOS, set up whatever the maximal NUMA per socket option is
Use one MPI process per NUMA node.
Try some different parameter values in schedule(static). I've rarely found the default to be best.
Essentially what this will do is ensure each bundle of threads only works on a single pool of memory.

Suffix Array Implementation Bugs

I've coded a Suffix Array implementation and discovered an issue in my implementation. Concretely I've outputted the first few suffix array ranks RA[0..7] of this string(length = 10^5) and had the following output:
80994
84360
87854
91517
95320
99277
83068
But the correct one had to be (everything shifted by 23):
81017
84383
87877
91540
95343
99300
83091
I know two ways how to fix it, but I don't know why it worked.
The first way was adding S[N++] = '$'; to the top of the buildSA() function (then the output was 1 less than the correct one, but it doesn't matter)
I also found another solution by decreasing the MAX_N constant to 1e5 + 10!
This is so much magic for me and I really need to know why this bug happened because I don't want to have this bug again.
#include <cstdio>
#include <cstring>
#include <algorithm>
using std::max;
const int MAX_N = 2e5 + 10;
int SA[MAX_N]; // The ith element is the index of the suffix
int RA[MAX_N]; // The rank of the suffix at i
int tmp[MAX_N]; // A temporary array
int B[MAX_N]; // An array for the buckets
int N;
char S[MAX_N];
void bucketSort(int k){
int i, m = max(256, N);
for(i = 0; i < m; i++)
B[i] = 0;
for(i = 0; i < N; i++)
B[i + k < N ? RA[i + k] : 0] ++;
for(i = 1; i < m; i++)
B[i] += B[i - 1];
for(i = N - 1; i >= 0; i--)
tmp[--B[SA[i] + k < N ? RA[SA[i] + k] : 0]] = SA[i];
for(i = 0; i < N; i++)
SA[i] = tmp[i];
}
void buildSA(){
for(int i = 0; i < N; i++){
SA[i] = i;
RA[i] = S[i];
}
for(int k = 1; k < N; k <<= 1){
bucketSort(k);
bucketSort(0);
int norder = 0;
tmp[SA[0]] = 0;
for(int i = 1; i < N; i++){
if(RA[SA[i]] == RA[SA[i - 1]] && RA[SA[i] + k] == RA[SA[i - 1] + k])
{} else norder++;
tmp[SA[i]] = norder;
}
for(int i = 0; i < N; i++)
RA[i] = tmp[i];
if(norder == N)
break;
}
}
void printSA(){
for(int i = 0; i < N; i++){
printf("%d: %s\n", SA[i], S + SA[i]);
}
}
int main(){
scanf("%s", S);
N = strlen(S);
buildSA();
for(int i = 0; i < 7; i++){
printf("%d\n",RA[i]);
}
return 0;
}

In the following line:
if(RA[SA[i]] == RA[SA[i - 1]] && RA[SA[i] + k] == RA[SA[i - 1] + k])
SA[i] + k can be >=N(the same is for SA[i - 1] + k).
It should be (SA[i] + k) % Ninstead.

I think I got it after many wasted hours. Sometimes the littlest mistakes can literally result to wrong answers.
The "bad" code line is:
if(RA[SA[i]] == RA[SA[i - 1]] && RA[SA[i] + k] == RA[SA[i - 1] + k])
{} else norder++;
I verified this by using a very simple testcase (I couldn't generate randomly...) like:
abab
The resulting suffix array was
0: abab
2: ab
3: b
1: bab
which is clearly wrong.
At step k = 2, if we are comparing two suffixes like ab and abab then, we realize that they have the same rank, since their first k = 2 characters match. ab is suffix #2, by adding k = 2, we are out of range.
I've often coded it like this because I've always appended an auxiliary character (e.g. '$') to the end. If I don't put such a character (like in my case), SA[i] + k could actually be >= N and this code crashes.

CodeJam 2014: How to solve task "New Lottery Game"?

I want to know efficient approach for the New Lottery Game problem.
The Lottery is changing! The Lottery used to have a machine to generate a random winning number. But due to cheating problems, the Lottery has decided to add another machine. The new winning number will be the result of the bitwise-AND operation between the two random numbers generated by the two machines.
To find the bitwise-AND of X and Y, write them both in binary; then a bit in the result in binary has a 1 if the corresponding bits of X and Y were both 1, and a 0 otherwise. In most programming languages, the bitwise-AND of X and Y is written X&Y.
For example:
The old machine generates the number 7 = 0111.
The new machine generates the number 11 = 1011.
The winning number will be (7 AND 11) = (0111 AND 1011) = 0011 = 3.
With this measure, the Lottery expects to reduce the cases of fraudulent claims, but unfortunately an employee from the Lottery company has leaked the following information: the old machine will always generate a non-negative integer less than A and the new one will always generate a non-negative integer less than B.
Catalina wants to win this lottery and to give it a try she decided to buy all non-negative integers less than K.
Given A, B and K, Catalina would like to know in how many different ways the machines can generate a pair of numbers that will make her a winner.
For small input we can check all possible pairs but how to do it with large inputs. I guess we represent the binary number into string first and then check permutations which would give answer less than K. But I can't seem to figure out how to calculate possible permutations of 2 binary strings.

I used a general DP technique that I described in a lot of detail in another answer.
We want to count the pairs (a, b) such that a < A, b < B and a & b < K.
The first step is to convert the numbers to binary and to pad them to the same size by adding leading zeroes. I just padded them to a fixed size of 40. The idea is to build up the valid a and b bit by bit.
Let f(i, loA, loB, loK) be the number of valid suffix pairs of a and b of size 40 - i. If loA is true, it means that the prefix up to i is already strictly smaller than the corresponding prefix of A. In that case there is no restriction on the next possible bit for a. If loA ist false, A[i] is an upper bound on the next bit we can place at the end of the current prefix. loB and loK have an analogous meaning.
Now we have the following transition:
long long f(int i, bool loA, bool loB, bool loK) {
// TODO add memoization
if (i == 40)
return loA && loB && loK;
int hiA = loA ? 1: A[i]-'0'; // upper bound on the next bit in a
int hiB = loB ? 1: B[i]-'0'; // upper bound on the next bit in b
int hiK = loK ? 1: K[i]-'0'; // upper bound on the next bit in a & b
long long res = 0;
for (int a = 0; a <= hiA; ++a)
for (int b = 0; b <= hiB; ++b) {
int k = a & b;
if (k > hiK) continue;
res += f(i+1, loA || a < A[i]-'0',
loB || b < B[i]-'0',
loK || k < K[i]-'0');
}
return res;
}
The result is f(0, false, false, false).
The runtime is O(max(log A, log B)) if memoization is added to ensure that every subproblem is only solved once.

What I did was just to identify when the answer is A * B.
Otherwise, just brute force the rest, this code passed the large input.
// for each test cases
long count = 0;
if ((K > A) || (K > B)) {
count = A * B;
continue; // print count and go to the next test case
}
count = A * B - (A-K) * (B-K);
for (int i = K; i < A; i++) {
for (int j = K; j < B; j++) {
if ((i&j) < K) count++;
}
}
I hope this helps!

just as Niklas B. said.
the whole answer is.
#include <algorithm>
#include <cstring>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <map>
#include <sstream>
#include <string>
#include <vector>
using namespace std;
#define MAX_SIZE 32
int A, B, K;
int arr_a[MAX_SIZE];
int arr_b[MAX_SIZE];
int arr_k[MAX_SIZE];
bool flag [MAX_SIZE][2][2][2];
long long matrix[MAX_SIZE][2][2][2];
long long
get_result();
int main(int argc, char *argv[])
{
int case_amount = 0;
cin >> case_amount;
for (int i = 0; i < case_amount; ++i)
{
const long long result = get_result();
cout << "Case #" << 1 + i << ": " << result << endl;
}
return 0;
}
long long
dp(const int h,
const bool can_A_choose_1,
const bool can_B_choose_1,
const bool can_K_choose_1)
{
if (MAX_SIZE == h)
return can_A_choose_1 && can_B_choose_1 && can_K_choose_1;
if (flag[h][can_A_choose_1][can_B_choose_1][can_K_choose_1])
return matrix[h][can_A_choose_1][can_B_choose_1][can_K_choose_1];
int cnt_A_max = arr_a[h];
int cnt_B_max = arr_b[h];
int cnt_K_max = arr_k[h];
if (can_A_choose_1)
cnt_A_max = 1;
if (can_B_choose_1)
cnt_B_max = 1;
if (can_K_choose_1)
cnt_K_max = 1;
long long res = 0;
for (int i = 0; i <= cnt_A_max; ++i)
{
for (int j = 0; j <= cnt_B_max; ++j)
{
int k = i & j;
if (k > cnt_K_max)
continue;
res += dp(h + 1,
can_A_choose_1 || (i < cnt_A_max),
can_B_choose_1 || (j < cnt_B_max),
can_K_choose_1 || (k < cnt_K_max));
}
}
flag[h][can_A_choose_1][can_B_choose_1][can_K_choose_1] = true;
matrix[h][can_A_choose_1][can_B_choose_1][can_K_choose_1] = res;
return res;
}
long long
get_result()
{
cin >> A >> B >> K;
memset(arr_a, 0, sizeof(arr_a));
memset(arr_b, 0, sizeof(arr_b));
memset(arr_k, 0, sizeof(arr_k));
memset(flag, 0, sizeof(flag));
memset(matrix, 0, sizeof(matrix));
int i = 31;
while (i >= 1)
{
arr_a[i] = A % 2;
A /= 2;
arr_b[i] = B % 2;
B /= 2;
arr_k[i] = K % 2;
K /= 2;
i--;
}
return dp(1, 0, 0, 0);
}

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string