For my open source project cachegrand we are implementing AARCH64 support and although most of the port is completed we are sorting out a feature to perform an accelerated array search using NEON instructions.
The logic we use is pretty simple:
in input there is an array of 14 uint32 elements, the value to find and a mask to ignore certain matches
the code has to find any value that matches a specific uint32
build a bitmask
the least significant bits of the bitmask match the begin of the array
the bitmask is then & with the skip indices mask
and then the trailing zeros are counted to determine the index of the first occurance
It's a very rare occurance that the skip indices mask is actually used, I would say that 99.9% of the cases will be zero.
I have come up with the following implementation, but I have no experience with ARMv8 NEON instruction and feels a bit clunky, especially so I was wondering if there is a way to make it faster and/or better.
For reference, currently the code is compiled only with GCC.
uint8_t hashtable_mcmp_support_hash_search_armv8a_neon_14(
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask) {
uint32x4_t tmp;
uint32_t compacted_result_mask = 0;
uint32_t skip_indexes_mask_inv = ~skip_indexes_mask;
static const int32x4_t shift = {0, 1, 2, 3};
uint32x4_t cmp_vector = vdupq_n_u32(hash);
uint32x4_t ring_vector_0_3 = vld1q_u32((hashtable_hash_half_t*)hashes + 0);
uint32x4_t cmp_vector_0_3 = vceqq_u32(ring_vector_0_3, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_0_3, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 0;
uint32x4_t ring_vector_4_7 = vld1q_u32((hashtable_hash_half_t*)hashes + 4);
uint32x4_t cmp_vector_4_7 = vceqq_u32(ring_vector_4_7, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_4_7, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 4;
uint32x4_t ring_vector_8_11 = vld1q_u32((hashtable_hash_half_t*)hashes + 8);
uint32x4_t cmp_vector_8_11 = vceqq_u32(ring_vector_8_11, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_8_11, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 8;
uint32x4_t ring_vector_10_13 = vld1q_u32((hashtable_hash_half_t*)hashes + 10);
uint32x4_t cmp_vector_10_13 = vceqq_u32(ring_vector_10_13, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_10_13, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 10;
return __builtin_ctz(compacted_result_mask & skip_indexes_mask_inv);
Just for reference, here the AVX2 code
static inline uint8_t hashtable_mcmp_support_hash_search_avx2_14(
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask) {
uint32_t compacted_result_mask = 0;
uint32_t skip_indexes_mask_inv = ~skip_indexes_mask;
__m256i cmp_vector = _mm256_set1_epi32(hash);
// The second load, load from the 6th uint32 to the 14th uint32, _mm256_loadu_si256 always loads 8 x uint32
for(uint8_t base_index = 0; base_index < 12; base_index += 6) {
__m256i ring_vector = _mm256_loadu_si256((__m256i*) (hashes + base_index));
__m256i result_mask_vector = _mm256_cmpeq_epi32(ring_vector, cmp_vector);
// Uses _mm256_movemask_ps to reduce the bandwidth
compacted_result_mask |= (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(result_mask_vector)) << (base_index);
return _tzcnt_u32(compacted_result_mask & skip_indexes_mask_inv);
On a side question, do you think it's worth to implement support for SVE2 instructions? Especially taking into account that this is a pretty simple operation and looks like there might not be mandatory support for 256 bits registers (which probably would be the biggest benefit of using SVE2 in this specific context)
Booleans don't need 32 bits each: shrink them to 8 bits ASAP by vuzp1 and vomovn prior to doing further operations.
uint8_t hashtable_mcmp_support_hash_search_armv8a_neon_14(
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask)
uint16x8_t tmp16a, tmp16b;
uint8x8_t tmp8a, tmp8b;
uint32_t tmp;
static const uint8x8_t mask = {1, 2, 4, 8, 16, 32, 64, 128};
uint32x4_t cmp_vector = vdupq_n_u32(hash);
uint32x4x3_t ring_vector_0_11 = vld1q_u32_x3((uint32_t *)hashes);
uint32x4_t ring_vector_10_13 = vld1q_u32((uint32_t *)hashes+10);
ring_vector_0_11.val[0] = vceqq_u32(ring_vector_0_11.val[0], cmp_vector);
ring_vector_0_11.val[1] = vceqq_u32(ring_vector_0_11.val[1], cmp_vector);
ring_vector_0_11.val[2] = vceqq_u32(ring_vector_0_11.val[2], cmp_vector);
ring_vector_10_13 = vceqq_u32(ring_vector_10_13, cmp_vector);
tmp16a = vuzp1q_u16(ring_vector_0_11.val[0], ring_vector_0_11.val[1]);
tmp16b = vuzp1q_u16(ring_vector_0_11.val[2], ring_vector_10_13);
tmp8a = vmovn_u16(tmp16a);
tmp8b = vmovn_u16(tmp16b);
tmp8a = vand_u8(tmp8a, mask);
tmp8b = vand_u8(tmp8b, mask);
tmp = (uint32_t)vaddv_u8(tmp8a) | (uint32_t)(vaddv_u8(tmp8b)<<8);
return __builtin_ctz(tmp &~ skip_indexes_mask);
And I don't think sve will bring a meaningful performance boost since the performance is more or less crippled at the end (vaddv and especially the transfer to arm registers)
If you are dealing with thousands of 14 entry arrays, you should consider redesigning your function to writing into an 8bit array instead of returning in arm register each and every time. That will eliminate the most time consuming pipeline hazard caused by the Neon to arm transfer.
#include <arm_neon.h>
#include <arm_acle.h>
void hashtable_mcmp_support_hash_search_armv8a_neon_14_b(
uint8_t *pDst,
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask, uint32_t number_of_arrays)
uint16x8_t tmp16a, tmp16b;
uint16x4_t tmp;
uint8x8_t tmp8a, tmp8b;
static const uint8x8_t mask = {128, 64, 32, 16, 8, 4, 2, 1};
uint32x4_t cmp_vector = vdupq_n_u32(hash);
skip_indexes_mask = __rbit(skip_indexes_mask)>>16;
uint16x4_t index_mask = vdup_n_u16((uint16_t) skip_indexes_mask);
uint32x4x4_t ring_vector;
while (number_of_arrays--)
ring_vector = vld1q_u32_x4((uint32_t *)hashes);
hashes += 16;
ring_vector.val[0] = vceqq_u32(ring_vector.val[0], cmp_vector);
ring_vector.val[1] = vceqq_u32(ring_vector.val[1], cmp_vector);
ring_vector.val[2] = vceqq_u32(ring_vector.val[2], cmp_vector);
ring_vector.val[3] = vceqq_u32(ring_vector.val[3], cmp_vector);
tmp16a = vuzp1q_u16(vreinterpretq_u16_u32(ring_vector.val[0]), vreinterpretq_u16_u32(ring_vector.val[1]));
tmp16b = vuzp1q_u16(vreinterpretq_u16_u32(ring_vector.val[2]), vreinterpretq_u16_u32(ring_vector.val[3]));
tmp8a = vmovn_u16(tmp16a);
tmp8b = vmovn_u16(tmp16b);
tmp8a = vand_u8(tmp8a, mask);
tmp8b = vand_u8(tmp8b, mask);
tmp8a[1] = vaddv_u8(tmp8a);
tmp8a[0] = vaddv_u8(tmp8b);
tmp = vbic_u16(vreinterpret_u16_u8(tmp8a), index_mask);
tmp = vclz_u16(tmp);
vst1_lane_u8(pDst++,vreinterpret_u8_u16(tmp), 0);
Above is an "improved" version
It assumes the arrays to be in contiguous memory with 8 bytes padding which is perferrable for the cache efficiency unless the memory requirement is a problem.
Instead of returning an 8bit result, it writes the results into memory directly, avoiding pipeline hazards caused by neon to arm transfer.
It still suffers from vaddv latency(8 cycles). You can unroll the loop so that it processes 2 or even 4 arrays per iteration in order to hide that latency.
Please Help!
I am using MPI (= Message Passing Interface) in python for a ring communication, which means that every rank are sending and receiving from each other. I know one way to realize this is by using for instance MPI.COMM_WORLD.issend()and MPI.COMM_WORLD.recv(), this is working and done.
Now I want to realize the same Output on a different way by using MPI.Topocomm.Neighbor_alltoallw but this is not working. I wrote a C Code and is working there, so the same output can be reached with this function, but when I implement this in python it is not working. Please find below the C Code and the Python Code
The definition of the Function says (mpi4py Package for Python):
Topocomm.Neighbor_alltoallw(self, sendbuf, recvbuf)
Neighbor All-to-All Generalized
I do not understand following things:
why is recbuf not a return value? it seems to be an argument here
how can this be implmented for a ring communication in Python?
Thank you for your time and support!
my working C Code:
#include <stdio.h>
#include <mpi.h>
#define to_right 201
#define max_dims 1
int main (int argc, char *argv[])
int my_rank, size;
int snd_buf, rcv_buf;
int right, left;
int sum, i;
MPI_Comm new_comm;
int dims[max_dims],
MPI_Aint snd_displs[2], rcv_displs[2];
int snd_counts[2], rcv_counts[2];
MPI_Datatype snd_types[2], rcv_types[2];
MPI_Status status;
MPI_Request request;
MPI_Init(&argc, &argv);
/* Get process info. */
MPI_Comm_size(MPI_COMM_WORLD, &size);
/* Set cartesian topology. */
dims[0] = size;
periods[0] = 1;
reorder = 1;
MPI_Cart_create(MPI_COMM_WORLD, max_dims, dims, periods,
/* Get coords */
MPI_Comm_rank(new_comm, &my_rank);
/* MPI_Cart_coords(new_comm, my_rank, max_dims, my_coords); */
/* Get nearest neighbour rank. */
MPI_Cart_shift(new_comm, 0, 1, &left, &right);
/* Compute global sum. */
sum = 0;
snd_buf = my_rank;
rcv_buf = -1000; /* unused value, should be overwritten by first MPI_Recv; only for test purpose */
rcv_counts[0] = 1; MPI_Get_address(&rcv_buf, &rcv_displs[0]); snd_types[0] = MPI_INT;
rcv_counts[1] = 0; rcv_displs[1] = 0 /*unused*/; snd_types[1] = MPI_INT;
snd_counts[0] = 0; snd_displs[0] = 0 /*unused*/; rcv_types[0] = MPI_INT;
snd_counts[1] = 1; MPI_Get_address(&snd_buf, &snd_displs[1]); rcv_types[1] = MPI_INT;
for( i = 0; i < size; i++)
/* Substituted by MPI_Neighbor_alltoallw() :
MPI_Issend(&snd_buf, 1, MPI_INT, right, to_right,
new_comm, &request);
MPI_Recv(&rcv_buf, 1, MPI_INT, left, to_right,
new_comm, &status);
MPI_Wait(&request, &status);
MPI_Neighbor_alltoallw(MPI_BOTTOM, snd_counts, snd_displs, snd_types,
MPI_BOTTOM, rcv_counts, rcv_displs, rcv_types, new_comm);
snd_buf = rcv_buf;
sum += rcv_buf;
printf ("PE%i:\tSum = %i\n", my_rank, sum);
My not working Python Code:
from mpi4py import MPI
size = MPI.COMM_WORLD.Get_size()
my_rank = MPI.COMM_WORLD.Get_rank()
to_right =201
dims = [max_dims]
reorder = True
my_rank= new_comm.Get_rank()
left_right= MPI.Cartcomm.Shift(new_comm,0,1)
rcv_buf=-1000 #unused value, should be overwritten, only for test purpose
for counter in range(0,size):
print('PE ', my_rank,'sum=',sum)
I am converting a 3-D Jacobi solver from pure MPI to Hybrid MPI+OpenMP. I have a 192x192x192 array which is divided among 24 processes in Pure MPI in 1-D decomposition i.e. each process has 192/24 x 192 x 192 = 8 x 192 x 192 slab of data. Now I do :
for(i=0 ; i <= 7; i++)
for(j=0; j<= 191; j++)
for(k=0; k<= 191; k++)
unew[i][j][k] = 1/6.0 * (u[i+1][j][k]+u[i-1][j][k]+
This update takes around 60 seconds for each process.
Now with Hybrid MPI, I run two processes (1 process per socket --bind-to socket --map-by socket and OMP_PROC_PLACES=coreswith OMP_PROC_BIND=close). I create 12 threads per MPI Process (i.e. 12 threads per socket or processor). Now each MPI process has an array of size : 192/2 x 192 x 192 = 96x192x192 elements. Each thread works on 96/12 x 192 x 192 = 8 x 192 x 192 portion of the array owned by each process. I do the same triple loop update using threads but the time is approximately 76 seconds for each thread. The load balance is perfect in both the problems. What could be the possible causes of performance degradation ? Is is False Sharing because threads could be invalidating the cache lines close to each other's chunk of data ? If yes, then how do I reduce this performance degradation ? (I have purposefully not mentioned ghost data but initially I am NOT overlapping communication with computation.)
In response to the comments below, am posting the code. Apologies for the long MWE but you can very safely ignore (1) Header files declaration (2) Variable Declaration (3) Memory allocation routine (4) Formation of Cartesian Topology (5) Setting boundary conditions in parallel using OpenMP parallel region (6) Declaration of MPI_Type_subarray datatype (7) MPI_Isend() and MPI_Irecv() calls and just concentrate on (a) INDEPENDENT UPDATE OpenMP parallel region (b) independent_update(...) routine being called from here.
#define MIN(a,b) (a < b ? a : b)
#define Tol 0.00001
void input(int *X, int *Y, int *Z)
int a=193, b=193, c=193;
*X = a;
*Y = b;
*Z = c;
float*** allocate_mem(int X, int Y, int Z)
int i,j;
float ***matrix;
float *arr;
arr = (float*)calloc(X*Y*Z, sizeof(float));
matrix = (float***)calloc(X, sizeof(float**));
for(i = 0 ; i<= X-1; i++)
matrix[i] = (float**)calloc(Y, sizeof(float*));
for(i = 0 ; i <= X-1; i++)
for(j=0; j<= Y-1; j++)
matrix[i][j] = &(arr[i*Y*Z + j*Z]);
return matrix ;
float independent_update(float ***old, float ***new, int NX, int NY, int NZ, int tID, int chunk)
int i,j,k, start, end;
float error = 0.0;
float diff;
start = tID * chunk + 1;
end = MIN( (tID+1)*chunk, NX-2 );
for(i = start; i <= end ; i++)
for(j = 1; j<= NY-2; j++)
#pragma omp simd
for(k = 1; k<= NZ-2; k++)
new[i][j][k] = (1/6.0) *(old[i-1][j][k] + old[i+1][j][k] + old[i][j-1][k] + old[i][j+1][k] + old[i][j][k-1] + old[i][j][k+1] );
diff = 1.0 - new[i][j][k];
diff = (diff > 0 ? diff : -1.0 * diff );
if(diff > error)
error = diff;
return error;
int main(int argc, char *argv[])
int size, rank; //Size of old_comm and rank of process
int i, j, k,l; //General loop variables
MPI_Comm old_comm, new_comm; //MPI_COMM_WORLD handle and for MPI_Cart_create()
int N[3]; //For taking input of size of matrix from user
int P; //Represent number of processes i.e. same as size
int dims[3]; //For dimensions of Cartesian topology
int PX, PY, PZ; //X dim, Y dim, Z dim of each process
float ***old, ***new, ***temp; //Matrices for results dimensions is (Px+2)*(PY+2)*(PZ+2)
int period[3]; //Periodicity for each dimension
int reorder; //Whether processes should be reordered in new cartesian topology
int ndims; //Number of dimensions (which is 3)
int Z_TOWARDS_U, Z_AWAY_U; //Z neighbour towards you and away from you (Z const)
int X_DOWN, X_UP; //Below plane and above plane (X const)
int Y_LEFT, Y_RIGHT; //Left plane and right plane (Y const)
int coords[3]; //Finding coordinates of processes
int dimension; //Used in MPI_Cart_shift() , values = 0, 1,2
int displacement; //Used in MPI_Cart_shift(), values will be +1 to find immediate neighbours
float l_max_err; //Local maximum error on process
float l_max_err_new; //For dependent faces.
float G_max_err = 1.0; //Maximum error for stopping criterion
int iterations = 0 ; //Counting number of iterations
MPI_Request send[6], recv[6]; //For MPI_Isend and MPI_Irecv
int start[3]; //Start will be defined in MPI_Isend() and MPI_Irecv()
int gsize[3]; //Defining global size of subarray
MPI_Datatype x_subarray; //For sending X_UP and X_DOWN
int local_x[3]; //Defining local plane size for X_UP/X_DOWN
MPI_Datatype y_subarray; //For sending Y_LEFT and Y_RIGHT
int local_y[3]; //Defining local plane for Y_LEFT/Y_RIGHT
MPI_Datatype z_subarray; //For sending Z_TOWARDS_U and Z_AWAY_U
int local_z[3]; //Defining local plan size for XY plane i.e. where Z=0
double strt, end; //For measuring time
double strt1, end1, delta1; //For measuring trivial time 1
double strt2, end2, delta2; //For measuring trivial time 2
double t_i_strt, t_i_end, t_i_sum=0; //Time for independent computational kernel
double t_up_strt, t_up_end, t_up_sum=0; //Time for X_UP
double t_down_strt, t_down_end, t_down_sum=0; //Time for X_DOWN
double t_left_strt, t_left_end, t_left_sum=0; //Time for Y_LEFT
double t_right_strt, t_right_end, t_right_sum=0; //Time for Y_RIGHT
double t_towards_strt, t_towards_end, t_towards_sum=0; //For Z_TOWARDS_U
double t_away_strt, t_away_end, t_away_sum=0; //For Z_AWAY_U
double t_comm_strt, t_comm_end, t_comm_sum=0; //Time comm + independent update (need to subtract to get comm time)
double t_setup_strt,t_setup_end; //Set-up start and end time
double t_allred_strt,t_allred_end,t_allred_total=0.0; //Measuring Allreduce time separately.
int threadID; //ID of a thread
int nthreads; //Total threads in OpenMP region
int chunk; //chunk - used to calculate iterations of a thread
MPI_Init(&argc, &argv);
t_setup_strt = MPI_Wtime();
old_comm = MPI_COMM_WORLD;
MPI_Comm_size(old_comm, &size);
MPI_Comm_rank(old_comm, &rank);
P = size;
if(rank == 0)
input(&N[0], &N[1], &N[2]);
MPI_Bcast(N, 3, MPI_INT, 0, old_comm);
dims[0] = 0;
dims[1] = 0;
dims[2] = 0;
period[0] = period[1] = period[2] = 0; //All dimensions aperiodic
reorder = 0 ; //No reordering of ranks in new_comm
ndims = 3;
MPI_Cart_create(old_comm, ndims, dims, period, reorder, &new_comm);
if( (N[0]-1) % dims[0] == 0 && (N[1]-1) % dims[1] == 0 && (N[2]-1) % dims[2] == 0 )
PX = (N[0]-1)/dims[0]; //Rows of unknowns each process gets
PY = (N[1]-1)/dims[1]; //Columns of unknowns each process gets
PZ = (N[2]-1)/dims[2]; //Depth of unknowns each process gets
old = allocate_mem(PX+2, PY+2, PZ+2); //3D arrays with ghost points
new = allocate_mem(PX+2, PY+2, PZ+2); //3D arrays with ghost points
dimension = 0;
displacement = 1;
MPI_Cart_shift(new_comm, dimension, displacement, &X_UP, &X_DOWN); //Find UP and DOWN neighbours
dimension = 1;
MPI_Cart_shift(new_comm, dimension, displacement, &Y_LEFT, &Y_RIGHT); //Find UP and DOWN neighbours
dimension = 2;
MPI_Cart_shift(new_comm, dimension, displacement, &Z_TOWARDS_U, &Z_AWAY_U); //Find UP and DOWN neighbours
#pragma omp parallel for default(none) shared(old,new,PX,PY,PZ) private(i,j,k) schedule(static)
for(i = 0; i <= PX+1; i++)
for(j = 0; j <= PY+1; j++)
for(k = 0; k <= PZ+1; k++)
old[i][j][k] = 0.0;
new[i][j][k] = 0.0;
#pragma omp parallel default(none) shared(X_DOWN,X_UP,Y_LEFT,Y_RIGHT,Z_TOWARDS_U,Z_AWAY_U,old,new,PX,PY,PZ) private(i,j,k,threadID,nthreads)
threadID = omp_get_thread_num();
nthreads = omp_get_num_threads();
if(threadID == 0)
if(X_DOWN == MPI_PROC_NULL) //X is constant here, this is YZ upper plane
for(j = 1 ; j<= PY ; j++)
for(k = 1 ; k<= PZ ; k++)
old[0][j][k] = 1;
new[0][j][k] = 1; //Set boundaries in new also
if(threadID == (nthreads-1))
if(X_UP == MPI_PROC_NULL) //YZ lower plane
for(j = 1 ; j<= PY ; j++)
for(k = 1; k<= PZ ; k++)
old[PX+1][j][k] = 1;
new[PX+1][j][k] = 1;
if(Y_LEFT == MPI_PROC_NULL) //Y is constant, this is left XZ plane, possibly can use collapse(2)
#pragma omp for schedule(static)
for(i = 1 ; i<= PX ; i++)
for(k = 1; k<= PZ; k++)
old[i][0][k] = 1;
new[i][0][k] = 1;
if(Y_RIGHT == MPI_PROC_NULL) //XZ right plane, again collapse(2) potential
#pragma omp for schedule(static)
for(i = 1 ; i<= PX; i++)
for(k = 1; k<= PZ ; k++)
old[i][PY+1][k] = 1;
new[i][PY+1][k] = 1;
if(Z_TOWARDS_U == MPI_PROC_NULL) //Z is constant here, towards you XY plane, collapse(2)
#pragma omp for schedule(static)
for(i = 1 ; i<= PX ; i++)
for(j = 1; j<= PY ; j++)
old[i][j][0] = 1;
new[i][j][0] = 1;
if(Z_AWAY_U == MPI_PROC_NULL) //Away from you XY plane, collapse(2)
#pragma omp for schedule(static)
for(i = 1 ; i<= PX; i++)
for(j = 1; j<= PY ; j++)
old[i][j][PZ+1] = 1;
new[i][j][PZ+1] = 1;
gsize[0] = PX+2; //Global sizes of 3-D cubes for each process
gsize[1] = PY+2;
gsize[2] = PZ+2;
start[0] = 0; //Will specify starting location while sending/receiving
start[1] = 0;
start[2] = 0;
local_x[0] = 1;
local_x[1] = PY;
local_x[2] = PZ;
MPI_Type_create_subarray(ndims, gsize, local_x, start, MPI_ORDER_C, MPI_FLOAT, &x_subarray);
local_y[0] = PX;
local_y[1] = 1;
local_y[2] = PZ;
MPI_Type_create_subarray(ndims, gsize, local_y, start, MPI_ORDER_C, MPI_FLOAT, &y_subarray);
local_z[0] = PX;
local_z[1] = PY;
local_z[2] = 1;
MPI_Type_create_subarray(ndims, gsize, local_z, start, MPI_ORDER_C, MPI_FLOAT, &z_subarray);
t_setup_end = MPI_Wtime();
strt = MPI_Wtime();
while(G_max_err > Tol) //iterations < ITERATIONS)
iterations++ ;
t_comm_strt = MPI_Wtime();
MPI_Irecv(&old[0][1][1], 1, x_subarray, X_DOWN, 10, new_comm, &recv[0]);
MPI_Irecv(&old[PX+1][1][1], 1, x_subarray, X_UP, 20, new_comm, &recv[1]);
MPI_Irecv(&old[1][PY+1][1], 1, y_subarray, Y_RIGHT, 30, new_comm, &recv[2]);
MPI_Irecv(&old[1][0][1], 1, y_subarray, Y_LEFT, 40, new_comm, &recv[3]);
MPI_Irecv(&old[1][1][PZ+1], 1, z_subarray, Z_AWAY_U, 50, new_comm, &recv[4]);
MPI_Irecv(&old[1][1][0], 1, z_subarray, Z_TOWARDS_U, 60, new_comm, &recv[5]);
MPI_Isend(&old[PX][1][1], 1, x_subarray, X_UP, 10, new_comm, &send[0]);
MPI_Isend(&old[1][1][1], 1, x_subarray, X_DOWN, 20, new_comm, &send[1]);
MPI_Isend(&old[1][1][1], 1, y_subarray, Y_LEFT, 30, new_comm, &send[2]);
MPI_Isend(&old[1][PY][1], 1, y_subarray, Y_RIGHT, 40, new_comm, &send[3]);
MPI_Isend(&old[1][1][1], 1, z_subarray, Z_TOWARDS_U, 50, new_comm, &send[4]);
MPI_Isend(&old[1][1][PZ], 1, z_subarray, Z_AWAY_U, 60, new_comm, &send[5]);
MPI_Waitall(6, send, MPI_STATUSES_IGNORE);
MPI_Waitall(6, recv, MPI_STATUSES_IGNORE);
t_comm_end = MPI_Wtime();
t_comm_sum = t_comm_sum + (t_comm_end - t_comm_strt);
/* Use threads in Independent update */
t_i_strt = MPI_Wtime();
l_max_err = 0.0; //Very important, Reduction result is combined with this !
#pragma omp parallel default(none) shared(old,new,PX,PY,PZ,chunk) private(threadID,nthreads) reduction(max:l_max_err)
nthreads = omp_get_num_threads();
threadID = omp_get_thread_num();
chunk = (PX-1+1) / nthreads ;
l_max_err = independent_update(old, new, PX+2, PY+2, PZ+2, threadID, chunk);
t_i_end = MPI_Wtime();
t_i_sum = t_i_sum + (t_i_end - t_i_strt) ;
t_allred_strt = MPI_Wtime();
MPI_Allreduce(&l_max_err, &G_max_err, 1, MPI_FLOAT, MPI_MAX, new_comm);
t_allred_end = MPI_Wtime();
t_allred_total = t_allred_total + (t_allred_end - t_allred_strt);
temp = new ;
new = old;
old = temp;
end = MPI_Wtime();
if( rank == 0)
printf("\nIterations = %d, G_max_err = %f", iterations, G_max_err);
printf("\nThe total SET-UP time for MPI and boundary conditions is %lf", (t_setup_end-t_setup_strt));
printf("\nThe total time for SOLVING is %lf", (end-strt));
printf("\nThe total time for INDEPENDENT COMPUTE %lf", t_i_sum);
printf("\nThe total time for COMMUNICATION OVERHEAD is %lf", t_comm_sum);
printf("\nThe total time for MPI_ALLREDUCE() is %lf", t_allred_total);
return 0;
P.S. : I am almost sure that the cost of spawning/waking the threads is not the reason for such a huge difference in the timing.
Please find attached Scalasca snapshot for INDEPENDENT COMPUTE of the Hybrid Program.
Using loop simd construct
#pragma omp parallel default(none) shared(old,new,PX,PY,PZ,l_max_err) private(i,j,k,diff)
#pragma omp for simd schedule(static) reduction(max:l_max_err)
for(i = 1; i <= PX ; i++)
for(j = 1; j<= PY; j++)
for(k = 1; k<= PZ; k++)
new[i][j][k] = (1/6.0) *(old[i-1][j][k] + old[i+1][j][k] + old[i][j-1][k] + old[i][j+1][k] + old[i][j][k-1] + old[i][j][k+1] );
diff = 1.0 - new[i][j][k];
diff = (diff > 0 ? diff : -1.0 * diff );
if(diff > l_max_err)
l_max_err = diff;
You frequently get memory access and cache issues when you just do one MPI process per socket on a CPU with multiple memory controllers. It can be on either the read or the write side, so you can't really say which. This is especially an issue when doing thread-parallel execution with lightweight compute tasks (e.g. math on arrays). One MPI process per socket in this case tends to fare significantly worse than pure MPI.
In your BIOS, set up whatever the maximal NUMA per socket option is
Use one MPI process per NUMA node.
Try some different parameter values in schedule(static). I've rarely found the default to be best.
Essentially what this will do is ensure each bundle of threads only works on a single pool of memory.
Followed this guide here
I am tasked with "using map and unmap methods to draw a line across the screen by setting pixel byte data to rgb red values".
I have the sprite and background displaying but have no idea how to get the data.
I also tried doing this:
//Create device
ZeroMemory(&desc, sizeof(D3D11_TEXTURE2D_DESC));
desc.Width = 500;
desc.Height = 300;
desc.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
desc.Usage = D3D11_USAGE_DYNAMIC;
desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
desc.MiscFlags = 0;
desc.MipLevels = 1;
desc.ArraySize = 1;
desc.SampleDesc.Count = 1;
desc.SampleDesc.Quality = 0;
desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
m_d3dDevice->CreateTexture2D(&desc, nullptr, &texture);
m_d3dDevice->CreateShaderResourceView(texture, 0, &textureView);
// Render
m_d3dContext->Map(texture, 0, D3D11_MAP_WRITE_DISCARD, 0, &mapped);
data = (BYTE*)mapped.pData;
rows = (BYTE)sizeof(data);
std::cout << "hi" << std::endl;
m_d3dContext->Unmap(texture, 0);
Problem is that in that case data array is size 0 but has a pointer. This means that I am pointing to a texture that doesn't have any data or am I not getting this?
currently I found
desc.Buffer; // buffer
I felt the need to create an Answer for this as when I searched for how do this. This question pops up first and the supplied answer didn't really solve the problem for me and wasn't quite the way I wanted to do it anyways...
In my program I have a method as below.
void ContentLoader::WritePixelsToShaderIndex(uint32_t *data, int width, int height, int index)
D3D11_TEXTURE2D_DESC desc = {};
desc.Width = width;
desc.Height = height;
desc.MipLevels = 1;
desc.ArraySize = 1;
desc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
desc.SampleDesc.Count = 1;
desc.SampleDesc.Quality = 0;
desc.Usage = D3D11_USAGE_DEFAULT;
desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
desc.CPUAccessFlags = 0;
desc.MiscFlags = 0;
initData.pSysMem = data;
initData.SysMemPitch = width * 4;
initData.SysMemSlicePitch = width * height * 4;
Microsoft::WRL::ComPtr<ID3D11Texture2D> tex;
Engine::device->CreateTexture2D(&desc, &initData, tex.GetAddressOf());
Engine::device->CreateShaderResourceView(tex.Get(), NULL, ContentLoader::GetTextureAddress(index));
Then using the below code I tested drawing a Blue Square with a White Line. And it works perfectly fine. The issue I was getting was setting the System Mem Slice and Mem Pitch after looking in the WICTextureLoader class I was able to figure out how the data is stored. So it appears the
MemPitch = The Row's Size in Bytes.
MemSlice = The Total Image Pixels Size In Bytes.
const int WIDTH = 200;
const int HEIGHT = 200;
const uint32_t RED = 255 | (0 << 8) | (0 << 16) | (255 << 24);
const uint32_t WHITE = 255 | (255 << 8) | (255 << 16) | (255 << 24);
const uint32_t BLUE = 0 | (0 << 8) | (255 << 16) | (255 << 24);
uint32_t *buffer = new uint32_t[WIDTH * HEIGHT];
bool flip = false;
for (int X = 0; X < WIDTH; ++X)
for (int Y = 0; Y < HEIGHT; ++Y)
int pixel = X + Y * WIDTH;
buffer[pixel] = flip ? BLUE : WHITE;
flip = true;
WritePixelsToShaderIndex(buffer, WIDTH, HEIGHT, 3);
delete [] buffer;
First of all, most of those functions return HRESULT values that you are ignoring. That's not safe as you will miss important errors that invalidate the remaining code. You can use if(FAILED(...)) if you want, or you can use ThrowIfFailed, but you can't just ignore the return value in a functioning app.
HRESULT hr = m_d3dDevice->CreateTexture2D(&desc, nullptr, &texture);
if (FAILED(hr))
// error!
hr = m_d3dDevice->CreateShaderResourceView(texture, 0, &textureView);
if (FAILED(hr))
// error!
// Render
hr = m_d3dContext->Map(texture, 0, D3D11_MAP_WRITE_DISCARD, 0, &mapped);
if (FAILED(hr))
// error!
Second, you should enable the Debug Device and look for diagnostic output which will likely point you to the reason for the failure.
sizeof(data) is always going to be 4 or 8 since data is a BYTE* i.e. the size of a pointer. It has nothing to do with the size of your data array. The locked buffer pointed to by mapped.pData is going to be mapped.RowPitch * desc.Height bytes in size.
You have to copy your pixel data into it row-by-row. Depending on the format and other factors, mapped.RowPitch is not necessarily going to be 4 * desc.Width--4 bytes per pixel is because you are using a format of DXGI_FORMAT_B8G8R8A8_UNORM. It should be at least that big, but it could be bigger to align the overall size.
This is pseudo-code and not necessarily an efficient way to do it, but:
for(UINT y = 0; y < desc.Height; ++y )
for(UINT x = 0; x < desc.Width; ++x )
// Find the memory location of the pixel at (x,y)
int pixel = y * mapped.RowPitch + (x*4)
BYTE* blue = data[pixel];
BYTE* green = data[pixel] + 1;
BYTE* red = data[pixel] + 2;
BYTE* alpha = data[pixel] + 3;
*blue = /* value between 0 and 255 */;
*green = /* value between 0 and 255 */;
*red = /* value between 0 and 255 */;
*alpha = /* value between 0 and 255 */;
You should take a look at DirectXTex which does a lot of this kind of row-by-row processing.
I would like to Generate Checksum for Strings/Data
1. The same data should produce the same Checksum
2. Two different data strings can't product same checksum. Random collision of 0.1% can be negligible
3. No encryption/decryption of data
4. Checksum length need not be too huge and contains letters and characters.
5. Must be too fast and efficient. Imagine generating checksum(s) for 100 Mb of text data should be in less than 5mins. Generating 1000 checksums for less than 1 KB of each segment data should be in less than 10 seconds.
Any algorithm or implementation reference and suggestions are most appreciated.
You can write a custom hash function: (c++)
long long int hash(String s){
long long k = 7;
for(int i = 0; i < s.length(); i++){
k *= 23;
k += s[i];
k *= 13;
k %= 1000000009;
return k;
This should give you a well (collision free for most samples) hash value.
A very common, fast checksum is the CRC-32, a 32-bit polynomial cyclic redundancy check. Here are three implementations in C, which vary in speed vs. complexity, of the CRC-32: (This is from
#include <stdio.h>
#include <stdlib.h>
// ---------------------------- reverse --------------------------------
// Reverses (reflects) bits in a 32-bit word.
unsigned reverse(unsigned x) {
x = ((x & 0x55555555) << 1) | ((x >> 1) & 0x55555555);
x = ((x & 0x33333333) << 2) | ((x >> 2) & 0x33333333);
x = ((x & 0x0F0F0F0F) << 4) | ((x >> 4) & 0x0F0F0F0F);
x = (x << 24) | ((x & 0xFF00) << 8) |
((x >> 8) & 0xFF00) | (x >> 24);
return x;
// ----------------------------- crc32a --------------------------------
/* This is the basic CRC algorithm with no optimizations. It follows the
logic circuit as closely as possible. */
unsigned int crc32a(unsigned char *message) {
int i, j;
unsigned int byte, crc;
i = 0;
crc = 0xFFFFFFFF;
while (message[i] != 0) {
byte = message[i]; // Get next byte.
byte = reverse(byte); // 32-bit reversal.
for (j = 0; j <= 7; j++) { // Do eight times.
if ((int)(crc ^ byte) < 0)
crc = (crc << 1) ^ 0x04C11DB7;
else crc = crc << 1;
byte = byte << 1; // Ready next msg bit.
i = i + 1;
return reverse(~crc);
// ----------------------------- crc32b --------------------------------
/* This is the basic CRC-32 calculation with some optimization but no
table lookup. The the byte reversal is avoided by shifting the crc reg
right instead of left and by using a reversed 32-bit word to represent
the polynomial.
When compiled to Cyclops with GCC, this function executes in 8 + 72n
instructions, where n is the number of bytes in the input message. It
should be doable in 4 + 61n instructions.
If the inner loop is strung out (approx. 5*8 = 40 instructions),
it would take about 6 + 46n instructions. */
unsigned int crc32b(unsigned char *message) {
int i, j;
unsigned int byte, crc, mask;
i = 0;
crc = 0xFFFFFFFF;
while (message[i] != 0) {
byte = message[i]; // Get next byte.
crc = crc ^ byte;
for (j = 7; j >= 0; j--) { // Do eight times.
mask = -(crc & 1);
crc = (crc >> 1) ^ (0xEDB88320 & mask);
i = i + 1;
return ~crc;
// ----------------------------- crc32c --------------------------------
/* This is derived from crc32b but does table lookup. First the table
itself is calculated, if it has not yet been set up.
Not counting the table setup (which would probably be a separate
function), when compiled to Cyclops with GCC, this function executes in
7 + 13n instructions, where n is the number of bytes in the input
message. It should be doable in 4 + 9n instructions. In any case, two
of the 13 or 9 instrucions are load byte.
This is Figure 14-7 in the text. */
unsigned int crc32c(unsigned char *message) {
int i, j;
unsigned int byte, crc, mask;
static unsigned int table[256];
/* Set up the table, if necessary. */
if (table[1] == 0) {
for (byte = 0; byte <= 255; byte++) {
crc = byte;
for (j = 7; j >= 0; j--) { // Do eight times.
mask = -(crc & 1);
crc = (crc >> 1) ^ (0xEDB88320 & mask);
table[byte] = crc;
/* Through with table setup, now calculate the CRC. */
i = 0;
crc = 0xFFFFFFFF;
while ((byte = message[i]) != 0) {
crc = (crc >> 8) ^ table[(crc ^ byte) & 0xFF];
i = i + 1;
return ~crc;
If you simply google "CRC32", you will get more info than you could possibly absorb.