I'm having a hard time to get a Rust code to compute the Walsh-Hadamard transform efficiently inplace. The algorithm is by essence highly parallelizable but it requires two nested for loops.
This is easy in C, see below the code using OpenMP which is fast.
void FWHT32(int32_t* array, int n){
uint32_t d, i, j;
for(d=n; d > 0; d--){
uint32_t D = ((uint32_t)1<<d);
#pragma omp parallel for collapse(2)
for(j=0; j < ((uint32_t) 1 << (n-d)); j++){
for(i=0; i < (D >> 1); i++){
array[D*j+i] = array[D*j+i] + array[D*j+(D>>1)+i];
array[D*j+(D>>1)+i] = array[D*j+i] - 2*array[D*j+(D>>1)+i];
}
}
}
}
But in Rust I get a really hard time iterating in parallel over two distant values, my best attempt so far doesn't parallelize the inner loop:
use std::ops::{AddAssign, Sub};
use rayon::prelude::*;
pub fn _par_fast_walsh_hadamard_tr<T: Send + Copy + Sized + AddAssign + Sub<Output = T>>(array: &mut [T]) {
let arlen = array.len();
let n = arlen.trailing_zeros() as usize;
if arlen != (1 << n) {
panic!("Length must be a factor of 2.");
}
for step in 0..n {
let s = 1 << (n - step);
array.par_chunks_exact_mut(s)
.for_each(|chunk| {
for j in 0..s/2{
let a = chunk[j];
chunk[j] += chunk[j+s/2];
chunk[j+s/2] = a - chunk[j+s/2];
}
});
}
}
But even this runs only slightly faster on 6 cores / 12 threads compared to a trivial sequential Rust implementation.
Is Rayon not the right parallel framework for this?
How can one parallelize the inner loop?
Related
I have a problem in my code. I want to multiply 2 matrices using dgemv from cblas, but I want to share the operations to the threads I have. I have also used dgemv to multiply the matrices in a previous exercise where there was no parallelism needed. Is there any idea of what I should do?
The code:
for (it = 0; it < itime; it++) {
cblas_dgemv(CblasColMajor,CblasNoTrans,n,n, 1 , sigma, n, u , 1, 0.0 , d, 1);
#pragma omp parallel for private(i,j,sum) schedule(static)
for (i = 0; i < n; i++) {
sum = 0.0;
uplus[i] = u[i] + dtmu - dt * u[i];
#pragma omp simd reduction(+:sum)
for (j = 0; j < n; j++) {
sum += sigma[i*n+j]*u[j];
}
sum = sum - u[i]*m[i];
uplus[i] += dtdiv * sum;
if (uplus[i] > uth) {
uplus[i] = 0.0;
if (it >= ttransient) {
omega1[i] += 1.0;
}
}
}
t = u;
u = uplus;
uplus = t;
}
I want to get the dgemv function into the parallel region and share somehow the multiplications to the threads I have.
I'm rewriting C code in Rust which heavily relies on u32 variables and wrapping them around. For example, I have a loop defined like this:
#define NWORDS 24
#define ZERO_WORDS 11
int main()
{
unsigned int i, j;
for (i = 0; i < NWORDS; i++) {
for (j = 0; j < i; j++) {
if (j < (i-ZERO_WORDS+1)) {
}
}
}
return 0;
}
Now, the if statement will need to wrap around u32 for a few values as initially i = 0. I came across the wrapping_neg method but it seems to just compute -self. Is there any more flexible way to work with u32 in Rust by also allowing wrapping?
As mentioned in the comments, the literal answer to your question is to use u32::wrapping_sub and u32::wrapping_add:
const NWORDS: u32 = 24;
const ZERO_WORDS: u32 = 11;
fn main() {
for i in 0..NWORDS {
for j in 0..i {
if j < i.wrapping_sub(ZERO_WORDS).wrapping_add(1) {}
}
}
}
However, I'd advocate avoiding relying on wrapping operations unless you are performing hashing / cryptography / compression / something similar. Wrapping operations are non-intuitive. For example, j < i-ZERO_WORDS+1 doesn't have the same results as j+ZERO_WORDS < i+1.
Even better would be to rewrite the logic. I can't even tell in which circumstances that if expression will be true without spending a lot of time thinking about it!
It turns out that the condition will be evaluated for i=9, j=8, but not for i=10, j=0. Perhaps all of this is clearer in the real code, but devoid of context it's very confusing.
This appears to have the same logic, but seems much more understandable to me:
i < ZERO_WORDS - 1 || i - j > ZERO_WORDS - 1;
Compare:
j < i.wrapping_sub(ZERO_WORDS).wrapping_add(1);
I've got a serial version of BML and I'm trying to write a parallel one with OpenMP. Basically my code works with a main witin a loop calling two functions for horizontal and vertical moves. Like that:
for (s = 0; s < nmovss; s++) {
horizontal_movs(grid, N);
copy_sides(grid, N);
cur = 1-cur;
vertical_movs(grid, N);
copy_sides(grid, N);
cur = 1-cur;
}
Where cur is the current grid. Then horizontal and vertical functions are similar and have a nested loop:
for(i = 1; i <= n; i++) {
for(j = 1; j <= n+1; j++) {
if(grid[cur][i][j-1] == LR && grid[cur][i][j] == EMPTY) {
grid[1-cur][i][j-1] = EMPTY;
grid[1-cur][i][j] = LR;
}
else {
grid[1-cur][i][j] = grid[cur][i][j];
}
}
}
The code produces a ppm image at every step, and whit a certain input the serial version produce an output that we can suppose good. But using #pragma omp parallel for inside the two functions H and V, the ppm file results splitted in such zones as the number of threads(i.e. 4):
I suppose the problem is that every thread should be doing both functions in sequence before termitate because movememnts are strictcly connected. I don't know how to do that. If I set pragma at a highter level like before main loop, there is no speed-up. Obviously the ppm file has to be not sliced like the image.
Goin'on I tried this solution that gives me an identical result as the serial code, but I don't excatly understand why
# pragma omp parallel num_threads(thread_count) default(none) \
shared(grid, n, cur) private(i, j)
for(i = 1; i <= n+1; i++) {
# pragma omp for
for(j = 1; j <= n; j++) {
if(grid[cur][i-1][j] == TB && grid[cur][i][j] == EMPTY) {
grid[1-cur][i-1][j] = EMPTY;
grid[1-cur][i][j] = TB;
}
else {
grid[1-cur][i][j] = grid[cur][i][j];
}
}
}
}
Therefore, if i use just one thread more than available cores(4), the execution time "explodes" instead of remain barely the same.
ı want to calculate determinant of matrix with thread but i have a error "term does not eveluate to a function taking 0 arguments" ı want to solve big matrix with thread and parsing matrix,what can ı do
int determinant(int f[1000][1000], int x)
{
int pr, c[1000], d = 0, b[1000][1000], j, p, q, t;
if (x == 2)
{
d = 0;
d = (f[1][1] * f[2][2]) - (f[1][2] * f[2][1]);
return(d);
}
else
{
for (j = 1; j <= x; j++)
{
int r = 1, s = 1;
for (p = 1; p <= x; p++)
{
for (q = 1; q <= x; q++)
{
if (p != 1 && q != j)
{
b[r][s] = f[p][q];
s++;
if (s > x - 1)
{
r++;
s = 1;
}
}
}
}
for (t = 1, pr = 1; t <= (1 + j); t++)
pr = (-1)*pr;
c[j] = pr*determinant(b, x - 1);
}
for (j = 1, d = 0; j <= x; j++)
{
d = d + (f[1][j] * c[j]);
}
return(d);
}
}
int main()
{
srand(time_t(NULL));
int i, j;
printf("\n\nEnter order of matrix : ");
scanf_s("%d", &m);
printf("\nEnter the elements of matrix\n");
for (i = 1; i <= m; i++)
{
for (j = 1; j <= m; j++)
{
a[i][j] = rand() % 10;
}
}
thread t(determinant(a, m));
t.join();
printf("\n Determinant of Matrix A is %d .", determinant(a, m));
}
The immediate problem is that here: thread t(determinant(a, m)); you pass the result of calling determinant(a, m) as the function to execute, and zero arguments to call that function with - but an int is not a function or other callable object, which is what the error you got complains about.
std::thread's constructor takes the function to run and the arguments to supply separately, so you would need to call std::thread(determinant, a, m).
Now we have another problem, std::thread doesn't provide a way to retrieve the return value, and so you calculate it again here: printf("\n Determinant of Matrix A is %d .", determinant(a, m));.
To fix this, we can use std::async from the <future> header, which will manage the thread handling for us, and lets us retrieve the result later:
auto result = std::async(std::launch::async, determinant, a, m);
int det = result.get()
This will run determinant(a,m) on a new thread, and return a std::future<int> into which the return value may eventually be placed.
We can then try to retrieve that value with std::future::get(), which will block until the value can be retrieved (or until an exception occurs in the thread).
In this example, we still execute determinant in a pretty serial fashion, since we delegate the work to a thread, then wait for that thread to finish its work before continuing.
However we are now free to store the future, and defer calling std::future::get() until we actually need the value, potentially much later in your program.
There are a few other problems in the rest of your code:
all your array indexing is off by one (array indices run from 0 to N-1 in C and C++)
a few of the variables you're using don't exist (like a and m)
C-arrays are passed by pointer, so if you ever change the code not to block on the thread right there, the array will go out of scope and your thread may read garbage from the dangling pointer. If you use a proper container like std::array or std::vector, you can pass it by value so your thread will own the data to operate on for its entire lifetime.
I'm trying to use Thrust to detect if each element of an array can be found in another array and where (both arrays are sorted). I came across the vectorized search routines (lower_bound and binary_search).
lower_bound will return for each value the index where it could be inserted in a list respecting its ordering.
I also need to know if the value is found or not (which can be done with binary_search), not just its position.
Is it possible to achieve both efficiently without making two searches (calling binary_search and then lower_bound)?
I know in the scalar case, lower_bound will return a pointer to end of the array if a value cannot be found, but this does not happens in the vectorized version.
Thanks!
You can check that the element that lower_bound returns is the same as the one you searched for. E.g. given a = {1,3,5} and searching for b = {1,4}, the result will be c = {0,2}. We have a[c[0]] == b[0], so b[0] is in a, but a[c[1]] != b[1] so b[1] is not in a.
(Note that you will need to ensure that you don't make any out-of-bounds memory accesses, since lower_bound can return an index that is beyond the end of the array.)
#tat0: you can also play around with Arrayfire:
vectorized search using lower_bound() does not give you the answer immediately
while with setintersect() in arrayfire, you get the "intersection" of two arrays directly:
float A_host[] = {3,22,4,5,2,9,234,11,6,17,7,873,23,45,454};
int szA = sizeof(A_host) / sizeof(float);
float B_host[] = {345,5,55,6,7,8,19,2,63};
int szB = sizeof(B_host) / sizeof(float);
// initialize arrays from host data
array A(szA, 1, A_host);
array B(szB, 1, B_host);
array U = setintersect(A, B); // compute intersection of 2 arrays
int n_common = U.elements();
std::cout << "common: ";
print(U);
the output is:
common: U = 2.0000
5.0000
6.0000
7.0000
to get the actual locations of these elements in array A, you can use the following
construct (provided that elements in A are unique):
int n_common = U.elements();
array loc = zeros(n_common); // empty array
gfor(array i, n_common) // parallel for loop
loc(i) = sum((A == U(i))*seq(szA));
print(loc);
then: loc =
4.0000
3.0000
8.0000
10.0000
Furthermore, thrust::lower_bound() seems to be slower than setintersect(),
i benchmarked it with the following program:
int *g_data = 0;
int g_N = 0;
void thrust_test() {
thrust::device_ptr<int> A = thrust::device_pointer_cast((int *)g_data),
B = thrust::device_pointer_cast((int *)g_data + g_N);
thrust::device_vector<int> output(g_N);
thrust::lower_bound(A, A + g_N, B, B + g_N,
output.begin(),
thrust::less<int>());
std::cout << "thrust: " << output.size() << "\n";
}
void af_test()
{
array A(g_N, 1, g_data, afDevicePointer);
array B(g_N, 1, g_data + g_N, afDevicePointer);
array U = setintersect(A, B);
std::cout << "intersection sz: " << U.elements() << "\n";
}
int main()
{
g_N = 3e6; // 3M entries
thrust::host_vector< int > input(g_N*2);
for(int i = 0; i < g_N*2; i++) { // generate some input
if(i & 1)
input[i] = (i*i) % 1131;
else
input[i] = (i*i*i-1) % 1223 ;
}
thrust::device_vector< int > dev_input = input;
// sort the vector A
thrust::sort(dev_input.begin(), dev_input.begin() + g_N);
// sort the vector B
thrust::sort(dev_input.begin() + g_N, dev_input.begin() + g_N*2);
g_data = thrust::raw_pointer_cast(dev_input.data());
try {
info();
printf("thrust: %.5f seconds\n", timeit(thrust_test));
printf("af: %.5f seconds\n", timeit(af_test));
} catch (af::exception& e) {
fprintf(stderr, "%s\n", e.what());
}
return 0;
}
and the results:
CUDA toolkit 4.2, driver 295.59
GPU0 GeForce GT 650M, 2048 MB, Compute 3.0 (single,double)
Memory Usage: 1937 MB free (2048 MB total)
thrust: 0.13008 seconds
arrayfire: 0.06702 seconds