This is a huge do loop and I have privatized many variables to avoid race conditions. But, values don't match with the serial code and I tried printing out proc_num and observed that only 2 threads (0,1) are getting activated out of 8. I want to paste the entire code but it's too long.
OUTPUT : Code is running, but I'm only getting wrong values and only 0 and 1 are getting printed for the value of proc_num.
thread_num = 8
!$ call omp_set_num_threads (thread_num)
.
.
.
.
!$OMP parallel do private(A_inc,T_star_tr,temp33,B_alpha,C_alpha,C_alpha_vec,&
!$OMP del_CRSS,iter_out,iter_in,del_T_star_tau,RSS_alpha,&
!$OMP del_gam_tau,G_n,Schmid_dum,J_n, J_n_inv,correction,&
!$OMP check,EXP_N,CRSS_c_tau,temp12,checkouter,flag,Fp_det,Fe_det)
do vk=1,npts3
do vj=1,npts2
do vi=1,npts1
A_inc = 0.d0
do vii = 1,3
do vjj = 1,3
do vmm = 1,3
do vpp = 1,3
do voo = 1,3
A_inc(vii,vjj)= A_inc(vii,vjj) + Fp_inv_t(vmm,vii,vi,vj,vk)* &
F_tau(vpp,vmm,vi,vj,vk)*F_tau(vpp,voo,vi,vj,vk)*Fp_inv_t(voo,vjj,vi,vj,vk)
end do
end do
end do
end do
end do
!$OMP critical
!$ proc_num = omp_get_thread_num()
print *, proc_num, A_inc
!$OMP end critical
T_star_tr = 0.d0
do vii = 1,3
do vjj = 1,3
do vkk = 1,3
do vll = 1,3
T_star_tr(vii,vjj) = T_star_tr(vii,vjj) +&
Stiff_sam(vii,vjj,vkk,vll,vi,vj,vk)*0.5*(A_inc(vkk,vll) - IDEN_2(vkk,vll))
end do
end do
end do
end do
.
.
.
Related
Testing an Atomic example code I got a strange result.
program atomic
use omp_lib
implicit none
integer, parameter :: num_threads = 4, m = 1000000
integer :: thread_num
integer :: i, j, sum1 = 0, sum2 = 0, tic,toc, rate
real:: time
integer, external :: increment
thread_num = 0
!$ call omp_set_num_threads(num_threads)
!////////// ATOMIC ////////////////////////////////////////////////////////////
CALL system_clock(count_rate=rate)
call system_clock(tic)
!$omp parallel do private(thread_num, j) &
!$omp shared(sum1, sum2)
do i = 0 , m-1
!$ thread_num = omp_get_thread_num()
!$omp atomic
sum1 = sum1 + i
sum2 = sum2 + increment(thread_num, i)
end do
!$omp end paralleldo
print*, "sum 1 = ", sum1
print*, "sum 2 = ", sum2
call system_clock(toc)
time = real(toc-tic)/real(rate)
print*, "Time atomic: ", time, 's'
!////////// CRITICAL ////////////////////////////////////////////////////////////
sum1=0; sum2=0
CALL system_clock(count_rate=rate)
call system_clock(tic)
!$omp parallel do private(thread_num, j) &
!$omp shared(sum1, sum2)
do i = 0 , m-1
!$ thread_num = omp_get_thread_num()
!$omp critical
sum1 = sum1 + i
sum2 = sum2 + increment(thread_num, i)
!$omp end critical
end do
!$omp end paralleldo
print*, "sum 1 = ", sum1
print*, "sum 2 = ", sum2
call system_clock(toc)
time = real(toc-tic)/real(rate)
print*, "Time critical: ", time, 's'
end program atomic
integer function increment (thread_num, j)
implicit none
integer, intent(in) :: thread_num, j
! print*, "Function increment run by thread number: ", thread_num
increment = j
end function increment
Using 'm = 10000000' (7 zeros) I get:
sum 1 = -2014260032
sum 2 = -1146784608
Time atomic: 1.13900006 s
sum 1 = -2014260032
sum 2 = -2014260032
Time critical: 4.09000015 s
Using 'm=1000000' (6 zeros) I get:
sum 1 = 1783293664
sum 2 = 1576859165
Time atomic: 0.123999998 s
sum 1 = 1783293664
sum 2 = 1783293664
Time critical: 0.133000001 s
I have two questions:
Why do I get a negative output in the first case?
Why is not sum1 equal to sum2 in atomic outputs?
It was compiled using:
gfortran -Wall -Wextra -fopenmp -O2 -Wall -o prog.exe prueba.f90
./prog.exe
Why do I get a negative output in the first case?
Because the sum operation overflows. From this source one can read:
In computer programming, an integer overflow occurs when an arithmetic
operation attempts to create a numeric value that is outside of the
range that can be represented with a given number of digits – either
higher than the maximum or lower than the minimum representable value
For a m = 10000000 the result is 49999995000000, which is a value bigger than the maximum value representable with an Integer (32-bit integer) in Fortran.
The second question
Why is not sum1 equal to sum2 in atomic outputs?
Because the atomic clause is only being applied to the operation:
sum1 = sum1 + i
The first problem you can solve by using a data-type that can represent a wider range of numbers. The second problem you can solve as follows:
!$omp atomic
sum1 = sum1 + i
!$omp atomic
sum2 = sum2 + increment(thread_num, i)
So I'm trying to wrap my head around Julia's parallelization options. I'm modelling stochastic processes as Markov chains. Since the chains are independent replicates, the outer loops are independent - making the problem embarrassingly parallel.
I tried to implement both a #distributed and a #threads solution, both of which seem to run fine, but aren't any faster than the sequential.
Here's a simplified version of my code (sequential):
function dummy(steps = 10000, width = 100, chains = 4)
out_N = zeros(steps, width, chains)
initial = zeros(width)
for c = 1:chains
# print("c=$c\n")
N = zeros(steps, width)
state = copy(initial)
N[1,:] = state
for i = 1:steps
state = state + rand(width)
N[i,:] = state
end
out_N[:,:,c] = N
end
return out_N
end
What would be the correct way of parallelizing this problem to increase performance?
Here is the correct way to do it (at the time of writing this answer the other answer does not work - see my comment).
I will use slightly less complex example than in the question (however very similar).
1. Not parallelized version (baseline scenario)
using Random
const m = MersenneTwister(0);
function dothestuff!(out_N, N, ic, m)
out_N[:, ic] .= rand(m, N)
end
function dummy_base(m=m, N=100_000,c=256)
out_N = Array{Float64}(undef,N,c)
for ic in 1:c
dothestuff!(out_N, N, ic, m)
end
out_N
end
Testing:
julia> using BenchmarkTools; #btime dummy_base();
106.512 ms (514 allocations: 390.64 MiB)
2. Parallelize with threads
#remember to run before starting Julia:
# set JULIA_NUM_THREADS=4
# OR (Linux)
# export JULIA_NUM_THREADS=4
using Random
const mt = MersenneTwister.(1:Threads.nthreads());
# required for older Julia versions, look still good in later versions :-)
function dothestuff!(out_N, N, ic, m)
out_N[:, ic] .= rand(m, N)
end
function dummy_threads(mt=mt, N=100_000,c=256)
out_N = Array{Float64}(undef,N,c)
Threads.#threads for ic in 1:c
dothestuff!(out_N, N, ic, mt[Threads.threadid()])
end
out_N
end
Let us test the performance:
julia> using BenchmarkTools; #btime dummy_threads();
46.775 ms (535 allocations: 390.65 MiB)
3. Parallelize with processes (on a single machine)
using Distributed
addprocs(4)
using Random, SharedArrays
#everywhere using Random, SharedArrays, Distributed
#everywhere Random.seed!(myid())
#everywhere function dothestuff!(out_N, N, ic)
out_N[:, ic] .= rand(N)
end
function dummy_distr(N=100_000,c=256)
out_N = SharedArray{Float64}(N,c)
#sync #distributed for ic in 1:c
dothestuff!(out_N, N, ic)
end
out_N
end
Performance (note that inter-process communication takes some time and hence for small computations threads will be usually better):
julia> using BenchmarkTools; #btime dummy_distr();
62.584 ms (1073 allocations: 45.48 KiB)
You can use #distributed macro, to run processes in parallel
#everywhere using Distributed, SharedArrays
addprocs(4)
#everywhere function inner_loop!(out_N, chain_number,steps,width)
N = zeros(steps, width)
state = zeros(width)
for i = 1:steps
state .+= rand(width)
N[i,:] .= state
end
out_N[:,:,chain_number] .= N
nothing
end
function dummy(steps = 10000, width = 100, chains = 4)
out_N = SharedArray{Float64}((steps, width, chains); pids = collect(1:4))
#sync for c = 1:chains
# print("c=$c\n")
#spawnat :any inner_loop!(out_N, c, steps,width)
end
sdata(out_N)
end
Have a loop in QB64 concerning loop optimization:
DIM N AS DOUBLE, X(100000000) AS DOUBLE
T! = TIMER
FOR N = 1 to 100000000
IF X(N) THEN
PRINT X(N)
EXIT FOR
END IF
NEXT
PRINT TIMER - T!
is it any faster than:
DIM N AS DOUBLE, X(100000000) AS DOUBLE
T! = TIMER
FOR N = 1 to 100000000
IF X(N) <> 0 THEN
PRINT X(N)
EXIT FOR
END IF
NEXT
PRINT TIMER - T!
EDITED: 09-18-2018 to include variable types
I written this code to evaluate your test:
REM Delete REM to enable console runs
REM $CONSOLE:ONLY
REM _DEST _CONSOLE
DIM SHARED N AS DOUBLE, X(100000000) AS DOUBLE
S# = 0: ZC% = 0
T% = 10
IF COMMAND$ <> "" THEN
T% = VAL(COMMAND$)
END IF
IF T% > 999 THEN T% = 999
FOR I% = 1 TO T%
A# = TRYA
B# = TRYB
D# = A# - B#
PRINT USING "Case A ... : #.########"; A#
PRINT USING "Case B ... : #.########"; B#
PRINT USING "Diff ..... : #.########"; D#;
A$ = ""
IF ABS(D#) < 0.00000001 THEN
ZC% = ZC% + 1
A$ = "*"
END IF
S# = S# + A# - B#
PRINT A$
PRINT
REM INKEY$ doesn't work in console mode!
A$ = INKEY$
IF A$ = CHR$(27) THEN
I% = I% + 1: EXIT FOR
END IF
NEXT
PRINT USING "Avrg A - B : #.########"; S# / (I% - 1)
PRINT USING "0 diff:### on ### tryes"; ZC%, (I% - 1)
PRINT
PRINT "Hit a key to exit!"
REM INPUT$ doesn't work in console mode!
A$ = INPUT$(1)
SYSTEM
FUNCTION TRYA#
T# = TIMER
FOR N = 1 TO 100000000
IF X(N) THEN
PRINT X(N)
EXIT FOR
END IF
NEXT
A# = TIMER - T#
TRYA = A#
END FUNCTION
FUNCTION TRYB#
T# = TIMER
FOR N = 1 TO 100000000
IF X(N) <> 0 THEN
PRINT X(N)
EXIT FOR
END IF
NEXT
A# = TIMER - T#
TRYB = A#
END FUNCTION
The two different routines are inserted into two functions: TRYA and TRYB.
I launched this SW with a loop that runs 999 times the functions and the result is:
Avrg. A - B: 0.00204501
0 diff:359 on 999 tryes
then I launched with a 10 times loop and the result is:
Avrg. A - B: -.01640625
0 diff: 1 on 10 tryes
then I launched with a 15 times loop and the result is:
Avrg. A - B: 0.00026042
0 diff: 5 on 15 tryes
Cause we launch the SW in a multi-thread ambient I don't believe this is a very good test, but there's some results:
In two cases the results of no difference (0 diff) is a third of all loops.
In two cases it seems the function TRYA is slower.
In one case it seems the function TRYB is slower.
Looking at these results, I think, we may consider the two functions equivalent!
You obtain more than 10 loops running the code from command line (or modifying the command$ parameter into the QB64 menu) as:
# ./test n
Where n is the number of loops you desire.
The SW was compiled using gcc with -O3 optimizations option. (To do this you have to modify the file [/opt/]qb64/internal/c/makeline_lnx.txt)
I found some NaN's in a data array in my Fortran code, and may have isolated the issue to an OpenMP do loop. When this loop runs, NaNs result in sp%ptl(i)%ph(6):
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
do ith=1, sml_nthreads
do i=i_beg(ith), i_end(ith)
if(sp%ptl(i)%ph(3) >= 2pi .or. sp%ptl(i)%ph(3)< 0D0 ) then
sp%ptl(i)%ph(3) = modulo(sp%ptl(i)%ph(3),2pi)
endif
enddo
enddo
But if I run the same, but add a line adding 0D0 to a dummy variable, the NaN's go away in sp%ptl(i)%ph(6):
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
do ith=1, sml_nthreads
do i=i_beg(ith), i_end(ith)
if(sp%ptl(i)%ph(3) >= 2pi .or. sp%ptl(i)%ph(3)< 0D0 ) then
sp%ptl(i)%ph(3) = modulo(sp%ptl(i)%ph(3),2pi)
endif
tmp = tmp + 0D0
enddo
enddo
Of course, there is more to the real code and this is not a minimal working example. My question though is why does adding any line to the do loop cause that the sp%ptl(i)%ph(6) never get the NaN's? Is having only an if-statement in the OpenMP inner do loop a bad idea? It's a confusing issue to me right now how this is working.
UPDATE Here is minimal example, not quite working as the larger code yet, as it doesn't have NaN's but rather large numbers in random points of the array ptl, but at the very least shows the basic workflow. I compile the same as with the larger codebase (Intel compiler, 18.0.1.163), then run this with srun -n1 -c24 .
I did some further tests with the larger codebase, and found that recompiling the subroutines represented here by "mymod" without optimization (i.e. -O0 -g -C) makes the NaN's go away.
UPDATE 2 Never mind about the large numbers, I had simply forgotten the initialization of ptl(i)%ph (now added at beginning of push), now that its added I never get NaN or large numbers in this minimal example (still there in the larger code with optimization on).
module mymod
integer, parameter :: ptl_nphase=8
integer, parameter :: num=1000
integer, parameter :: sml_nthreads=24
type ptl_type
real(8) :: ph(ptl_nphase)
end type ptl_type
contains
logical function is_nan(a)
implicit none
real (8) :: a
is_nan = .not. ( a > 1D0 .or. a < 2D0 )
end function is_nan
subroutine split_indices(total,num_pieces,ibeg,iend)
implicit none
integer :: total
integer :: num_pieces
integer :: ibeg(num_pieces), iend(num_pieces)
integer :: itmp1, itmp2, ioffset, i
if (num_pieces > 0) then
itmp1 = total/num_pieces
itmp2 = mod(total,num_pieces)
ioffset = 0
do i=1,itmp2
ibeg(i) = ioffset + 1
iend(i) = ioffset + (itmp1+1)
ioffset = iend(i)
enddo
do i=itmp2+1,num_pieces
ibeg(i) = ioffset + 1
if (ibeg(i) > total) then
iend(i) = ibeg(i) - 1
else
iend(i) = ioffset + itmp1
ioffset = iend(i)
endif
enddo
endif
end subroutine split_indices
subroutine calc_source(ptl,icycle)
implicit none
type(ptl_type) :: ptl(num)
integer :: ith, i, i_beg(sml_nthreads), i_end(sml_nthreads)
integer :: icycle
call split_indices(num, sml_nthreads, i_beg, i_end)
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
do ith=1, sml_nthreads
do i=i_beg(ith), i_end(ith)
ptl(i)%ph(6) = ptl(i)%ph(6) + 1D0
enddo
enddo
if (icycle==1) then
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
do ith=1, sml_nthreads
do i=i_beg(ith), i_end(ith)
ptl(i)%ph(7) = ptl(i)%ph(6)
enddo
enddo
endif
end subroutine calc_source
subroutine push1(ptl)
implicit none
type(ptl_type) :: ptl(num)
integer :: ith, i, i_beg(sml_nthreads), i_end(sml_nthreads)
real(8) :: arr1(5)
call split_indices(num, sml_nthreads, i_beg, i_end)
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
do ith=1, sml_nthreads
do i=i_beg(ith), i_end(ith)
call random_number(arr1)
ptl(i)%ph(1:5) = ptl(i)%ph(1:5) + arr1
enddo
enddo
end subroutine push1
subroutine push(ptl)
implicit none
type(ptl_type) :: ptl(num)
integer :: icycle
integer :: ith, i, i_beg(sml_nthreads), i_end(sml_nthreads)
call split_indices(num, sml_nthreads, i_beg, i_end)
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
do ith=1, sml_nthreads
do i=i_beg(ith), i_end(ith)
ptl(i)%ph(:) = 0D0
enddo
enddo
do icycle=1,100
call calc_source(ptl,icycle)
call push1(ptl)
call split_indices(num, sml_nthreads, i_beg, i_end)
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
do ith=1, sml_nthreads
do i=i_beg(ith), i_end(ith)
ptl(i)%ph(3) = modulo(ptl(i)%ph(3),6.28)
enddo
enddo
enddo
end subroutine push
end module mymod
program main
use mymod
implicit none
type(ptl_type) :: ptl(num)
integer :: ith, i, i_beg(sml_nthreads), i_end(sml_nthreads)
call push(ptl)
!check for nan
call split_indices(num, sml_nthreads, i_beg, i_end)
!$OMP PARALLEL DO &
!$OMP PRIVATE( ITH, I )
do ith=1, sml_nthreads
do i=i_beg(ith), i_end(ith)
! if (is_nan(ptl(i)%ph(6))) then
!print *,'is_nan',i
print *,ptl(i)%ph(6)
! endif
enddo
enddo
end program main
I found a solution, but still without understanding why, and still without being able to generate a minimal, working example (my simple example never replicated the issue in the larger code). This has to do with the optimization, and how loops are inlined and vectorized, but when looking through the optimization reports, there wasn't a clear difference when compiling with the tmp = tmp+0D0 line and not.
So I decided to start removing code. If I removed the call to calc_source, the NaN's went away. When I put it back, and just used a blank calc_source subroutine, NaNs came back. I moved calc_source to be in the same file as push, and the NaN's went away.
I would appreciate your point of view where I might did wrong using OpenMP.
I parallelized this code pretty strait forward - yet even with single thread (i.e., call omp_set_num_threads(1)) I get wrong results.
I have checked with Intel Inspector, and I do not have a race condition, yet the Inspector tool indicated as a warning that a thread might approach other thread stack (I have this warning in other code I have, and it runs well with OpenMP). I do not think this is the problem.
SUBROUTINE GR(NUMBER_D, RAD_D, RAD_CC, SPECT)
use TERM,only: DENSITY, TEMPERATURE, VISCOSITY, WATER_DENSITY, &
PRESSURE, D_HOR, D_VER, D_TEMP, QQQ, UMU
use SATUR,only: FF, A1, A2, AAA, BBB, SAT
use DELTA,only: DDM, DT
use CONST,only: PI, G
IMPLICIT NONE
INTEGER,INTENT(IN) :: NUMBER_D
DOUBLE PRECISION,INTENT(IN) :: RAD_CC(NUMBER_D), SPECT(NUMBER_D)
DOUBLE PRECISION,INTENT(INOUT) :: RAD_D(NUMBER_D)
DOUBLE PRECISION :: R3, DR3, C2, C0, P, Q, RAD_CR, SAT_CR, C4, A, &
C, D, CC, DD, CC2, DD2, RAD_ST, DRAA, DRA, DM, X1
INTEGER :: I
DDM = 0.0D0
!$OMP PARALLEL DO DEFAULT(SHARED) &
!$OMP PRIVATE(I,R3,DR3,C2,C0,P,Q,SAT,SAT_CR,C4,A) &
!$OMP PRIVATE (C,D,CC,DD,CC2,DD2,RAD_ST,DRAA,DRA,DM,RAD_CR,X1) &
!$OMP REDUCTION (+:DDM)
DO I=1,NUMBER_D
R3 = RAD_CC(I)**3
DR3 = RAD_D(I)**3-R3
IF(DR3.LT.1.0D-100) DR3 = 1.0D-100
C2 = -DSQRT(3.0D0*BBB*R3/AAA)
C0 = -R3
P = -0.3333333333D0*C2**2
Q = C0+0.074074074D0*C2**3
CALL CUBIC(P, Q, RAD_CR)
RAD_CR = RAD_CR - 0.3333333333D0*C2
SAT_CR = DEXP(AAA/RAD_CR-BBB*R3/(RAD_CR**3-R3))-1.0D0
DRA = DT*(SAT+1.0D0-DEXP(AAA/RAD_DROP(I)-BBB*R3/DR3))/ &
(FF*RAD_D(I))
IF(SAT.LT.SAT_CR) THEN
IF(DABS(SAT).LT.1.0D-10) THEN
P = -BBB*R3/AAA
Q = -R3
CALL CUBIC(P, Q, RAD_ST)
GO TO 22
END IF
C4 = DLOG(SAT+1.0D0)
A = -AAA/C4
C = (BBB-C4)*R3/C4
D = -A*R3
P = A*C-4.0D0*D
Q = -(A**2*D+C**2)
CALL CUBIC(P, Q, X1)
CC = DSQRT(A**2+4.D0*X1)
DD = DSQRT(X1**2-4.D0*D)
CC2 = 0.5D0*(A-CC)
IF(SAT.LT.0.0D0) THEN
DD2 = 0.5D0*(X1-DD)
RAD_ST = 0.5D0*(-CC2+DSQRT(CC2**2-4.0D0*DD2))
ELSE
DD2 = 0.5D0*(X1+DD)
RAD_ST = 0.5D0*(-CC2-DSQRT(CC2**2-4.0D0*DD2))
END IF
22 CONTINUE
DRAA = RAD_ST-RAD_D(I)
IF(ABS(DRAA).LT.ABS(DRA)) THEN
DRA = DRAA
DM = 1.3333333333333333D0*PI*WATER_DENSITY* &
(RAD_ST**3-RAD_D(I)**3)
ELSE
DM = 4.0D0*PI*WATER_DENSITY*RAD_D(I)**2*DRA
END IF
DDM = DDM+SPECT(I)*DM
RAD_D(I) = RAD_D(I) + DRA
ELSE
DM = 4.0D0*PI*WATER_DENSITY*RAD_D(I)**2*DRA
DDM = DDM+SPECT(I)*DM
RAD_D(I) = RAD_D(I) + DRA
END IF
END DO
!$OMP END PARALLEL DO
RETURN
END SUBROUTINE GR
SUBROUTINE CUBIC(P, Q, X)
IMPLICIT NONE
DOUBLE PRECISION,INTENT(IN) :: P, Q
DOUBLE PRECISION,INTENT(OUT) :: X
DOUBLE PRECISION :: DIS, PP, COSALFA,ALFA, QQ, U, V
DIS = (P/3.D0)**3+(0.5D0*Q)**2
IF(DIS.LT.0.0D0) THEN
PP = -P/3.0D0
COSALFA = -0.5D0*Q/DSQRT(PP**3)
ALFA = DACOS(COSALFA)
X = 2.0D0*DSQRT(PP)*DCOS(ALFA/3.0D0)
RETURN
ELSE
QQ = DSQRT(DIS)
U = -0.5D0*Q+QQ
V = -0.5D0*Q-QQ
IF(U.GE.0.0D0) THEN
U = U**0.333333333333333D0
ELSE
U = -(-U)**0.333333333333333D0
END IF
IF(V.GE.0.0D0) THEN
V = V**0.333333333333333D0
ELSE
V = -(-V)**0.333333333333333D0
END IF
X = U+V
END IF
RETURN
END SUBROUTINE CUBIC