My Centos 6 VM shows four cores when displaying the content of /proc/cpuinfo, and /sys/devices/system/cpu/online shows 0-3.
I am trying to run the following code on the core 2 and 3 using KMP_AFFINITY="explicit,proclist=[2-3]"
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <sched.h>
int main (int argc, char *argv[]) {
int nthreads, tid, cid;
#pragma omp parallel private(nthreads, tid)
{
tid = omp_get_thread_num();
cid = sched_getcpu();
printf("Hello from thread %d on core %d\n", tid, cid);
if (tid == 0) {
nthreads = omp_get_num_threads();
printf("Number of threads = %d\n", nthreads);
}
}
}
When compiled with icc (ICC) 16.0.1 20151021, it it fails to detect the available cores and executes everything on the core 0.
$ OMP_NUM_THREADS=4 ./a.out
OMP: Warning #123: Ignoring invalid OS proc ID 2.
OMP: Warning #123: Ignoring invalid OS proc ID 3.
OMP: Warning #124: No valid OS proc IDs specified - not using affinity.
Hello from thread 0 on core 0
Number of threads = 1
Hello from thread 0 on core 0
Number of threads = 1
Hello from thread 0 on core 0
Number of threads = 1
Hello from thread 0 on core 0
Number of threads = 1
Where as gcc (GCC) 4.4.7 20120313, with GOMP_CPU_AFFINITY="2-3", executes properly on core 2 and 3, like set.
I used strace to check what's going on under the hood, and I noticed something strange :
[...]
open("/sys/devices/system/cpu/online", O_RDONLY|O_CLOEXEC) = 3
read(3, "0-3\n", 8192) = 4
[...]
sched_getaffinity(0, 1048576, { 1 }) = 8
sched_setaffinity(0, 8, { 4521c26fbb1c38c1 }) = -1 EFAULT (Bad addres
[...]
Could this be an error from the implementation of OpenMP made by intel?
I cannot upgrade my compiler to fix it in this case. Is it possible to use the GCC OpenMP library instead of the Intel one when compiling with icc ?
Update:
I managed to compile the code with gcc and linking it with iomp using the following command
gcc omp.c -L/opt/intel/compilers_and_libraries_2016/linux/lib/intel64_lin/ -liomp5
The execution outputs no warning, and is still not correct:
$ OMP_NUM_THREADS=4 ./a.out
Hello from thread 0 on core 0
Number of threads = 1
Same sched_setaffinity error than previously shown.
Related
I am running a code on a 24c architecture and would like to use one mpi rank for each set of three cores bound to a L3 cache bloc. So, 8 mpi ranks per socket, 16 per node, with 3 threads per rank. I think the following command line should apply
mpirun --bind-to l3 -np 16 gmx_mpi mdrun -nt 3
--bind-to binding the mpi ranks to each bloc of L3 cache, -np allocating 16 mpi ranks per node and a -nt a number of threads per MPI rank of 3. Is this the correct approach ?
If the core is capable of multithreading (2 threads) is it right to write
mpirun --bind-to l3 -np 16 gmx_mpi mdrun -nt 6
--bind-to core is I assume binding one MPI rank per core, with no spanning into threads, or spanning into 2 threads per core for exploiting MT, e.g.
mpirun --bind-to core -np 48 gmx_mpi mdrun -nt 2
with 48 ranks one per core on a 2-socket platform and 2 threads per core (MT)
Would you confirm ?
I always use this piece of code, that I inherited from somewhere many years ago, to print out bindings at runtime. For example, on my 4-core laptop:
dsh#e7390dh:binding$ mpicc -o bind bind.c utilities.c
dsh#e7390dh:binding$ mpirun -n 4 ./bind
Rank 2 on core 2,6 of node <e7390dh>
Rank 3 on core 3,7 of node <e7390dh>
Rank 0 on core 0,4 of node <e7390dh>
Rank 1 on core 1,5 of node <e7390dh>
i.e. each process is bound to one physical core but can run on either hypercore. If there is no binding you get a range, e.g. "on core [0-7]".
Hope this is useful.
bind.c:
#include <stdio.h>
#include <mpi.h>
void printlocation();
int main(void)
{
MPI_Init(NULL,NULL);
printlocation();
MPI_Finalize();
return 0;
}
utilities.c:
#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <sched.h>
#include <mpi.h>
/* Borrowed from util-linux-2.13-pre7/schedutils/taskset.c */
static char *cpuset_to_cstr(cpu_set_t *mask, char *str)
{
char *ptr = str;
int i, j, entry_made = 0;
for (i = 0; i < CPU_SETSIZE; i++) {
if (CPU_ISSET(i, mask)) {
int run = 0;
entry_made = 1;
for (j = i + 1; j < CPU_SETSIZE; j++) {
if (CPU_ISSET(j, mask)) run++;
else break;
}
if (!run)
sprintf(ptr, "%d,", i);
else if (run == 1) {
sprintf(ptr, "%d,%d,", i, i + 1);
i++;
} else {
sprintf(ptr, "%d-%d,", i, i + run);
i += run;
}
while (*ptr != 0) ptr++;
}
}
ptr -= entry_made;
*ptr = 0;
return(str);
}
void printlocation()
{
int rank, namelen;
char hnbuf[MPI_MAX_PROCESSOR_NAME];
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
memset(hnbuf, 0, sizeof(hnbuf));
MPI_Get_processor_name(hnbuf, &namelen);
cpu_set_t coremask;
char clbuf[7 * CPU_SETSIZE];
memset(clbuf, 0, sizeof(clbuf));
(void)sched_getaffinity(0, sizeof(coremask), &coremask);
cpuset_to_cstr(&coremask, clbuf);
printf("Rank %d on core %s of node <%s>\n", rank, clbuf, hnbuf);
}
the exact command seems to be --bind-to l3cache
mpirun --bind-to l3cache -np 16 gmx_mpi mdrun -nt 6
learning the first steps with openMP and got stuck a little bit. Why my code does not use all allowable threads? OMP_NUM_THREADS=6 has been set as environmental variable.
#include <omp.h>
int max = omp_get_max_threads();
std::cout<<"Max threads: "<<max<<std::endl;
#pragma omp parallel
{
int n = omp_get_num_threads();
int tid = omp_get_thread_num();
std::cout<<"There are threads:"<<n<<"Hello from thread: "<<tid<<std::endl;
};
/*end of parallel section */
std::cout<<"Hello from the master thread\n";
Output:
Max threads: 6
There are threads:1 Hello from thread: 0
Hello from the master thread
Update: I also tried omp_set_dynamic(0); with no success.
Update: it was solved with compilator flag:
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
Answer:
It was problem with the compilator/cmakelists.txt, I needed to set this flag:
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
This question is related to:
c++ std::async : faster on 4 cores compared to 8 cores
In the previous question, I was wondering why some code would run faster on 4 cores rather than 8 (answer: my cpu had 4 cores and 8 threads)
Now I am discovering that code is more or less the same speed independently of the number of cores used.
I am on ubuntu 16.06. c++11. Intel® Core™ i7-8550U CPU # 1.80GHz × 8
Here code for benchmarking computation time against number of core used
#include <math.h>
#include <future>
#include <ctime>
#include <vector>
#include <iostream>
#define NB_JOBS 2000.0
#define MAX_CORES 8
// no special meaning to this function,
// just uses some CPU
static bool _expensive(int nb_jobs){
for(int job=0;job<nb_jobs;job++){
float x = 0.6;
bool b = true;
double f = 1;
for(int i=0;i<1000;i++){
if(!b) f=-1;
for(double j=1;j<2.0;j+=0.01) x+= f* pow(1.0/sin(x),j);
b = !b;
}
}
return true;
}
static double _duration(int nb_cores){
std::clock_t begin = clock();
int nb_jobs_per_core = rint ( NB_JOBS / (float)nb_cores );
std::vector < std::future<bool> > futures;
for(int i=0;i<nb_cores;i++){
futures.push_back( std::async(std::launch::async,_expensive,nb_jobs_per_core));
}
for (auto &e: futures) {
bool foo = e.get();
}
std::clock_t end = clock();
double duration = double(end - begin) / CLOCKS_PER_SEC;
return duration;
}
int main(){
for(int nb_cores=1 ; nb_cores<=MAX_CORES ; nb_cores++){
double duration = _duration(nb_cores);
std::cout << nb_cores << " threads: " << duration << "\n";
}
return 0;
}
Here the output:
1 threads: 8.55817
2 threads: 8.76621
3 threads: 7.90191
4 threads: 8.4656
5 threads: 10.5494
6 threads: 11.6175
7 threads: 21.697
8 threads: 24.3621
using cores seems to have marginal impacts.
What troubles me is that the CPU has 4 cores. So I was expecting the program to run (around) 4 times faster when using 4 threads. It does not.
Note: "htop" shows usage of virtual cores as expected by the program, i.e. first one core used at 100%, then 2, ..., and at the end 8.
If I replace:
futures.push_back( std::async(std::launch::async,[...]
by :
futures.push_back( std::async(std::launch::async|std::launch::deferred,[...]
then I get:
1 threads: 8.6459
2 threads: 8.69905
3 threads: 10.7763
4 threads: 11.4505
5 threads: 11.8426
6 threads: 10.4282
7 threads: 9.55181
8 threads: 9.05565
and htop shows only 1 virtual core being used 100% during the full duration.
Anything I am doing wrong ?
note: I tried on several desktops, all with various specs (nb of core and nb of threads), and observed something similar.
I'm at very beginning with OpenMP, i just compiled with gcc -fopenmp openmp_c_helloworld.c the following piece of code:
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int main (int argc, char *argv[]) {
int th_id, nthreads;
#pragma omp parallel private(th_id)
{
th_id = omp_get_thread_num();
printf("Hello World from thread %d\n", th_id);
#pragma omp barrier
if ( th_id == 0 ) {
nthreads = omp_get_num_threads();
printf("There are %d threads\n",nthreads);
}
}
return EXIT_SUCCESS;
}
I just run the executable on a quad-core Intel CPU with HyperThreading and i obtain the following output:
Hello World from thread 2
Hello World from thread 0
Hello World from thread 3
Hello World from thread 1
There are 4 threads
Technically speaking i have 8 thread available on my CPU and 4 CPU-core, why OpenMP shows me only 4 thread?
To put it simply, I think it's because OpenMP looks for the number of CPU's (cores) rather than the number of processor threads.
See this page: `
Implementation default - usually the number of CPUs on a node, though
it could be dynamic (see next bullet).
Something you could try out is setting the number of threads in your program to be equal to the number of processor threads and see if there's a performance improvement (you'll have to create your own benchmarking program).
In parallel programming, good performance is obtained when the number of worker threads are equal to the number of processor threads. You can keep a thread or two extra for I/O as well.
I've managed to get my pthreads program sort of working. Basically I am trying to manually set the affinity of 4 threads such that thread 1 runs on CPU 1, thread 2 runs on CPU 2, thread 3 runs on CPU 3, and thread 4 runs on CPU 4.
After compiling, my code works for a few threads but not others (seems like thread 1 never works) but running the same compiled program a couple of different times gives me different results.
For example:
hao#Gorax:~/Desktop$ ./a.out
Thread 3 is running on CPU 3
pthread_setaffinity_np: Invalid argument
Thread Thread 2 is running on CPU 2
hao#Gorax:~/Desktop$ ./a.out
Thread 2 is running on CPU 2
pthread_setaffinity_np: Invalid argument
pthread_setaffinity_np: Invalid argument
Thread 3 is running on CPU 3
Thread 3 is running on CPU 3
hao#Gorax:~/Desktop$ ./a.out
Thread 2 is running on CPU 2
pthread_setaffinity_np: Invalid argument
Thread 4 is running on CPU 4
Thread 4 is running on CPU 4
hao#Gorax:~/Desktop$ ./a.out
pthread_setaffinity_np: Invalid argument
My question is "Why does this happen? Also, why does the message sometimes print twice?"
Here is the code:
#define _GNU_SOURCE
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <sched.h>
#include <errno.h>
#define handle_error_en(en, msg) \
do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0)
void *thread_function(char *message)
{
int s, j, number;
pthread_t thread;
cpu_set_t cpuset;
number = (int)message;
thread = pthread_self();
CPU_SET(number, &cpuset);
s = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
if (s != 0)
{
handle_error_en(s, "pthread_setaffinity_np");
}
printf("Thread %d is running on CPU %d\n", number, sched_getcpu());
exit(EXIT_SUCCESS);
}
int main()
{
pthread_t thread1, thread2, thread3, thread4;
int thread1Num = 1;
int thread2Num = 2;
int thread3Num = 3;
int thread4Num = 4;
int thread1Create, thread2Create, thread3Create, thread4Create, i, temp;
thread1Create = pthread_create(&thread1, NULL, (void *)thread_function, (char *)thread1Num);
thread2Create = pthread_create(&thread2, NULL, (void *)thread_function, (char *)thread2Num);
thread3Create = pthread_create(&thread3, NULL, (void *)thread_function, (char *)thread3Num);
thread4Create = pthread_create(&thread4, NULL, (void *)thread_function, (char *)thread4Num);
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
pthread_join(thread3, NULL);
pthread_join(thread4, NULL);
return 0;
}
The first CPU is CPU 0 not CPU 1. So you'll want to change your threadNums:
int thread1Num = 0;
int thread2Num = 1;
int thread3Num = 2;
int thread4Num = 3;
You should initialize cpuset with the CPU_ZERO() macro this way:
CPU_ZERO(&cpuset);
CPU_SET(number, &cpuset);
Also don't call exit() from a thread as it will stop the whole process with all its threads:
exit(EXIT_SUCCESS);
return 0; // Use this instead or call pthread_exit()