cudaStreamSynchronize behavior under multiple threads - multithreading

What is the behavior of cudaStreamSynchronize in the following case
ThreadA pseudo code :
while(true):
submit new cuda Kernel to cudaStreamX
ThreadB pseudo code:
call cudaStreamSynchronize(cudaStreamX)
My question is when will ThreadB return? Since ThreadA will always push new cuda kernels, and the cudaStreamX will never finish.

The API documentation isn't directly explicit about this, however the CUDA C programming guide is basically explicit:
cudaStreamSynchronize() takes a stream as a parameter and waits until all preceding commands in the given stream have completed
Furthermore, I think it should be sensible that:
cudaStreamSynchronize() cannot reasonably take into account work issued to a stream after that cudaStreamSynchronize() call. this would more or less require it to know the future.
cudaStreamSynchronize() should reasonably be expected to return after all previously issued work to that stream is complete.
Putting together an experimental test app, the above description is what I observe:
$ cat t396.cu
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <unistd.h>
const int PTHREADS=2;
const int TRIGGER1=5;
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
#define DELAY_T 1000000000ULL
template <int type>
__global__ void delay_kern(int i){
unsigned long long time = clock64();
#ifdef DEBUG
printf("hello %d\n", type);
#endif
while (clock64() < time+(i*DELAY_T));
}
volatile static int flag, flag0, loop_cnt;
// The thread configuration structure.
typedef struct
{
int my_thread_ordinal;
pthread_t thread;
cudaError_t status;
cudaStream_t stream;
int delay_usec;
}
config_t;
// The function executed by each thread assigned with CUDA device.
void *thread_func(void *arg)
{
// Unpack the config structure.
config_t *config = (config_t *)arg;
int my_thread=config->my_thread_ordinal;
cudaError_t cuda_status = cudaSuccess;
cuda_status = cudaSetDevice(0);
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot set focus to device %d, status = %d\n",
0, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}
printf("thread %d initialized\n", my_thread);
switch(config->my_thread_ordinal){
case 0:
//master thread
while (flag0) {
delay_kern<0><<<1,1,0,config->stream>>>(1);
if (loop_cnt++ > TRIGGER1) flag = 1;
printf("master thread loop: %d\n", loop_cnt);
usleep(config->delay_usec);
}
break;
default:
//slave thread
while (!flag);
printf("slave thread issuing stream sync at loop count: %d\n", loop_cnt);
cudaStreamSynchronize(config->stream);
flag0 = 0;
printf("slave thread set trigger and exit\n");
break;
}
cudaCheckErrors("thread CUDA error");
printf("thread %d complete\n", my_thread);
config->status = cudaSuccess;
return NULL;
}
int main(int argc, char* argv[])
{
int mydelay_usec = 1;
if (argc > 1) mydelay_usec = atoi(argv[1]);
if ((mydelay_usec < 1) || (mydelay_usec > 10000000)) {printf("invalid delay time specified\n"); return -1;}
flag = 0; flag0 = 1; loop_cnt = 0;
const int nthreads = PTHREADS;
// Create workers configs. Its data will be passed as
// argument to thread_func.
config_t* configs = (config_t*)malloc(sizeof(config_t) * nthreads);
cudaSetDevice(0);
cudaStream_t str;
cudaStreamCreate(&str);
// create a separate thread
// and execute the thread_func.
for (int i = 0; i < nthreads; i++) {
config_t *config = configs + i;
config->my_thread_ordinal = i;
config->stream = str;
config->delay_usec = mydelay_usec;
int status = pthread_create(&config->thread, NULL, thread_func, config);
if (status) {
fprintf(stderr, "Cannot create thread for device %d, status = %d\n",
i, status);
}
}
// Wait for device threads completion.
// Check error status.
int status = 0;
for (int i = 0; i < nthreads; i++) {
pthread_join(configs[i].thread, NULL);
status += configs[i].status;
}
if (status)
return status;
free(configs);
return 0;
}
$ nvcc -arch=sm_61 -o t396 t396.cu -lpthread
$ time ./t396 100000
thread 0 initialized
thread 1 initialized
master thread loop: 1
master thread loop: 2
master thread loop: 3
master thread loop: 4
master thread loop: 5
master thread loop: 6
slave thread issuing stream sync at loop count: 7
master thread loop: 7
master thread loop: 8
master thread loop: 9
master thread loop: 10
master thread loop: 11
master thread loop: 12
master thread loop: 13
master thread loop: 14
master thread loop: 15
master thread loop: 16
master thread loop: 17
master thread loop: 18
master thread loop: 19
master thread loop: 20
master thread loop: 21
master thread loop: 22
master thread loop: 23
master thread loop: 24
master thread loop: 25
master thread loop: 26
master thread loop: 27
master thread loop: 28
master thread loop: 29
master thread loop: 30
master thread loop: 31
master thread loop: 32
master thread loop: 33
master thread loop: 34
master thread loop: 35
master thread loop: 36
master thread loop: 37
master thread loop: 38
master thread loop: 39
slave thread set trigger and exit
thread 1 complete
thread 0 complete
real 0m5.416s
user 0m2.990s
sys 0m1.623s
$
This will require some careful thought to understand. However, in a nutshell, the app will issue kernels that simply execute about a 0.7s delay before returning from one thread, and from the other thread will wait for a small number of kernels to be issued, then will issue a cudaStreamSynchronize() call. The overall time measurement for the application defines when that call returned. As long as you keep the command line parameter (host delay) between kernel launches to a value less than about 0.5s, then the app will reliably exit in about 5.4s (this will vary depending on which GPU you are running on, but the overall app execution time should be constant up to a fairly large value of the host delay parameter).
If you specify a command line parameter that is larger than the kernel duration on your machine, then the overall app execution time will be approximately 5 times your command line parameter (microseconds), since the trigger point for the cudaStreamSynchronize() call is 5.
In my case, I compiled and ran this on CUDA 8.0.61, Ubuntu 14.04, Pascal Titan X.

Related

Why this simple program on shared variable does not scale? (no lock)

I'm new to concurrent programming. I implement a CPU intensive work and measure how much speedup I could gain. However, I cannot get any speedup as I increase #threads.
The program does the following task:
There's a shared counter to count from 1 to 1000001.
Each thread does the following until the counter reaches 1000001:
increments the counter atomically, then
run a loop for 10000 times.
There're 1000001*10000 = 10^10 operations in total to be perform, so I should be able to get good speedup as I increment #threads.
Here's how I implemented it:
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <stdatomic.h>
pthread_t workers[8];
atomic_int counter; // a shared counter
void *runner(void *param);
int main(int argc, char *argv[]) {
if(argc != 2) {
printf("Usage: ./thread thread_num\n");
return 1;
}
int NUM_THREADS = atoi(argv[1]);
pthread_attr_t attr;
counter = 1; // initialize shared counter
pthread_attr_init(&attr);
const clock_t begin_time = clock(); // begin timer
for(int i=0;i<NUM_THREADS;i++)
pthread_create(&workers[i], &attr, runner, NULL);
for(int i=0;i<NUM_THREADS;i++)
pthread_join(workers[i], NULL);
const clock_t end_time = clock(); // end timer
printf("Thread number = %d, execution time = %lf s\n", NUM_THREADS, (double)(end_time - begin_time)/CLOCKS_PER_SEC);
return 0;
}
void *runner(void *param) {
int temp = 0;
while(temp < 1000001) {
temp = atomic_fetch_add_explicit(&counter, 1, memory_order_relaxed);
for(int i=1;i<10000;i++)
temp%i; // do some CPU intensive work
}
pthread_exit(0);
}
However, as I run my program, I cannot get better performance than sequential execution!!
gcc-4.9 -std=c11 -pthread -o my_program my_program.c
for i in 1 2 3 4 5 6 7 8; do \
./my_program $i; \
done
Thread number = 1, execution time = 19.235998 s
Thread number = 2, execution time = 20.575237 s
Thread number = 3, execution time = 25.161116 s
Thread number = 4, execution time = 28.278671 s
Thread number = 5, execution time = 28.185605 s
Thread number = 6, execution time = 28.050380 s
Thread number = 7, execution time = 28.286925 s
Thread number = 8, execution time = 28.227132 s
I run the program on a 4-core machine.
Does anyone have suggestions to improve the program? Or any clue why I cannot get speedup?
The only work here that can be done in parallel is the loop:
for(int i=0;i<10000;i++)
temp%i; // do some CPU intensive work
gcc, even with the minimal optimisation level, will not emit any code for the temp%i; void expression (disassemble it and see), so this essentially becomes an empty loop, which will execute very fast - the execution time in the case with multiple threads running on different cores will be dominated by the cacheline containing your atomic variable ping-ponging between the different cores.
You need to make this loop actually do a significant amount of work before you'll see a speed-up.

setitimer and signal count on Linux. Is signal count directly proportional to run time?

There is a test program to work with setitimer on Linux (kernel 2.6; HZ=100). It sets various itimers to send signal every 10 ms (actually it is set as 9ms, but the timeslice is 10 ms). Then program runs for some fixed time (e.g. 30 sec) and counts signals.
Is it guaranteed that signal count will be proportional to running time? Will count be the same in every run and with every timer type (-r -p -v)?
Note, on the system should be no other cpu-active processes; and the question is about fixed-HZ kernel.
#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include <unistd.h>
#include <sys/time.h>
/* Use 9 ms timer */
#define usecs 9000
int events = 0;
void count(int a) {
events++;
}
int main(int argc, char**argv)
{
int timer,j,i,k=0;
struct itimerval timerval = {
.it_interval = {.tv_sec=0, .tv_usec=usecs},
.it_value = {.tv_sec=0, .tv_usec=usecs}
};
if ( (argc!=2) || (argv[1][0]!='-') ) {
printf("Usage: %s -[rpv]\n -r - ITIMER_REAL\n -p - ITIMER_PROF\n -v - ITIMER_VIRTUAL\n", argv[0]);
exit(0);
}
switch(argv[1][1]) {
case'r':
timer=ITIMER_REAL;
break;
case'p':
timer=ITIMER_PROF;
break;
case'v':
timer=ITIMER_VIRTUAL;
};
signal(SIGALRM,count);
signal(SIGPROF,count);
signal(SIGVTALRM,count);
setitimer(timer, &timerval, NULL);
/* constants should be tuned to some huge value */
for (j=0; j<4; j++)
for (i=0; i<2000000000; i++)
k += k*argc + 5*k + argc*3;
printf("%d events\n",events);
return 0;
}
Is it guaranteed that signal count will be proportional to running time?
Yes. In general, for all the three timers the longer the code runs, the more the number of signals received.
Will count be the same in every run and with every timer type (-r -p -v)?
No.
When the timer is set using ITIMER_REAL, the timer decrements in real time.
When it is set using ITIMER_VIRTUAL, the timer decrements only when the process is executing in the user address space. So, it doesn't decrement when the process makes a system call or during interrupt service routines.
So we can expect that #real_signals > #virtual_signals
ITIMER_PROF timers decrement both during user space execution of the process and when the OS is executing on behalf of the process i.e. during system calls.
So #prof_signals > #virtual_signals
ITIMER_PROF doesn't decrement when OS is not executing on behalf of the process. So #real_signals > #prof_signals
To summarise, #real_signals > #prof_signals > #virtual_signals.

Why sleep() after acquiring a pthread_mutex_lock will block the whole program?

In my test program, I start two threads, each of them just do the following logic:
1) pthread_mutex_lock()
2) sleep(1)
3) pthread_mutex_unlock()
However, I find that after some time, one of the two threads will block on pthread_mutex_lock() forever, while the other thread works normal. This is a very strange behavior and I think maybe a potential serious issue. By Linux manual, sleep() is not prohibited when a pthread_mutex_t is acquired. So my question is: is this a real problem or is there any bug in my code ?
The following is the test program. In the code, the 1st thread's output is directed to stdout, while the 2nd's is directed to stderr. So we can check these two different output to see whether the thread is blocked.
I have tested it on linux kernel (2.6.31) and (2.6.9). Both results are the same.
//======================= Test Program ===========================
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <pthread.h>
#define THREAD_NUM 2
static int data[THREAD_NUM];
static int sleepFlag = 1;
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
static void * threadFunc(void *arg)
{
int* idx = (int*) arg;
FILE* fd = NULL;
if (*idx == 0)
fd = stdout;
else
fd = stderr;
while(1) {
fprintf(fd, "\n[%d]Before pthread_mutex_lock is called\n", *idx);
if (pthread_mutex_lock(&mutex) != 0) {
exit(1);
}
fprintf(fd, "[%d]pthread_mutex_lock is finisheded. Sleep some time\n", *idx);
if (sleepFlag == 1)
sleep(1);
fprintf(fd, "[%d]sleep done\n\n", *idx);
fprintf(fd, "[%d]Before pthread_mutex_unlock is called\n", *idx);
if (pthread_mutex_unlock(&mutex) != 0) {
exit(1);
}
fprintf(fd, "[%d]pthread_mutex_unlock is finisheded.\n", *idx);
}
}
// 1. compile
// gcc -o pthread pthread.c -lpthread
// 2. run
// 1) ./pthread sleep 2> /tmp/error.log # Each thread will sleep 1 second after it acquires pthread_mutex_lock
// ==> We can find that /tmp/error.log will not increase.
// or
// 2) ./pthread nosleep 2> /tmp/error.log # No sleep is done when each thread acquires pthread_mutex_lock
// ==> We can find that both stdout and /tmp/error.log increase.
int main(int argc, char *argv[]) {
if ((argc == 2) && (strcmp(argv[1], "nosleep") == 0))
{
sleepFlag = 0;
}
pthread_t t[THREAD_NUM];
int i;
for (i = 0; i < THREAD_NUM; i++) {
data[i] = i;
int ret = pthread_create(&t[i], NULL, threadFunc, &data[i]);
if (ret != 0) {
perror("pthread_create error\n");
exit(-1);
}
}
for (i = 0; i < THREAD_NUM; i++) {
int ret = pthread_join(t[i], (void*)0);
if (ret != 0) {
perror("pthread_join error\n");
exit(-1);
}
}
exit(0);
}
This is the output:
On the terminal where the program is started:
root#skyscribe:~# ./pthread sleep 2> /tmp/error.log
[0]Before pthread_mutex_lock is called
[0]pthread_mutex_lock is finisheded. Sleep some time
[0]sleep done
[0]Before pthread_mutex_unlock is called
[0]pthread_mutex_unlock is finisheded.
...
On another terminal to see the file /tmp/error.log
root#skyscribe:~# tail -f /tmp/error.log
[1]Before pthread_mutex_lock is called
And no new lines are outputed from /tmp/error.log
This is a wrong way to use mutexes. A thread should not hold a mutex for more time than it does not own it, particularly not if it sleeps while holding the mutex. There is no FIFO guarantee for locking a mutex (for efficiency reasons).
More specifically, if thread 1 unlocks the mutex while thread 2 is waiting for it, it makes thread 2 runnable but this does not force the scheduler to preempt thread 1 or make thread 2 run immediately. Most likely, it will not because thread 1 has recently slept. When thread 1 subsequently reaches the pthread_mutex_lock() call, it will generally be allowed to lock the mutex immediately, even though there is a thread waiting (and the implementation can know it). When thread 2 wakes up after that, it will find the mutex already locked and go back to sleep.
The best solution is not to hold a mutex for that long. If that is not possible, consider moving the lock-needing operations to a single thread (removing the need for the lock) or waking up the correct thread using condition variables.
There's neither a problem, nor a bug in your code, but a combination of buffering and scheduling effects. Add an fflush here:
fprintf (fd, "[%d]pthread_mutex_unlock is finisheded.\n", *idx);
fflush (fd);
and run
./a.out >1 1.log 2> 2.log &
and you'll see rather equal progress made by the two threads.
EDIT: and like #jilles above said, a mutex is supposed to be a short wait lock, as opposed to long waits like condition variable wait, waiting for I/O or sleeping. That's why a mutex is not a cancellation point too.

Does a call to MPI_Barrier affect every thread in an MPI process?

Does a call to MPI_Barrier affect every thread in an MPI process or only the thread
that makes the call?
For your information , my MPI application will run with MPI_THREAD_MULTIPLE.
Thanks.
The way to think of this is that MPI_Barrier (and other collectives) are blocking function calls, which block until all processes in the communicator have completed the function. That, I think, makes it a little easier to figure out what should happen; the function blocks, but other threads continue on their way unimpeded.
So consider the following chunk of code (The shared done flag being flushed to communicate between threads is not how you should be doing thread communication, so please don't use this as a template for anything. Furthermore, using a reference to done will solve this bug/optimization, see the end of comment 2):
#include <mpi.h>
#include <omp.h>
#include <stdio.h>
#include <unistd.h>
int main(int argc, char**argv) {
int ierr, size, rank;
int provided;
volatile int done=0;
MPI_Comm comm;
ierr = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
if (provided == MPI_THREAD_SINGLE) {
fprintf(stderr,"Could not initialize with thread support\n");
MPI_Abort(MPI_COMM_WORLD,1);
}
comm = MPI_COMM_WORLD;
ierr = MPI_Comm_size(comm, &size);
ierr = MPI_Comm_rank(comm, &rank);
if (rank == 1) sleep(10);
#pragma omp parallel num_threads(2) default(none) shared(rank,comm,done)
{
#pragma omp single
{
/* spawn off one thread to do the barrier,... */
#pragma omp task
{
MPI_Barrier(comm);
printf("%d -- thread done Barrier\n", rank);
done = 1;
#pragma omp flush
}
/* and another to do some printing while we're waiting */
#pragma omp task
{
int *p = &done;
while(!(*p) {
printf("%d -- thread waiting\n", rank);
sleep(1);
}
}
}
}
MPI_Finalize();
return 0;
}
Rank 1 sleeps for 10 seconds, and all the ranks start a barrier in one thread. If you run this with mpirun -np 2, you'd expect the first of rank 0s threads to hit the barrier, and the other to cycle around printing and waiting -- and sure enough, that's what happens:
$ mpirun -np 2 ./threadbarrier
0 -- thread waiting
0 -- thread waiting
0 -- thread waiting
0 -- thread waiting
0 -- thread waiting
0 -- thread waiting
0 -- thread waiting
0 -- thread waiting
0 -- thread waiting
0 -- thread waiting
1 -- thread waiting
0 -- thread done Barrier
1 -- thread done Barrier

pthreads_setaffinity_np: Invalid argument?

I've managed to get my pthreads program sort of working. Basically I am trying to manually set the affinity of 4 threads such that thread 1 runs on CPU 1, thread 2 runs on CPU 2, thread 3 runs on CPU 3, and thread 4 runs on CPU 4.
After compiling, my code works for a few threads but not others (seems like thread 1 never works) but running the same compiled program a couple of different times gives me different results.
For example:
hao#Gorax:~/Desktop$ ./a.out
Thread 3 is running on CPU 3
pthread_setaffinity_np: Invalid argument
Thread Thread 2 is running on CPU 2
hao#Gorax:~/Desktop$ ./a.out
Thread 2 is running on CPU 2
pthread_setaffinity_np: Invalid argument
pthread_setaffinity_np: Invalid argument
Thread 3 is running on CPU 3
Thread 3 is running on CPU 3
hao#Gorax:~/Desktop$ ./a.out
Thread 2 is running on CPU 2
pthread_setaffinity_np: Invalid argument
Thread 4 is running on CPU 4
Thread 4 is running on CPU 4
hao#Gorax:~/Desktop$ ./a.out
pthread_setaffinity_np: Invalid argument
My question is "Why does this happen? Also, why does the message sometimes print twice?"
Here is the code:
#define _GNU_SOURCE
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <sched.h>
#include <errno.h>
#define handle_error_en(en, msg) \
do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0)
void *thread_function(char *message)
{
int s, j, number;
pthread_t thread;
cpu_set_t cpuset;
number = (int)message;
thread = pthread_self();
CPU_SET(number, &cpuset);
s = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
if (s != 0)
{
handle_error_en(s, "pthread_setaffinity_np");
}
printf("Thread %d is running on CPU %d\n", number, sched_getcpu());
exit(EXIT_SUCCESS);
}
int main()
{
pthread_t thread1, thread2, thread3, thread4;
int thread1Num = 1;
int thread2Num = 2;
int thread3Num = 3;
int thread4Num = 4;
int thread1Create, thread2Create, thread3Create, thread4Create, i, temp;
thread1Create = pthread_create(&thread1, NULL, (void *)thread_function, (char *)thread1Num);
thread2Create = pthread_create(&thread2, NULL, (void *)thread_function, (char *)thread2Num);
thread3Create = pthread_create(&thread3, NULL, (void *)thread_function, (char *)thread3Num);
thread4Create = pthread_create(&thread4, NULL, (void *)thread_function, (char *)thread4Num);
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
pthread_join(thread3, NULL);
pthread_join(thread4, NULL);
return 0;
}
The first CPU is CPU 0 not CPU 1. So you'll want to change your threadNums:
int thread1Num = 0;
int thread2Num = 1;
int thread3Num = 2;
int thread4Num = 3;
You should initialize cpuset with the CPU_ZERO() macro this way:
CPU_ZERO(&cpuset);
CPU_SET(number, &cpuset);
Also don't call exit() from a thread as it will stop the whole process with all its threads:
exit(EXIT_SUCCESS);
return 0; // Use this instead or call pthread_exit()

Resources