How many threads per core? - multithreading

I am running a multi-threaded program on my computer which has 4 cores. I am creating threads that run with SCHED_FIFO, SCHED_OTHER, and SCHED_RR priorities. What is the maximum number of each type of thread that can run simultaneously?
For example,
I'm pretty sure only four SCHED_FIFO threads can run at a time (one per core)
but I'm not sure about the other two.
edit my code, as asked (it's long, but most of it is for testing how long each thread completes a delay task)
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/time.h>
#include <time.h>
#include <string.h>
void *ThreadRunner(void *vargp);
void DisplayThreadSchdStats(void);
void delayTask(void);
int threadNumber = 0;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
#define NUM_THREADS 9
//used to store the information of each thread
typedef struct{
pthread_t threadID;
int policy;
struct sched_param param;
long startTime;
long taskStartTime;
long endTime1;
long endTime2;
long endTime3;
long runTime;
char startDate[30];
char endDate[30];
}ThreadInfo;
ThreadInfo myThreadInfo[NUM_THREADS];
//main function
int main(void){
printf("running...\n");
int fifoPri = 60;
int rrPri = 30;
//create the 9 threads and assign their scheduling policies
for(int i=0; i<NUM_THREADS; i++){
if(i%3 == SCHED_OTHER){
myThreadInfo[i].policy = SCHED_OTHER;
myThreadInfo[i].param.sched_priority = 0;
}
else if (i%3 == SCHED_FIFO){
myThreadInfo[i].policy = SCHED_RR;
myThreadInfo[i].param.sched_priority = rrPri++;
}
else{
myThreadInfo[i].policy = SCHED_FIFO;
myThreadInfo[i].param.sched_priority = fifoPri++;
}
pthread_create( &myThreadInfo[i].threadID, NULL, ThreadRunner, &myThreadInfo[i]);
pthread_cond_wait(&cond, &mutex);
}
printf("\n\n");
//join each thread
for(int g = 0; g < NUM_THREADS; g++){
pthread_join(myThreadInfo[g].threadID, NULL);
}
//print out the stats for each thread
DisplayThreadSchdStats();
return 0;
}
//used to print out all of the threads, along with their stats
void DisplayThreadSchdStats(void){
int otherNum = 0;
long task1RR = 0;
long task2RR = 0;
long task3RR = 0;
long task1FIFO = 0;
long task2FIFO = 0;
long task3FIFO = 0;
long task1OTHER = 0;
long task2OTHER = 0;
long task3OTHER = 0;
for(int g = 0; g < threadNumber; g++){
printf("\nThread# [%d] id [0x%x] exiting...\n", g + 1, (int) myThreadInfo[g].threadID);
printf("DisplayThreadSchdStats:\n");
printf(" threadID = 0x%x \n", (int) myThreadInfo[g].threadID);
if(myThreadInfo[g].policy == 0){
printf(" policy = SHED_OTHER\n");
task1OTHER += (myThreadInfo[g].endTime1 - myThreadInfo[g].taskStartTime);
task2OTHER += (myThreadInfo[g].endTime2 - myThreadInfo[g].endTime1);
task3OTHER += (myThreadInfo[g].endTime3 - myThreadInfo[g].endTime2);
otherNum++;
}
if(myThreadInfo[g].policy == 1){
printf(" policy = SHED_FIFO\n");
task1FIFO += (myThreadInfo[g].endTime1 - myThreadInfo[g].taskStartTime);
task2FIFO += (myThreadInfo[g].endTime2 - myThreadInfo[g].endTime1);
task3FIFO += (myThreadInfo[g].endTime3 - myThreadInfo[g].endTime2);
}
if(myThreadInfo[g].policy == 2){
printf(" policy = SHED_RR\n");
task1RR+= (myThreadInfo[g].endTime1 - myThreadInfo[g].taskStartTime);
task2RR += (myThreadInfo[g].endTime2 - myThreadInfo[g].endTime1);
task3RR += (myThreadInfo[g].endTime3 - myThreadInfo[g].endTime2);
}
printf(" priority = %d \n", myThreadInfo[g].param.sched_priority);
printf(" startTime = %s\n", myThreadInfo[g].startDate);
printf(" endTime = %s\n", myThreadInfo[g].endDate);
printf(" Task start TimeStamp in micro seconds [%ld]\n", myThreadInfo[g].taskStartTime);
printf(" Task end TimeStamp in micro seconds [%ld] Delta [%lu]us\n", myThreadInfo[g].endTime1 , (myThreadInfo[g].endTime1 - myThreadInfo[g].taskStartTime));
printf(" Task end Timestamp in micro seconds [%ld] Delta [%lu]us\n", myThreadInfo[g].endTime2, (myThreadInfo[g].endTime2 - myThreadInfo[g].endTime1));
printf(" Task end Timestamp in micro seconds [%ld] Delta [%lu]us\n\n\n", myThreadInfo[g].endTime3, (myThreadInfo[g].endTime3 - myThreadInfo[g].endTime2));
printf("\n\n");
}
printf("Analysis: \n");
printf(" for SCHED_OTHER, task 1 took %lu, task2 took %lu, and task 3 took %lu. (average = %lu)\n", (task1OTHER/otherNum), (task2OTHER/otherNum), (task3OTHER/otherNum), (task1OTHER/otherNum + task2OTHER/otherNum + task3OTHER/otherNum)/3 );
printf(" for SCHED_RR, task 1 took %lu, task2 took %lu, and task 3 took %lu. (average = %lu)\n", (task1RR/otherNum), (task2RR/otherNum), (task3RR/otherNum), (task1RR/otherNum + task2RR/otherNum + task3RR/otherNum)/3 );
printf(" for SCHED_FIFO, task 1 took %lu, task2 took %lu, and task 3 took %lu. (average = %lu)\n", (task1FIFO/otherNum), (task2FIFO/otherNum), (task3FIFO/otherNum) , (task1FIFO/otherNum + task2FIFO/otherNum + task3FIFO/otherNum)/3);
}
//the function that runs the threads
void *ThreadRunner(void *vargp){
pthread_mutex_lock(&mutex);
char date[30];
struct tm *ts;
size_t last;
time_t timestamp = time(NULL);
ts = localtime(&timestamp);
last = strftime(date, 30, "%c", ts);
threadNumber++;
ThreadInfo* currentThread;
currentThread = (ThreadInfo*)vargp;
//set the start time
struct timeval tv;
gettimeofday(&tv, NULL);
long milltime0 = (tv.tv_sec) * 1000 + (tv.tv_usec) / 1000;
currentThread->startTime = milltime0;
//set the start date
strcpy(currentThread->startDate, date);
if(pthread_setschedparam(pthread_self(), currentThread->policy,(const struct sched_param *) &(currentThread->param))){
perror("pthread_setschedparam failed");
pthread_exit(NULL);
}
if(pthread_getschedparam(pthread_self(), &currentThread->policy,(struct sched_param *) &currentThread->param)){
perror("pthread_getschedparam failed");
pthread_exit(NULL);
}
gettimeofday(&tv, NULL);
long startTime = (tv.tv_sec) * 1000 + (tv.tv_usec) / 1000;
currentThread->taskStartTime = startTime;
//delay task #1
delayTask();
//set the end time of task 1
gettimeofday(&tv, NULL);
long milltime1 = (tv.tv_sec) * 1000 + (tv.tv_usec) / 1000;
currentThread->endTime1 = milltime1;
//delay task #2
delayTask();
//set the end time of task 2
gettimeofday(&tv, NULL);
long milltime2 = (tv.tv_sec) * 1000 + (tv.tv_usec) / 1000;
currentThread->endTime2 = milltime2;
//delay task #3
delayTask();
//set the end time of task 3
gettimeofday(&tv, NULL);
long milltime3 = (tv.tv_sec) * 1000 + (tv.tv_usec) / 1000;
currentThread->endTime3 = milltime3;
//set the end date
timestamp = time(NULL);
ts = localtime(&timestamp);
last = strftime(date, 30, "%c", ts);
strcpy(currentThread->endDate, date);
//set the total run time of the thread
long runTime = milltime3 - milltime0;
currentThread->runTime = runTime;
//unlock mutex
pthread_mutex_unlock(&mutex);
pthread_cond_signal(&cond);
pthread_exit(NULL);
}
//used to delay each thread
void delayTask(void){
for(int i = 0; i < 5000000; i++){
printf("%d", i % 2);
}
}

In short: no guarantees how many threads will be run parallelly, but all of them will run concurrently.
No matter how many threads you start in an application controlled by a general-purpose operating system, they all will run concurrently. That is, each thread will be provided with some non-zero time to run, and no particular execution order of execution of threads' sections outside OS-defined synchronization primitives (waiting on mutexes, locks etc.) is guaranteed. The only limit on thread number may be imposed by OS'es policies.
How many of your threads will be chosen to run parallelly at any given moment of time is not defined. The number cannot obviously exceed number of logical processors visible to an OS (remember that the OS itself may be run inside a virtual machine, and there are hardware tricks like SMT), and your threads will be competing with other threads present in the same system. OSes do offer APIs to query which threads/processes are currently in running state and which are blocked or ready but not scheduled, otherwise writing programs like top would become problematic.
Explicitly setting priorities to threads may affect the operating system's choices and increase average number of your threads being executed parallelly. Note that it can either help or hurt if used without thinking. Still, it will never be strictly equal to four inside a multitasking OS as long as there are other processes. The only way to make sure 100% of CPU's hardware is dedicated to your threads 100% of the time is to run a barebone application, outside of any OS outside of any hypervisor (and even then there are peculiarities, see "Intel System Management Mode").
Inside a mostly idle general purpose OS, if your threads are compute-intensive, I would guess the average parallel utilization ratio would be 3.9 — 4.0. But a slightest perturbation — and all bets are off.

Related

Copying Memory Pages in SLES Linux is much faster then in Ubuntu

I have two virtual machines running on the exact same Hardware. One of them is SLES12SP5 (Kernel 4.12) (I have similar results with SLES15SP2, but got no at hand at the moment) and one of them is Ubuntu 20.04 (Kernel 5.4). I have a really simple C-Program which measures writing into allocated memory pages and copying them. This program is WAY faster on SLES then on Ubuntu and I can't figure out why. Unfortunately I can't activate the performance counters on the esx Host because of the cluster configuration, maybe that would help to find out whats happening here.
Okay, so here is the Programm:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef __linux__
#include <sys/time.h>
#elif _WIN32
#include <sys/timeb.h>
#endif
#ifdef __linux__
double get_time_insec() {
struct timeval tstruct;
gettimeofday(&tstruct, NULL);
long long milliseconds = tstruct.tv_sec*1000LL + tstruct.tv_usec/1000;
return (double)milliseconds/1000;
}
#elif _WIN32
double get_time_insec() {
struct timeb tstruct;
ftime(&tstruct);
return (double)tstruct.time + ((double)tstruct.millitm / (double)1000);
}
#endif
#define PAGESIZE 8 * 1024
int main(int argc, char *argv[]) {
int i = 0;
int x = 0;
double startval = 0;
int pagenum = 0;
int offset = 0;
int numOfPages = 0;
int numOfTimes = 0;
size_t allocBytes = 0;
if (argc != 3) {
printf("Usage %s numOfPages numOfTimes\n", argv[0]);
return -1;
}
numOfPages = atoi(argv[1]);
numOfTimes = atoi(argv[2]);
allocBytes = numOfPages*PAGESIZE*2;
// Allocate memory Pages
printf("Allocating %ld Bytes\n", allocBytes);
char *mymem = malloc(allocBytes);
if (mymem == NULL) {
printf("Allocation failed!\n");
return -1;
}
// Fill the first half of Pages with text
printf("Filling %d Pages %d times...", numOfPages, numOfTimes);
fflush(stdout);
startval = get_time_insec();
for (x = 0; x < numOfTimes; x++) {
for (pagenum = 0; pagenum < numOfPages; pagenum++) {
for (i = 0; i < PAGESIZE; i += 32) {
offset = (pagenum * PAGESIZE) + i;
memcpy(mymem + offset, &"ABCDEFGHIJKLMNOPQRSTUVWXYZ123456", 32);
}
}
}
printf("Time taken: %.6f sec\n", get_time_insec() - startval);
// And now copy them to the next half of pages
printf("Copying %d Pages %d times...", numOfPages/2, numOfTimes);
fflush(stdout);
startval = get_time_insec();
for (x = 0; x < numOfTimes; x++) {
for (pagenum = 0; pagenum < numOfPages; pagenum++) {
memcpy((mymem + (numOfPages/2) * PAGESIZE) + (pagenum * PAGESIZE), mymem + (pagenum * PAGESIZE), PAGESIZE);
}
}
printf("Time taken: %.6f sec\n", get_time_insec() - startval);
free(mymem);
return 0;
}
And this are the results for running it with "2000 200":
Ubuntu:
Filling 2000 Pages 200 times...Time taken: 0.606000 sec
Copying 1000 Pages 200 times...Time taken: 0.921000 sec
SLES:
Filling 2000 Pages 200 times...Time taken: 0.513000 sec
Copying 1000 Pages 200 times...Time taken: 0.479000 sec
I can't get my head around why SLES is twice as fast then Ubuntu.
I thought that writing into memory is a operation where the os doesn't do anything, apart from allocating the pages in the MMU beforehand.
So why exactly could it be that SLES is twice as fast?
Any kernel parameter? Any parameter in the virtual memory subsystem?
I would love to solve this which is bugging me for weeks now!
Thomas
you are absolutely right. After compiling the source directly on the platform and activating the gcc Optimizations the values are very close now:
Ubuntu:
Filling 2000 Pages 200 times...Time taken: 0.208000 sec
Copying 1000 Pages 200 times...Time taken: 0.484000 sec
SLES:
Filling 2000 Pages 200 times...Time taken: 0.205000 sec
Copying 1000 Pages 200 times...Time taken: 0.470000 sec
I think I can live with that. Of course Ubuntu is still slower then SLES every time, but it's such a minimal difference that it won't matter much!
Thanks a lot for your help !
Thomas

Pthreads program is slower than the serial program - Linux

Thank you for being generous with your time and helping me in this matter. I am trying to calculate the sum of the squared numbers using pthread. However, it seems that it is even slower than the serial implementation. Moreover, when I increase the number of threads the program becomes even slower. I made sure that each thread is running on a different core (I have 6 cores assigned to the virtual machine)
This is the serial program:
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
#include <time.h>
int main(int argc, char *argv[]) {
struct timeval start, end;
gettimeofday(&start, NULL); //start time of calculation
int n = atoi(argv[1]);
long int sum = 0;
for (int i = 1; i < n; i++){
sum += (i * i);
}
gettimeofday(&end, NULL); //end time of calculation
printf("The sum of squares in [1,%d): %ld | Time Taken: %ld mirco seconds \n",n,sum,
((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)));
return 0;
}
This the Pthreads program:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/time.h>
#include <time.h>
void *Sum(void *param);
// structure for thread arguments
struct thread_args {
int tid;
int a; //start
int b; //end
long int result; // partial results
};
int main(int argc, char *argv[])
{
struct timeval start, end;
gettimeofday(&start, NULL); //start time of calculation
int numthreads;
int number;
double totalSum=0;
if(argc < 3 ){
printf("Usage: ./sum_pthreads <numthreads> <number> ");
return 1;
}
numthreads = atoi(argv[1]);
number = atoi(argv[2]);;
pthread_t tid[numthreads];
struct thread_args targs[numthreads];
printf("I am Process | range: [%d,%d)\n",1,number);
printf("Running Threads...\n\n");
for(int i=0; i<numthreads;i++ ){
//Setting up the args
targs[i].tid = i;
targs[i].a = (number)*(targs[i].tid)/(numthreads);
targs[i].b = (number)*(targs[i].tid+1)/(numthreads);
if(i == numthreads-1 ){
targs[i].b = number;
}
pthread_create(&tid[i],NULL,Sum, &targs[i]);
}
for(int i=0; i< numthreads; i++){
pthread_join(tid[i],NULL);
}
printf("Threads Exited!\n");
printf("Process collecting information...\n");
for(int i=0; i<numthreads;i++ ){
totalSum += targs[i].result;
}
gettimeofday(&end, NULL); //end time of calculation
printf("Total Sum is: %.2f | Taken Time: %ld mirco seconds \n",totalSum,
((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)));
return 0;
}
void *Sum(void *param) {
int start = (*(( struct thread_args*) param)).a;
int end = (*((struct thread_args*) param)).b;
int id = (*((struct thread_args*)param)).tid;
long int sum =0;
printf("I am thread %d | range: [%d,%d)\n",id,start,end);
for (int i = start; i < end; i++){
sum += (i * i);
}
(*((struct thread_args*)param)).result = sum;
printf("I am thread %d | Sum: %ld\n\n", id ,(*((struct thread_args*)param)).result );
pthread_exit(0);
}
Results:
hamza#hamza:~/Desktop/lab4$ ./sum_serial 10
The sum of squares in [1,10): 285 | Time Taken: 7 mirco seconds
hamza#hamza:~/Desktop/lab4$ ./sol 2 10
I am Process | range: [1,10)
Running Threads...
I am thread 0 | range: [0,5)
I am thread 0 | Sum: 30
I am thread 1 | range: [5,10)
I am thread 1 | Sum: 255
Threads Exited!
Process collecting information...
Total Sum is: 285.00 | Taken Time: 670 mirco seconds
hamza#hamza:~/Desktop/lab4$ ./sol 3 10
I am Process | range: [1,10)
Running Threads...
I am thread 0 | range: [0,3)
I am thread 0 | Sum: 5
I am thread 1 | range: [3,6)
I am thread 1 | Sum: 50
I am thread 2 | range: [6,10)
I am thread 2 | Sum: 230
Threads Exited!
Process collecting information...
Total Sum is: 285.00 | Taken Time: 775 mirco seconds
hamza#hamza:~/Desktop/lab4$
The two programs do very different things. For example, the threaded program produces much more text output and creates a bunch of threads. You're comparing very short runs (less than a thousandth of a second) so the overhead of those additional things is significant.
You have to test with much longer runs such that the cost of producing additional output and creating and synchronizing threads is lost.
To use an analogy, one person can tighten three screws faster than three people can because of the overhead of getting a tool to each person, deciding who will tighten which screw, and so on. But if you have 500 screws to tighten, then three people will get it done faster.

Large overhead in CUDA kernel launch outside GPU execution

I am measuring the running time of kernels, as seen from a CPU thread, by measuring the interval from before launching a kernel to after a cudaDeviceSynchronize (using gettimeofday). I have a cudaDeviceSynchronize before I start recording the interval. I also instrument the kernels to record the timestamp on the GPU (using clock64) at the start of the kernel by thread(0,0,0) of each block from block(0,0,0) to block(occupancy-1,0,0) to an array of size equal to number of SMs. Every thread at the end of the kernel code, updates the timestamp to another array (of the same size) at the index equal to the index of the SM it runs on.
The intervals calculated from the two arrays are 60-70% of that measured from the CPU thread.
For example, on a K40, while gettimeofday gives an interval of 140ms, the avg of intervals calculated from GPU timestamps is only 100ms. I have experimented with many grid sizes (15 blocks to 6K blocks) but have found similar behavior so far.
__global__ void some_kernel(long long *d_start, long long *d_end){
if(threadIdx.x==0){
d_start[blockIdx.x] = clock64();
}
//some_kernel code
d_end[blockIdx.x] = clock64();
}
Does this seem possible to the experts?
Does this seem possible to the experts?
I suppose anything is possible for code you haven't shown. After all, you may just have a silly bug in any of your computation arithmetic. But if the question is "is it sensible that there should be 40ms of unaccounted-for time overhead on a kernel launch, for a kernel that takes ~140ms to execute?" I would say no.
I believe the method I outlined in the comments is reasonably accurate. Take the minimum clock64() timestamp from any thread in the grid (but see note below regarding SM restriction). Compare it to the maximum time stamp of any thread in the grid. The difference will be comparable to the reported execution time of gettimeofday() to within 2 percent, according to my testing.
Here is my test case:
$ cat t1040.cu
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define LS_MAX 2000000000U
#define MAX_SM 64
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
__device__ int result;
__device__ unsigned long long t_start[MAX_SM];
__device__ unsigned long long t_end[MAX_SM];
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__device__ __inline__ uint32_t __mysmid(){
uint32_t smid;
asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
return smid;}
__global__ void kernel(unsigned ls){
unsigned long long int ts = clock64();
unsigned my_sm = __mysmid();
atomicMin(t_start+my_sm, ts);
// junk code to waste time
int tv = ts&0x1F;
for (unsigned i = 0; i < ls; i++){
tv &= (ts+i);}
result = tv;
// end of junk code
ts = clock64();
atomicMax(t_end+my_sm, ts);
}
// optional command line parameter 1 = kernel duration, parameter 2 = number of blocks, parameter 3 = number of threads per block
int main(int argc, char *argv[]){
unsigned ls;
if (argc > 1) ls = atoi(argv[1]);
else ls = 1000000;
if (ls > LS_MAX) ls = LS_MAX;
int num_sms = 0;
cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0);
cudaCheckErrors("cuda get attribute fail");
int gpu_clk = 0;
cudaDeviceGetAttribute(&gpu_clk, cudaDevAttrClockRate, 0);
if ((num_sms < 1) || (num_sms > MAX_SM)) {printf("invalid sm count: %d\n", num_sms); return 1;}
unsigned blks;
if (argc > 2) blks = atoi(argv[2]);
else blks = num_sms;
if ((blks < 1) || (blks > 0x3FFFFFFF)) {printf("invalid blocks: %d\n", blks); return 1;}
unsigned ntpb;
if (argc > 3) ntpb = atoi(argv[3]);
else ntpb = 256;
if ((ntpb < 1) || (ntpb > 1024)) {printf("invalid threads: %d\n", ntpb); return 1;}
kernel<<<1,1>>>(100); // warm up
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
unsigned long long *h_start, *h_end;
h_start = new unsigned long long[num_sms];
h_end = new unsigned long long[num_sms];
for (int i = 0; i < num_sms; i++){
h_start[i] = 0xFFFFFFFFFFFFFFFFULL;
h_end[i] = 0;}
cudaMemcpyToSymbol(t_start, h_start, num_sms*sizeof(unsigned long long));
cudaMemcpyToSymbol(t_end, h_end, num_sms*sizeof(unsigned long long));
unsigned long long htime = dtime_usec(0);
kernel<<<blks,ntpb>>>(ls);
cudaDeviceSynchronize();
htime = dtime_usec(htime);
cudaMemcpyFromSymbol(h_start, t_start, num_sms*sizeof(unsigned long long));
cudaMemcpyFromSymbol(h_end, t_end, num_sms*sizeof(unsigned long long));
cudaCheckErrors("some error");
printf("host elapsed time (ms): %f \n device sm clocks:\n start:", htime/1000.0f);
unsigned long long max_diff = 0;
for (int i = 0; i < num_sms; i++) {printf(" %12lu ", h_start[i]);}
printf("\n end: ");
for (int i = 0; i < num_sms; i++) {printf(" %12lu ", h_end[i]);}
for (int i = 0; i < num_sms; i++) if ((h_start[i] != 0xFFFFFFFFFFFFFFFFULL) && (h_end[i] != 0) && ((h_end[i]-h_start[i]) > max_diff)) max_diff=(h_end[i]-h_start[i]);
printf("\n max diff clks: %lu\nmax diff kernel time (ms): %f\n", max_diff, max_diff/(float)(gpu_clk));
return 0;
}
$ nvcc -o t1040 t1040.cu -arch=sm_35
$ ./t1040 1000000 1000 128
host elapsed time (ms): 2128.818115
device sm clocks:
start: 3484744 3484724
end: 2219687393 2228431323
max diff clks: 2224946599
max diff kernel time (ms): 2128.117432
$
Notes:
This code can only be run on a cc3.5 or higher GPU due to the use of 64-bit atomicMin and atomicMax.
I've run it on a variety of grid configurations, on both a GT640 (very low end cc3.5 device) and K40c (high end) and the timing results between host and device agree to within 2% (for reasonably long kernel execution times. If you pass 1 as the command line parameter, with very small grid sizes, the kernel execution time will be very short (nanoseconds) whereas the host will see about 10-20us. This is kernel launch overhead being measured. So the 2% number is for kernels that take much longer than 20us to execute).
It accepts 3 (optional) command line parameters, the first of which varies the amount of time the kernel will execute.
My timestamping is done on a per-SM basis, because the clock64() resource is indicated to be a per-SM resource. The sm clocks are not guaranteed to be synchronized between SMs.
You can modify the grid dimensions. The second optional command line parameter specifies the number of blocks to launch. The third optional command line parameter specifies the number of threads per block. The timing methodology I have shown here should not be dependent on number of blocks launched or number of threads per block. If you specify fewer blocks than SMs, the code should ignore "unused" SM data.

pthreads code not scaling up

I wrote the following very simple pthread code to test how it scales up. I am running the code on a machine with 8 logical processors and at no time do I create more than 8 threads (to avoid context switching).
With increasing number of threads, each thread has to do lesser amount of work. Also, it is evident from the code that there are no shared Data structures between the threads which might be a bottleneck. But still, my performance degrades as I increase the number of threads.
Can somebody tell me what am I doing wrong here.
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int NUM_THREADS = 3;
unsigned long int COUNTER = 10000000000000;
unsigned long int LOOP_INDEX;
void* addNum(void *data)
{
unsigned long int sum = 0;
for(unsigned long int i = 0; i < LOOP_INDEX; i++) {
sum += 100;
}
return NULL;
}
int main(int argc, char** argv)
{
NUM_THREADS = atoi(argv[1]);
pthread_t *threads = (pthread_t*)malloc(sizeof(pthread_t) * NUM_THREADS);
int rc;
clock_t start, diff;
LOOP_INDEX = COUNTER/NUM_THREADS;
start = clock();
for (int t = 0; t < NUM_THREADS; t++) {
rc = pthread_create((threads + t), NULL, addNum, NULL);
if (rc) {
printf("ERROR; return code from pthread_create() is %d", rc);
exit(-1);
}
}
void *status;
for (int t = 0; t < NUM_THREADS; t++) {
rc = pthread_join(threads[t], &status);
}
diff = clock() - start;
int sec = diff / CLOCKS_PER_SEC;
printf("%d",sec);
}
Note: All the answers I found online said that the overhead of creating the threads is more than the work they are doing. To test it, I commented out everything in the "addNum()" function. But then, after doing that no matter how many threads I create, the time taken by the code is 0 seconds. So there is no overhead as such, I think.
clock() counts CPU time used, across all threads. So all that's telling you is that you're using a little bit more total CPU time, which is exactly what you would expect.
It's the total wall clock elapsed time which should be going down if your parallelisation is effective. Measure that with clock_gettime() specifying the CLOCK_MONOTONIC clock instead of clock().

Why this simple program on shared variable does not scale? (no lock)

I'm new to concurrent programming. I implement a CPU intensive work and measure how much speedup I could gain. However, I cannot get any speedup as I increase #threads.
The program does the following task:
There's a shared counter to count from 1 to 1000001.
Each thread does the following until the counter reaches 1000001:
increments the counter atomically, then
run a loop for 10000 times.
There're 1000001*10000 = 10^10 operations in total to be perform, so I should be able to get good speedup as I increment #threads.
Here's how I implemented it:
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <stdatomic.h>
pthread_t workers[8];
atomic_int counter; // a shared counter
void *runner(void *param);
int main(int argc, char *argv[]) {
if(argc != 2) {
printf("Usage: ./thread thread_num\n");
return 1;
}
int NUM_THREADS = atoi(argv[1]);
pthread_attr_t attr;
counter = 1; // initialize shared counter
pthread_attr_init(&attr);
const clock_t begin_time = clock(); // begin timer
for(int i=0;i<NUM_THREADS;i++)
pthread_create(&workers[i], &attr, runner, NULL);
for(int i=0;i<NUM_THREADS;i++)
pthread_join(workers[i], NULL);
const clock_t end_time = clock(); // end timer
printf("Thread number = %d, execution time = %lf s\n", NUM_THREADS, (double)(end_time - begin_time)/CLOCKS_PER_SEC);
return 0;
}
void *runner(void *param) {
int temp = 0;
while(temp < 1000001) {
temp = atomic_fetch_add_explicit(&counter, 1, memory_order_relaxed);
for(int i=1;i<10000;i++)
temp%i; // do some CPU intensive work
}
pthread_exit(0);
}
However, as I run my program, I cannot get better performance than sequential execution!!
gcc-4.9 -std=c11 -pthread -o my_program my_program.c
for i in 1 2 3 4 5 6 7 8; do \
./my_program $i; \
done
Thread number = 1, execution time = 19.235998 s
Thread number = 2, execution time = 20.575237 s
Thread number = 3, execution time = 25.161116 s
Thread number = 4, execution time = 28.278671 s
Thread number = 5, execution time = 28.185605 s
Thread number = 6, execution time = 28.050380 s
Thread number = 7, execution time = 28.286925 s
Thread number = 8, execution time = 28.227132 s
I run the program on a 4-core machine.
Does anyone have suggestions to improve the program? Or any clue why I cannot get speedup?
The only work here that can be done in parallel is the loop:
for(int i=0;i<10000;i++)
temp%i; // do some CPU intensive work
gcc, even with the minimal optimisation level, will not emit any code for the temp%i; void expression (disassemble it and see), so this essentially becomes an empty loop, which will execute very fast - the execution time in the case with multiple threads running on different cores will be dominated by the cacheline containing your atomic variable ping-ponging between the different cores.
You need to make this loop actually do a significant amount of work before you'll see a speed-up.

Resources