CPU Affinity Masks (Putting Threads on different CPUs) - linux

I have 4 threads, and I am trying to set thread 1 to run on CPU 1, thread 2 on CPU 2, etc.
However, when I run my code below, the affinity masks are returning the correct values, but when I do a sched_getcpu() on the threads, they all return that they are running on CPU 4.
Anybody know what my problem here is?
Thanks in advance!
#define _GNU_SOURCE
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <sched.h>
#include <errno.h>
void *pthread_Message(char *message)
{
printf("%s is running on CPU %d\n", message, sched_getcpu());
}
int main()
{
pthread_t thread1, thread2, thread3, thread4;
pthread_t threadArray[4];
cpu_set_t cpu1, cpu2, cpu3, cpu4;
char *thread1Msg = "Thread 1";
char *thread2Msg = "Thread 2";
char *thread3Msg = "Thread 3";
char *thread4Msg = "Thread 4";
int thread1Create, thread2Create, thread3Create, thread4Create, i, temp;
CPU_ZERO(&cpu1);
CPU_SET(1, &cpu1);
temp = pthread_setaffinity_np(thread1, sizeof(cpu_set_t), &cpu1);
printf("Set returned by pthread_getaffinity_np() contained:\n");
for (i = 0; i < CPU_SETSIZE; i++)
if (CPU_ISSET(i, &cpu1))
printf("CPU1: CPU %d\n", i);
CPU_ZERO(&cpu2);
CPU_SET(2, &cpu2);
temp = pthread_setaffinity_np(thread2, sizeof(cpu_set_t), &cpu2);
for (i = 0; i < CPU_SETSIZE; i++)
if (CPU_ISSET(i, &cpu2))
printf("CPU2: CPU %d\n", i);
CPU_ZERO(&cpu3);
CPU_SET(3, &cpu3);
temp = pthread_setaffinity_np(thread3, sizeof(cpu_set_t), &cpu3);
for (i = 0; i < CPU_SETSIZE; i++)
if (CPU_ISSET(i, &cpu3))
printf("CPU3: CPU %d\n", i);
CPU_ZERO(&cpu4);
CPU_SET(4, &cpu4);
temp = pthread_setaffinity_np(thread4, sizeof(cpu_set_t), &cpu4);
for (i = 0; i < CPU_SETSIZE; i++)
if (CPU_ISSET(i, &cpu4))
printf("CPU4: CPU %d\n", i);
thread1Create = pthread_create(&thread1, NULL, (void *)pthread_Message, thread1Msg);
thread2Create = pthread_create(&thread2, NULL, (void *)pthread_Message, thread2Msg);
thread3Create = pthread_create(&thread3, NULL, (void *)pthread_Message, thread3Msg);
thread4Create = pthread_create(&thread4, NULL, (void *)pthread_Message, thread4Msg);
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
pthread_join(thread3, NULL);
pthread_join(thread4, NULL);
return 0;
}

You're trying to set the affinity of threads that you did not initialize.
Edit: Ok, let me give you some more info:
Don't mix thread handles (the thing you store in the pthread_t variable) and what they represent (a thread of execution that runs somewhere). What you were trying to do is to set a property of a thread before it starts, with an API that requires the thread object. As it happens pthread_create creates the object and starts the execution at the same time, so trying to use pthread_setaffinity_np is not the right way to go (this is useful if you want to change the affinity of a currently running thread).
But... pthread_create has an attribute parameter (you're passing NULL to it). This is storing the information of how you want the thread to be created.
Affinity is one of the attributes you can set through that parameter. See the man-page documentation for pthread_attr_init and pthread_attr_setaffinity_np for how exactly

Here's what you were looking for. I know it is a late answer, but this might help others.
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <sched.h>
#include <errno.h>
#include <unistd.h>
int getNumberOfCpus( void )
{
long nprocs = -1;
long nprocs_max = -1;
# ifdef _SC_NPROCESSORS_ONLN
nprocs = sysconf( _SC_NPROCESSORS_ONLN );
if ( nprocs < 1 )
{
//std::cout << "Could not determine number of CPUs on line. Error is " << strerror( errno ) << std::endl;
return 0;
}
nprocs_max = sysconf( _SC_NPROCESSORS_CONF );
if ( nprocs_max < 1 )
{
//std::cout << "Could not determine number of CPUs in host. Error is " << strerror( errno ) << std::endl;
return 0;
}
//std::cout << nprocs < " of " << nprocs_max << " online" << std::endl;
return nprocs;
#else
//std::cout << "Could not determine number of CPUs" << std::endl;
return 0;
#endif
}
void *pthread_Message( void *ptr )
{
sleep(10);
char *message;
message = (char *) ptr;
printf("%s \n", message);
cpu_set_t l_cpuSet;
int l_maxCpus;
int j;
unsigned long l_cpuBitMask;
CPU_ZERO( &l_cpuSet );
printf("get affinity %d\n",pthread_getaffinity_np(pthread_self() , sizeof( cpu_set_t ), &l_cpuSet ));
// printf("cpuset %d\n",l_cpuSet);
printf (" thread id %u\n", pthread_self());
if ( pthread_getaffinity_np(pthread_self() , sizeof( cpu_set_t ), &l_cpuSet ) == 0 )
for (int i = 0; i < 4; i++)
if (CPU_ISSET(i, &l_cpuSet))
printf("XXXCPU: CPU %d\n", i);
for (long i=0; i< 10000000000; ++i);
}
int main()
{
pthread_t thread1, thread2, thread3, thread4;
pthread_t threadArray[4];
cpu_set_t cpu1, cpu2, cpu3, cpu4;
const char *thread1Msg = "Thread 1";
const char *thread2Msg = "Thread 2";
const char *thread3Msg = "Thread 3";
const char *thread4Msg = "Thread 4";
int thread1Create, thread2Create, thread3Create, thread4Create, i, temp;
thread1Create = pthread_create(&thread1, NULL, &pthread_Message, (void*)thread1Msg);
sleep(1);
thread2Create = pthread_create(&thread2, NULL, &pthread_Message, (void*)thread2Msg);
sleep(1);
thread3Create = pthread_create(&thread3, NULL, &pthread_Message, (void*)thread3Msg);
sleep(1);
thread4Create = pthread_create(&thread4, NULL, &pthread_Message, (void*)thread4Msg);
CPU_ZERO(&cpu1);
CPU_SET(0, &cpu1);
temp = pthread_setaffinity_np(thread1, sizeof(cpu_set_t), &cpu1);
printf("setaffinity=%d\n", temp);
printf("Set returned by pthread_getaffinity_np() contained:\n");
for (i = 0; i < CPU_SETSIZE; i++)
if (CPU_ISSET(i, &cpu1))
printf("CPU1: CPU %d\n", i);
CPU_ZERO(&cpu2);
CPU_SET(1, &cpu2);
temp = pthread_setaffinity_np(thread2, sizeof(cpu_set_t), &cpu2);
for (i = 0; i < CPU_SETSIZE; i++)
if (CPU_ISSET(i, &cpu2))
printf("CPU2: CPU %d\n", i);
CPU_ZERO(&cpu3);
CPU_SET(2, &cpu3);
temp = pthread_setaffinity_np(thread3, sizeof(cpu_set_t), &cpu3);
for (i = 0; i < CPU_SETSIZE; i++)
if (CPU_ISSET(i, &cpu3))
printf("CPU3: CPU %d\n", i);
CPU_ZERO(&cpu4);
CPU_SET(3, &cpu4);
temp = pthread_setaffinity_np(thread4, sizeof(cpu_set_t), &cpu4);
for (i = 0; i < CPU_SETSIZE; i++)
if (CPU_ISSET(i, &cpu4))
printf("CPU4: CPU %d\n", i);
// pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu1);
// pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu1);
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
pthread_join(thread3, NULL);
pthread_join(thread4, NULL);
return 0;
}

I think the easiest would be to give the CPU mask as a parameter to each thread and have the thread request given affinity itself, as in example here: pthread_setaffinity_np(3).

Related

Scheduling policy and priority using pthread does not make any difference

I am running a simple multi-threaded program using pthread. Considering using real-scheduler (SCHED_FIFO policy), lower priority threads won't be able to run until higher priority ones are finished. But, when I run two versions of this program (the only difference is priority 99->1) at the same time, they finish at almost the same time. I even changed the policy to SCHED_OTHER but still no difference.
# include <stdio.h>
# include <string.h>
# include <pthread.h>
# include <stdlib.h>
# include <unistd.h>
# include <math.h>
# define NUM_THREADS 128
pthread_t tid[NUM_THREADS];
int indexes[NUM_THREADS];
void* dummyThread(void *arg)
{
unsigned long i = 0;
pthread_t id = pthread_self();
float a, b = 5, c = 8;
printf("Thread %d started.\n", *(int*)arg + 1);
for(i = 0; i < 10000000; i++)
a = sin(b) + sqrt(b);
printf("Thread %d finished.\n", *(int*)arg + 1);
return NULL;
}
int main(void)
{
int i = 0;
pthread_attr_t attr;
struct sched_param schedParam;
struct timespec start, finish;
double elapsed;
pthread_attr_init(&attr);
pthread_attr_setschedpolicy(&attr, SCHED_FIFO);
schedParam.sched_priority = 1;
pthread_attr_setschedparam(&attr, &schedParam);
pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = 0 ; i < NUM_THREADS; i++)
{
indexes[i] = i;
if (!pthread_create((void*)&tid[i], &attr, &dummyThread, &indexes[i]))
printf("Thread %d created successfully.\n", i + 1);
else
printf("Failed to create Thread %d.\n", i + 1);
}
for (i = 0 ; i < NUM_THREADS; i++)
pthread_join(tid[i], NULL);
clock_gettime(CLOCK_MONOTONIC, &finish);
elapsed = (finish.tv_sec - start.tv_sec);
elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
printf("%lf\n", elapsed);
return 0;
}
Edit 1: Updated my code by adding pthread_attr_setschedparam and error checking. I don't get any errors when running it without sudo, and changing priority or scheduling policy still does not change the result.
Edit 2: I noticed that when I create threads with different priorities within the same process it works well. In the following code for threads with even index I assign priority 1 while for threads with an odd index I assign priority 99. It works well and odd threads finish first before even threads.
# include <stdio.h>
# include <string.h>
# include <pthread.h>
# include <stdlib.h>
# include <unistd.h>
# include <math.h>
# define NUM_THREADS 128
pthread_t tid[NUM_THREADS];
int indexes[NUM_THREADS];
void* dummyThread(void *arg)
{
unsigned long i = 0;
pthread_t id = pthread_self();
float a, b = 5, c = 8;
printf("Thread %d started.\n", *(int*)arg);
for(i = 0; i < 10000000; i++)
a = sin(b) + sqrt(b);
printf("Thread %d finished.\n", *(int*)arg);
return NULL;
}
int main(void)
{
int i = 0;
pthread_attr_t attr;
struct sched_param schedParam;
struct timespec start, finish;
double elapsed;
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = 0 ; i < NUM_THREADS; i++)
{
indexes[i] = i;
pthread_attr_init(&attr);
pthread_attr_setschedpolicy(&attr, SCHED_FIFO);
schedParam.sched_priority = i % 2 == 0 ? 1 : 99;
pthread_attr_setschedparam(&attr, &schedParam);
pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
if (!pthread_create((void*)&tid[i], &attr, &dummyThread, &indexes[i]))
printf("Thread %d created successfully.\n", i);
else
printf("Failed to create Thread %d.\n", i);
}
for (i = 0 ; i < NUM_THREADS; i++)
pthread_join(tid[i], NULL);
clock_gettime(CLOCK_MONOTONIC, &finish);
elapsed = (finish.tv_sec - start.tv_sec);
elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
printf("%lf\n", elapsed);
return 0;
}
Since threads from different processes are all sent to the same scheduler in the Kernel, I don't know why it does not work with different processes.
From man pthread_attr_setschedpolicy
In order for the policy setting made by
pthread_attr_setschedpolicy() to have effect when calling
pthread_create(3), the caller must use
pthread_attr_setinheritsched(3) to set the inherit-scheduler
attribute of the attributes object attr to
PTHREAD_EXPLICIT_SCHED.
You've neglected to do that, so your SCHED_FIFO didn't have any effect.
As soon as I add pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED); call, pthread_create() starts failing with EPERM (since only root can create SCHED_FIFO threads).

Pass a returned value from a thread to another thread

I'm learning posix thread programming and I'm struggling in the following program. So I've 2 threads with the ids: tid1 and tid2. I use the thread with tid1 to enter the values I want to sum each other into the second thread with tid2 but the result of the sum is not correct. So I've tried to display in tid2 the values I entered in tid1 but I realized that these are not correct too (didn't correspond to the entered values). Can you pleas tell me where I'm doing the error? The way I pass the pointer returned from a thread with tid1 to thread with tid2, is that correct?
Note: The values returned from tid1 are already correct because I checked it as you can also see in the main (line 56).
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
void *entryValues(void *p)
{
int nbVal = *(int*)p;
int i = 0;
int val = 0;
int *tab;
while(i < nbVal){
printf("value %d: ", i);
scanf("%d", &val);
tab[i] = val;
i++;
}
pthread_exit(tab);
}
void *sumValues(void *p)
{
int *values;
int sum = 0, i = 0;
memcpy(values, (int*)p, sizeof((int*)p));
printf("%d\n", values[0]);
for (i = 0; i < sizeof(values); i++)
{
//printf("value %d: %d\n", i, values[i]);
sum += values[i];
}
printf("The sum of the entered values is: %d\n", sum);
return NULL;
}
int main(int argc, char **argv)
{
pthread_t tid1;
pthread_t tid2;
int nbValues = 0;
int *arr;
printf("Please enter the number of values you want to sum each other: ");
scanf("%d", &nbValues);
if(pthread_create(&tid1, NULL, entryValues, &nbValues)!= 0)
{
fprintf(stderr, "creation of thread failed\n");
return -1;
}
pthread_join(tid1, (void**)&arr);
//printf("array: %d\n", arr[0]);
if(pthread_create(&tid2, NULL, sumValues, arr) != 0)
{
fprintf(stderr, "creation of thread failed\n");
return -1;
}
pthread_join(tid2, NULL);
/* Waiting for the termination of tid */
printf("End of main thread\n");
return EXIT_SUCCESS;
}

Is there an alternative way to sync

Is there an alternative way to be sure that the threads are ready to recieve the broadcast signal. I want to replace the Sleep(1) function in main.
#include <iostream>
#include <pthread.h>
#define NUM 4
using namespace std;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
pthread_t tid[NUM];
void *threads(void *arg){
int tid = (int)arg;
while(true){
pthread_mutex_lock(&mutex);
pthread_cond_wait(&cond,&mutex);
//do some work
cout<<"Thread: "<<tid<<endl;;
pthread_mutex_unlock(&mutex);
}
}
int main(){
for(int i=0;i<NUM;i++){
pthread_create(&tid[i],NULL,threads,(void*)i);
}
Sleep(1);
pthread_cond_broadcast(&cond);
Sleep(1);
pthread_cond_broadcast(&cond);
Sleep(1);
pthread_cond_broadcast(&cond);
return 0;
}
I tried memory barriers before pthread_cond_wait and i thought of using an counter, but nothing worked for me yet.
Condition variables are usually connected to a predicate. In the other threads, check if predicate is already fulfilled (check while holding the mutex protecting the predicate), if so, do not wait on the condition variable. In main, acquire mutex, change predicate while holding the mutex. Then release mutex and signal or broadcast on the condvar. Here is a similar question:
Synchronisation before pthread_cond_broadcast
Here is some example code:
#include <iostream>
#include <pthread.h>
#include <unistd.h>
#include <cassert>
#define NUM 4
#define SIZE 256
using std::cout;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
pthread_t tid[NUM];
int work_available;
void *threads(void *arg)
{
int tid = *((int*)arg);
while (1) {
pthread_mutex_lock(&mutex);
while (work_available == 0) {
// While loop since cond_wait can have spurious wakeups.
pthread_cond_wait(&cond, &mutex);
cout << "Worker " << tid << " woke up...\n";
cout << "Work available: " << work_available << '\n';
}
if (work_available == -1) {
cout << "Worker " << tid << " quitting\n";
pthread_mutex_unlock(&mutex); // Easy to forget, better to use C++11 RAII mutexes.
break;
}
assert(work_available > 0);
work_available--;
cout << "Worker " << tid << " took one item of work\n";
pthread_mutex_unlock(&mutex);
//do some work
sleep(2); // simulated work
pthread_mutex_lock(&mutex);
cout << "Worker " << tid << " done with one item of work.\n";
pthread_mutex_unlock(&mutex);
}
}
int main()
{
work_available = 0;
int args[NUM];
for (int i=0; i<NUM; i++) {
args[i] = i;
pthread_create(&tid[i], NULL, threads, (void*)&args[i]);
}
const int MAX_TIME = 10;
for (int i = 0; i < MAX_TIME; i++)
{
pthread_mutex_lock(&mutex);
work_available++;
cout << "Main thread, work available: " << work_available << '\n';
pthread_mutex_unlock(&mutex);
pthread_cond_broadcast(&cond);
sleep(1);
}
pthread_mutex_lock(&mutex);
cout << "Main signalling threads to quit\n";
work_available = -1;
pthread_mutex_unlock(&mutex);
pthread_cond_broadcast(&cond);
for (int i = 0; i < NUM; i++)
{
pthread_join(tid[i], NULL);
}
return 0;
}

Pthread Mutex Lock Linux

I created a simple program that shows the use of mutex lock. Here is the code...
#include <stdio.h>
#include <unistd.h>
#include <pthread.h>
#define NUM_THREAD 2
pthread_mutex_t mutex;
int call_time;
void *makeCall(void *param)
{
call_time = 10;
pthread_mutex_lock(&mutex);
printf("Hi I'm thread #%u making a call\n", (unsigned int) pthread_self());
do{
printf("%d\n", call_time);
call_time--;
sleep(1);
}
while(call_time > 0);
pthread_mutex_unlock(&mutex);
return 0;
}
int main()
{
int i;
pthread_t thread[NUM_THREAD];
//init mutex
pthread_mutex_init(&mutex, NULL);
//create thread
for(i = 0; i < NUM_THREAD; i++)
pthread_create(&thread[i], NULL, makeCall, NULL);
//join thread
for(i = 0; i < NUM_THREAD; i++)
pthread_join(thread[i], NULL);
pthread_mutex_destroy(&mutex);
return 0;
}
The output is...
Hi I'm thread #3404384000 making a call
10
10
9
8
7
6
5
4
3
2
1
Hi I'm thread #3412776704 making a call
0
However, if I modify the function makeCall and transfer the variable call_time inside the mutex locks...
pthread_mutex_lock(&mutex);
call_time = 10;
/*
*
*
*
*/
pthread_mutex_unlock(&mutex);
The program now gives me the correct output where each of the thread counts down from 10 to 0. I don't understand the difference it makes transferring the variable call_time inside the locks. I hope someone can make me understand this behavior of my program. Cheers!
call_time is a shared variable that is accessed from 2 threads and so must be protected. What is happening is that the first thread starts, sets call_time to 10 and prints the first round.Then the second thread starts, resets call_time back to 10 and waits for the mutex. The first thread now comes back and keeps running with call_time reset to 10. After it is done and frees the mutex, the second thread can now run. call_time is now 0 since the first thread left it at 0, and so it just prints the last round.
Try this program, I think it will demonstrate threads better:
#include <stdio.h>
#include <unistd.h>
#include <pthread.h>
#define NUM_THREAD 2
pthread_mutex_t mutex;
int call_time;
void *makeCall(void *param)
{
int temp;
do{
pthread_mutex_lock(&mutex);
printf("Hi I'm thread #%u making a call\n", (unsigned int) pthread_self());
printf("%d\n", call_time);
temp = call_time--;
pthread_mutex_unlock(&mutex);
//sleep(1); //try with and without this line and see the difference.
}
while(temp > 0);
return 0;
}
int main()
{
int i;
call_time = 100;
pthread_t thread[NUM_THREAD];
//init mutex
pthread_mutex_init(&mutex, NULL);
//create thread
for(i = 0; i < NUM_THREAD; i++)
pthread_create(&thread[i], NULL, makeCall, NULL);
//join thread
for(i = 0; i < NUM_THREAD; i++)
pthread_join(thread[i], NULL);
pthread_mutex_destroy(&mutex);
return 0;
}

mutex and its effect on execution time (and cpu usage)

I wrote a very simple test program to examine efficiency of pthread mutex. But I'm not able to analyse the results I get. (I can see 4 CPUs in Linux System Monitor and that's why I have at least 4 active threads, because I want to keep all of them busy.) The existence of mutex is not necessary in the code.
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
pthread_mutex_t lock1, lock2, lock3, lock4;
void do_sth() { /* just open a files, read it and copy to another file */
int i;
for (i = 0; i < 1; i++) {
FILE* fp = fopen("(2) Catching Fire.txt", "r");
if (fp == NULL) {
fprintf(stderr, "could not open file\n");
exit(1);
}
char filename[20];
sprintf(filename, "a%d", (int)pthread_self());
FILE* wfp = fopen(filename, "w");
if (wfp == NULL) {
fprintf(stderr, "could not open file for write\n");
exit(1);
}
int c;
while (c = fgetc(fp) != EOF) {
c++;
fputc(c, wfp);
}
close(fp);
close(wfp);
}
}
void* routine1(void* param) {
pthread_mutex_lock(&lock1);
do_sth();
pthread_mutex_unlock(&lock1);
}
void* routine2(void* param) {
pthread_mutex_lock(&lock2);
do_sth();
pthread_mutex_unlock(&lock2);
}
void* routine3(void* param) {
pthread_mutex_lock(&lock3);
do_sth();
pthread_mutex_unlock(&lock3);
}
void* routine4(void* param) {
pthread_mutex_lock(&lock4);
do_sth();
pthread_mutex_unlock(&lock4);
}
int main(int argc, char** argv) {
int i ;
pthread_mutex_init(&lock1, 0);
pthread_mutex_init(&lock2, 0);
pthread_mutex_init(&lock3, 0);
pthread_mutex_init(&lock4, 0);
pthread_t thread1[4];
pthread_t thread2[4];
pthread_t thread3[4];
pthread_t thread4[4];
for (i = 0; i < 4; i++)
pthread_create(&thread1[i], NULL, routine1, NULL);
for (i = 0; i < 4; i++)
pthread_create(&thread2[i], NULL, routine2, NULL);
for (i = 0; i < 4; i++)
pthread_create(&thread3[i], NULL, routine3, NULL);
for (i = 0; i < 4; i++)
pthread_create(&thread4[i], NULL, routine4, NULL);
for (i = 0; i < 4; i++)
pthread_join(thread1[i], NULL);
for (i = 0; i < 4; i++)
pthread_join(thread2[i], NULL);
for (i = 0; i < 4; i++)
pthread_join(thread3[i], NULL);
for (i = 0; i < 4; i++)
pthread_join(thread4[i], NULL);
printf("Hello, World!\n");
}
I execute this program in two ways, with and without all the mutex. and I measure time of execution (using time ./a.out) and average cpu load (using htop). here is the results:
first: when I use htop, I can see that loadavg of the system considerably increases when I do not use any mutex in the code. I have no idea why this happens. (is 4 active threads not enough to get the most out of 4 CPUs?)
second: It takes (a little) less time for the program to execute with all those mutex than without it. why does it happen? I mean, it should take some time to sleep and wake up a thread.
edit: I guess, when I use locks I put other threads to sleep and it eliminates a lot of context-switch (saving some time), could this be the reason?
You are using one lock per thread, so that's why when you use all the mutexes you don't see an increase in the execution time of the application: dosth() is not actually being protected from concurrent execution.
Since all the threads are working on the same file, all they should be accessing it using the same lock (otherwise you will have incorrect results: all the threads trying to modify the file at the same time).
Try running again the experiments using just one global lock.

Resources