Anyone think about it. OpenMP features to adjust cpu muscles to handle dumbbel. In my research for openmp we cannot set thread priority to execute block code with powerfull muscle. Only one way(_beginthreadex or CreateThread function with 5. parameters) to create threads with highest priority.
Here some code for this issue:
This is manual setting.
int numberOfCore = ( execute __cpuid to obtain number of cores on your cpu ).
HANDLES* hThreads = new HANDLES[ numberOfCore ];
hThreads[0] = _beginthreadex( NULL, 0, someThreadFunc, NULL, 0, NULL );
SetThreadPriority( hThreads[0], HIGH_PRIORITY_CLASS );
WaitForMultipleObjects(...);
Here is i want to see this part:
#pragma omp parallel
{
#pragma omp for ( threadpriority:HIGH_PRIORITY_CLASS )
for( ;; ) { ... }
}
Or
#pragma omp parallel
{
// Generally this function greatly appreciativable.
_omp_set_priority( HIGH_PRIORITY_CLASS );
#pragma omp for
for( ;; ) { ... }
}
I dont know if there was a way to setup priority with openmp pls inform us.
You can do SetThreadPriority in the body of the loop without requiring special support from OpenMP:
for (...)
{
DWORD priority=GetThreadPriority(...);
SetThreadPriority(...);
// stuff
SetThreadPriority(priority);
}
Simple test reveals unexpected results:
I have run a simple test in Visual Studio 2010 (Windows 7):
#include <stdio.h>
#include <omp.h>
#include <windows.h>
int main()
{
int tid, nthreads;
SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_ABOVE_NORMAL);
#pragma omp parallel private(tid) num_threads(4)
{
tid = omp_get_thread_num();
printf("Thread %d: Priority = %d\n", tid, GetThreadPriority(GetCurrentThread()));
}
printf("\n");
#pragma omp parallel private(tid) shared(nthreads) num_threads(4)
{
tid = omp_get_thread_num();
#pragma omp master
{
printf("Master Thread %d: Priority = %d\n", tid, GetThreadPriority(GetCurrentThread()));
}
}
#pragma omp parallel num_threads(4)
{
SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_ABOVE_NORMAL);
}
printf("\n");
#pragma omp parallel private(tid) num_threads(4)
{
tid = omp_get_thread_num();
printf("Thread %d: Priority = %d\n", tid, GetThreadPriority(GetCurrentThread()));
}
return 0;
}
The output is:
Thread 1: Priority = 0
Thread 0: Priority = 1
Thread 2: Priority = 0
Thread 3: Priority = 0
Master Thread 0: Priority = 1
Thread 0: Priority = 1
Thread 1: Priority = 1
Thread 3: Priority = 1
Thread 2: Priority = 1
Explanation:
The OpenMP master threads is executed with the thread priority of the main.
The other OpenMP threads are left in Normal priority.
When manually setting the thread priority of OpenMP threads, the threads remains with that priority.
Related
In my experience, when I update a varible in 1 task the variable is not updated in other tasks even if the first task that updated the variable is done executing. For example given the code,
int nThreads = atoi(argv[1]);
omp_set_num_threads(nThreads);
int currentInt = 0;
int numEdges = 1000000;
#pragma omp parallel shared(currentInt)
{
#pragma omp single
{
#pragma omp task shared(currentInt)
{
printf("I am doing kruskals: Thread %d\n", omp_get_thread_num());
while(currentInt < numEdges)
{
currentInt++;
}
printf("Kruskals Done! %d\n", currentInt);
#pragma omp shared(currentInt)
{
for(int i = 0; i < 10000000; i++){
}
printf("Helper: Current Int %d Thread %d \n", currentInt, omp_get_thread_num());
}
}
#pragma omp taskwait
}
}
It will always print currentInt 0. Even if the first task finishes before the second. I need this because I am trying to parallize an algorithm where a have a sequential task going through a large array and many parallel tasks excuting simultanously on parts of that array and once the sequential task reaches the portion of the array that a parallel task is working on the parallel task can stop itself because it is no longer needed. The parallel and sequential tasks share no dependancies so that is not a problem.
Any help will be appreciated.
I am trying to synchronize threads using pthread_barrier_wait(), but threads are not getting synchronized if the number of iterations in MAIN for LOOP is more than one. Here (s<2) is written so for 2 iterations I have mentioned the output below which is not synchronized.
Thank you.
Here is my code:
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <iostream>
#define ARRAYSIZE 6
#define NUMTHREADS 3
using namespace std;
// Barrier variable
pthread_barrier_t barr;
unsigned int count = NUMTHREADS;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
struct ThreadData {
int start, stop, tid;
};
void* squarer(void* td) {
struct ThreadData* data=(struct ThreadData*) td;
int start=data->start;
int stop=data->stop;
int tid = data ->tid;
int i,s;
//MAIN LOOP
for (s=0; s<2; s++){
for (i=start; i<stop; i++) {
printf("thread no. %d is writing: \n", tid);
}
// Synchronization point
int rc = pthread_barrier_wait(&barr);
if(rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD){
printf("Could not wait on barrier\n");
exit(-1);
}
for (i=start; i<stop; i++) {
printf("thread no. %d is executing: \n", tid);
}
}
return NULL;
}
int main(void) {
pthread_t thread[NUMTHREADS];
struct ThreadData data[NUMTHREADS];
// Barrier initialization
if(pthread_barrier_init(&barr, NULL, NUMTHREADS)){
printf("Could not create a barrier\n");
return -1;
}
int i;
int tasksPerThread=(ARRAYSIZE+NUMTHREADS-1)/NUMTHREADS;
/* Divide work for threads, prepare parameters */
for (i=0; i<NUMTHREADS; i++) {
data[i].start=i*tasksPerThread;
data[i].stop=(i+1)*tasksPerThread;
data[i].tid = i;
}
/* the last thread must not go past the end of the array */
data[NUMTHREADS-1].stop=ARRAYSIZE;
/* Launch Threads */
for (i=0; i<NUMTHREADS; i++) {
pthread_create(&thread[i], NULL, squarer, &data[i]);
}
/* Wait for Threads to Finish */
for (i=0; i<NUMTHREADS; i++) {
pthread_join(thread[i], NULL);
}
return 0;
}
Here is the output:
thread no. 2 is writing:
thread no. 2 is writing:
thread no. 0 is writing:
thread no. 0 is writing:
thread no. 1 is writing:
thread no. 1 is writing:
thread no. 1 is executing:
thread no. 1 is executing:
thread no. 1 is writing:
thread no. 1 is writing:
thread no. 0 is executing:
thread no. 0 is executing:
thread no. 0 is writing:
thread no. 0 is writing:
thread no. 2 is executing:
thread no. 2 is executing:
thread no. 2 is writing:
thread no. 2 is writing:
thread no. 2 is executing:
thread no. 2 is executing:
thread no. 0 is executing:
thread no. 0 is executing:
thread no. 1 is executing:
thread no. 1 is executing:
That output is synchronised fine given the code you have written. No thread starts the 'executing' step until all threads have finished the 'writing' step.
If, in addition, you want no thread to start the next 'writing' step until all threads have finished the previous 'executing' step, you need an additional call to pthread_barrier_wait() after the 'executing' step.
I want to map tasks to three threads as follows:
Each of taskA, taskB, and taskC must be executed by separate threads.
taskA has subtasks task(1), task(2), and task(3).
taskB has subtasks task(11), task(12), and task(13).
taskC has subtasks task(21), task(22), and task(23).
If any one of taskA, taskB, and taskC finishes and there is at least one unstarted subtask of another task, the thread associated with the finished task should steal the unstarted subtask.
I was not able to achieve this setting. All I was able to do the following MWE. In this MWE, threads do not obey the rules 2, 3, 4.
Here is my MWE:
double task(int taskid) {
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
printf("%d/%d: taskid=%d\n", tid, nthreads, taskid);
int i;
double t = 1.1;
for(i = 0; i < 10000000*taskid; i++) {
t *= t/i;
}
return t;
}
double taskA() {
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
printf("%s %d/%d\n", __FUNCTION__, tid, nthreads);
double a, b, c;
//#pragma omp parallel
//#pragma omp single
{
#pragma omp task untied shared(a)
a=task(1);
#pragma omp task untied shared(b)
b=task(2);
#pragma omp task untied shared(c)
c=task(3);
}
return a+b+c;
}
double taskB() {
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
printf("%s %d/%d\n", __FUNCTION__, tid, nthreads);
double a, b, c;
//#pragma omp parallel
//#pragma omp single
{
#pragma omp task untied shared(a)
a=task(11);
#pragma omp task untied shared(b)
b=task(12);
#pragma omp task untied shared(c)
c=task(13);
}
return a+b+c;
}
double taskC() {
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
printf("%s %d/%d\n", __FUNCTION__, tid, nthreads);
double a, b, c;
//#pragma omp parallel
//#pragma omp single
{
#pragma omp task untied shared(a)
a=task(21);
#pragma omp task untied shared(b)
b=task(22);
#pragma omp task untied shared(c)
c=task(23);
}
return a+b+c;
}
int main() {
omp_set_num_threads(3);
double a,b,c;
#pragma omp parallel
#pragma omp single
{
#pragma omp task untied
a=taskA();
#pragma omp task untied
b=taskB();
#pragma omp task untied
c=taskC();
}
#pragma omp taskwait
printf("%g %g %g\n", a, b, c);
return 0;
}
Compiled as:
icpc -Wall -fopenmp -O2 -o nestedomp nestedomp.c
Output:
taskC 1/3
1/3: taskid=21
taskA 2/3
taskB 0/3
0/3: taskid=23
2/3: taskid=22
1/3: taskid=1
1/3: taskid=2
2/3: taskid=3
0/3: taskid=11
1/3: taskid=12
2/3: taskid=13
Here, thread 0 starts processing task 23, however it must start processing 1 or 11.
You could use thread id to structure work distribution:
#pragma omp parallel num_threads(3)
{
int tid = omp_get_thread_num();
if (tid == 0)
// Task 0
} else if (tid == 1) {
// Task 1
} else
// Task 2
}
You can set the number of threads according to your needs and introduce nesting at the task level.
I have an OpenMP parallelized program that looks like that:
[...]
#pragma omp parallel
{
//initialize threads
#pragma omp for
for(...)
{
//Work is done here
}
}
Now I'm adding MPI support. What I will need is a thread that handles the communication, in my case, calls GatherAll all the time and fills/empties a linked list for receiving/sending data from the other processes. That thread should send/receive until a flag is set. So right now there is no MPI stuff in the example, my question is about the implementation of that routine in OpenMP.
How do I implement such a thread? For example, I tried to introduce a single directive here:
[...]
int kill=0
#pragma omp parallel shared(kill)
{
//initialize threads
#pragma omp single nowait
{
while(!kill)
send_receive();
}
#pragma omp for
for(...)
{
//Work is done here
}
kill=1
}
but in this case the program gets stuck because the implicit barrier after the for-loop waits for the thread in the while-loop above.
Thank you, rugermini.
You could try adding a nowait clause to your single construct:
EDIT: responding to the first comment
If you enable nested parallelism for OpenMP, you might be able to achieve what you want by making two levels of parallelism. In the top level, you have two concurrent parallel sections, one for the MPI communications, the other for local computation. This last section can itself be parallelized, which gives you a second level of parallelisation. Only threads executing this level will be affected by barriers in it.
#include <iostream>
#include <omp.h>
int main()
{
int kill = 0;
#pragma omp parallel sections
{
#pragma omp section
{
while (kill == 0){
/* manage MPI communications */
}
}
#pragma omp section
{
#pragma omp parallel
#pragma omp for
for (int i = 0; i < 10000 ; ++i) {
/* your workload */
}
kill = 1;
}
}
}
However, you must be aware that your code is going to break if you don't have at least two threads, which means you're breaking the assumption that the sequential and parallelized versions of the code should do the same thing.
It would be much cleaner to wrap your OpenMP kernel inside a more global MPI communication scheme (potentially using asynchronous communications to overlap communications with computations).
You have to be careful, because you can't just have your MPI calling thread "skip" the omp for loop; all threads in the thread team have to go through the for loop.
There's a couple ways you could do this: with nested parallism and tasks, you could launch one task to do the message passing and anther to call a work routine which has an omp parallel for in it:
#include <mpi.h>
#include <omp.h>
#include <stdio.h>
void work(int rank) {
const int n=14;
#pragma omp parallel for
for (int i=0; i<n; i++) {
int tid = omp_get_thread_num();
printf("%d:%d working on item %d\n", rank, tid, i);
}
}
void sendrecv(int rank, int sneighbour, int rneighbour, int *data) {
const int tag=1;
MPI_Sendrecv(&rank, 1, MPI_INT, sneighbour, tag,
data, 1, MPI_INT, rneighbour, tag,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
int main(int argc, char **argv) {
int rank, size;
int sneighbour;
int rneighbour;
int data;
int got;
MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &got);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
omp_set_nested(1);
sneighbour = rank + 1;
if (sneighbour >= size) sneighbour = 0;
rneighbour = rank - 1;
if (rneighbour <0 ) rneighbour = size-1;
#pragma omp parallel
{
#pragma omp single
{
#pragma omp task
{
sendrecv(rank, sneighbour, rneighbour, &data);
printf("Got data from %d\n", data);
}
#pragma omp task
work(rank);
}
}
MPI_Finalize();
return 0;
}
Alternately, you could make your omp for loop schedule(dynamic) so that the other threads can pick up some of the slack from while the master thread is sending, and the master thread can pick up some work when it's done:
#include <mpi.h>
#include <omp.h>
#include <stdio.h>
void sendrecv(int rank, int sneighbour, int rneighbour, int *data) {
const int tag=1;
MPI_Sendrecv(&rank, 1, MPI_INT, sneighbour, tag,
data, 1, MPI_INT, rneighbour, tag,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
int main(int argc, char **argv) {
int rank, size;
int sneighbour;
int rneighbour;
int data;
int got;
const int n=14;
MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &got);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
omp_set_nested(1);
sneighbour = rank + 1;
if (sneighbour >= size) sneighbour = 0;
rneighbour = rank - 1;
if (rneighbour <0 ) rneighbour = size-1;
#pragma omp parallel
{
#pragma omp master
{
sendrecv(rank, sneighbour, rneighbour, &data);
printf("Got data from %d\n", data);
}
#pragma omp for schedule(dynamic)
for (int i=0; i<n; i++) {
int tid = omp_get_thread_num();
printf("%d:%d working on item %d\n", rank, tid, i);
}
}
MPI_Finalize();
return 0;
}
Hmmm. If you are indeed adding MPI 'support' to your program, then you ought to be using mpi_allgather as mpi_gatherall does not exist. Note that mpi_allgather is a collective operation, that is all processes in the communicator call it. You can't have a process gathering data while the other processes do whatever it is they do. What you could do is use MPI single-sided communications to implement your idea; this will be a little tricky but no more than that if one process only reads the memory of other processes.
I'm puzzled by your use of the term 'thread' wrt MPI. I fear that you are confusing OpenMP and MPI, one of whose variants is called OpenMPI. Despite this name it is as different from OpenMP as chalk from cheese. MPI programs are written in terms of processes, not threads. The typical OpenMP implementation does indeed use threads, though the details are generally well-hidden from the programmer.
I'm seriously impressed that you are trying, or seem to be trying, to use MPI 'inside' your OpenMP code. This is exactly the opposite of work I do, and see others do on some seriously large computers. The standard mode for such 'hybrid' parallelisation is to write MPI programs which call OpenMP code. Many of today's very large computers comprise collections of what are, in effect, multicore boxes. A typical approach to programming one of these is to have one MPI process running on each box, and for each of those processes to use one OpenMP thread for each core in the box.
I tried to find a solution in order to keep the number of working threads constant under linux in C using pthreads, but I seem to be unable to fully understand what's wrong with the following code:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define MAX_JOBS 50
#define MAX_THREADS 5
pthread_mutex_t mutex1 = PTHREAD_MUTEX_INITIALIZER;
int jobs = MAX_JOBS;
int worker = 0;
int counter = 0;
void *functionC() {
pthread_mutex_lock(&mutex1);
worker++;
counter++;
printf("Counter value: %d\n",counter);
pthread_mutex_unlock(&mutex1);
// Do something...
sleep(4);
pthread_mutex_lock(&mutex1);
jobs--;
worker--;
printf(" >>> Job done: %d\n",jobs);
pthread_mutex_unlock(&mutex1);
}
int main(int argc, char *argv[]) {
int i=0, j=0;
pthread_t thread[MAX_JOBS];
// Create threads if the number of working threads doesn't exceed MAX_THREADS
while (1) {
if (worker > MAX_THREADS) {
printf(" +++ In queue: %d\n", worker);
sleep(1);
} else {
//printf(" +++ Creating new thread: %d\n", worker);
pthread_create(&thread[i], NULL, &functionC, NULL);
//printf("%d",worker);
i++;
}
if (i == MAX_JOBS) break;
}
// Wait all threads to finish
for (j=0;j<MAX_JOBS;j++) {
pthread_join(thread[j], NULL);
}
return(0);
}
A while (1) loop keeps creating threads if the number of working threads is under a certain threshold. A mutex is supposed to lock the critical sections every time the global counter of the working threads is incremented (thread creation) and decremented (job is done). I thought it could work fine and for the most part it does, but weird things happen...
For instance, if I comment (as it is in this snippet) the printf //printf(" +++ Creating new thread: %d\n", worker); the while (1) seems to generate a random number (18-25 in my experience) threads (functionC prints out "Counter value: from 1 to 18-25"...) at a time instead of respecting the IF condition inside the loop. If I include the printf the loop seems to behave "almost" in the right way... This seems to hint that there's a missing "mutex" condition that I should add to the loop in main() to effectively lock the thread when MAX_THREADS is reached but after changing a LOT of times this code for the past few days I'm a bit lost, now. What am I missing?
Please, let me know what I should change in order to keep the number of threads constant it doesn't seem that I'm too far from the solution... Hopefully... :-)
Thanks in advance!
Your problem is that worker is not incremented until the new thread actually starts and gets to run - in the meantime, the main thread loops around, checks workers, finds that it hasn't changed, and starts another thread. It can repeat this many times, creating far too many threads.
So, you need to increment worker in the main thread, when you've decided to create a new thread.
You have another problem - you should be using condition variables to let the main thread sleep until it should start another thread, not using a busy-wait loop with a sleep(1); in it. The complete fixed code would look like:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <unistd.h>
#define MAX_JOBS 50
#define MAX_THREADS 5
pthread_mutex_t mutex1 = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond1 = PTHREAD_COND_INITIALIZER;
int jobs = MAX_JOBS;
int workers = 0;
int counter = 0;
void *functionC() {
pthread_mutex_lock(&mutex1);
counter++;
printf("Counter value: %d\n",counter);
pthread_mutex_unlock(&mutex1);
// Do something...
sleep(4);
pthread_mutex_lock(&mutex1);
jobs--;
printf(" >>> Job done: %d\n",jobs);
/* Worker is about to exit, so decrement count and wakeup main thread */
workers--;
pthread_cond_signal(&cond1);
pthread_mutex_unlock(&mutex1);
return NULL;
}
int main(int argc, char *argv[]) {
int i=0, j=0;
pthread_t thread[MAX_JOBS];
// Create threads if the number of working threads doesn't exceed MAX_THREADS
while (i < MAX_JOBS) {
/* Block on condition variable until there are insufficient workers running */
pthread_mutex_lock(&mutex1);
while (workers >= MAX_THREADS)
pthread_cond_wait(&cond1, &mutex1);
/* Another worker will be running shortly */
workers++;
pthread_mutex_unlock(&mutex1);
pthread_create(&thread[i], NULL, &functionC, NULL);
i++;
}
// Wait all threads to finish
for (j=0;j<MAX_JOBS;j++) {
pthread_join(thread[j], NULL);
}
return(0);
}
Note that even though this works, it isn't ideal - it's best to create the number of threads you want up-front, and have them loop around, waiting for work. This is because creating and destroying threads has significant overhead, and because it often simplifies resource management. A version of your code rewritten to work like this would look like:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <unistd.h>
#define MAX_JOBS 50
#define MAX_THREADS 5
pthread_mutex_t mutex1 = PTHREAD_MUTEX_INITIALIZER;
int jobs = MAX_JOBS;
int counter = 0;
void *functionC()
{
int running_job;
pthread_mutex_lock(&mutex1);
counter++;
printf("Counter value: %d\n",counter);
while (jobs > 0) {
running_job = jobs--;
pthread_mutex_unlock(&mutex1);
printf(" >>> Job starting: %d\n", running_job);
// Do something...
sleep(4);
printf(" >>> Job done: %d\n", running_job);
pthread_mutex_lock(&mutex1);
}
pthread_mutex_unlock(&mutex1);
return NULL;
}
int main(int argc, char *argv[]) {
int i;
pthread_t thread[MAX_THREADS];
for (i = 0; i < MAX_THREADS; i++)
pthread_create(&thread[i], NULL, &functionC, NULL);
// Wait all threads to finish
for (i = 0; i < MAX_THREADS; i++)
pthread_join(thread[i], NULL);
return 0;
}