Multithreaded Monte Carlo - montecarlo

I am triying to parallelize a loop in my Monte Carlo program which aims to simulate the magnetic properties of manganites. this loop calculates the dipolar magnetic interaction in the lattice. I am new in Multithreading and that's my first test. It don't work. Here below the code and tell me please where is the mistake and thaks in advance
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <pthread.h>
#include <unistd.h>
.
.
.
#define CORES 4 // number of threads
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
int rc,n,N,i,j,p,l,NTOT = ILIGNE*ICOLONE*ICOUCHE,Nc;
double CTEC; // current thread contribution to energy change
double V,G,E,F,dU;
FILE *voisins = NULL;
float r[3],d,w[3];
void *Dipolar_Interaction(){
float spin[3*NTOT];
n = 1;
while(n < Nc/CORES){
printf("%f\n",(float)n/(float)Nc);
fscanf(voisins,"%d%d%f%f%f%f",&i,&j,&r[0],&r[1],&r[2],&d);
V = 0.0;E = 0.0;F = 0.0;
for(p = 0;p < 3;p++){
V += (D/pow(d,3.0))*(spin[3*i-3+p]-w[p])*spin[3*j-3+p];
E += (spin[3*i-3+p]-w[p])*r[p];
F += spin[3*j-3+p]*r[p];
}
G = -3*(D/pow(d,5.0))*E*F;
CTEC += (V+G);
n++;
}
rc = pthread_mutex_lock(&mutex);
dU += CTEC;
rc = pthread_mutex_unlock(&mutex);
pthread_exit(NULL);
}
main(){
int th;
.
.
.
pthread_t thread[CORES];
.
.
.
for( th = 1; th <= CORES; th++ )
pthread_create(&thread[th], NULL, Dipolar_Interaction, (void*)th);
for( th = 1; i <= CORES; th++ )
pthread_join(thread[th], NULL);
pthread_exit(NULL);
.
.
.
}
There is a missing code. I can't write the whole program because it's so long (532 line) and it's the subject of a research which may be published.

Related

I'm trying to create a string with n characters by allocating memories with malloc, but I have a problem

#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <stdlib.h>
int main(void)
{
int n;
printf("Length? ");
scanf("%d", &n);
getchar();
char* str = (char*)malloc(sizeof(char) * (n+1));
fgets(str,sizeof(str),stdin);
for (int i = 0; i < n; i++)
printf("%c\n", str[i]);
free(str);
}
Process results like this!
Length? 5
abcde
a
b
c
?
(I wanted to upload the result image, but I got rejected since I didn't have 10 reputations)
I can't figure out why 'd' and 'e' won't be showing in the results.
What is the problem with my code??
(wellcome to stackoverflow :) (update #1)
str is a pointer to char instead of a character array therefore sizeof(str) is always 8 on 64-bit or 4 on 32-bit machines, no matter how much space you have allocated.
Demo (compilation succeeds only if X in static_assert(X) holds):
#include <assert.h>
#include <stdlib.h>
int main(void){
// Pointer to char
char *str=(char*)malloc(1024);
#if defined _WIN64 || defined __x86_64__ || defined _____LP64_____
static_assert(sizeof(str)==8);
#else
static_assert(sizeof(str)==4);
#endif
free(str);
// Character array
char arr[1024];
static_assert(sizeof(arr)==1024);
return 0;
}
fgets(char *str, int num, FILE *stream) reads until (num-1) characters have been read
Instead of fgets(str,sizeof(str),stdin) please fgets(str,n+1,stdin)
Fixed version:
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
int main(void){
int n=0;
printf("Length? ");
scanf("%d",&n);
getchar();
char *str=(char*)calloc((n+1),sizeof(char));
static_assert(
sizeof(str)==sizeof(char*) && (
sizeof(str)==4 || // 32-bit machine
sizeof(str)==8 // 64-bit machine
)
);
fgets(str,n+1,stdin);
for(int i=0;i<n;++i)
printf("%c\n",str[i]);
free(str);
str=NULL;
}
Length? 5
abcde
a
b
c
d
e

Pthreads program is slower than the serial program - Linux

Thank you for being generous with your time and helping me in this matter. I am trying to calculate the sum of the squared numbers using pthread. However, it seems that it is even slower than the serial implementation. Moreover, when I increase the number of threads the program becomes even slower. I made sure that each thread is running on a different core (I have 6 cores assigned to the virtual machine)
This is the serial program:
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
#include <time.h>
int main(int argc, char *argv[]) {
struct timeval start, end;
gettimeofday(&start, NULL); //start time of calculation
int n = atoi(argv[1]);
long int sum = 0;
for (int i = 1; i < n; i++){
sum += (i * i);
}
gettimeofday(&end, NULL); //end time of calculation
printf("The sum of squares in [1,%d): %ld | Time Taken: %ld mirco seconds \n",n,sum,
((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)));
return 0;
}
This the Pthreads program:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/time.h>
#include <time.h>
void *Sum(void *param);
// structure for thread arguments
struct thread_args {
int tid;
int a; //start
int b; //end
long int result; // partial results
};
int main(int argc, char *argv[])
{
struct timeval start, end;
gettimeofday(&start, NULL); //start time of calculation
int numthreads;
int number;
double totalSum=0;
if(argc < 3 ){
printf("Usage: ./sum_pthreads <numthreads> <number> ");
return 1;
}
numthreads = atoi(argv[1]);
number = atoi(argv[2]);;
pthread_t tid[numthreads];
struct thread_args targs[numthreads];
printf("I am Process | range: [%d,%d)\n",1,number);
printf("Running Threads...\n\n");
for(int i=0; i<numthreads;i++ ){
//Setting up the args
targs[i].tid = i;
targs[i].a = (number)*(targs[i].tid)/(numthreads);
targs[i].b = (number)*(targs[i].tid+1)/(numthreads);
if(i == numthreads-1 ){
targs[i].b = number;
}
pthread_create(&tid[i],NULL,Sum, &targs[i]);
}
for(int i=0; i< numthreads; i++){
pthread_join(tid[i],NULL);
}
printf("Threads Exited!\n");
printf("Process collecting information...\n");
for(int i=0; i<numthreads;i++ ){
totalSum += targs[i].result;
}
gettimeofday(&end, NULL); //end time of calculation
printf("Total Sum is: %.2f | Taken Time: %ld mirco seconds \n",totalSum,
((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)));
return 0;
}
void *Sum(void *param) {
int start = (*(( struct thread_args*) param)).a;
int end = (*((struct thread_args*) param)).b;
int id = (*((struct thread_args*)param)).tid;
long int sum =0;
printf("I am thread %d | range: [%d,%d)\n",id,start,end);
for (int i = start; i < end; i++){
sum += (i * i);
}
(*((struct thread_args*)param)).result = sum;
printf("I am thread %d | Sum: %ld\n\n", id ,(*((struct thread_args*)param)).result );
pthread_exit(0);
}
Results:
hamza#hamza:~/Desktop/lab4$ ./sum_serial 10
The sum of squares in [1,10): 285 | Time Taken: 7 mirco seconds
hamza#hamza:~/Desktop/lab4$ ./sol 2 10
I am Process | range: [1,10)
Running Threads...
I am thread 0 | range: [0,5)
I am thread 0 | Sum: 30
I am thread 1 | range: [5,10)
I am thread 1 | Sum: 255
Threads Exited!
Process collecting information...
Total Sum is: 285.00 | Taken Time: 670 mirco seconds
hamza#hamza:~/Desktop/lab4$ ./sol 3 10
I am Process | range: [1,10)
Running Threads...
I am thread 0 | range: [0,3)
I am thread 0 | Sum: 5
I am thread 1 | range: [3,6)
I am thread 1 | Sum: 50
I am thread 2 | range: [6,10)
I am thread 2 | Sum: 230
Threads Exited!
Process collecting information...
Total Sum is: 285.00 | Taken Time: 775 mirco seconds
hamza#hamza:~/Desktop/lab4$
The two programs do very different things. For example, the threaded program produces much more text output and creates a bunch of threads. You're comparing very short runs (less than a thousandth of a second) so the overhead of those additional things is significant.
You have to test with much longer runs such that the cost of producing additional output and creating and synchronizing threads is lost.
To use an analogy, one person can tighten three screws faster than three people can because of the overhead of getting a tool to each person, deciding who will tighten which screw, and so on. But if you have 500 screws to tighten, then three people will get it done faster.

Using rand_r in OpenMP 'for' is slower with 2 threads

The following code performs better with 1 thread than with 2 (using 4 threads gives speed up, though):
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
int main(int argc, char **argv) {
int n = atoi(argv[1]);
int num_threads = atoi(argv[2]);
omp_set_num_threads(num_threads);
unsigned int *seeds = malloc(num_threads * sizeof(unsigned int));
for (int i = 0; i < num_threads; ++i) {
seeds[i] = 42 + i;
}
unsigned long long sum = 0;
double begin_time = omp_get_wtime();
#pragma omp parallel
{
unsigned int *seedp = &seeds[omp_get_thread_num()];
#pragma omp for reduction(+ : sum)
for (int i = 0; i < n; ++i) {
sum += rand_r(seedp);
}
}
double end_time = omp_get_wtime();
printf("%fs\n", end_time - begin_time);
free(seeds);
return EXIT_SUCCESS;
}
On my laptop (2 cores, HT enabled) I get the following results:
$ gcc -fopenmp test.c && ./a.out 100000000 1
0.821497s
$ gcc -fopenmp test.c && ./a.out 100000000 2
1.096394s
$ gcc -fopenmp test.c && ./a.out 100000000 3
0.933494s
$ gcc -fopenmp test.c && ./a.out 100000000 4
0.748038s
The problem persists without reduction, drand48_r brings no difference, dynamic scheduling makes things even worse. However, if I replace the body of the loop with something not connected with random, i. e. sum += *seedp + i;, everything works as expected.
This is textbook example of false sharing. By using an array of seeds upon which each thread take one element, you force the logically private variables to be physically located next to each-other in memory. Therefore, the are all in the same cache line. This means that although no thread tries to modify a some other thread's seed, the cache line itself is modified by each threads at each iteration. And the actual trouble is that the system cannot detect variable's modifications for cache coherency, only cache line modifications. Therefore, at each iteration for each thread, the cache line has been modified by another thread and is no longer valid from a system's point of view. It has to be reloaded from memory (well, most likely from shared L3 cache here), leading to slowing down your code.
Try this one instead (not tested):
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
int main(int argc, char **argv) {
int n = atoi(argv[1]);
int num_threads = atoi(argv[2]);
omp_set_num_threads(num_threads);
unsigned long long sum = 0;
double begin_time = omp_get_wtime();
#pragma omp parallel
{
unsigned int seed = 42 + omp_get_thread_num();
#pragma omp for reduction(+ : sum)
for (int i = 0; i < n; ++i) {
sum += rand_r(&seed);
}
}
double end_time = omp_get_wtime();
printf("%fs\n", end_time - begin_time);
return EXIT_SUCCESS;
}

Why thread_id creates not in order?

I tried to create 10 threads, and output each tread index. My code is shown as below, I am wondering why they are repeating instead of arranging in order?
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include "util.h"
#include <errno.h>
#include <unistd.h>
#include <signal.h>
#include <time.h>
pthread_mutex_t request_buf_lock = PTHREAD_MUTEX_INITIALIZER;
void * worker(void *arg)
{
int thread_id = *(int*)arg;
// int requests_handled = 0;
//requests_handled = requests_handled + 1;
printf("%d\n",thread_id);
}
int main(int argc, char** argv)
{
pthread_t dispatchers[100];
pthread_t workers[100];
int i;
int * thread_id = malloc(sizeof(int));
for (i = 0; i < 10; i++) {
*thread_id = i;
pthread_create(&workers[i], NULL, worker, (void*)thread_id);
}
for (i = 0; i < 10; i++) {
pthread_join(workers[i], NULL);
}
return 0;
}
And the output result is:
4
5
5
6
6
6
7
8
9
9
But I expected it as:
0
1
2
3
4
5
6
7
8
9
Anyone has any idea or advice?
All 10 threads execute in parallel, and they all share a single int object, the one created by the call to malloc.
By the time your first thread executes its printf call, the value of *thread_id has been set to 4. Your second and third threads execute their printf calls when *thread_id has been set to 5. And so on.
If you allocate a separate int object for each thread (either by moving the malloc call inside the loop or just by declaring an array of ints), you'll get a unique thread id in each thread. But they're still likely to be printed in arbitrary order, since there's no synchronization among the threads.

Strange behaviour in OpenMP nested loop

In the following program I get different results (serial vs OpenMP), what is the reason? At the moment I can only think that perhaps the loop is too "large" for the threads and perhaps I should write it in some other way but I am not sure, any hints?
Compilation: g++-4.2 -fopenmp main.c functions.c -o main_elec_gcc.exe
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <omp.h>
#include <math.h>
#define NRACK 64
#define NSTARS 1024
double mysumallatomic_serial(float rocks[NRACK][3],float moon[NSTARS][3],float qr[NRACK],float ql[NSTARS]) {
int j,i;
float temp_div=0.,temp_sqrt=0.;
float difx,dify,difz;
float mod2x, mod2y, mod2z;
double S2 = 0.;
for(j=0; j<NRACK; j++){
for(i=0; i<NSTARS;i++){
difx=rocks[j][0]-moon[i][0];
dify=rocks[j][1]-moon[i][1];
difz=rocks[j][2]-moon[i][2];
mod2x=difx*difx;
mod2y=dify*dify;
mod2z=difz*difz;
temp_sqrt=sqrt(mod2x+mod2y+mod2z);
temp_div=1/temp_sqrt;
S2 += ql[i]*temp_div*qr[j];
}
}
return S2;
}
double mysumallatomic(float rocks[NRACK][3],float moon[NSTARS][3],float qr[NRACK],float ql[NSTARS]) {
float temp_div=0.,temp_sqrt=0.;
float difx,dify,difz;
float mod2x, mod2y, mod2z;
double S2 = 0.;
#pragma omp parallel for shared(S2)
for(int j=0; j<NRACK; j++){
for(int i=0; i<NSTARS;i++){
difx=rocks[j][0]-moon[i][0];
dify=rocks[j][1]-moon[i][1];
difz=rocks[j][2]-moon[i][2];
mod2x=difx*difx;
mod2y=dify*dify;
mod2z=difz*difz;
temp_sqrt=sqrt(mod2x+mod2y+mod2z);
temp_div=1/temp_sqrt;
float myterm=ql[i]*temp_div*qr[j];
#pragma omp atomic
S2 += myterm;
}
}
return S2;
int main(int argc, char *argv[]) {
float rocks[NRACK][3], moon[NSTARS][3];
float qr[NRACK], ql[NSTARS];
int i,j;
for(j=0;j<NRACK;j++){
rocks[j][0]=j;
rocks[j][1]=j+1;
rocks[j][2]=j+2;
qr[j] = j*1e-4+1e-3;
//qr[j] = 1;
}
for(i=0;i<NSTARS;i++){
moon[i][0]=12000+i;
moon[i][1]=12000+i+1;
moon[i][2]=12000+i+2;
ql[i] = i*1e-3 +1e-2 ;
//ql[i] = 1 ;
}
printf(" serial: %f\n", mysumallatomic_serial(rocks,moon,qr,ql));
printf(" openmp: %f\n", mysumallatomic(rocks,moon,qr,ql));
return(0);
}
}
I think you should use reduction instead of shared variable and remove #pragma omp atomic, like:
#pragma omp parallel for reduction(+:S2)
And it should work faster, because there are no need for atomic operations which are quite painful in terms of performance and threads synchronization.
UPDATE
You can also have some difference in results because of the operations order:
\sum_1^100(x[i]) != \sum_1^50(x[i]) + \sum_51^100(x[i])
You have data races on most of the temporary variables you are using in the parallel region - difx, dify, difz, mod2x, mod2y, mod2z, temp_sqrt, and temp_div should all be private. You should make these variables private by using a private clause on the parallel for directive.

Resources