I have written two similar C programs. How can I make the outputs of both code same by editing one of the ELF files not the actual code?
/**
* prg1.c
*/
#include <stdio.h>
#include <stdlib.h>
int main()
{
int a = 5;
int b = 10;
int sum;
sum = a + b;
printf("sum is %d\n", sum);
return(0);
}
/**
* prg2.c
*/
#include <stdio.h>
#include <stdlib.h>
int main()
{
int a = 5;
int b = 20;
int sum;
sum = a + b;
printf("sum is %d\n", sum);
return(0);
}
In your second program's elf file find the occurrence of 20 and change it to 10.
To do that you can do something like this -
Find 14 (hex of 20) in your elf file and change it to A and making sure length is same by adding extra 0.
To do this you can use any elf editor, I use 'Hex Fiend' for mac.
Related
#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <stdlib.h>
int main(void)
{
int n;
printf("Length? ");
scanf("%d", &n);
getchar();
char* str = (char*)malloc(sizeof(char) * (n+1));
fgets(str,sizeof(str),stdin);
for (int i = 0; i < n; i++)
printf("%c\n", str[i]);
free(str);
}
Process results like this!
Length? 5
abcde
a
b
c
?
(I wanted to upload the result image, but I got rejected since I didn't have 10 reputations)
I can't figure out why 'd' and 'e' won't be showing in the results.
What is the problem with my code??
(wellcome to stackoverflow :) (update #1)
str is a pointer to char instead of a character array therefore sizeof(str) is always 8 on 64-bit or 4 on 32-bit machines, no matter how much space you have allocated.
Demo (compilation succeeds only if X in static_assert(X) holds):
#include <assert.h>
#include <stdlib.h>
int main(void){
// Pointer to char
char *str=(char*)malloc(1024);
#if defined _WIN64 || defined __x86_64__ || defined _____LP64_____
static_assert(sizeof(str)==8);
#else
static_assert(sizeof(str)==4);
#endif
free(str);
// Character array
char arr[1024];
static_assert(sizeof(arr)==1024);
return 0;
}
fgets(char *str, int num, FILE *stream) reads until (num-1) characters have been read
Instead of fgets(str,sizeof(str),stdin) please fgets(str,n+1,stdin)
Fixed version:
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
int main(void){
int n=0;
printf("Length? ");
scanf("%d",&n);
getchar();
char *str=(char*)calloc((n+1),sizeof(char));
static_assert(
sizeof(str)==sizeof(char*) && (
sizeof(str)==4 || // 32-bit machine
sizeof(str)==8 // 64-bit machine
)
);
fgets(str,n+1,stdin);
for(int i=0;i<n;++i)
printf("%c\n",str[i]);
free(str);
str=NULL;
}
Length? 5
abcde
a
b
c
d
e
I compile this code in Ubuntu. I did it more than 10 times but I got only AAA BBB CCC . I believe sequence can be changed but I don't know why. Please somebody kindly tell me the reason.
#include <stdio.h>
#include <pthread.h>
#include <sched.h>
#include <unistd.h>
void *thread_entry(void *ptr)
{
char *name = (char *)ptr;
printf("%s-A\n", name);
sleep(1); //sched_yield();
printf("%s-B\n", name);
sleep(1); //sched_yield();
printf("%s-C\n", name);
}
int main()
{
#define MAX_THREAD 3
pthread_t thread[MAX_THREAD];
char *thread_name[MAX_THREAD] = {"thread1", "thread2", "thread3"};
int i;
for (i = 0; i < MAX_THREAD; i++)
pthread_create(&thread[i], NULL, thread_entry, thread_name[i]);
for (i = 0; i < MAX_THREAD; i++)
pthread_join(thread[i], NULL);
return 0;
}
Theoretically it's possible, but it's very unlikely you would see another ordering.
You spawn 3 threads, then one of them prints "A" and waits for 1 second until it will print "B". One second of sleeping is far more than enough for the rest threads to print "A". Same for "B" and "C".
When trying to write blocks to a file, with my blocks being unevenly distributed across my processes, one can use MPI_File_write_at with the good offset. As this function is not a collective operation, this works well.
Exemple :
#include <cstdio>
#include <cstdlib>
#include <string>
#include <mpi.h>
int main(int argc, char* argv[])
{
int rank, size;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int global = 7; // prime helps have unbalanced procs
int local = (global/size) + (global%size>rank?1:0);
int strsize = 5;
MPI_File fh;
MPI_File_open(MPI_COMM_WORLD, "output.txt", MPI_MODE_CREATE|MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
for (int i=0; i<local; ++i)
{
size_t idx = i * size + rank;
std::string buffer = std::string(strsize, 'a' + idx);
size_t offset = buffer.size() * idx;
MPI_File_write_at(fh, offset, buffer.c_str(), buffer.size(), MPI_CHAR, MPI_STATUS_IGNORE);
}
MPI_File_close(&fh);
MPI_Finalize();
return 0;
}
However for more complexe write, particularly when writting multi dimensional data like raw images, one may want to create a view at the file with MPI_Type_create_subarray. However, when using this methods with simple MPI_File_write (which is suppose to be non collective) I run in deadlocks. Exemple :
#include <cstdio>
#include <cstdlib>
#include <string>
#include <mpi.h>
int main(int argc, char* argv[])
{
int rank, size;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int global = 7; // prime helps have unbalanced procs
int local = (global/size) + (global%size>rank?1:0);
int strsize = 5;
MPI_File fh;
MPI_File_open(MPI_COMM_WORLD, "output.txt", MPI_MODE_CREATE|MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
for (int i=0; i<local; ++i)
{
size_t idx = i * size + rank;
std::string buffer = std::string(strsize, 'a' + idx);
int dim = 2;
int gsizes[2] = { buffer.size(), global };
int lsizes[2] = { buffer.size(), 1 };
int offset[2] = { 0, idx };
MPI_Datatype filetype;
MPI_Type_create_subarray(dim, gsizes, lsizes, offset, MPI_ORDER_C, MPI_CHAR, &filetype);
MPI_Type_commit(&filetype);
MPI_File_set_view(fh, 0, MPI_CHAR, filetype, "native", MPI_INFO_NULL);
MPI_File_write(fh, buffer.c_str(), buffer.size(), MPI_CHAR, MPI_STATUS_IGNORE);
}
MPI_File_close(&fh);
MPI_Finalize();
return 0;
}
How to avoid such a code to lock ? Keep in mind that by real code will really use the multidimensional capabilities of MPI_Type_create_subarray and cannot just use MPI_File_write_at
Also, it is difficult for me to know the maximum number of block in a process, so I'd like to avoid doing a reduce_all and then loop on the max number of block with empty writes when localnb <= id < maxnb
You don't use MPI_REDUCE when you have a variable number of blocks per node. You use MPI_SCAN or MPI_EXSCAN: MPI IO Writing a file when offset is not known
MPI_File_set_view is collective, so if 'local' is different on each processor, you'll find yourself calling a collective routine from less than all processors in the communicator. If you really really need to do so, open the file with MPI_COMM_SELF.
the MPI_SCAN approach means each process can set the file view as needed, and then blammo you can call the collective MPI_File_write_at_all (even if some processes have zero work -- they still need to participate) and take advantage of whatever clever optimizations your MPI-IO implementation provides.
I'm learning to programming using pthread for a adder program, after reference several codes still don't get how to pass multiple arguments into a thread using a struct, here is my buggy program:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <pthread.h>
typedef struct s_addition {
int num1;
int num2;
int sum;
} addition;
void *thread_add_function (void *ad)
{
printf ("ad.num1:%d, ad.num2:%d\n",ad.num1, ad.num2);
ad.sum = ad.num1 + ad.num2;
pthread_exit(0);
}
int main()
{
int N = 5;
int a[N], b[N], c[N];
srand (time(NULL));
// fill them with random numbers
for ( int j = 0; j < N; j++ ) {
a[j] = rand() % 392;
b[j] = rand() % 321;
}
addition ad1;
pthread_t thread[N];
for (int i = 0; i < N; i++) {
ad1.num1 = a[i];
ad1.num2 = b[i];
printf ("ad1.num1:%d, ad1.num2:%d\n",ad1.num1, ad1.num2);
pthread_create (&thread[i], NULL, thread_add_function, &ad1);
pthread_join(thread[i], NULL);
c[i] = ad.sum;
}
printf( "This is the result of using pthread.\n");
for ( int i = 0; i < N; i++) {
printf( "%d + %d = %d\n", a[i], b[i], c[i]);
}
}
But when compiling I got the following error:
vecadd_parallel.c:15:39: error: member reference base type 'void *' is not a
structure or union
printf ("ad.num1:%d, ad.num2:%d\n",ad.num1, ad.num2);
I tried but still cannot get a clue, what I am doing wrong with it?
Seems like you have a problem with trying to access the members of a void datatype.
You will need to add a line to cast your parameter to thread_add_function to the correct datatype similar to addition* add = (addition*)ad;, and then use this variable in your function (note that you also have to change you r .'s to -> because it's a pointer)
You also should only pass data to threads that was malloc()'d, as stack allocated data may not be permanent. It should be fine for the current implementation, but changes later could easily give strange, unpredictable behaviour.
In the following program I get different results (serial vs OpenMP), what is the reason? At the moment I can only think that perhaps the loop is too "large" for the threads and perhaps I should write it in some other way but I am not sure, any hints?
Compilation: g++-4.2 -fopenmp main.c functions.c -o main_elec_gcc.exe
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <omp.h>
#include <math.h>
#define NRACK 64
#define NSTARS 1024
double mysumallatomic_serial(float rocks[NRACK][3],float moon[NSTARS][3],float qr[NRACK],float ql[NSTARS]) {
int j,i;
float temp_div=0.,temp_sqrt=0.;
float difx,dify,difz;
float mod2x, mod2y, mod2z;
double S2 = 0.;
for(j=0; j<NRACK; j++){
for(i=0; i<NSTARS;i++){
difx=rocks[j][0]-moon[i][0];
dify=rocks[j][1]-moon[i][1];
difz=rocks[j][2]-moon[i][2];
mod2x=difx*difx;
mod2y=dify*dify;
mod2z=difz*difz;
temp_sqrt=sqrt(mod2x+mod2y+mod2z);
temp_div=1/temp_sqrt;
S2 += ql[i]*temp_div*qr[j];
}
}
return S2;
}
double mysumallatomic(float rocks[NRACK][3],float moon[NSTARS][3],float qr[NRACK],float ql[NSTARS]) {
float temp_div=0.,temp_sqrt=0.;
float difx,dify,difz;
float mod2x, mod2y, mod2z;
double S2 = 0.;
#pragma omp parallel for shared(S2)
for(int j=0; j<NRACK; j++){
for(int i=0; i<NSTARS;i++){
difx=rocks[j][0]-moon[i][0];
dify=rocks[j][1]-moon[i][1];
difz=rocks[j][2]-moon[i][2];
mod2x=difx*difx;
mod2y=dify*dify;
mod2z=difz*difz;
temp_sqrt=sqrt(mod2x+mod2y+mod2z);
temp_div=1/temp_sqrt;
float myterm=ql[i]*temp_div*qr[j];
#pragma omp atomic
S2 += myterm;
}
}
return S2;
int main(int argc, char *argv[]) {
float rocks[NRACK][3], moon[NSTARS][3];
float qr[NRACK], ql[NSTARS];
int i,j;
for(j=0;j<NRACK;j++){
rocks[j][0]=j;
rocks[j][1]=j+1;
rocks[j][2]=j+2;
qr[j] = j*1e-4+1e-3;
//qr[j] = 1;
}
for(i=0;i<NSTARS;i++){
moon[i][0]=12000+i;
moon[i][1]=12000+i+1;
moon[i][2]=12000+i+2;
ql[i] = i*1e-3 +1e-2 ;
//ql[i] = 1 ;
}
printf(" serial: %f\n", mysumallatomic_serial(rocks,moon,qr,ql));
printf(" openmp: %f\n", mysumallatomic(rocks,moon,qr,ql));
return(0);
}
}
I think you should use reduction instead of shared variable and remove #pragma omp atomic, like:
#pragma omp parallel for reduction(+:S2)
And it should work faster, because there are no need for atomic operations which are quite painful in terms of performance and threads synchronization.
UPDATE
You can also have some difference in results because of the operations order:
\sum_1^100(x[i]) != \sum_1^50(x[i]) + \sum_51^100(x[i])
You have data races on most of the temporary variables you are using in the parallel region - difx, dify, difz, mod2x, mod2y, mod2z, temp_sqrt, and temp_div should all be private. You should make these variables private by using a private clause on the parallel for directive.