seperate the cuda host code in .cpp file - visual-c++

main.cpp
#include<iostream>
#include "cuda.h"
using namespace std;
void cuda_calculation();
int main()
{
cuda_calculation();
return 0;
}
cu.h
void call(int , int ,float* , int );
cuda.cpp
#include <stdio.h>
#include <cuda.h>
#include "cu.h"
void cuda_calculation()
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
void call(n_blocks, block_size,&a_d, N);
/*square_array <<< n_blocks, block_size >>> (a_d, N);*/
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
}
cu.cu
#include <stdio.h>
#include "cu.h"
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
//}
void call(int a,int b,float* c,int d)
{
square_array <<< 3,4 >>> (c,d);
}
I tried to seperate the kernal code and host code in a cpp file, however the following error prevails:
Error 'cudaMemcpy': identifier not found and the other cuda related identifier is not identified.
how to use the cuda related identifier in cpp file and call the kernal functions

There are some errors: void cuda_calculation(); needs to be visible to main.cpp through a header file (cu.h).
Also make sure to compile your .cu files with nvcc and NOT as a standard C++ file. Use CUDA compilation rules to make this process easy (installed by default as part of CUDA toolkit)

after a long trial ,I came with the proper output,
to include the cuda identifier in the cpp files we not only need to include cuda.h but also we need to include cuda_runtime.h as
cuda.cpp as
#include <stdio.h>
#include <cuda.h>
#include<cuda_runtime.h>
#include "cu.h"
#include "cud.h"
//void call(int , int ,float * , int );
void cuda_calculation()
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
call(n_blocks, block_size,a_d, N);
/*square_array <<< n_blocks, block_size >>> (a_d, N);*/
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h);
cudaFree(a_d);
}
so the others files are
main.cpp
#include<iostream>
#include "cud.h"
using namespace std;
int main()
{
cuda_calculation();
return 0;
}
cud.h
void cuda_calculation();
cu.h
void call(int , int ,float* , int );
cu.cu
#include <stdio.h>
#include "cu.h"
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
//}
void call(int a,int b,float* c,int d)
{
square_array <<< 3,4 >>> (c,d);
}

Related

Decrease in Random read IOPs on NVME SSD if requests issued over small region

(TL;DR) On NVME SSDs (Intel p3600 as well as Avant), I am seeing decrease in the IOPS if I issue random reads over a small subset of the disk instead of the entire disk.
While reading the same offset over and over, the IOPS are about 36-40K for 4k blocksize. The IOPS gradually increase as I grow the region over which random reads are being issued. The program (seen below) uses asynchronous IO on Linux to submit the read requests.
Disk Range(in 4k blocks), IOPS
0, 38833
1, 68596
10, 76100
30, 80381
40, 113647
50, 148205
100, 170374
200, 239798
400, 270197
800, 334767
OS : Linux 4.2.0-35-generic
SSD : Intel P3600 NVME Flash
What could be causing this problem ?
The program can be run as follows
$ for i in 0 1 10 30 40 50 100 200 400 800
do
<program_name> /dev/nvme0n1 10 $i
done
and validate if you also see the increasing pattern of IOPS seen above
/**
* $ g++ <progname.cpp> -o progname -std=c++11 -lpthread -laio -O3
* $ progname /dev/nvme0n1 10 100
*/
#include <random>
#include <libaio.h>
#include <stdlib.h>//malloc, exit
#include <future> //async
#include <unistd.h> //usleep
#include <iostream>
#include <sys/time.h> // gettimeofday
#include <vector>
#include <fcntl.h> // open
#include <errno.h>
#include <sys/types.h> // open
#include <sys/stat.h> // open
#include <cassert>
#include <semaphore.h>
io_context_t ioctx;
std::vector<char*> buffers;
int fd = -1;
sem_t sem;
constexpr int numPerRound = 20;
constexpr int numRounds = 100000;
constexpr int MAXEVENT = 10;
constexpr size_t BLKSIZE = 4096;
constexpr int QDEPTH = 200;
off_t startBlock = 0;
off_t numBlocks = 100;
const int numSubmitted = numRounds * numPerRound;
void DoGet()
{
io_event eventsArray[MAXEVENT];
int numCompleted = 0;
while (numCompleted != numSubmitted)
{
bzero(eventsArray, MAXEVENT * sizeof(io_event));
int numEvents;
do {
numEvents = io_getevents(ioctx, 1, MAXEVENT, eventsArray, nullptr);
} while (numEvents == -EINTR);
for (int i = 0; i < numEvents; i++)
{
io_event* ev = &eventsArray[i];
iocb* cb = (iocb*)(ev->data);
assert(ev->res2 == 0);
assert(ev->res == BLKSIZE);
sem_post(&sem); // free ioctx
}
numCompleted += numEvents;
}
std::cout << "completed=" << numCompleted << std::endl;
}
int main(int argc, char* argv[])
{
if (argc == 1) {
std::cout << "usage <nvme_device_name> <start_4k_block> <num_4k_blocks>" << std::endl;
exit(1);
}
char* deviceName = argv[1];
startBlock = atoll(argv[2]);
numBlocks = atoll(argv[3]);
int ret = 0;
ret = io_queue_init(QDEPTH, &ioctx);
assert(ret == 0);
ret = sem_init(&sem, 0, QDEPTH);
assert(ret == 0);
auto DoGetFut = std::async(std::launch::async, DoGet);
// preallocate buffers
for (int i = 0; i < QDEPTH; i++)
{
char* buf ;
ret = posix_memalign((void**)&buf, 4096, BLKSIZE);
assert(ret == 0);
buffers.push_back(buf);
}
fd = open("/dev/nvme0n1", O_DIRECT | O_RDONLY);
assert(fd >= 0);
off_t offset = 0;
struct timeval start;
gettimeofday(&start, 0);
std::mt19937 generator (getpid());
// generate random offsets within [startBlock, startBlock + numBlocks]
std::uniform_int_distribution<off_t> offsetgen(startBlock, startBlock + numBlocks);
for (int j = 0; j < numRounds; j++)
{
iocb mycb[numPerRound];
iocb* posted[numPerRound];
bzero(mycb, sizeof(iocb) * numPerRound);
for (int i = 0; i < numPerRound; i++)
{
// same buffer may get used in 2 different async read
// thats ok - not validating content in this program
char* iobuf = buffers[i];
iocb* cb = &mycb[i];
offset = offsetgen(generator) * BLKSIZE;
io_prep_pread(cb, fd, iobuf, BLKSIZE, offset);
cb->data = iobuf;
posted[i] = cb;
sem_wait(&sem); // wait for ioctx to be free
}
int ret = 0;
do {
ret = io_submit(ioctx, numPerRound, posted);
} while (ret == -EINTR);
assert(ret == numPerRound);
}
DoGetFut.wait();
struct timeval end;
gettimeofday(&end, 0);
uint64_t diff = ((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec - start.tv_usec);
io_queue_release(ioctx);
std::cout
<< "ops=" << numRounds * numPerRound
<< " iops=" << (numRounds * numPerRound *(uint64_t)1000000)/diff
<< " region-size=" << (numBlocks * BLKSIZE)
<< std::endl;
}
Surely it is to do with the structure of the memory. Internally this drive is built from many memory chips and may have multiple memory buses internally. If you do requests across a small range all the requests will resolve to a single or few chips and will have to be queued. If you access across the whole device then the multiple request are across many internal chips and buses and can be run asynchronously so will provide more throughput.

Zero copy in using vmsplice/splice in Linux

I am trying to get zero copy semantics working in linux using
vmsplice()/splice() but I don't see any performance improvement. This
is on linux 3.10, tried on 3.0.0 and 2.6.32. The following code tries
to do file writes, I have tried network socket writes() also, couldn't
see any improvement.
Can somebody tell what am I doing wrong ?
Has anyone gotten improvement using vmsplice()/splice() in production ?
#include <assert.h>
#include <fcntl.h>
#include <iostream>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <unistd.h>
#include <vector>
const char *filename = "Test-File";
const int block_size = 4 * 1024;
const int file_size = 4 * 1024 * 1024;
using namespace std;
int pipes[2];
vector<char *> file_data;
static int NowUsecs() {
struct timeval tv;
const int err = gettimeofday(&tv, NULL);
assert(err >= 0);
return tv.tv_sec * 1000000LL + tv.tv_usec;
}
void CreateData() {
for (int xx = 0; xx < file_size / block_size; ++xx) {
// The data buffer to fill.
char *data = NULL;
assert(posix_memalign(reinterpret_cast<void **>(&data), 4096, block_size) == 0);
file_data.emplace_back(data);
}
}
int SpliceWrite(int fd, char *buf, int buf_len) {
int len = buf_len;
struct iovec iov;
iov.iov_base = buf;
iov.iov_len = len;
while (len) {
int ret = vmsplice(pipes[1], &iov, 1, SPLICE_F_GIFT);
assert(ret >= 0);
if (!ret)
break;
len -= ret;
if (len) {
auto ptr = static_cast<char *>(iov.iov_base);
ptr += ret;
iov.iov_base = ptr;
iov.iov_len -= ret;
}
}
len = buf_len;
while (len) {
int ret = splice(pipes[0], NULL, fd, NULL, len, SPLICE_F_MOVE);
assert(ret >= 0);
if (!ret)
break;
len -= ret;
}
return 1;
}
int WriteToFile(const char *filename, bool use_splice) {
// Open and write to the file.
mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
int fd = open(filename, O_CREAT | O_RDWR, mode);
assert(fd >= 0);
const int start = NowUsecs();
for (int xx = 0; xx < file_size / block_size; ++xx) {
if (use_splice) {
SpliceWrite(fd, file_data[xx], block_size);
} else {
assert(write(fd, file_data[xx], block_size) == block_size);
}
}
const int time = NowUsecs() - start;
// Close file.
assert(close(fd) == 0);
return time;
}
void ValidateData() {
// Open and read from file.
const int fd = open(filename, O_RDWR);
assert(fd >= 0);
char *read_buf = (char *)malloc(block_size);
for (int xx = 0; xx < file_size / block_size; ++xx) {
assert(read(fd, read_buf, block_size) == block_size);
assert(memcmp(read_buf, file_data[xx], block_size) == 0);
}
// Close file.
assert(close(fd) == 0);
assert(unlink(filename) == 0);
}
int main(int argc, char **argv) {
auto res = pipe(pipes);
assert(res == 0);
CreateData();
const int without_splice = WriteToFile(filename, false /* use splice */);
ValidateData();
const int with_splice = WriteToFile(filename, true /* use splice */);
ValidateData();
cout << "TIME WITH SPLICE: " << with_splice << endl;
cout << "TIME WITHOUT SPLICE: " << without_splice << endl;
return 0;
}
I did a proof-of-concept some years ago where I got as 4x speedup using an optimized, specially tailored, vmsplice() code. This was measured against a generic socket/write() based solution. This blog post from natsys-lab echoes my findings. But I believe you need to have the exact right use case to get near this number.
So what are you doing wrong? Primarily I think you are measuring the wrong thing. When writing directly to a file you have 1 system call, which is write(). And you are not actually copying data (except to the kernel). When you have a buffer with data that you want to write to disk, it's not gonna get faster than that.
In you vmsplice/splice setup you are still copying you data into the kernel, but you have a total of 2 system calls vmsplice()+splice() to get it to disk. The speed being identical to write() is probably just a testament to Linux system call speed :-)
A more "fair" setup would be to write one program that read() from stdin and write() the same data to stdout. Write an identical program that simply splice() stdin into a file (or point stdout to a file when you run it). Although this setup might be too simple to really show anything.
Aside: an (undocumented?) feature of vmsplice() is that you can also use to to read data from a pipe. I used this in my old POC. It was basically just an IPC layer based on the idea of passing memory pages around using vmsplice().
Note: NowUsecs() probably overflows the int

My semaphore module is not working properly(Dining philosopher)

I'm implementing a semaphore methods to understand synchronization and thread things.
By using my semaphore, I tried to solve the Dining Philosophers problem.
My plan was making deadlock situation first.
But I found that just only one philosopher eat repeatedly.
And I checked that my semaphore is working quite good by using other synchronization problems. I think there is some problem with grammar.
please let me know what is the problem.
Here is my code.
dinig.c (including main function)
#include "sem.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
static tsem_t *chopstick[5];
static tsem_t *updating;
static int update_status (int i, int eating)
{
static int status[5] = { 0, };
static int duplicated;
int idx;
int sum;
tsem_wait (updating);
status[i] = eating;
/* Check invalid state. */
duplicated = 0;
sum = 0;
for (idx = 0; idx < 5; idx++)
{
sum += status[idx];
if (status[idx] && status[(idx + 1) % 5])
duplicated++;
}
/* Avoid printing empty table. */
if (sum == 0)
{
tsem_signal (updating);
return 0;
}
for (idx = 0; idx < 5; idx++)
fprintf (stdout, "%3s ", status[idx] ? "EAT" : "...");
/* Stop on invalid state. */
if (sum > 2 || duplicated > 0)
{
fprintf (stdout, "invalid %d (duplicated:%d)!\n", sum, duplicated);
exit (1);
}
else
fprintf (stdout, "\n");
tsem_signal (updating);
return 0;
}
void *thread_func (void *arg)
{
int i = (int) (long) arg;
int k = (i + 1) % 5;
do
{
tsem_wait (chopstick[i]);
tsem_wait (chopstick[k]);
update_status (i, 1);
update_status (i, 0);
tsem_signal (chopstick[i]);
tsem_signal (chopstick[k]);
}
while (1);
return NULL;
}
int main (int argc,
char **argv)
{
int i;
for (i = 0; i < 5; i++)
chopstick[i] = tsem_new (1);
updating = tsem_new (1);
for (i = 0; i < 5; i++)
{
pthread_t tid;
pthread_create (&tid, NULL, thread_func, (void *) (long) i);
}
/* endless thinking and eating... */
while (1)
usleep (10000000);
return 0;
}
sem.c(including semaphore methods)
#include "sem.h"
.
sem.h(Header for sem.c)
#ifndef __SEM_H__
#define __SEM_H__
#include <pthread.h>
typedef struct test_semaphore tsem_t;
tsem_t *tsem_new (int value);
void tsem_free (tsem_t *sem);
void tsem_wait (tsem_t *sem);
int tsem_try_wait (tsem_t *sem);
void tsem_signal (tsem_t *sem);
#endif /* __SEM_H__ */
compile command
gcc sem.c dining.c -pthread -o dining
One problem is that in tsem_wait() you have the following code sequence outside of a lock:
while(sem->count <= 0)
continue;
There's no guarantee that the program will actually re-read sem->count - the compiler is free to produce machine code that does something like the following:
int temp = sem->count;
while(temp <= 0)
continue;
In fact, this will likely happen in an optimized build.
Try changing your busy wait loop to something like this so the count is checked while holding the lock:
void tsem_wait (tsem_t *sem)
{
pthread_mutex_lock(&(sem->mutexLock));
while (sem->count <= 0) {
pthread_mutex_unlock(&(sem->mutexLock));
usleep(1);
pthread_mutex_lock(&(sem->mutexLock));
}
// sem->mutexLock is still held here...
sem->count--;
pthread_mutex_unlock(&(sem->mutexLock));
}
Strictly speaking, you should do something similar for tsem_try_wait() (which you're not using yet).
Note that you might want to consider using a pthread_cond_t to make waiting on the counter changing more efficient.
Finally, your code to 'get' the chopsticks in thread_func() has the classic Dining Philosopher deadlock problem in the situation where each philosopher simultaneously acquires the 'left' chopstick (chopstick[i]) and ends up waiting forever to get the 'right' chopstick (chopstick[k]) since all the chopsticks are in some philosopher's left hand.

Pthread program returning expected declaration specifiers or â...â before â&â token

I am getting the same error multiple times when compiling the following pthreads program in linux using:
gcc -c -lpthread proj2_part1.c -lrt
#include <unistd.h>
#include <sys/types.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <string.h>
#include <semaphore.h>
#define BUFFER_SIZE 10
#define TRUE 1
struct cQueue{
int *buffer;
int front, rear;
}*queue;
int count;
pthread_mutex_t mutex;
sem_t full, empty;
........
/*MAIN*/
int main(int argc,char* argv[])
{
int a=0, b=0, count =0, buff[BUFFER_SIZE];
int p = atoi(argv[1]), c = atoi(argv[2]);
unsigned int s = atoi(argv[0]);
pthread_t prothread[p], conthread[c];
/*Initialize Semaphores*/
if (sem_init(&full,0,0) == -1)
printf("%s\n",strerror(errno));
if (sem_init(&empty,0,BUFFER_SIZE) == -1)
printf("%s\n",strerror(errno));
pthread_mutex_init(&mutex, NULL);
/*Initialize Circular Queue*/
queue->buffer=buff;
queue->front = queue->buffer[0];
queue->rear = queue->buffer[0];
if(argc!=3)
{
fprintf(stderr,"Syntax: ./a.out <int> <int> <int>");
return -1;
}
if(s<0)
{
fprintf(stderr,"Argument %d must be positive value\n",s);
return -1;
}
else
{
/*Create producer threads*/
int i;
for (i=0; i<p; i++)
{
b = pthread_create(&prothread[i], NULL, producerThread, (void*)argv[1]);
if (b<0)
{
printf("Error: unable to create thread, %d\n",b);
return -1;
}
}
/*Create consumer threads*/
int j;
for (j=0; j<c; j++)
(
a = pthread_create(&conthread[j], NULL, consumerThread, (void*)argv[2]);
if (a<0)
{
printf("Error: unable to create thread, %d\n",a);
return -1;
}
}
sleep(atoi(argv[0]));
}
return 0;
}
I am receiving the following error. I think it has something to do with my semaphore declaration.
proj2_part1.c:147:81: error: expected â)â before â;â token
a = pthread_create(&conthread[j], NULL, consumerThread, (void*)argv[2]);
^
proj2_part1.c:153:6: error: expected â;â before â}â token}
^
proj2_part1.c: At top level:
proj2_part1.c:156:6: error: expected identifier or â(â before âreturnâ
return 0;
^
proj2_part1.c:157:1: error: expected identifier or â(â before â}â token
}
You've used a ( where you meant {:
for (j=0; j<c; j++)
(

Why is my scanner code so slow with Pthread or OpenMP?

I want to scan one array and get the scanning result .
In My code , I make a disorder array with the shuffle function. then scan it get some number ( which is bigger than 60000 ). I split the array into threadnum-part ,every thread get one part to deal with. It seems that there are no shared memory between different thread.
So, why the two parallel code is so slow, since it is too slow ,I think the padding things maybe not the main reason. Could anyone give me some tips ? I am a beginner in parallel programming ,thank you.
here is my testing code with three part: serialization/Pthread/OpenMP. you can copy it and test on your own machine.
serialization code :
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#define N (65532)
#define threadnum 2
typedef struct{
int *mydata;
int *myres;
int val;
int datalen;
int reslen;
int tid;
}arg_t;
int randint(int i, int k){
int ret;
if(i> k){
int t= i;i=k;k=t;
}
ret = i + rand()%(k-i+1);
return ret;
}
void shuffle(int *org,int n){
int j=0,i;
for(i = n-1;i !=0;i--){
j = randint(0,i);
int t = org[j];org[j] = org[i];org[i] =t;
}
}
void scan(arg_t *arg){
int i;
arg->reslen=0;
for(i=0;i<arg->datalen;i++){
//if(arg->mydata[i] > arg->val){
arg->myres[arg->reslen] = arg->mydata[i];
arg->reslen +=(arg->mydata[i] > arg->val);
//}
}
}
int main(){
struct timeval begin,end;
int i,A[N],*res,reslen,perthread;
double diff_usec;
arg_t args[threadnum];
for(i=0;i<N;i++){
A[i] = i+1;
}
shuffle(A,N);
gettimeofday(&begin,NULL);
res = malloc(sizeof(int)*688);
reslen = 0;
for(i=0;i<N;i++){
//if(arg->mydata[i] > arg->val){
res[reslen] =A[i];
reslen +=(A[i] > 60000);
//}
}
gettimeofday(&end,NULL);
diff_usec = (((end).tv_sec*1000000L + (end).tv_usec)- ((begin).tv_sec*1000000L+(begin).tv_usec));
printf("\n%.4lf %d\n",diff_usec,reslen);
return 0;
}
Pthread code :
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#define N 65532
#define threadnum 2
typedef struct{
int *mydata;
int *myres;
int val;
int datalen;
int reslen;
int tid;
}arg_t;
int randint(int i, int k){
int ret;
if(i> k){
int t= i;i=k;k=t;
}
ret = i + rand()%(k-i+1);
return ret;
}
void shuffle(int *org,int n){
int j=0,i;
for(i = n-1;i !=0;i--){
j = randint(0,i);
int t = org[j];org[j] = org[i];org[i] =t;
}
}
void scan(arg_t *arg){
int i;
arg->reslen=0;
for(i=0;i<arg->datalen;i++){
//if(arg->mydata[i] > arg->val){
arg->myres[arg->reslen] = arg->mydata[i];
arg->reslen +=(arg->mydata[i] > arg->val);
//}
}
}
int get_time(struct timespec *begin , struct timespec *end){
return 1000 * (end -> tv_sec - begin -> tv_sec) + (end -> tv_sec - begin->tv_sec)/1000000;
}
int main(){
struct timeval begin,end;
//struct timespec begin,end;
int i,A[N],*res,reslen,perthread;
arg_t args[threadnum];
double diff_usec;
pthread_t tid[threadnum];
for(i=0;i<N;i++){
A[i] = i+1;
}
shuffle(A,N);
gettimeofday(&begin,NULL);
//clock_gettime(CLOCK_REALTIME,&begin);
perthread = N/threadnum;
for(i=0;i<threadnum;i++){
args[i].mydata = A+i*perthread;
args[i].myres = malloc(sizeof(int)*perthread);
args[i].datalen = (i == threadnum-1)?(N-(threadnum-1)*perthread):perthread;
args[i].val = 60000;
args[i].tid = i;
pthread_create(&tid[i],NULL,scan,(void*)&args[i]);
}
reslen =0;
for(i=0;i<threadnum;i++){
pthread_join(tid[i],NULL);
reslen += args[i].reslen;
}
res = malloc(sizeof(int)*reslen);
reslen=0;
for(i=0;i<threadnum;i++){
memcpy(res+reslen,args[i].myres,args[i].reslen);
reslen += args[i].reslen;
}
//clock_gettime(CLOCK_REALTIME,&end);
gettimeofday(&end,NULL);
diff_usec = (((end).tv_sec*1000000L + (end).tv_usec)- ((begin).tv_sec*1000000L+(begin).tv_usec));
//printf("\n%dms %d\n",get_time(&begin,&end),reslen);
printf("\n%.4lfms %d\n",diff_usec,reslen);
return 0;
}
OpenMP code :
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#include <time.h>
#define N 65532
#define threadnum 16
typedef struct{
int *mydata;
int *myres;
int val;
int datalen;
int reslen;
}arg_t;
int get_time(struct timespec *begin , struct timespec *end){
return 1000 * (end -> tv_sec - begin -> tv_sec) + (end -> tv_sec - begin->tv_sec)/1000000;
}
int randint(int i, int k){
int ret;
if(i> k){
int t= i;i=k;k=t;
}
ret = i + rand()%(k-i+1);
return ret;
}
void shuffle(int *org,int n){
int j=0,i;
for(i = n-1;i !=0;i--){
j = randint(0,i);
int t = org[j];org[j] = org[i];org[i] =t;
}
}
void scan(arg_t *arg){
int i;
arg->reslen=0;
for(i=0;i<arg->datalen;i++){
//if(arg->mydata[i] > arg->val){
arg->myres[arg->reslen] = arg->mydata[i];
arg->reslen +=(arg->mydata[i] > arg->val);
//}
}
}
int main(){
struct timeval begin,end;
//struct timespec begin,end;
int i,A[N],*res,reslen,perthread;
double diff_usec;
arg_t args[threadnum];
for(i=0;i<N;i++){
A[i] = i+1;
}
shuffle(A,N);
gettimeofday(&begin,NULL);
//clock_gettime(CLOCK_REALTIME,&begin);
perthread = N/threadnum;
for(i=0;i<threadnum;i++){
args[i].mydata = A+i*perthread;
args[i].myres = malloc(sizeof(int)*perthread);
args[i].datalen = (i == threadnum-1)?(N-(threadnum-1)*perthread):perthread;
args[i].val = 60000;
}
omp_set_num_threads(threadnum);
#pragma omp parallel
{
scan(&args[omp_get_thread_num()]);
}
reslen =0;
for(i=0;i<threadnum;i++){
reslen += args[i].reslen;
}
res = malloc(sizeof(int)*reslen);
reslen=0;
for(i=0;i<threadnum;i++){
memcpy(res+reslen,args[i].myres,args[i].reslen);
reslen += args[i].reslen;
}
gettimeofday(&end,NULL);
//clock_gettime(CLOCK_REALTIME,&end);
diff_usec = (((end).tv_sec*1000000L + (end).tv_usec)- ((begin).tv_sec*1000000L+(begin).tv_usec));
printf("\n%.4lfms %d\n",diff_usec,reslen);
return 0;
}

Resources