Copy struct with function pointer to device - struct

I have a struct containing the parameters of a linear function, as well as the function itself. What I want to do is copy this struct to the device and then evaluate the linear function. The following example doesn't make sense but it is sufficient to describe the difficulties I have:
struct model
{
double* params;
double (*func)(double*, double);
};
I don't know how to copy this struct to the device.
Here are my functions:
Init function
// init function for struct model
__host__ void model_init(model* m, double* params, double(*func)(double*,double))
{
if(m)
{
m->params = params;
m->func = func;
}
}
Evaluation function
__device__ double model_evaluate(model* m, double x)
{
if(m)
{
return m->func(m->params, x);
}
return 0.0;
}
The actual function
__host__ __device__ double linear_function(double* params, double x)
{
return params[0] + params[1] * x;
}
Function called inside kernel
__device__ double compute(model *d_linear_model)
{
return model_evaluate(d_linear_model,1.0);
}
The kernel itself
__global__ void kernel(double *array, model *d_linear_model, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N)
{
array[idx] = compute(d_linear_model);
}
}
I know how to copy an array from host to device but I don't know how to do this for this concrete struct which contains a function.
The kernel call in main then looks like this:
int block_size = 4;
int n_blocks = N_array/block_size + (N_array % block_size == 0 ? 0:1);
kernel<<<n_blocks, block_size>>>(device_array, d_linear_model, N_array);

You've outlined two items that I consider to be somewhat more difficult than beginner-level CUDA programming:
use of device function pointers
a "deep copy" operation (on the embedded params pointer in your model structure)
Both of these topics have been covered in other questions. For example this question/answer discusses deep copy operations - when a data structure has embedded pointers to other data. And this question/answer links to a variety of resources on device function pointer usage.
But I'll go ahead and offer a possible solution for your posted case. Most of your code is usable as-is (at least for demonstration purposes). As mentioned already, your model structure will present two challenges:
struct model
{
double* params; // requires a "deep copy" operation
double (*func)(double*, double); // requires special handling for device function pointers
};
As a result, although most of your code is usable as-is, your "init" function is not. That might work for a host realization, but not for a device realization.
The deep copy operation requires us to copy the overall structure, plus separately copy the data pointed to by the embedded pointer, plus separately copy or "fixup" the embedded pointer itself.
The usage of a device function pointer is restricted by the fact that we cannot grab the actual device function pointer in host code - that is illegal in CUDA. So one possible solution is to use a __device__ construct to "capture" the device function pointer, then do a cudaMemcpyFromSymbol operation in host code, to retrieve the numerical value of the device function pointer, which can then be moved about in ordinary fashion.
Here's a worked example building on what you have shown, demonstrating the two concepts above. I have not created a "device init" function - but all the code necessary to do that is in the main function. Once you've grasped the concepts, you can take whatever code you wish out of the main function below and craft it into your "device init" function, if you wish to create one.
Here's a worked example:
$ cat t968.cu
#include <iostream>
#define NUM_PARAMS 2
#define ARR_SIZE 1
#define nTPB 256
struct model
{
double* params;
double (*func)(double*, double);
};
// init function for struct model -- not using this for device operations
__host__ void model_init(model* m, double* params, double(*func)(double*,double))
{
if(m)
{
m->params = params;
m->func = func;
}
}
__device__ double model_evaluate(model* m, double x)
{
if(m)
{
return m->func(m->params, x);
}
return 0.0;
}
__host__ __device__ double linear_function(double* params, double x)
{
return params[0] + params[1] * x;
}
__device__ double compute(model *d_linear_model)
{
return model_evaluate(d_linear_model,1.0);
}
__global__ void kernel(double *array, model *d_linear_model, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N)
{
array[idx] = compute(d_linear_model);
}
}
__device__ double (*linear_function_ptr)(double*, double) = linear_function;
int main(){
// grab function pointer from device code
double (*my_fp)(double*, double);
cudaMemcpyFromSymbol(&my_fp, linear_function_ptr, sizeof(void *));
// setup model
model my_model;
my_model.params = new double[NUM_PARAMS];
my_model.params[0] = 1.0;
my_model.params[1] = 2.0;
my_model.func = my_fp;
// setup for device copy of model
model *d_model;
cudaMalloc(&d_model, sizeof(model));
// setup "deep copy" for params
double *d_params;
cudaMalloc(&d_params, NUM_PARAMS*sizeof(double));
cudaMemcpy(d_params, my_model.params, NUM_PARAMS*sizeof(double), cudaMemcpyHostToDevice);
// copy model to device
cudaMemcpy(d_model, &my_model, sizeof(model), cudaMemcpyHostToDevice);
// fixup device params pointer in device model
cudaMemcpy(&(d_model->params), &d_params, sizeof(double *), cudaMemcpyHostToDevice);
// run test
double *d_array, *h_array;
cudaMalloc(&d_array, ARR_SIZE*sizeof(double));
h_array = new double[ARR_SIZE];
for (int i = 0; i < ARR_SIZE; i++) h_array[i] = i;
cudaMemcpy(d_array, h_array, ARR_SIZE*sizeof(double), cudaMemcpyHostToDevice);
kernel<<<(ARR_SIZE+nTPB-1)/nTPB,nTPB>>>(d_array, d_model, ARR_SIZE);
cudaMemcpy(h_array, d_array, ARR_SIZE*sizeof(double), cudaMemcpyDeviceToHost);
std::cout << "Results: " << std::endl;
for (int i = 0; i < ARR_SIZE; i++) std::cout << h_array[i] << " ";
std::cout << std::endl;
return 0;
}
$ nvcc -o t968 t968.cu
$ cuda-memcheck ./t968
========= CUDA-MEMCHECK
Results:
3
========= ERROR SUMMARY: 0 errors
$
For brevity of presentation, I've dispensed with proper cuda error checking (instead I have run the code with cuda-memcheck to demonstrate that it is without runtime error) but I would recommend proper error checking if you're having any trouble with a code.

Related

'PTX JIT compilation failed' from cuModuleLoadData

Below is the code:
#define FILENAME "kernel.code"
#define kernel_name "hello_world"
#define THREADS 4
std::vector<char> load_file()
{
std::ifstream file(FILENAME, std::ios::binary | std::ios::ate);
std::streamsize fsize = file.tellg();
file.seekg(0, std::ios::beg);
std::vector<char> buffer(fsize);
if (!file.read(buffer.data(), fsize)) {
failed("could not open code object '%s'\n", FILENAME);
}
return buffer;
}
struct joinable_thread : std::thread
{
template <class... Xs>
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
{
}
joinable_thread& operator=(joinable_thread&& other) = default;
joinable_thread(joinable_thread&& other) = default;
~joinable_thread()
{
if(this->joinable())
this->join();
}
};
void run(const std::vector<char>& buffer) {
CUdevice device;
CUDACHECK(cuDeviceGet(&device, 0));
CUcontext context;
CUDACHECK(cuCtxCreate(&context, 0, device));
CUmodule Module;
CUDACHECK(cuModuleLoadData(&Module, &buffer[0]));
...
}
void run_multi_threads(uint32_t n) {
{
auto buffer = load_file();
std::vector<joinable_thread> threads;
for (uint32_t i = 0; i < n; i++) {
threads.emplace_back(std::thread{[&, i, buffer] {
run(buffer);
}});
}
}
}
int main() {
CUDACHECK(cuInit(0));
run_multi_threads(THREADS);
}
And the code kernel.cu used for ptx is as follows:
#include "cuda_runtime.h"
extern "C" __global__ void hello_world(float* a, float* b) {
int tx = threadIdx.x;
b[tx] = a[tx];
}
I m generating the ptx in this way
nvcc --ptx kernel.cu -o kernel.code
Im using a machine with GeForce GTX TITAN X.
And Im facing this "PTX JIT compilation failed" from cuModuleLoadData error, only when I m trying to use this with multiple threads. If i remove the multi-threading part and run normally, this error doesn't occur.
Can anyone please tell me what is going wrong and how to overcome this.
As mentioned in the comments, I was able to get it to work by moving the load_file() call to the main, so that the buffer read from the file is valid, and then pass only the buffer to all the threads.
Actually in the original code, the buffer will be deconstructed once it leaves the '{...}' scope. So when thread starts, you may read the invalid buffer.
If you put your buffer in the main, it will not be deconstructed or freed until the program exits.
So yes, it's because you pass the invalid buffer (which may have already been freed) to the cu code.

can we convert Audio (.mp3) to video (mp4) in android studio? how?

i am new in this and i am working on App of media player and recording app. in which i have shown song list of device in the listview and recording start / stop / play. Now i want to convert that .mp3 recorded file into .mp4 and one image will show on behalf of a video in that file. Help me to achive this i have no idea and i refer many links and i didnt find anything.
Please check this link for your first question:
Why can't we initialize class members at their declaration?
Usually constructor is use to initialize value to data variables of class.
For 2nd Question:
If data member is not initialize after creation of object, It will contain garbage value. So initialize or assign suitable value to as per your need.
Check below code:
#include<iostream>
using namespace std;
class swap_values
{
int a, b, temp;
public:
swap_values(){
a=0;b=0;temp=0;
}
swap_values(int x, int y){
a = x;
b = y;
temp = 0;
}
void swapped()
{
temp = b;
b=a;
a=temp;
}
void print(){
cout<<"a: "<<a<<" b: "<<b<<endl;
}
};
int main()
{
int x =10; int y = 20;
swap_values obj(x, y);
obj.print();
obj.swapped();
obj.print();
return 0;
}
Everything can be done in better ways but just using your code this will work for you -
#include <iostream>
using namespace std;
class Swap {
private:
int a,b,temp;
public:
Swap()
{
a=10;
b=20;
temp=0;
}
void swapNums()
{
temp=a; a=b; b=temp;
cout<<a<<" " <<b<<endl;
}
};
int main() {
Swap s;
s.swapNums();
return 0;
}
You can avoid using class name as some function name. You can instead use constructor without a return type where you can initialise the member variables. swap method looks fine.
i am not able to initialize my variable in class.
class swap
{
int a=10; \\cannot declare here
int b=20; \\ cannot declare here
}
Since C++11, this is fine, you can have default member initialization.
The error is due to missing semicolon after }.
why it has garbage value with b ??
a=b;
b=temp;
temp=a;
Since temp was never initialized before assigning it to b, temp has an indeterminate value.
Any usage will lead to undefined behavior.
Here's a simple Swap struct:
struct Swap
{
int a = 10; // default member initialization
int b = 20; // default member initialization
Swap(int a = 20, int b = 10): a(b), b(a) {}; // swap on initialization
// using member initializer list
};
Swap s;
std::cout << s.a // 20
<< s.b // 10
<< std::endl;
In this example, default member initialization is "obsolete" / "redundant" due to member initializer list.

Cuda object copy

I'm trying to use CUDA with objects, this is a little test code i put together to try out things, but i ran into a problem. When i'm doing anything to the device version of the variable, the copy back to the host fails with "cuda Error Ilegal Address", but if i just copy the code to the device and back it works.
If i comment out the printf... line, it the works.
class A {
public:
int s;
};
__device__ A *d_a;
__global__ void MethodA() {
printf("%d\n", d_a->s);
}
int main() {
A *a = new A();
a->s = 10;
cudaError e;
e = cudaMalloc((void**)&d_a, sizeof(A));
e = cudaMemcpy(d_a, a, sizeof(A), cudaMemcpyHostToDevice);
MethodA << <1, 1 >> > ();
e = cudaMemcpy(a, d_a, sizeof(A), cudaMemcpyDeviceToHost);
std::cout << cudaGetErrorName(e) << std::endl;
delete(a);
std::getchar();
return 0;
}
Use of the __device__ variable is causing difficulty. It is intended to be used for static allocations, known at compile time.
Your methodology would be simplified if you used an ordinary host-based pointer, pointing to a dynamic allocation created at runtime (which you are doing anyway), and then pass that host-based pointer to the device, via a kernel parameter.
Some problems with your approach:
You are using an incorrect API for modifying a __device__ variable. We don't use cudaMemcpy. We use cudaMemcpyToSymbol, etc.
You are not allowed to take the address of a device entity in host code:
e = cudaMalloc((void**)&d_a, sizeof(A));
^
cudaMalloc expects to store the allocated pointer value in host memory, not in device memory. It will point to a location in device memory, but it should be stored in a host variable.
If you want to stay with your method, the following modifications should make it correct:
$ cat t89.cu
#include <iostream>
#include <stdio.h>
class A {
public:
int s;
};
__device__ A *d_a;
__global__ void MethodA() {
printf("%d\n", d_a->s);
}
int main() {
A *a = new A();
a->s = 10;
A *temp_d_a;
cudaMalloc((void**)&temp_d_a, sizeof(A));
cudaMemcpy(temp_d_a, a, sizeof(A), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_a, &temp_d_a, sizeof(A *));
MethodA << <1, 1 >> > ();
cudaMemcpy(a, temp_d_a, sizeof(A), cudaMemcpyDeviceToHost);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
cudaFree(temp_d_a);
delete(a);
return 0;
}
$ nvcc t89.cu -o t89
$ cuda-memcheck ./t89
========= CUDA-MEMCHECK
10
no error
========= ERROR SUMMARY: 0 errors
$
EDIT: Regarding my previous statement:
Your methodology would be simplified if you used an ordinary host-based pointer, pointing to a dynamic allocation created at runtime (which you are doing anyway), and then pass that host-based pointer to the device, via a kernel parameter.
and asked about in the comments below, here is a worked example showing that approach:
$ cat t89.cu
#include <iostream>
#include <stdio.h>
class A {
public:
int s;
};
__global__ void MethodA(A *a) {
printf("%d\n", a->s);
}
int main() {
A *a = new A();
a->s = 10;
A *d_a; // an ordinary host-based pointer
cudaMalloc((void**)&d_a, sizeof(A)); //dynamic allocation created at runtime
cudaMemcpy(d_a, a, sizeof(A), cudaMemcpyHostToDevice);
MethodA << <1, 1 >> > (d_a); // passed to kernel via parameter
cudaMemcpy(a, d_a, sizeof(A), cudaMemcpyDeviceToHost);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
cudaFree(d_a);
delete(a);
return 0;
}
$ nvcc -o t89 t89.cu
$ cuda-memcheck ./t89
========= CUDA-MEMCHECK
10
no error
========= ERROR SUMMARY: 0 errors
$

CUDA copy linked lists from device to host

I am trying to populate a number of linked lists on the device and then return those lists back to the hosts.
From my understanding I need to allocate memory for my struct Element, but I don't know how to go about it since I will have many linked lists, each with an unknown number of elements. I've tried a couple of different things but it still didn't work. So I'm back to the starting point. Here is my code:
//NODE CLASS
class Node{
public:
int x,y;
Node *parent;
__device__ __host__ Node(){}
__device__ __host__ Node(int cX, int cY){x = cX; y = cY;}
__device__ __host__ int get_row() { return x; }
__device__ __host__ int get_col() { return y; }
};
//LINKED LIST
class LinkedList{
public:
__device__ __host__ struct Element{
Node n1;
Element *next;
};
__device__ __host__ LinkedList(){
head = NULL;
}
__device__ __host__ void addNode(Node n){
Element *el = new Element();
el->n1 = n;
el->next = head;
head = el;
}
__device__ __host__ Node popFirstNode(){
Element *cur = head;
Node n;
if(cur != NULL){
n = cur -> n1;
head = head -> next;
}
delete cur;
return n;
}
__device__ __host__ bool isEmpty(){
Element *cur = head;
if(cur == NULL){
return true;
}
return false;
}
Element *head;
};
//LISTS
__global__ void listsKernel(LinkedList* d_Results, int numLists){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
Node n(1,1);
if(idx < numLists){
d_Results[idx].addNode(n);
d_Results[idx].addNode(n);
d_Results[idx].addNode(n);
d_Results[idx].addNode(n);
}
}
int main(){
int numLists = 10;
size_t size = numLists * sizeof(LinkedList);
LinkedList curList;
LinkedList* h_Results = (LinkedList*)malloc(size);
LinkedList* d_Results;
cudaMalloc((void**)&d_Results, size);
listsKernel<<<256,256>>>(d_Results, numLists);
cudaMemcpy(h_Results, d_Results, sizeof(LinkedList)*numLists, cudaMemcpyDeviceToHost);
for(int i = 0; i < numLists; i++){
curList = h_Results[i];
while(curList.isEmpty() == false){
Node n = curList.popFirstNode();
std::cout << "x: " << n.get_row() << " y: " << n.get_col();
}
}
}
As you can see I'm trying to populate 10 linked lists on the device and then return them back to the host, but the code above results in unhandled exception - Access violation reading location. I am assuming it is not coping the pointers from the device.
Any help would be great.
Just eyeballing the code, it seems you have a fundamental misconception: there is host memory which cannot be accessed from the device, and device memory which cannot be accessed from the host. So when you create linked list nodes in device memory and copy the pointers back to the host, the host cannot dereference those pointers, because they are pointing to device memory.
If you truly want to pass linked lists back and forth between host and device, your best bet is probably to copy the entries into an array, do the memcpy, then copy the array back into a linked list. Other things can be done too, depending on just what your use case is.
(it is possible to allocate a region of memory that is accessible both from the host and from the device, but there is some awkwardness with it and I have no experience using it)

C++11 Thread Safety of Atomic Containers

I am trying to implement a thread safe STL vector without mutexes. So I followed through this post and implemented a wrapper for the atomic primitives.
However when I ran the code below, it displayed out Failed!twice from the below code (only two instances of race conditions) so it doesn't seem to be thread safe. I'm wondering how can I fix that?
Wrapper Class
template<typename T>
struct AtomicVariable
{
std::atomic<T> atomic;
AtomicVariable() : atomic(T()) {}
explicit AtomicVariable(T const& v) : atomic(v) {}
explicit AtomicVariable(std::atomic<T> const& a) : atomic(a.load()) {}
AtomicVariable(AtomicVariable const&other) :
atomic(other.atomic.load()) {}
inline AtomicVariable& operator=(AtomicVariable const &rhs) {
atomic.store(rhs.atomic.load());
return *this;
}
inline AtomicVariable& operator+=(AtomicVariable const &rhs) {
atomic.store(rhs.atomic.load() + atomic.load());
return *this;
}
inline bool operator!=(AtomicVariable const &rhs) {
return !(atomic.load() == rhs.atomic.load());
}
};
typedef AtomicVariable<int> AtomicInt;
Functions and Testing
// Vector of 100 elements.
vector<AtomicInt> common(100, AtomicInt(0));
void add10(vector<AtomicInt> &param){
for (vector<AtomicInt>::iterator it = param.begin();
it != param.end(); ++it){
*it += AtomicInt(10);
}
}
void add100(vector<AtomicInt> &param){
for (vector<AtomicInt>::iterator it = param.begin();
it != param.end(); ++it){
*it += AtomicInt(100);
}
}
void doParallelProcessing(){
// Create threads
std::thread t1(add10, std::ref(common));
std::thread t2(add100, std::ref(common));
// Join 'em
t1.join();
t2.join();
// Print vector again
for (vector<AtomicInt>::iterator it = common.begin();
it != common.end(); ++it){
if (*it != AtomicInt(110)){
cout << "Failed!" << endl;
}
}
}
int main(int argc, char *argv[]) {
// Just for testing purposes
for (int i = 0; i < 100000; i++){
// Reset vector
common.clear();
common.resize(100, AtomicInt(0));
doParallelProcessing();
}
}
Is there such a thing as an atomic container? I've also tested this with a regular vector<int> it didn't have any Failed output but that might just be a coincidence.
Just write operator += as:
inline AtomicVariable& operator+=(AtomicVariable const &rhs) {
atomic += rhs.atomic;
return *this;
}
In documentation: http://en.cppreference.com/w/cpp/atomic/atomic operator += is atomic.
Your example fails because below scenario of execution is possible:
Thread1 - rhs.atomic.load() - returns 10 ; Thread2 - rhs.atomic.load() - returns 100
Thread1 - atomic.load() - returns 0 ; Thread2 - atomic.load - returns 0
Thread1 - add values (0 + 10 = 10) ; Thread2 - add values (0 + 100)
Thread1 - atomic.store(10) ; Thread2 - atomic.store(100)
Finally in this case in atomic value might be 10 or 100, depends of which thread first execute atomic.store.
please note that
atomic.store(rhs.atomic.load() + atomic.load());
is not atomic
You have two options to solve it.
memoery
1) Use a mutex.
EDIT as T.C mentioned in the comments this is irrelevant since the operation here will be load() then load() then store() (not relaxed mode) - so memory order is not related here.
2) Use memory order http://bartoszmilewski.com/2008/12/01/c-atomics-and-memory-ordering/
memory_order_acquire: guarantees that subsequent loads are not moved before the current load or any preceding loads.
memory_order_release: preceding stores are not moved past the current store or any subsequent stores.
I'm still not sure about 2, but I think if the stores will not be on parallel, it will work.

Resources