CUDA copy linked lists from device to host - struct

I am trying to populate a number of linked lists on the device and then return those lists back to the hosts.
From my understanding I need to allocate memory for my struct Element, but I don't know how to go about it since I will have many linked lists, each with an unknown number of elements. I've tried a couple of different things but it still didn't work. So I'm back to the starting point. Here is my code:
//NODE CLASS
class Node{
public:
int x,y;
Node *parent;
__device__ __host__ Node(){}
__device__ __host__ Node(int cX, int cY){x = cX; y = cY;}
__device__ __host__ int get_row() { return x; }
__device__ __host__ int get_col() { return y; }
};
//LINKED LIST
class LinkedList{
public:
__device__ __host__ struct Element{
Node n1;
Element *next;
};
__device__ __host__ LinkedList(){
head = NULL;
}
__device__ __host__ void addNode(Node n){
Element *el = new Element();
el->n1 = n;
el->next = head;
head = el;
}
__device__ __host__ Node popFirstNode(){
Element *cur = head;
Node n;
if(cur != NULL){
n = cur -> n1;
head = head -> next;
}
delete cur;
return n;
}
__device__ __host__ bool isEmpty(){
Element *cur = head;
if(cur == NULL){
return true;
}
return false;
}
Element *head;
};
//LISTS
__global__ void listsKernel(LinkedList* d_Results, int numLists){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
Node n(1,1);
if(idx < numLists){
d_Results[idx].addNode(n);
d_Results[idx].addNode(n);
d_Results[idx].addNode(n);
d_Results[idx].addNode(n);
}
}
int main(){
int numLists = 10;
size_t size = numLists * sizeof(LinkedList);
LinkedList curList;
LinkedList* h_Results = (LinkedList*)malloc(size);
LinkedList* d_Results;
cudaMalloc((void**)&d_Results, size);
listsKernel<<<256,256>>>(d_Results, numLists);
cudaMemcpy(h_Results, d_Results, sizeof(LinkedList)*numLists, cudaMemcpyDeviceToHost);
for(int i = 0; i < numLists; i++){
curList = h_Results[i];
while(curList.isEmpty() == false){
Node n = curList.popFirstNode();
std::cout << "x: " << n.get_row() << " y: " << n.get_col();
}
}
}
As you can see I'm trying to populate 10 linked lists on the device and then return them back to the host, but the code above results in unhandled exception - Access violation reading location. I am assuming it is not coping the pointers from the device.
Any help would be great.

Just eyeballing the code, it seems you have a fundamental misconception: there is host memory which cannot be accessed from the device, and device memory which cannot be accessed from the host. So when you create linked list nodes in device memory and copy the pointers back to the host, the host cannot dereference those pointers, because they are pointing to device memory.
If you truly want to pass linked lists back and forth between host and device, your best bet is probably to copy the entries into an array, do the memcpy, then copy the array back into a linked list. Other things can be done too, depending on just what your use case is.
(it is possible to allocate a region of memory that is accessible both from the host and from the device, but there is some awkwardness with it and I have no experience using it)

Related

Querying free space in LocalFileSystem on the MBED board

Is there any supported API to get free space in the LocalFileSystem on an MBED board? I've tried statvfs but it doesn't seem to work... Any ideas?
I guess I could simply list all files and subtract the total from the total size, but I was wondering if there's a better way.
This is what I tried:
long GetAvailableSpace(const char* path)
{
struct statvfs stat;
if (statvfs(path, &stat) != 0) {
// error happens, just quits here
return -1;
}
// the available size is f_bsize * f_bavail
return stat.f_bsize * stat.f_bavail;
}
UPDATE:
I ended up iterating over all files and calculating it:
#define MAX_STORAGE 512000
int LocalFileSystemFreeSpace(){
char filename[MAX_FILENAME];
DIR *d;
struct dirent *dir;
int total = 0;
d = opendir("/local");
if(d){
while((dir = readdir(d)) != NULL){
sprintf(filename, "/local/%s", dir->d_name);
int size = FileSize(filename);
total += size;
//printf("%s -> %d\r\n",filename,size);
}
closedir(d);
}
// printf("Total files: %d\r\n", total);
// printf("Free: %d\r\n",MAX_STORAGE-total);
return MAX_STORAGE-total;
}
int FileSize(char * filename){
FILE * fp = fopen(filename,"r");
if(fp==NULL){
return 0;
}
int prev=ftell(fp);
fseek(fp, 0L, SEEK_END);
int sz=ftell(fp);
fclose(fp);
return sz;
}
I don't think it's possible right now. Semihosting is used for the LocalFileSystem API, and the only commands that are currently implemented are here. Nothing for free disk space...

Copy struct with function pointer to device

I have a struct containing the parameters of a linear function, as well as the function itself. What I want to do is copy this struct to the device and then evaluate the linear function. The following example doesn't make sense but it is sufficient to describe the difficulties I have:
struct model
{
double* params;
double (*func)(double*, double);
};
I don't know how to copy this struct to the device.
Here are my functions:
Init function
// init function for struct model
__host__ void model_init(model* m, double* params, double(*func)(double*,double))
{
if(m)
{
m->params = params;
m->func = func;
}
}
Evaluation function
__device__ double model_evaluate(model* m, double x)
{
if(m)
{
return m->func(m->params, x);
}
return 0.0;
}
The actual function
__host__ __device__ double linear_function(double* params, double x)
{
return params[0] + params[1] * x;
}
Function called inside kernel
__device__ double compute(model *d_linear_model)
{
return model_evaluate(d_linear_model,1.0);
}
The kernel itself
__global__ void kernel(double *array, model *d_linear_model, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N)
{
array[idx] = compute(d_linear_model);
}
}
I know how to copy an array from host to device but I don't know how to do this for this concrete struct which contains a function.
The kernel call in main then looks like this:
int block_size = 4;
int n_blocks = N_array/block_size + (N_array % block_size == 0 ? 0:1);
kernel<<<n_blocks, block_size>>>(device_array, d_linear_model, N_array);
You've outlined two items that I consider to be somewhat more difficult than beginner-level CUDA programming:
use of device function pointers
a "deep copy" operation (on the embedded params pointer in your model structure)
Both of these topics have been covered in other questions. For example this question/answer discusses deep copy operations - when a data structure has embedded pointers to other data. And this question/answer links to a variety of resources on device function pointer usage.
But I'll go ahead and offer a possible solution for your posted case. Most of your code is usable as-is (at least for demonstration purposes). As mentioned already, your model structure will present two challenges:
struct model
{
double* params; // requires a "deep copy" operation
double (*func)(double*, double); // requires special handling for device function pointers
};
As a result, although most of your code is usable as-is, your "init" function is not. That might work for a host realization, but not for a device realization.
The deep copy operation requires us to copy the overall structure, plus separately copy the data pointed to by the embedded pointer, plus separately copy or "fixup" the embedded pointer itself.
The usage of a device function pointer is restricted by the fact that we cannot grab the actual device function pointer in host code - that is illegal in CUDA. So one possible solution is to use a __device__ construct to "capture" the device function pointer, then do a cudaMemcpyFromSymbol operation in host code, to retrieve the numerical value of the device function pointer, which can then be moved about in ordinary fashion.
Here's a worked example building on what you have shown, demonstrating the two concepts above. I have not created a "device init" function - but all the code necessary to do that is in the main function. Once you've grasped the concepts, you can take whatever code you wish out of the main function below and craft it into your "device init" function, if you wish to create one.
Here's a worked example:
$ cat t968.cu
#include <iostream>
#define NUM_PARAMS 2
#define ARR_SIZE 1
#define nTPB 256
struct model
{
double* params;
double (*func)(double*, double);
};
// init function for struct model -- not using this for device operations
__host__ void model_init(model* m, double* params, double(*func)(double*,double))
{
if(m)
{
m->params = params;
m->func = func;
}
}
__device__ double model_evaluate(model* m, double x)
{
if(m)
{
return m->func(m->params, x);
}
return 0.0;
}
__host__ __device__ double linear_function(double* params, double x)
{
return params[0] + params[1] * x;
}
__device__ double compute(model *d_linear_model)
{
return model_evaluate(d_linear_model,1.0);
}
__global__ void kernel(double *array, model *d_linear_model, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N)
{
array[idx] = compute(d_linear_model);
}
}
__device__ double (*linear_function_ptr)(double*, double) = linear_function;
int main(){
// grab function pointer from device code
double (*my_fp)(double*, double);
cudaMemcpyFromSymbol(&my_fp, linear_function_ptr, sizeof(void *));
// setup model
model my_model;
my_model.params = new double[NUM_PARAMS];
my_model.params[0] = 1.0;
my_model.params[1] = 2.0;
my_model.func = my_fp;
// setup for device copy of model
model *d_model;
cudaMalloc(&d_model, sizeof(model));
// setup "deep copy" for params
double *d_params;
cudaMalloc(&d_params, NUM_PARAMS*sizeof(double));
cudaMemcpy(d_params, my_model.params, NUM_PARAMS*sizeof(double), cudaMemcpyHostToDevice);
// copy model to device
cudaMemcpy(d_model, &my_model, sizeof(model), cudaMemcpyHostToDevice);
// fixup device params pointer in device model
cudaMemcpy(&(d_model->params), &d_params, sizeof(double *), cudaMemcpyHostToDevice);
// run test
double *d_array, *h_array;
cudaMalloc(&d_array, ARR_SIZE*sizeof(double));
h_array = new double[ARR_SIZE];
for (int i = 0; i < ARR_SIZE; i++) h_array[i] = i;
cudaMemcpy(d_array, h_array, ARR_SIZE*sizeof(double), cudaMemcpyHostToDevice);
kernel<<<(ARR_SIZE+nTPB-1)/nTPB,nTPB>>>(d_array, d_model, ARR_SIZE);
cudaMemcpy(h_array, d_array, ARR_SIZE*sizeof(double), cudaMemcpyDeviceToHost);
std::cout << "Results: " << std::endl;
for (int i = 0; i < ARR_SIZE; i++) std::cout << h_array[i] << " ";
std::cout << std::endl;
return 0;
}
$ nvcc -o t968 t968.cu
$ cuda-memcheck ./t968
========= CUDA-MEMCHECK
Results:
3
========= ERROR SUMMARY: 0 errors
$
For brevity of presentation, I've dispensed with proper cuda error checking (instead I have run the code with cuda-memcheck to demonstrate that it is without runtime error) but I would recommend proper error checking if you're having any trouble with a code.

Using malloc to create a linked list

i used malloc in order to allocate new nodes in the list,yet i am facing an error with a certain part of my code;
the following solution applies only to deleting and inserting
#include <stdio.h>
#include <malloc.h>
struct Node{
int value;
struct Node * Next;
struct Node * Previous;
};
typedef struct Node Node;
struct List{
int Count;
int Total;
Node * First;
Node * Last;
};
typedef struct List List;
List Create();
void Add(List a,int value);
void Remove(List a,Node * b);
List Create()
{
List a;
a.Count=0;
return a;
}
void Add(List a,int value)
{
Node * b = (Node *)malloc(sizeof(Node));
if(b==NULL)
printf("Memory allocation error \n");
b->value=value;
if(a.Count==0)
{
b->Next=NULL;
b->Previous=NULL;
a.First=b;
}
else
{
b->Next=NULL;
b->Previous=a.Last;
a.Last->Next=b;
}
++a.Count;
a.Total+=value;
a.Last=b;
}
void Remove(List a,Node * b)
{
if(a.Count>1)
{
if(a.Last==b)
{
b->Previous->Next=NULL;
}
else
{
b->Previous->Next=b->Next;
b->Next->Previous=b->Previous;
}
}
free(b);
}
in the delete function,in the last else condition,i am not certain whether or not using b->Next->Previous is okay,and will work;when using the -> operator,am i adressing to the node pointer or to it's value?
The short answer: Yes, b->Next->Previous is fine -- it's a struct Node*, just like the right hand side b->Previous.
I think that your error lies with the handling of Count: It is incremented by Add(), but Remove() doesn't decrement it. In fact, as the list itself only needs to know whether it is empty or not, you can remove it and instead see if a.First == NULL. (Your a.Count == 1 test can likewise be replaced with a.First != NULL && a.First->Next == NULL test.)
If you're promising Count in you API, you can add it back when you've got the list itself working. The same "remove-then-add-back" might be useful with Total. Think of both of these as caches.
An even better solution would be to implement a circular list:
struct List
{
Node Anchor;
//...
};
List Create()
{
List l;
l.Anchor.Next = l.Anchor.Previous = &l;
return l;
}
bool IsEmpty(List const* l)
{
// Both or neither point at 'l'.
assert((l->Anchor.Next == l) == (l->Anchor.Previous == l));
return l->Anchor.Next == l;
}
// Add a node 'n' to some list after 'ln'.
void AddAfter(Node* n, Node* ln)
{
n->Previous = ln;
n->Next = ln->Next;
n->Next->Previous = n->Previous->Next = n;
}
Node* Remove(Node* n)
{
n->Previous->Next = n->Next;
n->Next->Previous = n->Previous;
n->Next = n->Previous = n; // nice and proper
return x;
}
Now you longer need special cases for empty lists. I let Remove() return the node itself, to make it easy to either move nodes between lists (AddAfter(Remove(somenode), &otherlist.Anchor)) or remove and delete notes (free(Remove(somenode))).
One wart here is that my Anchor node now wastes space for data that never will be used -- but that is easily fixable.

Overloading "*" Operator for custom SmartPointer

I am trying to directly access integer from a pointer class, by overloading * operator, but it seems VC++ 10 is not allowing it. Kindly help:
#include "stdafx.h"
#include <iostream>
#include <conio.h>
using namespace std;
int MAX7 = 10;
struct node{
int value;
node *next;
};
struct node *head = NULL;
struct node *current = NULL;
int count = 0;
class SmartPointer{
public:
SmartPointer(){
}
int push(int i){
if(count == MAX7) return 0;
if(head == NULL){
head = new node();
current = head;
head -> next = NULL;
head -> value = i;
count = 1;
}
else{
struct node *ptr = head;
while(ptr->next != NULL) ptr = ptr->next;
ptr->next = new node;
ptr = ptr->next;
ptr->next = NULL;
ptr->value = i;
count++;
}
return 1;
}
void Display(){
node *ptr = head;
while(ptr != NULL){
cout << ptr->value << "(" << ptr << ")";
if( ptr == current )
cout << "*";
cout << ", ";
ptr = ptr->next;
}
}
int operator *(){
if(current == NULL) return -1;
struct node *ptr = current;
return ptr->value;
}
};
int main(){
SmartPointer *sp;
sp = new SmartPointer();
sp->push(99);
for(int i=100; i<120; i++){
if(sp->push(i))
cout << "\nPushing ("<<i<<"): Successful!";
else
cout << "\nPushing ("<<i<<"): Failed!";
}
cout << "\n";
sp->Display();
int i = *sp;
getch();
return 0;
}
Error#
1>test7.cpp(71): error C2440: 'initializing' : cannot convert from 'SmartPointer' to 'int'
1> No user-defined-conversion operator available that can perform this conversion, or the operator cannot be called
sp is not a smart pointer - it's a plain old dumb pointer to SmartPointer class. *sp uses built-in dereference operator, producing an lvalue of SmartPointer type. It does not call SmartPointer::operator*() - for that, you need to write **sp (two stars).
It's not at all clear why you want to allocate SmartPointer instance on the heap. That's an unusual thing to want to do (also too, you leak it). I'm pretty sure you would be better off with
SmartPointer sp;
sp.push(99);
and so on.
short answer:
int i = **sp;
You should not allocate objects with new. Your code looks like java. In C++, you must delete everything you allocate with new. In C++ you can write:
SmartPointer sp;
sp.push(99);
int i = *sp;

Memory allocation of struct in C++

My struct is as follows:
typedef struct KeypointSt {
float row, col;
float scale, ori;
unsigned char *descrip; /* Vector of descriptor values */
struct KeypointSt *next;
} *Keypoint;
The following is a part of a code in C. How can I translate it to C++, considering allocation and de-allocation of heap.
Keypoint k, keys = NULL;
for (i = 0; i < num; i++) {
/* Allocate memory for the keypoint. */
k = (Keypoint) malloc(sizeof(struct KeypointSt));
k->next = keys;
keys = k;
k->descrip = malloc(len);
for (j = 0; j < len; j++) {
k->descrip[j] = (unsigned char) val;
}
}
One possible way of converting to C++ is:
#include <cstring> // memset()
typedef struct KeypointSt
{
float row, col;
float scale, ori;
size_t len;
unsigned char *descrip; /* Vector of descriptor values */
KeypointSt *next;
KeypointSt(int p_len, int p_val) : row(0.0), col(0.0), scale(0.0),
ori(0.0), len(p_len),
descrip(new unsigned char[len]), next(0)
{ memset(descrip, len, p_val); }
~KeypointSt() { delete descrip; }
} *Keypoint;
extern KeypointSt *init_keypoints(size_t num, size_t len, unsigned char val);
extern void free_keypoints(KeypointSt *list);
KeypointSt *init_keypoints(size_t num, size_t len, unsigned char val)
{
KeypointSt *keys = NULL;
for (size_t i = 0; i < num; i++)
{
/* Allocate memory for the keypoint. */
KeypointSt *k = new KeypointSt(len, val);
k->next = keys;
keys = k;
}
return keys;
}
void free_keypoints(KeypointSt *list)
{
while (list != 0)
{
KeypointSt *next = list->next;
delete list;
list = next;
}
}
int main(void)
{
KeypointSt *keys = init_keypoints(4, 5, 6);
free_keypoints(keys);
return 0;
}
The only reason I've kept the typedef in place is because you have existing code; the C++ code would be better using KeypointSt * everywhere — or renaming the structure tag to Keypoint and using Keypoint * in place of your original Keypoint. I don't like non-opaque types where the typedef conceals a pointer. If I see a declaration XYZ xyz;, and it is a structure or class type, I expect to use xyz.pqr and not xyz->pqr.
We can debate code layout of the constructor code, the absence of a default constructor (no arrays), and the absence of a copy constructor and an assignment operator (both needed because of the allocation for descrip). The code of init_keypoints() is not exception safe; a memory allocation failure will leak memory. Fixing that is left as an exercise (it isn't very hard, I think, but I don't claim exception-handling expertise). I've not attempted to consider any extra requirements imposed by C++11. Simply translating from C to C++ is 'easy' until you look at the extra demands that C++ makes — demands that make your life easier in the long run, but at a short-term cost in pain.

Resources