Core dump size can be restricted using limits (ulimit, rlimit, etc...).
What I'm wondering is how this is actually implemented - for instance: Is the core dump generation smart enough to prioritize the stack? Memory referenced by local variable pointers? Or is it literally the entire address space of the process, truncated at N bytes?
As I recently faced a core dump truncation in some process, I can share my experience on this. The Linux kernel reports a core dump in the context of the crashing process. In the Linux kernel source code, the transfer of the core dump from the kernel to user space area is done by several calls to dump_emit() in fs/coredump.c:
/*
* Core dumping helper functions. These are the only things you should
* do on a core-file: use only these functions to write out all the
* necessary info.
*/
int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
{
struct file *file = cprm->file;
loff_t pos = file->f_pos;
ssize_t n;
if (cprm->written + nr > cprm->limit)
return 0;
while (nr) {
if (dump_interrupted())
return 0;
n = __kernel_write(file, addr, nr, &pos);
if (n <= 0)
return 0;
file->f_pos = pos;
cprm->written += n;
cprm->pos += n;
nr -= n;
}
return 1;
}
EXPORT_SYMBOL(dump_emit);
The above function checks two main things:
Is the current size of the core dump still under the configured limit for the process ?
if (cprm->written + nr > cprm->limit)
Is there a pending signal for the current process?
if (dump_interrupted())
If one of the above check fails, the core dump transfer from kernel to user space is interrupted and so, the resulting core file is truncated at any point. In a ELF configured kernel, the above service is for example called by elf_core_dump() from fs/binfmt_elf.c:
/*
* Actual dumper
*
* This is a two-pass process; first we find the offsets of the bits,
* and then they are actually written out. If we run out of core limit
* we just truncate.
*/
static int elf_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
mm_segment_t fs;
int segs, i;
size_t vma_data_size = 0;
struct vm_area_struct *vma, *gate_vma;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff;
struct elf_note_info info = { };
struct elf_phdr *phdr4note = NULL;
struct elf_shdr *shdr4extnum = NULL;
Elf_Half e_phnum;
elf_addr_t e_shoff;
elf_addr_t *vma_filesz = NULL;
/*
* We no longer stop all VM operations.
*
* This is because those proceses that could possibly change map_count
* or the mmap / vma pages are now blocked in do_exit on current
* finishing this core dump.
*
* Only ptrace can touch these memory addresses, but it doesn't change
* the map_count or the pages allocated. So no possibility of crashing
* exists while dumping the mm->vm_next areas to the core file.
*/
/* alloc memory for large data structures: too large to be on stack */
elf = kmalloc(sizeof(*elf), GFP_KERNEL);
if (!elf)
goto out;
/*
* The number of segs are recored into ELF header as 16bit value.
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
*/
segs = current->mm->map_count;
segs += elf_core_extra_phdrs();
gate_vma = get_gate_vma(current->mm);
if (gate_vma != NULL)
segs++;
/* for notes section */
segs++;
/* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
* this, kernel supports extended numbering. Have a look at
* include/linux/elf.h for further information. */
e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
/*
* Collect all the non-memory information about the process for the
* notes. This also sets up the file header.
*/
if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs))
goto cleanup;
has_dumped = 1;
fs = get_fs();
set_fs(KERNEL_DS);
offset += sizeof(*elf); /* Elf header */
offset += segs * sizeof(struct elf_phdr); /* Program headers */
/* Write notes phdr entry */
{
size_t sz = get_note_info_size(&info);
sz += elf_coredump_extra_notes_size();
phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
if (!phdr4note)
goto end_coredump;
fill_elf_note_phdr(phdr4note, sz, offset);
offset += sz;
}
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz))
goto end_coredump;
vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz));
if (!vma_filesz)
goto end_coredump;
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma)) {
unsigned long dump_size;
dump_size = vma_dump_size(vma, cprm->mm_flags);
vma_filesz[i++] = dump_size;
vma_data_size += dump_size;
}
offset += vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
if (e_phnum == PN_XNUM) {
shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
if (!shdr4extnum)
goto end_coredump;
fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
}
offset = dataoff;
if (!dump_emit(cprm, elf, sizeof(*elf)))
goto end_coredump;
if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
goto end_coredump;
/* Write program headers for segments dump */
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma)) {
struct elf_phdr phdr;
phdr.p_type = PT_LOAD;
phdr.p_offset = offset;
phdr.p_vaddr = vma->vm_start;
phdr.p_paddr = 0;
phdr.p_filesz = vma_filesz[i++];
phdr.p_memsz = vma->vm_end - vma->vm_start;
offset += phdr.p_filesz;
phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
if (vma->vm_flags & VM_WRITE)
phdr.p_flags |= PF_W;
if (vma->vm_flags & VM_EXEC)
phdr.p_flags |= PF_X;
phdr.p_align = ELF_EXEC_PAGESIZE;
if (!dump_emit(cprm, &phdr, sizeof(phdr)))
goto end_coredump;
}
if (!elf_core_write_extra_phdrs(cprm, offset))
goto end_coredump;
/* write out the notes section */
if (!write_note_info(&info, cprm))
goto end_coredump;
if (elf_coredump_extra_notes_write(cprm))
goto end_coredump;
/* Align to page */
if (!dump_skip(cprm, dataoff - cprm->pos))
goto end_coredump;
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma)) {
unsigned long addr;
unsigned long end;
end = vma->vm_start + vma_filesz[i++];
for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
struct page *page;
int stop;
page = get_dump_page(addr);
if (page) {
void *kaddr = kmap(page);
stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
kunmap(page);
put_page(page);
} else
stop = !dump_skip(cprm, PAGE_SIZE);
if (stop)
goto end_coredump;
}
}
dump_truncate(cprm);
if (!elf_core_write_extra_data(cprm))
goto end_coredump;
if (e_phnum == PN_XNUM) {
if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum)))
goto end_coredump;
}
end_coredump:
set_fs(fs);
cleanup:
free_note_info(&info);
kfree(shdr4extnum);
vfree(vma_filesz);
kfree(phdr4note);
kfree(elf);
out:
return has_dumped;
}
Related
On my system, it has one mem node created already in /dev (/dev/pmem). then i use fdisk command for creating a partition
After creating partition I apply lsblk command and my result is
pmem1 259:4 0 15.8G 0 disk
└─pmem1p2 259:7 0 14.3G 0 part
After that, I write mmap code (Refer devmam library) just like
off_t address = 0x00;
int width = 10;
void *virtladdr, *phyaddr;
int a = 5;
int *lendata = &a;
int main(void)
{
int mode = 0x0777;
char *filename = "/dev/pmem0";
int fdout;
if ((fdout = open (filename, O_RDWR | O_CREAT | O_TRUNC, mode )) < 0)//edited here
{
printf ("can't create file");
return 0;
}
mapped_size = page_size = getpagesize();
offset_in_page = (uint64_t)address & (page_size - 1);
if (offset_in_page + (width * 8) > page_size) {
mapped_size *= 2;
}
printf("map size %ld\n", mapped_size);
phyaddr = mmap(0, mapped_size, (PROT_READ | PROT_WRITE), MAP_SHARED, fdout, address & ~(off_t)(page_size - 1));
1 = I write value 10 on 0x00 address for /dev/pmem1
2 = I write value 20 on 0x00 address for /dev/pmem1p2
Then after I Read mmap
off_t address = 0x00;
int width = 10;
int main(void)
{
int retVal = -1;
int fd;
unsigned long mapped_size, page_size, offset_in_page;
void *virtladdr, *phyaddr;
fd = open("/dev/pmem0", O_RDWR | O_SYNC);
if(fd < 0)
{
printf("file not open \n");
}
mapped_size = page_size = getpagesize();
offset_in_page = (uint64_t)address & (page_size - 1);
if (offset_in_page + (width * 8) > page_size) {
mapped_size *= 2;
}
/* Map one page */
phyaddr = mmap(0, mapped_size, PROT_READ,MAP_PRIVATE , fd, address & ~(off_t)(page_size - 1));
1 = Read 0x00 address from /dev/pmem1 My output is 10
2 = Read 0x00 address from /dev/pmem1p2 My output is 20
So what happened here why 0th address value is different? Is fdisk create two partitions which address starts from 0x00? Or I am in the wrong way?
Please Help!
I am using MPI (= Message Passing Interface) in python for a ring communication, which means that every rank are sending and receiving from each other. I know one way to realize this is by using for instance MPI.COMM_WORLD.issend()and MPI.COMM_WORLD.recv(), this is working and done.
Now I want to realize the same Output on a different way by using MPI.Topocomm.Neighbor_alltoallw but this is not working. I wrote a C Code and is working there, so the same output can be reached with this function, but when I implement this in python it is not working. Please find below the C Code and the Python Code
The definition of the Function says (mpi4py Package for Python):
Neighbor_alltoallw(...)
Topocomm.Neighbor_alltoallw(self, sendbuf, recvbuf)
Neighbor All-to-All Generalized
I do not understand following things:
why is recbuf not a return value? it seems to be an argument here
how can this be implmented for a ring communication in Python?
Thank you for your time and support!
my working C Code:
#include <stdio.h>
#include <mpi.h>
#define to_right 201
#define max_dims 1
int main (int argc, char *argv[])
{
int my_rank, size;
int snd_buf, rcv_buf;
int right, left;
int sum, i;
MPI_Comm new_comm;
int dims[max_dims],
periods[max_dims],
reorder;
MPI_Aint snd_displs[2], rcv_displs[2];
int snd_counts[2], rcv_counts[2];
MPI_Datatype snd_types[2], rcv_types[2];
MPI_Status status;
MPI_Request request;
MPI_Init(&argc, &argv);
/* Get process info. */
MPI_Comm_size(MPI_COMM_WORLD, &size);
/* Set cartesian topology. */
dims[0] = size;
periods[0] = 1;
reorder = 1;
MPI_Cart_create(MPI_COMM_WORLD, max_dims, dims, periods,
reorder,&new_comm);
/* Get coords */
MPI_Comm_rank(new_comm, &my_rank);
/* MPI_Cart_coords(new_comm, my_rank, max_dims, my_coords); */
/* Get nearest neighbour rank. */
MPI_Cart_shift(new_comm, 0, 1, &left, &right);
/* Compute global sum. */
sum = 0;
snd_buf = my_rank;
rcv_buf = -1000; /* unused value, should be overwritten by first MPI_Recv; only for test purpose */
rcv_counts[0] = 1; MPI_Get_address(&rcv_buf, &rcv_displs[0]); snd_types[0] = MPI_INT;
rcv_counts[1] = 0; rcv_displs[1] = 0 /*unused*/; snd_types[1] = MPI_INT;
snd_counts[0] = 0; snd_displs[0] = 0 /*unused*/; rcv_types[0] = MPI_INT;
snd_counts[1] = 1; MPI_Get_address(&snd_buf, &snd_displs[1]); rcv_types[1] = MPI_INT;
for( i = 0; i < size; i++)
{
/* Substituted by MPI_Neighbor_alltoallw() :
MPI_Issend(&snd_buf, 1, MPI_INT, right, to_right,
new_comm, &request);
MPI_Recv(&rcv_buf, 1, MPI_INT, left, to_right,
new_comm, &status);
MPI_Wait(&request, &status);
*/
MPI_Neighbor_alltoallw(MPI_BOTTOM, snd_counts, snd_displs, snd_types,
MPI_BOTTOM, rcv_counts, rcv_displs, rcv_types, new_comm);
snd_buf = rcv_buf;
sum += rcv_buf;
}
printf ("PE%i:\tSum = %i\n", my_rank, sum);
MPI_Finalize();
}
My not working Python Code:
from mpi4py import MPI
size = MPI.COMM_WORLD.Get_size()
my_rank = MPI.COMM_WORLD.Get_rank()
to_right =201
max_dims=1
dims = [max_dims]
periods=[max_dims]
dims[0]=size
periods[0]=1
reorder = True
new_comm=MPI.Intracomm.Create_cart(MPI.COMM_WORLD,dims,periods,True)
my_rank= new_comm.Get_rank()
left_right= MPI.Cartcomm.Shift(new_comm,0,1)
left=left_right[0]
right=left_right[1]
sum=0
snd_buf=my_rank
rcv_buf=-1000 #unused value, should be overwritten, only for test purpose
for counter in range(0,size):
MPI.Topocomm.Neighbor_alltoallw(new_comm,snd_buf,rcv_buf)
snd_buf=rcv_buf
sum=sum+rcv_buf
print('PE ', my_rank,'sum=',sum)
I use spike to run the test program in "riscv-tools/riscv-tests/build/benchmarks":
$ spike multiply.riscv
And the output shows:
mcycle = 24096
minstret = 24103
Why mcycle is less than minstret?
Dose it means that spike can run more than one instructions in one cycle?
(I tried to trace spike code but cannot find how mcycle is counted.)
The printing of mcycle and minstret values are not from Spike in this case, it is from the test (benchmark). There is the code:
https://github.com/ucb-bar/riscv-benchmarks/blob/master/common/syscalls.c
#define NUM_COUNTERS 2
static uintptr_t counters[NUM_COUNTERS];
static char* counter_names[NUM_COUNTERS];
static int handle_stats(int enable)
{
int i = 0;
#define READ_CTR(name) do { \
while (i >= NUM_COUNTERS) ; \
uintptr_t csr = read_csr(name); \
if (!enable) { csr -= counters[i]; counter_names[i] = #name; } \
counters[i++] = csr; \
} while (0)
READ_CTR(mcycle);
READ_CTR(minstret);
#undef READ_CTR
return 0;
}
There is some code between reading mcycle & minstret. And now you know how much code is there between readings.
In Spike mcycle & minstret are always equal by definition (they are handled by the same code): https://github.com/riscv/riscv-isa-sim/blob/9e012462f53113dc9ed00d7fbb89aeafeb9b89e9/riscv/processor.cc#L347
case CSR_MINSTRET:
case CSR_MCYCLE:
if (xlen == 32)
state.minstret = (state.minstret >> 32 << 32) | (val & 0xffffffffU);
else
state.minstret = val;
break;
The syscalls.c was linked into multiply.riscv binary from https://github.com/ucb-bar/riscv-benchmarks/blob/master/multiply/bmark.mk - multiply_riscv_bin = multiply.riscv
$(multiply_riscv_bin): ... $(patsubst %.c, %.o, ... syscalls.c ... )
There is _init in syscalls.c function which calls main of the test and prints values, recorded on SYS_stats "syscall" with handle_stats.
void _init(int cid, int nc)
{
init_tls();
thread_entry(cid, nc);
// only single-threaded programs should ever get here.
int ret = main(0, 0);
char buf[NUM_COUNTERS * 32] __attribute__((aligned(64)));
char* pbuf = buf;
for (int i = 0; i < NUM_COUNTERS; i++)
if (counters[i])
pbuf += sprintf(pbuf, "%s = %d\n", counter_names[i], counters[i]);
if (pbuf != buf)
printstr(buf);
exit(ret);
}
int file_write(struct m_inode * inode, struct file * filp, char * buf, int count){
off_t pos;
int block,c;
struct buffer_head * bh;
char * p;
int i=0;
.......
if (!(filp->f_flags & O_APPEND)) {
filp->f_pos = pos;
inode->i_ctime = CURRENT_TIME;
}
}
I think:
int file_write(struct m_inode * inode, struct file * filp, char * buf, int count){
off_t pos, tmp;
int block,c;
struct buffer_head * bh;
char * p;
int i=0;
if (filp->f_flags & O_APPEND)
pos = inode->i_size;
else
pos = filp->f_pos;
tmp = pos;
.......
if (!(filp->f_flags & O_APPEND)) {
filp->f_pos = tmp;
inode->i_ctime = CURRENT_TIME;
}
}
otherwise int the file_read filp->pos is error.
please Verify my ideas!
Thank you!
As far as I understand, if opened as O_APPEND, file_write always writes to the end of the file (so at offset i_size) and updates i_size. And if not opened as O_APPEND, file_write writes to wherever f_pos is, and updates f_pos.
I think you expect file_write when not opened as O_APPEND to keep f_pos constant, which is not how it works I think.
I am new to ArrayFire and CUDA development in general, I just started using ArrayFire a couple of days ago after failing miserably using Thrust.
I am building an ArrayFire-based algorithm that is supposed to search a single 32x32 pixel frame in a database of a couple hundred thousand 32x32 frames that are stored into device memory.
At first I initialize a matrix that has 1024 + 1 pixels as rows (I need an extra one to keep a frame group id) and a predefined number (this case 1000) of frames, indexed by coloumn.
Here's the function that performs the search, if I uncomment "pixels_uint32 = device_frame_ptr[pixel_group_idx];" the program crashes. The pointer seems to be valid so I do not understand why this happens. Maybe there is something I do not know regarding accessing device memory in this way?
#include <iostream>
#include <stdio.h>
#include <sys/types.h>
#include <arrayfire.h>
#include "utils.h"
using namespace af;
using namespace std;
/////////////////////////// CUDA settings ////////////////////////////////
#define TEST_DEBUG false
#define MAX_NUMBER_OF_FRAMES 1000 // maximum (2499999 frames) X (1024 + 1 pixels per frame) x (2 bytes per pixel) = 5.124.997.950 bytes (~ 5GB)
#define BLOB_FINGERPRINT_SIZE 1024 //32x32
//percentage of macroblocks that should match: 0.9 means 90%
#define MACROBLOCK_COMPARISON_OVERALL_THRESHOLD 768 //1024 * 0.75
//////////////////////// End of CUDA settings ////////////////////////////
array search_frame(array d_db_vec)
{
try {
uint number_of_uint32_for_frame = BLOB_FINGERPRINT_SIZE / 2;
// create one-element array to hold the result of the computation
array frame_found(1,MAX_NUMBER_OF_FRAMES, u32);
frame_found = 0;
gfor (array frame_idx, MAX_NUMBER_OF_FRAMES) {
// get the blob id it's the last coloumn of the matrix
array blob_id = d_db_vec(number_of_uint32_for_frame, frame_idx); // addressing with (pixel_idx, frame_idx)
// define some hardcoded pixel to search for
uint8_t searched_r = 0x0;
uint8_t searched_g = 0x3F;
uint8_t searched_b = 0x0;
uint8_t b1 = 0;
uint8_t g1 = 0;
uint8_t r1 = 0;
uint8_t b2 = 0;
uint8_t g2 = 0;
uint8_t r2 = 0;
uint32_t sum1 = 0;
uint32_t sum2 = 0;
uint32_t *device_frame_ptr = NULL;
uint32_t pixels_uint32 = 0;
uint pixel_match_counter = 0;
//uint pixel_match_counter = 0;
array frame = d_db_vec(span, frame_idx);
device_frame_ptr = frame.device<uint32_t>();
for (uint pixel_group_idx = 0; pixel_group_idx < number_of_uint32_for_frame; pixel_group_idx++) {
// test to see if the whole matrix is traversed
// d_db_vec(pixel_group_idx, frame_idx) = 0;
/////////////////////////////// PROBLEMATIC CODE ///////////////////////////////////
pixels_uint32 = 0x7E007E0;
//pixels_uint32 = device_frame_ptr[pixel_group_idx]; //why does this crash the program?
// if I uncomment the above line the program tries to copy the u32 frame into the pixels_uint32 variable
// something goes wrong, since the pointer device_frame_ptr is not NULL and the elements should be there judging by the lines above
////////////////////////////////////////////////////////////////////////////////////
// splitting the first pixel into its components
b1 = (pixels_uint32 & 0xF8000000) >> 27; //(input & 11111000000000000000000000000000)
g1 = (pixels_uint32 & 0x07E00000) >> 21; //(input & 00000111111000000000000000000000)
r1 = (pixels_uint32 & 0x001F0000) >> 16; //(input & 00000000000111110000000000000000)
// splitting the second pixel into its components
b2 = (pixels_uint32 & 0xF800) >> 11; //(input & 00000000000000001111100000000000)
g2 = (pixels_uint32 & 0x07E0) >> 5; //(input & 00000000000000000000011111100000)
r2 = (pixels_uint32 & 0x001F); //(input & 00000000000000000000000000011111)
// checking if they are a match
sum1 = abs(searched_r - r1) + abs(searched_g - g1) + abs(searched_b - b1);
sum2 = abs(searched_r - r2) + abs(searched_g - g2) + abs(searched_b - b2);
// if they match, increment the local counter
pixel_match_counter = (sum1 <= 16) ? pixel_match_counter + 1 : pixel_match_counter;
pixel_match_counter = (sum2 <= 16) ? pixel_match_counter + 1 : pixel_match_counter;
}
bool is_found = pixel_match_counter > MACROBLOCK_COMPARISON_OVERALL_THRESHOLD;
// write down if the frame is a match or not
frame_found(0,frame_idx) = is_found ? frame_found(0,frame_idx) : blob_id;
}
// test to see if the whole matrix is traversed - this has to print zeroes
if (TEST_DEBUG)
print(d_db_vec);
// return the matches array
return frame_found;
} catch (af::exception& e) {
fprintf(stderr, "%s\n", e.what());
throw;
}
}
// make 2 green pixels
uint32_t make_test_pixel_group() {
uint32_t b1 = 0x0; //11111000000000000000000000000000
uint32_t g1 = 0x7E00000; //00000111111000000000000000000000
uint32_t r1 = 0x0; //00000000000111110000000000000000
uint32_t b2 = 0x0; //00000000000000001111100000000000
uint32_t g2 = 0x7E0; //00000000000000000000011111100000
uint32_t r2 = 0x0; //00000000000000000000000000011111
uint32_t green_pix = b1 | g1 | r1 | b2 | g2 | r2;
return green_pix;
}
int main(int argc, char ** argv)
{
info();
/////////////////////////////////////// CREATE THE DATABASE ///////////////////////////////////////
uint number_of_uint32_for_frame = BLOB_FINGERPRINT_SIZE / 2;
array d_db_vec(number_of_uint32_for_frame + 1, // fingerprint size + 1 extra u32 for blob id
MAX_NUMBER_OF_FRAMES, // number of frames
u32); // type of elements is 32-bit unsigned integer (unsigned) with the configuration RGBRGB (565565)
if (TEST_DEBUG == true) {
for (uint frame_idx = 0; frame_idx < MAX_NUMBER_OF_FRAMES; frame_idx++) {
for (uint pix_idx = 0; pix_idx < number_of_uint32_for_frame; pix_idx++) {
d_db_vec(pix_idx, frame_idx) = make_test_pixel_group(); // fill everything with green :D
}
}
} else {
d_db_vec = rand(number_of_uint32_for_frame + 1, MAX_NUMBER_OF_FRAMES);
}
cout << "Setting blob ids. \n\n";
for (uint frame_idx = 0; frame_idx < MAX_NUMBER_OF_FRAMES; frame_idx++) {
// set the blob id to 123456
d_db_vec(number_of_uint32_for_frame, frame_idx) = 123456; // blob_id = 123456
}
if (TEST_DEBUG)
print(d_db_vec);
cout << "Done setting blob ids. \n\n";
//////////////////////////////////// CREATE THE SEARCHED FRAME ///////////////////////////////////
// to be done, for now we use the hardcoded values at line 37-39 to simulate the searched pixel:
//37 uint8_t searched_r = 0x0;
//38 uint8_t searched_g = 0x3F;
//39 uint8_t searched_b = 0x0;
///////////////////////////////////////////// SEARCH /////////////////////////////////////////////
clock_t timer = startTimer();
for (int i = 0; i< 1000; i++) {
array frame_found = search_frame(d_db_vec);
if (TEST_DEBUG)
print(frame_found);
}
stopTimer(timer);
return 0;
}
Here is the console output with the line commented:
arrayfire/examples/helloworld$ ./helloworld
ArrayFire v1.9.1 (64-bit Linux, build 9af23ea)
License: Server (27000#server.accelereyes.com)
CUDA toolkit 5.0, driver 304.54
GPU0 Tesla C2075, 5376 MB, Compute 2.0
Memory Usage: 5312 MB free (5376 MB total)
Setting blob ids.
Done setting blob ids.
Time: 0.03 seconds.
Here is the console output with the line uncommented:
arrayfire/examples/helloworld$ ./helloworld
ArrayFire v1.9.1 (64-bit Linux, build 9af23ea)
License: Server (27000#server.accelereyes.com)
CUDA toolkit 5.0, driver 304.54
GPU0 Tesla C2075, 5376 MB, Compute 2.0
Memory Usage: 5312 MB free (5376 MB total)
Setting blob ids.
Done setting blob ids.
Segmentation fault
Thanks in advance for any help on this issue. I really tried everything but without success.
Disclaimer: I am the lead developer of arrayfire. I see that you have posted on AccelerEyes forums as well, but I am posting here to clear up some common issues with your code.
Do not use .device(), .host(), .scalar() inside gfor loop. This will cause divergences inside the GFOR loop, and GFOR was not designed for this.
You can not index into a device pointer. The pointer refers to a location on the GPU. When you do device_frame_ptr[pixel_group_idx];, the system is looking for the equivalent position on the CPU. This is the reason for your segmentation fault.
Use vectorized code. For example, you don't need the inner for loop of the gfor. Instead of doing b1 = (pixels_uint32 & 0xF8000000) >> 27; inside a for loop, You can do array B1 = (frame & 0xF800000000) >> 27;. i.e. instead of getting data back to CPU and using a for loop, you are doing the entire operation inside the GPU.
Don't use if-else or ternary operators inside GFOR. These cause divergences again. For example, pixel_match_counter = sum(sum1 <= 16) + sum(sum2 < 16); and found(0, found_idx) = is_found * found(0, found_idx) + (1 - is_found) * blob_id.
I have answered the particular problem you are facing. If you have any follow up questions, please follow up on our forums and / or our support email. Stackoverflow is good for asking a specific question, but not to debug your entire program.