We have a C application which is using the GetOpenFileName common dialog to get the user to select a file. We have been having crashes on Windows2008R2. I figured out that if we put an DEP exception on our application the crashes stop
However, I cant figure out what we are doing wrong or what we can do to stop the crash in the first place. I have placed our code below.
typedef struct {
OPENFILENAME ofn;
COUNT nInternal;
COUNT nExternal;
char szDirName[_MAX_DIR];
char szFile[_MAX_PATH];
char szFileTitle[_MAX_PATH];
char szFilter[128];
} OPENFILENAMEINFO;
typedef OPENFILENAMEINFO FAR *LPOPENFILENAMEINFO;
LPOPENFILENAMEINFO RequestFileNameEx(HWND hDlg, LPSTR lpExt, BOOL bSave, LPSTR lpInit)
{
LPOPENFILENAMEINFO lpFileNameInfo;
int i;
DWORD dwError;
DWORD dwSize;
LPSTR lpDir;
LPSTR lpDrive;
lpFileNameInfo = (LPOPENFILENAMEINFO)mballc(1,sizeof(OPENFILENAMEINFO));
strcpy(lpFileNameInfo->szFilter,lpExt);
for (i=0; lpFileNameInfo->szFilter[i] != '\0'; i++) {
if (lpFileNameInfo->szFilter[i] == '|')
lpFileNameInfo->szFilter[i] = '\0';
}
memset(&lpFileNameInfo->ofn, 0, sizeof(OPENFILENAME));
lpFileNameInfo->ofn.lStructSize = sizeof(OPENFILENAME);
lpFileNameInfo->ofn.hwndOwner = hDlg;
lpFileNameInfo->ofn.lpstrFilter = lpFileNameInfo->szFilter;
lpFileNameInfo->ofn.nFilterIndex = 1;
lpFileNameInfo->ofn.lpstrFile = lpFileNameInfo->szFile;
lpFileNameInfo->ofn.nMaxFile = sizeof(lpFileNameInfo->szFile);
lpFileNameInfo->ofn.lpstrFileTitle = lpFileNameInfo->szFileTitle;
lpFileNameInfo->ofn.nMaxFileTitle = sizeof(lpFileNameInfo->szFileTitle);
lpFileNameInfo->ofn.lpstrInitialDir = _getcwd(lpFileNameInfo->szDirName, _MAX_DIR);
if (bSave) {
lpFileNameInfo->ofn.Flags = OFN_SHOWHELP | OFN_PATHMUSTEXIST | OFN_OVERWRITEPROMPT | OFN_NOCHANGEDIR;
dwError = GetSaveFileName(&lpFileNameInfo->ofn);
} else {
lpFileNameInfo->ofn.Flags = OFN_SHOWHELP | OFN_PATHMUSTEXIST | (bDir==FALSE?OFN_FILEMUSTEXIST:0) | OFN_NOCHANGEDIR;
dwError = GetOpenFileName(&lpFileNameInfo->ofn);
}
if (!dwError) {
dwError = CommDlgExtendedError();
if (dwError)
ResourceHandleError(GETOPENFAIL, dwError);
mbfree(lpFileNameInfo);
return(NULL);
}
return(lpFileNameInfo);
}
a crash dump stack trace looks like
0023:73E61FFF (0x080D7974 0x080D7970 0x078B62F0 0x00000000) msxml6.dll
0023:73E68165 (0x080D7970 0x080D78F0 0x078B62F0 0x080D78F0) msxml6.dll, DllCanUnloadNow()+22084 byte(s)
0023:73E67D08 (0x078B62F0 0x080D7970 0x00000000 0x080D78F0) msxml6.dll, DllCanUnloadNow()+20967 byte(s)
0023:73E6827A (0x080D78F0 0x080D7970 0x080D7950 0x59E489BA) msxml6.dll, DllCanUnloadNow()+22361 byte(s)
0023:73E68241 (0x080D7970 0x080D7950 0x59E489BA 0x00000000) msxml6.dll, DllCanUnloadNow()+22304 byte(s)
0023:73E69DDF (0x00000000 0x080D7950 0x00000000 0x0762FAE0) msxml6.dll, DllCanUnloadNow()+29374 byte(s)
0023:73E6BF9F (0x080D7970 0x080D7950 0x71932915 0x078B5E90) msxml6.dll, DllGetClassObject()+5125 byte(s)
0023:73E6BF83 (0x73E81B38 0x080D39C0 0x080D39C0 0x080D3980) msxml6.dll, DllGetClassObject()+5097 byte(s)
0023:73E6C318 (0x71932881 0x06148CB8 0x06148CB8 0x00000000) msxml6.dll, DllGetClassObject()+6014 byte(s)
0023:73E6CD18 (0x720B35A0 0x0762FBD8 0x06148CB8 0x0762FD68) msxml6.dll, DllGetClassObject()+8574 byte(s)
0023:73E78671 (0x720B35A0 0x0762FBD8 0x0762FD68 0x00000000) msxml6.dll, DllGetClassObject()+56023 byte(s)
0023:73E6AAE5 (0x73E6AC28 0x00000000 0x720B35A0 0x0762FBD8) msxml6.dll, DllCanUnloadNow()+32708 byte(s)
0023:74B0A0E1 (0x00000000 0x00000000 0x00000000 0x00000001) ole32.dll, CoCreateInstanceEx()+0915 byte(s)
0023:74B09FA1 (0x720B3614 0x00000000 0x00000017 0x00000000) ole32.dll, CoCreateInstanceEx()+0595 byte(s)
0023:74B09E25 (0x720B3614 0x00000000 0x00000017 0x00000000) ole32.dll, CoCreateInstanceEx()+0215 byte(s)
0023:74B09D86 (0x720B3614 0x00000000 0x00000017 0x00000000) ole32.dll, CoCreateInstanceEx()+0056 byte(s)
0023:74B09D3F (0x720B3614 0x00000000 0x00000017 0x720B35A0) ole32.dll, CoCreateInstance()+0052 byte(s)
0023:720B352B (0x0553A7C0 0x00000000 0x0070E2DC 0x0070E288) FunDisc.dll
0023:720B9470 (0x0553A7C0 0x00000000 0x00000001 0x00000001) FunDisc.dll, DllGetClassObject()+21871 byte(s)
0023:720C3B69 (0x00000001 0x0070E288 0x8007000E 0x00000000) FunDisc.dll, DllUnregisterServer()+20504 byte(s)
0023:720B75AA (0x73751590 0x00000000 0x00000001 0x00000000) FunDisc.dll, DllGetClassObject()+13993 byte(s)
0023:720B1CE9 (0x73751590 0x00000000 0x00000001 0x055874F8) FunDisc.dll
0023:720B1C39 (0x00709310 0x73751590 0x00000000 0x00000001) FunDisc.dll
0023:73752F84 (0x055E2F28 0x00709310 0x73751590 0x00000000) NetworkItemFactory.dll
0023:737530A5 (0x055E2F28 0x0762FF88 0x763643C0 0x055E2F28) NetworkItemFactory.dll
0023:73753144 (0x055E2F28 0x00000000 0x00000000 0x03EDFB9C) NetworkItemFactory.dll
0023:763643C0 (0x03EDFB9C 0x0762FFD4 0x77029EF2 0x03EDFB9C) SHLWAPI.dll, IUnknown_QueryService()+0346 byte(s)
0023:74C9339A (0x03EDFB9C 0x13BB74FB 0x00000000 0x00000000) kernel32.dll, BaseThreadInitThunk()+0018 byte(s)
0023:77029EF2 (0x763642ED 0x03EDFB9C 0xFFFFFFFF 0x770B736F) ntdll.dll, RtlInitializeExceptionChain()+0099 byte(s)
0023:77029EC5 (0x00000000 0x00000000 0x00000000 0x00000000) ntdll.dll, RtlInitializeExceptionChain()+0054 byte(s)
I had the same problem and found a possible solution by Synastry on
http://social.msdn.microsoft.com/Forums/en-US/vcmfcatl/thread/5037519a-78e2-42f4-94cd-bbe88e0f16d6/
All of us suffering this problem has a call stack in the function 'CoUninitialize'. This function is automatically called when a worker thread of COM ends up. This worker thread is not created directly by the user but when some function using COM library is called, it creates it(or them). And wagscallion and I commonly called the function 'GetOpenFileName'. I guess that GSansoucie also called some COM automated function too.
Ending up the worker thread and being called 'CoUninitialize' is legal and normal action of COM library. That's not the reason. The exception that we met is caused by uninitializing COM Server while it's still in use. But our(or at least my) code is also legal and proper except one thing. I did never called 'CoInitialize' or 'CoInitializeEx' function in my code.
COM library has a internal count(similar to a reference counter) and it's incremented by calling CoInitialize or CoInitializeEx and decremented by calling CoUninitialize. But I didn't call any initializer function. Although I didn't call it, GetOpenFileName function calls it in GetOpenFileName's implementation for it's worker threads. And after the function returns, the worker threads wait for another COM job for a while. This is why the exception is occured not instantly when GetOpenFileName function returns. But the worker threads decided to end up themselves, they call CoUninitialize and now the internal count of COM library Server goes 0 and CoUninitialize frees all resources from memory.
But after GetOpenFileName function returns, some of resources should remain in memory(I can't sure about this but if this assumption is not true, we would never meet an exception). To maintain them not to be freed, we need to call CoInitialize or CoInitializeEx(MSDN recommends later one) in the initialization of our program. Also we need to call CoUninitialize before our program finishes.
IN SHORT, WE NEED TO CALL 'CoInitialize' or 'CoInitializeEx' at the start of our program and 'CoUninitialize' at the end of program. But MSDN doesn't describe about this for GetOpenFileName or any other functions using COM library. :-(
In my case, by calling initializer and uninitializer the problem is gone and now everythings work well. Take a look and apply it to your code. If there's another reason causing this exception you know, please let us know too. :-)
Thank you for reading.
For myself this was not the solution but a hint where to look at. I have used CoInitialize in my multi-thread application which internally calls CoInitializeEx with COINIT_APARTMENTTHREADED.
I changed the call now from CoInitialize(NULL) to CoInitializeEx(NULL, COINIT_MULTITHREADED) and my problems seem to be gone.
Related
I was reading source code of glibc.
In function void *__libc_malloc(size_t bytes):
void *__libc_malloc(size_t bytes) {
mstate ar_ptr;
void *victim;
_Static_assert(PTRDIFF_MAX <= SIZE_MAX / 2, "PTRDIFF_MAX is not more than half of SIZE_MAX");
if (!__malloc_initialized) ptmalloc_init();
...
}
It shows that if the first thread was created, it calls ptmalloc_init(), and links thread_arena with main_arena, and sets __malloc_initialized to true.
On the other hand, the second thread was blocked by the following code in ptmalloc_init():
static void ptmalloc_init(void) {
if (__malloc_initialized) return;
__malloc_initialized = true;
thread_arena = &main_arena;
malloc_init_state(&main_arena);
...
Thus the thread_arena of the second thread is NULL, and it has to mmap() additional arena.
My question is:
It seems possible to cause race condition because there's no any lock with __malloc_initialized, and thread_arenas of the first thread and second thread may both link with main_arena, why not use lock to protect __malloc_initialized?
It seems possible to cause race condition because there's no any lock with __malloc_initialized
It is impossible1 for a program to create a second running thread without having called an allocation routine (and therefore ptmalloc_init) while it was still single-threaded.
Because of that, ptmalloc_init can assume that it runs while there is only a single thread.
1Why is it impossible? Because creating a thread itself calls calloc.
For example, in this program:
#include <pthread.h>
void *fn(void *p) { return p; }
int main()
{
pthread_t tid;
pthread_create(&tid, NULL, fn, NULL);
pthread_join(tid, NULL);
return 0;
}
ptmalloc_init is called here (only a single thread exists at that point):
Breakpoint 2, ptmalloc_init () at /usr/src/debug/glibc-2.34-42.fc35.x86_64/malloc/arena.c:283
283 if (__malloc_initialized)
(gdb) bt
#0 ptmalloc_init () at /usr/src/debug/glibc-2.34-42.fc35.x86_64/malloc/arena.c:283
#1 __libc_calloc (n=17, elem_size=16) at malloc.c:3526
#2 0x00007ffff7fdd6c3 in calloc (b=16, a=17) at ../include/rtld-malloc.h:44
#3 allocate_dtv (result=result#entry=0x7ffff7dae640) at ../elf/dl-tls.c:375
#4 0x00007ffff7fde0e2 in __GI__dl_allocate_tls (mem=mem#entry=0x7ffff7dae640) at ../elf/dl-tls.c:634
#5 0x00007ffff7e514e5 in allocate_stack (stacksize=<synthetic pointer>, stack=<synthetic pointer>,
pdp=<synthetic pointer>, attr=0x7fffffffde30)
at /usr/src/debug/glibc-2.34-42.fc35.x86_64/nptl/allocatestack.c:429
#6 __pthread_create_2_1 (newthread=0x7fffffffdf58, attr=0x0, start_routine=0x401136 <fn>, arg=0x0)
at pthread_create.c:648
#7 0x0000000000401167 in main () at p.c:7
GLIBC's dynamic memory allocator is designed to deliver performances in both mono-threaded and multi-threaded programs. Several mutexes are used instead of having a centralized unique one which would at the end serialize every concurrent accesses to the dynamic memory allocator. The concept of arenas protected by one mutex has been introduced to have a kind of reserved memory area for each thread. Hence, the threads can access the memory allocator data structures in parallel as long as they use different arenas.
The main goal is to avoid as much as possible the contention on the mutexes.
The initialization step is critical because the main arena must be set up once. The __malloc_initialized global variable is a flag to prevent multiple initializations. Of course, in a multi-threaded environment, the latter should be protected by a mutex because checking the value of a variable is not multi-thread safe. But doing this would break the main design principle consisting to avoid a centralized mutex which would somehow serialize the execution of the concurrent threads during the process life time.
So, the unprotected __malloc_initialized is a trade-off that works as long as the first access to the memory allocator is done in mono-threaded mode.
Under Linux, a process starts mono-threaded (the main thread). With dynamically and statically linked programs, the GLIBC library has an initialization entry point (CSU = C Start Up) called __libc_start_main()_ defined in csu/libc-start.c in the library's source tree. It performs many initializations before calling the main() function. This is where a first call to the dynamic allocator occurs to initialize the main arena.
Let's look at the following program which does not explicitly call any service from the dynamic memory allocator and does not create any thread:
#include <unistd.h>
int main(void)
{
pause();
return 0;
}
Let's compile it and run it with gdb and a breakpoint on malloc():
$ gcc -g mm.c -o mm
$ gdb ./mm
[...]
(gdb) br malloc
Function "malloc" not defined.
Make breakpoint pending on future shared library load? (y or [n]) y
Breakpoint 1 (malloc) pending.
(gdb) run
Starting program: /.../mm
Breakpoint 1, malloc (n=1441) at dl-minimal.c:49
49 dl-minimal.c: No such file or directory.
(gdb) where
#0 malloc (n=1441) at dl-minimal.c:49
#1 0x00007ffff7fec5e5 in calloc (nmemb=<optimized out>, size=size#entry=1) at dl-minimal.c:103
#2 0x00007ffff7fdc284 in _dl_new_object (realname=realname#entry=0x7ffff7ff4342 "", libname=libname#entry=0x7ffff7ff4342 "", type=type#entry=0, loader=loader#entry=0x0,
mode=mode#entry=536870912, nsid=nsid#entry=0) at dl-object.c:89
#3 0x00007ffff7fd1d2f in dl_main (phdr=0x555555554040, phnum=<optimized out>, user_entry=<optimized out>, auxv=<optimized out>) at rtld.c:1330
#4 0x00007ffff7febc4b in _dl_sysdep_start (start_argptr=start_argptr#entry=0x7fffffffdf70, dl_main=dl_main#entry=0x7ffff7fd15e0 <dl_main>) at ../elf/dl-sysdep.c:252
#5 0x00007ffff7fd104c in _dl_start_final (arg=0x7fffffffdf70) at rtld.c:449
#6 _dl_start (arg=0x7fffffffdf70) at rtld.c:539
#7 0x00007ffff7fd0108 in _start () from /lib64/ld-linux-x86-64.so.2
#8 0x0000000000000001 in ?? ()
#9 0x00007fffffffe2e2 in ?? ()
#10 0x0000000000000000 in ?? ()
(gdb)
The above display shows that even if malloc() is not called explicitly in the main program, the GLIBC's internals call at least once the memory allocator triggering the initialization of the main arena.
We may consequently wonder why we need to check the __malloc_initialized variable during the process life time after the internal initialization step. The GLIBC initialization sets up various internal modules (main stack, pthreads...) and some of them may call the dynamic memory allocator. Hence __malloc_initialized is here to allow calling the allocator at any time during the initialization step. And, if the allocator is not needed because of some specific esoteric configuration, then it will not be initialized at all.
I have a list of function pointers called tasks_ready_master. The pointers point to functions (tasks) defined in a seperate module. I want to execute them in parallel using threads. Each thread has a queue called "thread_queue" of capacity 1. This queue will contain the task that should be executed by the thread. Once it is done, the task is retired from the queue. We have also a queue where we put all the tasks (called "master _queue"). This is my implementation for the execution subroutine:
subroutine master_worker_execution(self,var,tasks_ready_master,first_task,last_task)
type(tcb),dimension(20)::tasks_ready_master !< the master array of tasks
integer::i_task !< the task counter
type(tcb)::self !< self
integer,intent(in)::first_task,last_task
type(variables),intent(inout)::var !< the variables
!OpenMP variables
integer::num_thread !< the rank of the thread
integer:: OMP_GET_THREAD_NUM !< function to get the rank of the thread
type(QUEUE_STRUCT),pointer:: thread_queue
type(QUEUE_STRUCT),pointer::master_queue
logical::success
integer(kind = OMP_lock_kind) :: lck !< a lock
call OMP_init_lock(lck) !< lock initialization
!$OMP PARALLEL PRIVATE(i_task,num_thread,thread_queue) &
!$OMP SHARED(tasks_ready_master,self,var,master_queue,lck)
num_thread=OMP_GET_THREAD_NUM() !< the rank of the thread
!$OMP MASTER
call queue_create(master_queue,last_task-first_task+1) !< create the master queue
do i_task=first_task,last_task
call queue_append_data(master_queue,tasks_ready_master(i_task),success) !< add the list elements to the queue (full queue)
end do
!$OMP END MASTER
!$OMP BARRIER
if (num_thread .ne. 0) then
do while (.not. queue_empty(master_queue)) !< if the queue is not empty
call queue_create(thread_queue,1) !< create a thread queue of capacity 1
call OMP_set_lock(lck) !< set the lock
call queue_append_data(thread_queue,master_queue%data(1),success) !< add the first element of the list to the thread queue
call queue_retrieve_data(master_queue) !< retire the first element of the master queue
call OMP_unset_lock(lck) !< unset the lock
call thread_queue%data(1)%f_ptr(self,var) !< execute the one and only element of the thread queueu
call queue_retrieve_data(thread_queue) !< retire the element
end do
end if
!$OMP MASTER
call queue_destroy(master_queue) !< destory the master queue
!$OMP END MASTER
call queue_destroy(thread_queue) !< destroy the thread queue
!$OMP END PARALLEL
call OMP_destroy_lock(lck) !< destroy the lock
end subroutine master_worker_execution
The problem is that I get a segmentation fault:
Program received signal SIGSEGV: Segmentation fault - invalid memory reference.
Backtrace for this error:
Program received signal SIGSEGV: Segmentation fault - invalid memory reference.
Backtrace for this error:
#0 0x7f30fd3ca700 in ???
#0 0x7f30fd3ca700 in ???
#1 0x7f30fd3c98a5 in ???
#1 0x7f30fd3c98a5 in ???
#2 0x7f30fd06920f in ???
#2 0x7f30fd06920f in ???
#3 0x56524a0f1d08 in __master_worker_MOD_master_worker_execution._omp_fn.0
at /home/hakim/stage_hecese_HPC/OpenMP/hecese_OMP/master_worker.f90:70
#4 0x7f30fd230a85 in ???
#3 0x56524a0f1ad7 in __queue_MOD_queue_destroy
at /home/hakim/stage_hecese_HPC/OpenMP/hecese_OMP/queue.f90:64
#4 0x56524a0f1d94 in __master_worker_MOD_master_worker_execution._omp_fn.0
at /home/hakim/stage_hecese_HPC/OpenMP/hecese_OMP/master_worker.f90:81
#5 0x7f30fd227e75 in ???
#6 0x56524a0f1f68 in __master_worker_MOD_master_worker_execution
at /home/hakim/stage_hecese_HPC/OpenMP/hecese_OMP/master_worker.f90:54
#7 0x56524a0f29b5 in __app_management_MOD_management
at /home/hakim/stage_hecese_HPC/OpenMP/hecese_OMP/app_management_without_t.f90:126
#8 0x56524a0f579b in hecese
at /home/hakim/stage_hecese_HPC/OpenMP/hecese_OMP/program_hecese.f90:398
#9 0x56524a0ed26e in main
at /home/hakim/stage_hecese_HPC/OpenMP/hecese_OMP/program_hecese.f90:13
Erreur de segmentation (core dumped)
I tried to retire the while loop and it works (no seg fault). I don't understand where the mistake came from.
While debugging with gdb, it guides me to the line where we use queue_append_data and queue_retrieve_data.
This is the ouput I get when I use valgrind:
==13100== Memcheck, a memory error detector
==13100== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==13100== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==13100== Command: ./output_hecese_omp
==13100==
==13100== Thread 3:
==13100== Jump to the invalid address stated on the next line
==13100== at 0x0: ???
==13100== by 0x10EB64: __master_worker_MOD_master_worker_execution._omp_fn.0 (master_worker.f90:73)
==13100== by 0x4C8BA85: ??? (in /usr/lib/x86_64-linux-gnu/libgomp.so.1.0.0)
==13100== by 0x4F1D608: start_thread (pthread_create.c:477)
==13100== by 0x4DD7292: clone (clone.S:95)
==13100== Address 0x0 is not stack'd, malloc'd or (recently) free'd
==13100==
Program received signal SIGSEGV: Segmentation fault - invalid memory reference.
Backtrace for this error:
#0 0x4888700 in ???
#1 0x48878a5 in ???
#2 0x4cfb20f in ???
#3 0x0 in ???
==13100==
==13100== Process terminating with default action of signal 11 (SIGSEGV)
==13100== at 0x4CFB169: raise (raise.c:46)
==13100== by 0x4CFB20F: ??? (in /usr/lib/x86_64-linux-gnu/libc-2.31.so)
==13100==
==13100== HEAP SUMMARY:
==13100== in use at exit: 266,372 bytes in 121 blocks
==13100== total heap usage: 194 allocs, 73 frees, 332,964 bytes allocated
==13100==
==13100== LEAK SUMMARY:
==13100== definitely lost: 29,280 bytes in 3 blocks
==13100== indirectly lost: 2,416 bytes in 2 blocks
==13100== possibly lost: 912 bytes in 3 blocks
==13100== still reachable: 233,764 bytes in 113 blocks
==13100== suppressed: 0 bytes in 0 blocks
==13100== Rerun with --leak-check=full to see details of leaked memory
==13100==
==13100== For lists of detected and suppressed errors, rerun with: -s
==13100== ERROR SUMMARY: 3 errors from 1 contexts (suppressed: 0 from 0)
I am learning pthreads programming.
I understood that there are two states of thread:
1. Joinable
2. Detachable
In case of Joinable, we need to call pthread_join to free the resources(stack), whereas in case of detached there is no need to call pthread_join and the resources will be freed on thread exit.
I wrote a sample program to observe the behavior
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
void *threadFn(void *arg)
{
pthread_detach(pthread_self());
sleep(1);
printf("Thread Fn\n");
pthread_exit(NULL);
}
int main(int argc, char *argv[])
{
pthread_t tid;
int ret = pthread_create(&tid, NULL, threadFn, NULL);
if (ret != 0) {
perror("Thread Creation Error\n");
exit(1);
}
printf("After thread created in Main\n");
pthread_exit(NULL);
}
When i try to check any mem leaks with valgrind it gave me leaks of 272 bytes. Can you show me why is the leak happening here.
$valgrind --leak-check=full ./app
==38649==
==38649== HEAP SUMMARY:
==38649== in use at exit: 272 bytes in 1 blocks
==38649== total heap usage: 7 allocs, 6 frees, 2,990 bytes allocated
==38649==
==38649== 272 bytes in 1 blocks are possibly lost in loss record 1 of 1
==38649== at 0x4C31B25: calloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
==38649== by 0x40134A6: allocate_dtv (dl-tls.c:286)
==38649== by 0x40134A6: _dl_allocate_tls (dl-tls.c:530)
==38649== by 0x4E44227: allocate_stack (allocatestack.c:627)
==38649== by 0x4E44227: pthread_create##GLIBC_2.2.5 (pthread_create.c:644)
==38649== by 0x108902: main (2.c:18)
==38649==
==38649== LEAK SUMMARY:
==38649== definitely lost: 0 bytes in 0 blocks
==38649== indirectly lost: 0 bytes in 0 blocks
==38649== possibly lost: 272 bytes in 1 blocks
==38649== still reachable: 0 bytes in 0 blocks
==38649== suppressed: 0 bytes in 0 blocks
==38649==
==38649== For counts of detected and suppressed errors, rerun with: -v
==38649== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)
Your expectation is correct that there shouldn't be any leaks in main thread once you call pthread_exit.
However, what you observe is a quirk of the implementation you're using (which is likely to be glibc) - pthreads library (glibc implementation) re-uses the initially allocated stack for threads - like a cache so that previously allocated stacks can be re-used whenever possible.
Valgrind simply reports what it "sees" (something was allocated but not de-allocated). But it's not a real leak, so you don't need to worry about this.
If you "reverse" the logic (main thread exits as the last thread) then you wouldn't see leaks because the initially allocated stack space is properly free'd by the main thread. But this leak isn't a real leak in any case and you can safely ignore this.
You can also setup a suppression file so that Valgrind doesn't complain about this (which is to inform Valgrind that "I know this isn't not real leak, so don't report this"), such as:
{
Pthread_Stack_Leaks_Ignore
Memcheck:Leak
fun:calloc
fun:allocate_dtv
fun:_dl_allocate_tls
fun:allocate_stack
fun:pthread_create*
}
I have a very simple CUDA component in my application. Valgrind reports a lot of leaks and still-reachables, all related to the cudaMalloc calls.
Are these leaks real? I call cudaFree for every cudaMalloc. Is this valgrind's inability to interpret GPU memory allocation? If these leaks are not real, can I suppress them and have valgrind only analyse the non-gpu part of the application?
extern "C"
unsigned int *gethash(int nodec, char *h_nodev, int len) {
unsigned int *h_out = (unsigned int *)malloc(sizeof(unsigned int) * nodec);
char *d_in;
unsigned int *d_out;
cudaMalloc((void**) &d_in, sizeof(char) * len * nodec);
cudaMalloc((void**) &d_out, sizeof(unsigned int) * nodec);
cudaMemcpy(d_in, h_nodev, sizeof(char) * len * nodec, cudaMemcpyHostToDevice);
int blocks = 1 + nodec / 512;
cube<<<blocks, 512>>>(d_out, d_in, nodec, len);
cudaMemcpy(h_out, d_out, sizeof(unsigned int) * nodec, cudaMemcpyDeviceToHost);
cudaFree(d_in);
cudaFree(d_out);
return h_out;
}
Last bit of the Valgrind output:
...
==5727== 5,468 (5,020 direct, 448 indirect) bytes in 1 blocks are definitely lost in loss record 506 of 523
==5727== at 0x402B965: calloc (in /usr/lib/valgrind/vgpreload_memcheck-x86-linux.so)
==5727== by 0x4843910: ??? (in /usr/lib/nvidia-319-updates/libcuda.so.319.60)
==5727== by 0x48403E9: ??? (in /usr/lib/nvidia-319-updates/libcuda.so.319.60)
==5727== by 0x498B32D: ??? (in /usr/lib/nvidia-319-updates/libcuda.so.319.60)
==5727== by 0x494A6E4: ??? (in /usr/lib/nvidia-319-updates/libcuda.so.319.60)
==5727== by 0x4849534: ??? (in /usr/lib/nvidia-319-updates/libcuda.so.319.60)
==5727== by 0x48191DD: cuInit (in /usr/lib/nvidia-319-updates/libcuda.so.319.60)
==5727== by 0x406B4D6: ??? (in /usr/lib/i386-linux-gnu/libcudart.so.5.0.35)
==5727== by 0x406B61F: ??? (in /usr/lib/i386-linux-gnu/libcudart.so.5.0.35)
==5727== by 0x408695D: cudaMalloc (in /usr/lib/i386-linux-gnu/libcudart.so.5.0.35)
==5727== by 0x804A006: gethash (hashkernel.cu:36)
==5727== by 0x804905F: chkisomorphs (bdd.c:326)
==5727==
==5727== LEAK SUMMARY:
==5727== definitely lost: 10,240 bytes in 6 blocks
==5727== indirectly lost: 1,505 bytes in 54 blocks
==5727== possibly lost: 7,972 bytes in 104 blocks
==5727== still reachable: 626,997 bytes in 1,201 blocks
==5727== suppressed: 0 bytes in 0 blocks
It's a known issue that valgrind reports false-positives for a bunch of CUDA stuff. The best way to avoid seeing it would be to use valgrind suppressions, which you can read all about here:
http://valgrind.org/docs/manual/manual-core.html#manual-core.suppress
If you want to jumpstart into something a little closer to your specific issue, an interesting post is this one on the Nvidia dev forums. It has a link to a sample suppression rule file.
https://devtalk.nvidia.com/default/topic/404607/valgrind-3-4-suppressions-a-little-howto/
Try using cuda-memcheck --leak-check full. Cuda-memcheck is a set of tools that provides similar functionality to Valgrind for CUDA applications. It is installed as part of the CUDA toolkit. You can get more documentation about how to use cuda-memcheck here : http://docs.nvidia.com/cuda/cuda-memcheck/
Note that cuda-memcheck is not a direct replacement for valgrind and can't be used to detect host side memory leaks or buffer overflows.
To add to scarl3tt's answer, this may be overly general for some applications, but if you want to use valgrind while ignoring most of the cuda issues, use the option --suppressions=valgrind-cuda.supp where valgrind-cuda.supp is a file with the following rules:
{
alloc_libcuda
Memcheck:Leak
match-leak-kinds: reachable,possible
fun:*alloc
...
obj:*libcuda.so*
...
}
{
alloc_libcufft
Memcheck:Leak
match-leak-kinds: reachable,possible
fun:*alloc
...
obj:*libcufft.so*
...
}
{
alloc_libcudaart
Memcheck:Leak
match-leak-kinds: reachable,possible
fun:*alloc
...
obj:*libcudart.so*
...
}
I wouldn't trust valgrind or any other leak detector (like VLD) with CUDA. I'm sure they weren't designed with GPU allocations in mind. I don't know whether Nvidia's Nsight has the capability these days (I haven't done GPU programming for almost 6 months now), but that's the best thing I used for CUDA debugging, and to be quite honest, it was buggy as hell.
The code you've posted shouldn't create a leak.
Since I don't have 50 reputation, I cannot leave a comment on #Vyas 's answer.
I feel strange that cuda-memcheck cannot observe cuda memory leakage.
I just write a very simple code with a cuda memory leakage, but when using cuda-memcheck --leak-check full it give no leakage. It is:
#include <iostream>
#include <cuda_runtime.h>
using namespace std;
int main(){
float* cpu_data;
float* gpu_data;
int buf_size = 10 * sizeof(float);
cpu_data = (float*)malloc(buf_size);
for(int i=0; i<10; i++){
cpu_data[i] = 1.0f * i;
}
cudaError_t cudaStatus = cudaMalloc(&gpu_data, buf_size);
cudaMemcpy(gpu_data, cpu_data, buf_size, cudaMemcpyHostToDevice);
free(cpu_data);
//cudaFree(gpu_data);
return 0;
}
Note the commented line of code, which make this program a cuda memory leakage, I think. However, when execuing cuda-memcheck ./a.out it gives:
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
All,
I am debuging a 24-thread program with GDB, now I have find which line in the code the error occurs, but I cannot tell what the error is from the output of GDB. The followsing line of code leads to the error, it's just a normal insertion to a map structure.
current_node->children.insert(std::pair<string, ComponentTrieNode*>(comps[j], temp_node));
I used GDB to find out in which thread the error happens and switched to that thread, the backtrace command shows the function calls in the stack. (The last several lines try to print the value of some variables in a function, but failed.)
What should I do to clear know what error is happening?
[root#localhost nameComponentEncoding]# gdb NCE_david
GNU gdb (GDB) Fedora (7.2.90.20110429-36.fc15)
Copyright (C) 2011 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /mnt/disk2/experiments_BLOODMOON/two_stage_bloom_filter/programs/nameComponentEncoding/NCE_david...done.
(gdb) r /mnt/disk2/FIB_with_port/10_1.txt /mnt/disk2/trace/a_10_1.trace /mnt/disk2/FIB_with_port/10_2.txt
Starting program: /mnt/disk2/experiments_BLOODMOON/two_stage_bloom_filter/programs/nameComponentEncoding/NCE_david /mnt/disk2/FIB_with_port/10_1.txt /mnt/disk2/trace/a_10_1.trace /mnt/disk2/FIB_with_port/10_2.txt
[Thread debugging using libthread_db enabled]
[New Thread 0x7fffd2bf5700 (LWP 13129)]
[New Thread 0x7fffd23f4700 (LWP 13130)]
[New Thread 0x7fffd1bf3700 (LWP 13131)]
[New Thread 0x7fffd13f2700 (LWP 13132)]
[New Thread 0x7fffd0bf1700 (LWP 13133)]
[New Thread 0x7fffd03f0700 (LWP 13134)]
[New Thread 0x7fffcfbef700 (LWP 13135)]
[New Thread 0x7fffcf3ee700 (LWP 13136)]
[New Thread 0x7fffcebed700 (LWP 13137)]
[New Thread 0x7fffce3ec700 (LWP 13138)]
[New Thread 0x7fffcdbeb700 (LWP 13139)]
[New Thread 0x7fffcd3ea700 (LWP 13140)]
[New Thread 0x7fffccbe9700 (LWP 13141)]
[New Thread 0x7fffcc3e8700 (LWP 13142)]
[New Thread 0x7fffcbbe7700 (LWP 13143)]
[New Thread 0x7fffcb3e6700 (LWP 13144)]
[New Thread 0x7fffcabe5700 (LWP 13145)]
[New Thread 0x7fffca3e4700 (LWP 13146)]
[New Thread 0x7fffc9be3700 (LWP 13147)]
[New Thread 0x7fffc93e2700 (LWP 13148)]
[New Thread 0x7fffc8be1700 (LWP 13149)]
[New Thread 0x7fffc83e0700 (LWP 13150)]
[New Thread 0x7fffc7bdf700 (LWP 13151)]
this is thread 1
this is thread 7
this is thread 14
this is thread 18
this is thread 2
this is thread 19
this is thread 6
this is thread 8
this is thread 24
base: 64312646
this is thread 11
this is thread 5
this is thread 12
this is thread 13
this is thread 3
this is thread 15
this is thread 16
this is thread 17
this is thread 4
this is thread 20
this is thread 21
this is thread 22
this is thread 23
this is thread 9
this is thread 10
Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7fffc8be1700 (LWP 13149)]
std::local_Rb_tree_rotate_left (__x=0xa057c90, __root=#0x608118) at ../../../../libstdc++-v3/src/tree.cc:126
126 __x->_M_right = __y->_M_left;
(gdb) info threads
Id Target Id Frame
24 Thread 0x7fffc7bdf700 (LWP 13151) "NCE_david" compare (__n=<optimized out>, __s2=<optimized out>, __s1=<optimized out>)
at /usr/lib/gcc/x86_64-redhat-linux/4.6.0/../../../../include/c++/4.6.0/bits/char_traits.h:257
(... other 22 threads not listed)
2 Thread 0x7fffd2bf5700 (LWP 13129) "NCE_david" compare (__n=<optimized out>, __s2=<optimized out>, __s1=<optimized out>)
at /usr/lib/gcc/x86_64-redhat-linux/4.6.0/../../../../include/c++/4.6.0/bits/char_traits.h:257
1 Thread 0x7ffff7fe57a0 (LWP 13126) "NCE_david" strtok () at ../sysdeps/x86_64/strtok.S:76
(gdb) thread 22
[Switching to thread 22 (Thread 0x7fffc8be1700 (LWP 13149))]
#0 std::local_Rb_tree_rotate_left (__x=0xa057c90, __root=#0x608118) at ../../../../libstdc++-v3/src/tree.cc:126
126 __x->_M_right = __y->_M_left;
(gdb) bt
#0 std::local_Rb_tree_rotate_left (__x=0xa057c90, __root=#0x608118) at ../../../../libstdc++-v3/src/tree.cc:126
#1 0x0000003cdd26e848 in std::_Rb_tree_insert_and_rebalance (__insert_left=<optimized out>, __x=0x7fffc0005ba0, __p=<optimized out>, __header=...)
at ../../../../libstdc++-v3/src/tree.cc:266
#2 0x00000000004029ca in std::_Rb_tree<std::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::pair<std::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ComponentTrieNode*>, std::_Select1st<std::pair<std::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ComponentTrieNode*> >, std::less<std::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ComponentTrieNode*> > >::_M_insert_ (this=0x608108, __x=<optimized out>, __p=0x16cd3e30, __v=...)
at /usr/lib/gcc/x86_64-redhat-linux/4.6.0/../../../../include/c++/4.6.0/bits/stl_pair.h:87
#3 0x0000000000402b7d in std::_Rb_tree<std::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::pair<std::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ComponentTrieNode*>, std::_Select1st<std::pair<std::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ComponentTrieNode*> >, std::less<std::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ComponentTrieNode*> > >::_M_insert_unique (this=0x608108, __v=...)
at /usr/lib/gcc/x86_64-redhat-linux/4.6.0/../../../../include/c++/4.6.0/bits/stl_tree.h:1281
#4 0x000000000040444c in insert (__x=..., this=0x608108) at /usr/lib/gcc/x86_64-redhat-linux/4.6.0/../../../../include/c++/4.6.0/bits/stl_map.h:518
#5 ComponentTrie::add_prefix (this=0x7fffffffe2e0, prefix_input=<optimized out>, port=10) at ComponentTrie_david.cpp:112
#6 0x0000000000401c3b in main._omp_fn.0 () at NameComponentEncoding_david.cpp:277
#7 0x0000003cd2607fea in gomp_thread_start (xdata=<optimized out>) at ../../../libgomp/team.c:115
#8 0x0000003cd0607cd1 in start_thread (arg=0x7fffc8be1700) at pthread_create.c:305
#9 0x0000003cd02dfd3d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
(gdb) p 'ComponentTrie::add_prefix(char*, int)'::comps[j]
No symbol "comps" in specified context.
(gdb) p 'ComponentTrie::add_prefix(char*, int)'::prefix
No symbol "prefix" in specified context.
Edit: I have run the code with valgrind --tool=memcheck, the following is the result.
[root#localhost nameComponentEncoding]# valgrind --tool=memcheck ./NCE_david /mnt/disk2/FIB_with_port/10_1.txt /mnt/disk2/trace/a_10_1.trace /mnt/disk2/FIB_with_port/10_2.txt
(... many lines omitted)
==13261==
==13261== Thread 11:
==13261== Invalid read of size 1
==13261== at 0x3CD02849BC: strtok (strtok.S:141)
==13261== by 0x40426A: ComponentTrie::add_prefix(char*, int) (ComponentTrie_david.cpp:99)
==13261== by 0x40242C: main._omp_fn.0 (NameComponentEncoding_david.cpp:531)
==13261== by 0x3CD2607FE9: gomp_thread_start (team.c:115)
==13261== by 0x3CD0607CD0: start_thread (pthread_create.c:305)
==13261== by 0x3CD02DFD3C: clone (clone.S:115)
==13261== Address 0x234422c02 is not stack'd, malloc'd or (recently) free'd
==13261==
==13261== Invalid read of size 1
==13261== at 0x3CD02849EC: strtok (strtok.S:167)
==13261== by 0x40426A: ComponentTrie::add_prefix(char*, int) (ComponentTrie_david.cpp:99)
==13261== by 0x40242C: main._omp_fn.0 (NameComponentEncoding_david.cpp:531)
==13261== by 0x3CD2607FE9: gomp_thread_start (team.c:115)
==13261== by 0x3CD0607CD0: start_thread (pthread_create.c:305)
==13261== by 0x3CD02DFD3C: clone (clone.S:115)
==13261== Address 0x234422c02 is not stack'd, malloc'd or (recently) free'd
==13261==
Insertion and lookup cost time(us): 994669532 67108864 14.821731 0.067469
component number:4849478, state number: 2545847
Parallel threads:24
==13261==
==13261== HEAP SUMMARY:
==13261== in use at exit: 4,239,081,584 bytes in 76,746,193 blocks
==13261== total heap usage: 80,050,114 allocs, 3,303,921 frees, 4,323,622,103 bytes allocated
==13261==
==13261== LEAK SUMMARY:
==13261== definitely lost: 0 bytes in 0 blocks
==13261== indirectly lost: 0 bytes in 0 blocks
==13261== possibly lost: 4,111,951,106 bytes in 74,746,429 blocks
==13261== still reachable: 127,130,478 bytes in 1,999,764 blocks
==13261== suppressed: 0 bytes in 0 blocks
==13261== Rerun with --leak-check=full to see details of leaked memory
==13261==
==13261== For counts of detected and suppressed errors, rerun with: -v
==13261== Use --track-origins=yes to see where uninitialised values come from
==13261== ERROR SUMMARY: 45 errors from 30 contexts (suppressed: 6 from 6)
We know that the program is segfaulting on this line:
current_node->children.insert(std::pair<string, ComponentTrieNode*>(comps[j], temp_node));
From the stack trace, we know that the segfault happens deep in the red black tree implementation of std::map:
#0 std::local_Rb_tree_rotate_left (__x=0xa057c90, __root=#0x608118) at ../../../../libstdc++-v3/src/tree.cc:126
126 __x->_M_right = __y->_M_left;
This implies that:
The segfault could be caused by:
evaluating __x->_M_right
evaluating __y->_M_left
storing the right hand side to the left hand side of __x->_M_right = __y->_M_left
std::map::insert() being called implies that the segfault was NOT caused while building the arguments to the call. In particular comps[j] is not out of bounds.
This leads me to think that your heap was already corrupted by previous memory operation errors by this time and that the crash in std::map::insert() is a symptom and not a cause.
Run your program under the Valgrind memcheck tool:
$ valgrind --tool=memcheck /mnt/disk2/experiments_BLOODMOON/two_stage_bloom_filter/programs/nameComponentEncoding/NCE_david /mnt/disk2/FIB_with_port/10_1.txt /mnt/disk2/trace/a_10_1.trace /mnt/disk2/FIB_with_port/10_2.txt
and carefully read Valgrind's output afterwards to find the first memory error in your program.
Valgrind is implemented as a virtual CPU, so your program would slow down by a factor of ~30. This is time consuming but should allow you to make progress in troubleshooting the problem.
In addition to Valgrind, you might also want to try enabling debug mode for the libstdc++ containers:
To use the libstdc++ debug mode, compile your application with the compiler flag -D_GLIBCXX_DEBUG. Note that this flag changes the sizes and behavior of standard class templates such as std::vector, and therefore you can only link code compiled with debug mode and code compiled without debug mode if no instantiation of a container is passed between the two translation units.
If your program uses no external libraries then rebuilding the whole thing with -D_GLIBCXX_DEBUG added to CXXFLAGS in the Makefile should work. Otherwise you'd need to know whether C++ containers are passed between components compiled with and without the debug flag.
Valgrind Log Review
I'm surprised that you're using strtok() in a multi-threaded program. Is ComponentTrie::add_prefix() never called from two threads concurrently? While fixing the invalid read by inspecting how strtok() is used on ComponentTrie_david.cpp:99, you might want to replace strtok() with strtok_r() as well.
Concurrent Access to STL Containers
The standard C++ containers are explicitly documented to not do thread synchronization:
The user code must guard against concurrent function calls which access any particular library object's state when one or more of those accesses modifies the state. An object will be modified by invoking a non-const member function on it or passing it as a non-const argument to a library function. An object will not be modified by invoking a const member function on it or passing it to a function as a pointer- or reference-to-const. Typically, the application programmer may infer what object locks must be held based on the objects referenced in a function call and whether the objects are accessed as const or non-const.
(That's from the GNU libstdc++ documentation but the C++11 standard essentially specifies the same behavior) Concurrent modifications of std::map and other containers is a serious error and likely the culprit that caused the crash. Guard each container with their own pthread_mutex_t or use the OpenMP synchronization mechanisms.