I've been whittling down this segfault for a while, and here's a pretty minimal reproducible example on my machine (below). I have the sinking feeling that it's a driver bug, but I'm very unfamiliar with OpenGL, so it's more likely I'm just doing something wrong.
Is this correct OpenGL 3.3 code? Should be fine regardless of platform and compiler and all that?
Here's the code, compiled with gcc -ggdb -lGL -lSDL2
#include <stdio.h>
#include "GL/gl.h"
#include "GL/glext.h"
#include "SDL2/SDL.h"
// this section is for loading OpenGL things from later versions.
typedef void (APIENTRY *GLGenVertexArrays) (GLsizei n, GLuint *arrays);
typedef void (APIENTRY *GLGenBuffers) (GLsizei n, GLuint *buffers);
typedef void (APIENTRY *GLBindVertexArray) (GLuint array);
typedef void (APIENTRY *GLBindBuffer) (GLenum target, GLuint buffer);
typedef void (APIENTRY *GLBufferData) (GLenum target, GLsizeiptr size, const GLvoid* data, GLenum usage);
typedef void (APIENTRY *GLBufferSubData) (GLenum target, GLintptr offset, GLsizeiptr size, const GLvoid* data);
typedef void (APIENTRY *GLGetBufferSubData) (GLenum target, GLintptr offset, GLsizeiptr size, const GLvoid* data);
typedef void (APIENTRY *GLFlush) (void);
typedef void (APIENTRY *GLFinish) (void);
GLGenVertexArrays glGenVertexArrays = NULL;
GLGenBuffers glGenBuffers = NULL;
GLBindVertexArray glBindVertexArray = NULL;
GLBindBuffer glBindBuffer = NULL;
GLBufferData glBufferData = NULL;
GLBufferSubData glBufferSubData = NULL;
GLGetBufferSubData glGetBufferSubData = NULL;
void load_gl_pointers() {
glGenVertexArrays = (GLGenVertexArrays)SDL_GL_GetProcAddress("glGenVertexArrays");
glGenBuffers = (GLGenBuffers)SDL_GL_GetProcAddress("glGenBuffers");
glBindVertexArray = (GLBindVertexArray)SDL_GL_GetProcAddress("glBindVertexArray");
glBindBuffer = (GLBindBuffer)SDL_GL_GetProcAddress("glBindBuffer");
glBufferData = (GLBufferData)SDL_GL_GetProcAddress("glBufferData");
glBufferSubData = (GLBufferSubData)SDL_GL_GetProcAddress("glBufferSubData");
glGetBufferSubData = (GLGetBufferSubData)SDL_GL_GetProcAddress("glGetBufferSubData");
}
// end OpenGL loading stuff
#define CAPACITY (1 << 8)
// return nonzero if an OpenGL error has occurred.
int opengl_checkerr(const char* const label) {
GLenum err;
switch(err = glGetError()) {
case GL_INVALID_ENUM:
printf("GL_INVALID_ENUM");
break;
case GL_INVALID_VALUE:
printf("GL_INVALID_VALUE");
break;
case GL_INVALID_OPERATION:
printf("GL_INVALID_OPERATION");
break;
case GL_INVALID_FRAMEBUFFER_OPERATION:
printf("GL_INVALID_FRAMEBUFFER_OPERATION");
break;
case GL_OUT_OF_MEMORY:
printf("GL_OUT_OF_MEMORY");
break;
case GL_STACK_UNDERFLOW:
printf("GL_STACK_UNDERFLOW");
break;
case GL_STACK_OVERFLOW:
printf("GL_STACK_OVERFLOW");
break;
default: return 0;
}
printf(" %s\n", label);
return 1;
}
int main(int nargs, const char* args[]) {
printf("initializing..\n");
SDL_Init(SDL_INIT_EVERYTHING);
SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 3);
SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 3);
SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
SDL_Window* const w =
SDL_CreateWindow(
"broken",
SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED,
1, 1,
SDL_WINDOW_OPENGL
);
if(w == NULL) {
printf("window was null\n");
return 0;
}
SDL_GLContext context = SDL_GL_CreateContext(w);
if(context == NULL) {
printf("context was null\n");
return 0;
}
load_gl_pointers();
if(opengl_checkerr("init")) {
return 1;
}
printf("GL_VENDOR: %s\n", glGetString(GL_VENDOR));
printf("GL_RENDERER: %s\n", glGetString(GL_RENDERER));
float* const vs = malloc(CAPACITY * sizeof(float));
memset(vs, 0, CAPACITY * sizeof(float));
unsigned int i = 0;
while(i < 128000) {
GLuint vertex_array;
GLuint vertex_buffer;
glGenVertexArrays(1, &vertex_array);
glBindVertexArray(vertex_array);
glGenBuffers(1, &vertex_buffer);
glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
if(opengl_checkerr("gen/binding")) {
return 1;
}
glBufferData(
GL_ARRAY_BUFFER,
CAPACITY * sizeof(float),
vs, // initialize with `vs` just to make sure it's allocated.
GL_DYNAMIC_DRAW
);
// verify that the memory is allocated by reading it back into `vs`.
glGetBufferSubData(
GL_ARRAY_BUFFER,
0,
CAPACITY * sizeof(float),
vs
);
if(opengl_checkerr("creating buffer")) {
return 1;
}
glFlush();
glFinish();
// segfault occurs here..
glBufferSubData(
GL_ARRAY_BUFFER,
0,
CAPACITY * sizeof(float),
vs
);
glFlush();
glFinish();
++i;
}
return 0;
}
When I bump the iterations from 64k to 128k, I start getting:
Program received signal SIGSEGV, Segmentation fault.
0x00007ffff754c859 in __memcpy_sse2_unaligned () from /usr/lib/libc.so.6
(gdb) bt
#0 0x00007ffff754c859 in __memcpy_sse2_unaligned () from /usr/lib/libc.so.6
#1 0x00007ffff2ea154d in ?? () from /usr/lib/xorg/modules/dri/i965_dri.so
#2 0x0000000000400e5c in main (nargs=1, args=0x7fffffffe8d8) at opengl-segfault.c:145
However, I can more than double the capacity (keeping the number of iterations at 64k) without segfaulting.
GL_VENDOR: Intel Open Source Technology Center
GL_RENDERER: Mesa DRI Intel(R) Haswell Mobile
I had a very similar issue when calling glGenTextures and glBindTexture. I tried debugging and when i would try to step through these lines I would get something like:
Program received signal SIGSEGV, Segmentation fault.
0x00007ffff26eaaa8 in ?? () from /usr/lib/x86_64-linux-gnu/dri/i965_dri.so
Note that prior to adding textures, I could successfully run programs with vbos and vaos and generate meshes fine. After looking into the answer suggesting switching from xf86-video-intel driver to xf86-video-fbdev driver, I would advise against it(There really isn't that much info on this issue or users facing segfaults on linux with integrated intel graphics cards. perhaps a good question to ask the folks over at Intel OpenSource).
The solution I found was to stop using freeglut. Switch to glfw instead. Whether there actually is some problem with the intel linux graphics stack is besides the matter, it seems the solvable problem is freeglut. If you want to use glfw with your machines most recent opengl core profile you need just the following:
glfwWindowHint (GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint (GLFW_CONTEXT_VERSION_MINOR, 0);
glfwWindowHint (GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
Setting forward compat(although ive seen lots of post argueing you shouldnt do this) means mesa is free to select a core context permitted one sets the minimum context to 3.0 or higher. I guess freeglut must be going wrong somewhere in its interactions with mesa, if anyone can share some light on this that would be great!
This is a bug in the intel graphics drivers for Linux. Switching from the xf86-video-intel driver to xf86-video-fbdev driver solves the problem.
Edit: I'm not recommending switching to fbdev, just using it as an experiment to see whether the segfault goes away.
Related
Once the following code is running, it will eat all my memory and cause OOM issue on my ARM 64-bit processor. it seems that some 'delete' operations do not work... it confused me. Could any experts help to analysis what is the root cause? Ttoolchain or libc?
I run the same code in another two platforms, one is x64, one is another ARM 32-bit chip; they all fine except my IPQ ARM chip.
environment:
libc version: musl libc 1.1.16
gcc version: 5.2.0
cpu chip: IPQ ARM 64bit
os: linux 4.1
#include <iostream>
#include <cstring>
#include <list>
#include <memory>
#include <pthread.h>
void *h(void *param)
{
while(1)
{
char *p = new char[4096];
delete[] p;
}
return NULL;
}
int main(int argc, char *argv)
{
pthread_t th[50];
int thread_cnt = 2;
for(int i = 0; i < thread_cnt; i++)
pthread_create(th+i, NULL, h, NULL);
for(int i = 0; i < thread_cnt; i++)
pthread_join(th+i, NULL);
return 0;
}
If I use a mutex in the header-file thread callback, the issue will disappear; there is no memory leak.
std::mutex thlock;
void *h(void *param)
{
while(1)
{
std::lock_guard<std::mutex> lk(thlock);
char *p = new char[4096];
delete[] p;
}
return NULL;
}
If I change the memory allocation size to 1024, then a weird thing happed. 2 threads do not leak memory, but 20 threads will cause memory leak. Whatever the allocation size, there's no issue with only 1 thread.
void *h(void *param)
{
while(1)
{
//char *p = new char[4096];
char *p = new char[1024];
delete[] p;
}
return NULL;
}
If I replace new/delete to std::malloc/std::free, the issue will disappear; no memory leak.
void *h(void *param)
{
while(1)
{
//char *p = new char[4096];
char *p = (char *)std::malloc(4096);
std::free(p);
//delete[] p;
}
return NULL;
}
So I thought if I overload operator new/delete and my "new/delete" allocate memory by using std::malloc/free, maybe the issue could be solved. But
the weird thing happened again; the memory leak still exists.
void* operator new(std::size_t sz) // no inline, required by [replacement.functions]/3
{
std::printf("global op new called, size = %zu\n", sz);
if (sz == 0)
++sz; // avoid std::malloc(0) which may return nullptr on success
if (void *ptr = std::malloc(sz))
return ptr;
throw std::bad_alloc{}; // required by [new.delete.single]/3
}
void operator delete(void* ptr) noexcept
{
std::puts("global op delete called");
std::free(ptr);
}
void *h(void *param)
{
while(1)
{
char *p = new char[4096];
delete[] p;
}
return NULL;
}
So I thought the memory leak may be caused by my operator new/delete rather than malloc/free, so I changed the code to look like what's shown below. I don't really call malloc/free but return a pre-allocated buffer. But the memory leak issue disappeared, so it seems the issue not only comes from new/delete...
char pp[4096];
void* operator new(std::size_t sz) // no inline, required by [replacement.functions]/3
{
std::printf("global op new called, size = %zu\n", sz);
if (sz == 0)
++sz; // avoid std::malloc(0) which may return nullptr on success
return (void *)pp;
//if (void *ptr = std::malloc(sz))
//return ptr;
throw std::bad_alloc{}; // required by [new.delete.single]/3
}
void operator delete(void* ptr) noexcept
{
std::puts("global op delete called");
//std::free(ptr);
}
So, what happened? Could any experts give me some idea? I guess it maybe caused musl libc thread lib; it seems that thread synchronization does not work fine... or is there a patch that already fixed this issue?
this issue is involved by musl libc, and has been fixed by the following patch.
https://git.musl-libc.org/cgit/musl/commit/src/malloc?id=3e16313f8fe2ed143ae0267fd79d63014c24779f
above patch involved a buffur overflow issue of realloc and has been fixed by below 2 patches:
https://git.musl-libc.org/cgit/musl/commit/src/malloc?id=cb5babdc8d624a3e3e7bea0b4e28a677a2f2fc46
https://git.musl-libc.org/cgit/musl/commit/src/malloc?id=fca7428c096066482d8c3f52450810288e27515c
Below is the code:
#define FILENAME "kernel.code"
#define kernel_name "hello_world"
#define THREADS 4
std::vector<char> load_file()
{
std::ifstream file(FILENAME, std::ios::binary | std::ios::ate);
std::streamsize fsize = file.tellg();
file.seekg(0, std::ios::beg);
std::vector<char> buffer(fsize);
if (!file.read(buffer.data(), fsize)) {
failed("could not open code object '%s'\n", FILENAME);
}
return buffer;
}
struct joinable_thread : std::thread
{
template <class... Xs>
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
{
}
joinable_thread& operator=(joinable_thread&& other) = default;
joinable_thread(joinable_thread&& other) = default;
~joinable_thread()
{
if(this->joinable())
this->join();
}
};
void run(const std::vector<char>& buffer) {
CUdevice device;
CUDACHECK(cuDeviceGet(&device, 0));
CUcontext context;
CUDACHECK(cuCtxCreate(&context, 0, device));
CUmodule Module;
CUDACHECK(cuModuleLoadData(&Module, &buffer[0]));
...
}
void run_multi_threads(uint32_t n) {
{
auto buffer = load_file();
std::vector<joinable_thread> threads;
for (uint32_t i = 0; i < n; i++) {
threads.emplace_back(std::thread{[&, i, buffer] {
run(buffer);
}});
}
}
}
int main() {
CUDACHECK(cuInit(0));
run_multi_threads(THREADS);
}
And the code kernel.cu used for ptx is as follows:
#include "cuda_runtime.h"
extern "C" __global__ void hello_world(float* a, float* b) {
int tx = threadIdx.x;
b[tx] = a[tx];
}
I m generating the ptx in this way
nvcc --ptx kernel.cu -o kernel.code
Im using a machine with GeForce GTX TITAN X.
And Im facing this "PTX JIT compilation failed" from cuModuleLoadData error, only when I m trying to use this with multiple threads. If i remove the multi-threading part and run normally, this error doesn't occur.
Can anyone please tell me what is going wrong and how to overcome this.
As mentioned in the comments, I was able to get it to work by moving the load_file() call to the main, so that the buffer read from the file is valid, and then pass only the buffer to all the threads.
Actually in the original code, the buffer will be deconstructed once it leaves the '{...}' scope. So when thread starts, you may read the invalid buffer.
If you put your buffer in the main, it will not be deconstructed or freed until the program exits.
So yes, it's because you pass the invalid buffer (which may have already been freed) to the cu code.
I am developing a C++ library realizing its interface by means of Qt, using VS2015. On the library side, 3 boost threads continously load images from 3 folders. I am trying to display these images in 3 different QLabel (or equivalent QWidgets), so the thread body consists of this functionality,
in particular by exploiting the setPixmap method. Although the call to the function is protected by a boost mutex, I got exceptions probably due to threads synchronization. Looking for a solution, I already awared that the QPixmap widget is not "thread-safe" (non-reentrant). I also tried to use QGraphicsView but it in turn relies on QPixmap, thus I came across the same problem.
So my question is: does an alternative to QPixmap exist to display images in Qt in a thread-safe
manner?
I would recommend to do not multi-threading in GUI programming. Although, Qt provides multi-threading support in general, IMHO, the widgets are not well-prepared for this.
Thus, to achieve image loaders which run concurrently in separate threads I would suggest the following concept:
Each threaded image loader feeds a private buffer. The GUI inspects from time to time (using QTimer) these buffers and updates its QPixmap. As access to buffers should be possible from the resp. image loader thread as well as the GUI thread they have to be mutex guarded, of course.
My sample code testLoadImageMT.cc:
#include <atomic>
#include <chrono>
#include <mutex>
#include <thread>
#include <QtWidgets>
// manually added types (normally provided by glib)
typedef unsigned guint;
typedef unsigned char guint8;
// the fluffy-cat image sample
struct Image {
guint width;
guint height;
guint bytes_per_pixel; /* 3:RGB, 4:RGBA */
guint8 pixel_data[1];
};
extern "C" const Image fluffyCat;
class ImageLoader {
private:
const Image &_img;
std::atomic<bool> _exit;
std::mutex _lock;
QImage _qImg;
std::thread _thread;
public: // main thread API
ImageLoader(const Image &img = fluffyCat):
_img(img),
_qImg(img.width, img.height, QImage::Format_RGB888),
_exit(false), _thread(&ImageLoader::loadImage, std::ref(*this))
{ }
~ImageLoader()
{
_exit = true;
_thread.join();
}
ImageLoader(const ImageLoader&) = delete;
void applyImage(QLabel &qLblImg)
{
std::lock_guard<std::mutex> lock(_lock);
qLblImg.setPixmap(QPixmap::fromImage(_qImg));
}
private: // thread private
void loadImage()
{
for (;;) {
{ std::lock_guard<std::mutex> lock(_lock);
_qImg.fill(0);
}
size_t i = 0;
for (int y = 0; y < (int)_img.height; ++y) {
for (int x = 0; x < (int)_img.width; ++x) {
const quint32 value
= _img.pixel_data[i + 2]
| (_img.pixel_data[i + 1] << 8)
| (_img.pixel_data[i + 0] << 16)
| (0xff << 24);
i += _img.bytes_per_pixel;
{ std::lock_guard<std::mutex> lock(_lock);
_qImg.setPixel(x, y, value);
}
if (_exit) return; // important: make thread co-operative
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // slow down CPU cooler
}
}
}
};
int main(int argc, char **argv)
{
// settings:
enum { N = 3 }; // number of images loaded/displayed
enum { Interval = 50 }; // update rate for GUI 50 ms -> 20 Hz (round about)
// build appl.
qDebug() << "Qt Version: " << QT_VERSION_STR;
QApplication app(argc, argv);
// build GUI
QWidget qMainWin;
QVBoxLayout qVBox;
QLabel *pQLblImgs[N];
for (int i = 0; i < N; ++i) {
qVBox.addWidget(
new QLabel(QString::fromUtf8("Image %1").arg(i + 1)));
qVBox.addWidget(
pQLblImgs[i] = new QLabel());
}
qMainWin.setLayout(&qVBox);
qMainWin.show();
// build image loaders
ImageLoader imgLoader[N];
// install timer
QTimer qTimer;
qTimer.setInterval(Interval); // ms
QObject::connect(&qTimer, &QTimer::timeout,
[&imgLoader, &pQLblImgs]() {
for (int i = 0; i < N; ++i) {
imgLoader[i].applyImage(*pQLblImgs[i]);
}
});
qTimer.start();
// exec. application
return app.exec();
}
Sorry, I used std::thread instead of boost::thread as I've no experience with the latter, nor a working installation. I believe (hope) the differences will be marginal. QThread would have been the "Qt native" alternative but again – no experiences.
To keep things simple, I just copied data out of a linked binary image (instead of loading one from file or from anywhere else). Hence, a second file has to be compiled and linked to make this an MCVE – fluffyCat.cc:
/* GIMP RGB C-Source image dump (fluffyCat.cc) */
// manually added types (normally provided by glib)
typedef unsigned guint;
typedef unsigned char guint8;
extern "C" const struct {
guint width;
guint height;
guint bytes_per_pixel; /* 3:RGB, 4:RGBA */
guint8 pixel_data[16 * 16 * 3 + 1];
} fluffyCat = {
16, 16, 3,
"x\211s\215\232\200gw`fx`at[cx^cw^fu\\itZerWn|ap~cv\204jnzedq^fr^kzfhv^Ra"
"GRbMWdR\\jXer^qw_\311\256\226\271\253\235\275\264\252\315\277\260\304\255"
"\231u~i\213\225\207l{fly`jx\\^nRlz_z\206nlx`t~i\221\211s\372\276\243\375"
"\336\275\376\352\340\356\312\301\235\216\212judgwcl~f\212\226u}\206h\212"
"\224q\231\237z\232\236{\216\225v\225\230\200\306\274\244\376\360\327\376"
"\361\331\376\360\341\326\275\272\253\240\244{\203p\202\220xp~e{\204^\222"
"\230n\212\217g\240\242{\234\236z\214\222r\270\271\247\360\353\340\376\370"
"\336\376\363\334\375\357\336\310\254\262\232\223\234\\gRfrX\204\220z\212"
"\225g\225\232j\254\255\177\252\250{\225\226u\304\302\265\374\365\351\376"
"\375\366\376\367\341\376\361\320\374\346\324\306\241\242\237\232\235n{fj"
"xckyfu~fUX#VZCfnT\231\231\207\374\374\371\377\372\354\376\376\374\376\376"
"\372\376\362\332\375\340\301\341\300\264\260\253\262jvdbq\\XkVJTDNTCCG8O"
"TE\322\321\313\377\377\375\376\376\373\376\377\376\376\376\375\376\374\362"
"\376\360\342\344\311\306\250\244\254R_PL^HXkT<#2OP#`dP\217\220\177\374\374"
"\370\377\377\374\376\375\371\377\377\376\376\374\360\377\367\336\376\350"
"\316\342\303\274\246\236\245jtbXdQTdNQYGU\\KchV\317\315\302\377\376\372\377"
"\376\367\376\373\360\377\376\367\376\366\337\376\355\312\374\331\271\323"
"\263\251\216\214\214\\hTP^HL\\FR[LMXI^dW\355\352\342\376\375\366\377\374"
"\360\376\374\361\376\374\361\376\356\321\374\331\264\374\330\266\330\270"
"\260\200||Y`SLVE>K9BJ<CN?VYP\347\330\322\376\366\345\376\363\330\376\367"
"\337\377\372\350\374\342\314\326\243\210\375\350\314\352\317\304shc^`TV`"
"RVbT>B4IS?PTD\244\232\216\374\355\320\376\354\311\376\351\306\376\362\332"
"\374\344\321\267\206u\375\362\337\326\274\272\\POMNBT]LNZH:<*<A*TV>OI;\242"
"\222\207\340\304\243\375\335\262\372\336\272\376\361\334\320\241\212\374"
"\352\322\266\233\237c\\WFH;MR>\\`F~xP\220\214[pqE\211\202\\g]=\230\214`\313"
"\266\207\344\303\240\362\336\274\323\257\201\333\304\240\305\252\204\254"
"\232p\216\206\\\206\203U\232\224b\234\244b\246\257m\220\232`\224\227h~\202"
"W\206\213]\204\210W\227\227i|\177RvzNlsGrtJwtLz}N{\204RlxF",
};
I compiled and tested in VS2013, with Qt 5.9.2 on Windows 10 (64 bit). This is how it looks:
I solved using signal/slot: the "non-GUI" thread emits a signal instead of displaying the images and the called slot paints the QLabel inside the GUI thread!
Below is an example of source I want to use on a machine running "Red Hat Enterprise Linux 5.5 (Tikanga) Kernel 2.6.18-194.el5xen x86_64" OS.
The general idea is that I want to have backtrace of some thread, so I am raising a SIGUSR1 signal for that thread and a handler does a backtrace() call.
In my scenario as below, FrameTwo function calls malloc and free in a loop. Whenever the signal is raised for this particular thread and free or malloc is on the callstack, the progream crashes when the signal handler calls backtrace().
(gdb) where (stack from gdb)
0 0x0000003e67207638 in ?? ()
1 0x0000003e672088bb in _Unwind_Backtrace
2 0x00000037ba0e5fa8 in backtrace ()
3 0x000000000040071a in handler ()
4 <signal handler called>
5 0x00000037ba071fac in _int_free ()
6 0x0000000a33605000 in ?? ()
7 0x000000004123b130 in ?? ()
8 0x00000000004007d4 in ThreadFunction ()
9 0x000000001f039020 in ?? ()
10 0x000000004123b940 in ?? ()
11 0x0000000000000001 in ?? ()
12 0x0000000000000000 in ?? ()
I learned from other sources that backtrace shouldn't be called from a signal handler, so I have written my own function grok_and_print_thread_stack() for this case.
It uses the RBP register to navigate the stack (RBP contains the base pointer of the current frame points to the previous frame's base pointer), but this algorithm does not work in this case either: when _int_free () is on the callstack, the RBP register navigation algorithm breaks, because the RBP of _int_free is some value like 0x20 which is not a valid frame's base pointer.
Does anyone know how a callstack can be navigated from the registers? Or how can I use backtrace for my purpose?
#include "stdio.h"
#include "stdlib.h"
#include "pthread.h"
#include "signal.h"
#include "syscall.h"
#include "string.h"
#include "inttypes.h"
//####################################################################
//gcc BacktraceTestProgram.c -o backtracetest -lpthread
//./backtracetest
//gdb -c core backtracetest
//####################################################################
volatile sig_atomic_t flag = 1;
int thlist[6] = {0};
int cnt = 0;
int *memory = NULL;
//####################################################################
void raiseUserSignal(int tid)
{
union sigval value;
value.sival_int = 1;
sigqueue(tid,SIGUSR1, value);
}
//####################################################################
int grok_and_print_thread_stack()
{
int ret = 0;
register uint64_t* rbp asm("rbp");
/*if buffer was built before, add separator */
uint64_t *previous_bp;
/*save pointers*/
previous_bp = rbp;
/* stack Traversal */
while(previous_bp)
{
uint64_t *next_bp;
next_bp = (uint64_t*)*previous_bp;
printf("Read BP: %lx \n", next_bp);
if ( NULL == (void*)next_bp )
{
printf("Reached the top of the stack\n");
fflush(stdout);
break;
}
previous_bp = next_bp;
}
return ret;
}
//####################################################################
void handler(int signum, siginfo_t *info, void *context)
{
int nptrs = 0 ;
void *buffer[100] = {NULL};
char **strings = NULL;
nptrs = backtrace(buffer, 100);
flag = 1;
}
//####################################################################
void FrameTwo(const char A)
{
do{
if( memory == NULL)
memory = (int *)malloc(sizeof(int) *5);
if(memory != NULL) {
free(memory);
memory = NULL;
}
}while(1);
}
//####################################################################
void FrameOne(int no)
{
FrameTwo('A');
}
//####################################################################
void *ThreadFunction( void *ptr )
{
int tid = syscall(SYS_gettid);
thlist[cnt++] = tid;
FrameOne(10);
}
//####################################################################
void RegisterSignalHandler()
{
/* Register a Signal Handler */
struct sigaction usrsig_action;
usrsig_action.sa_flags = SA_SIGINFO;
usrsig_action.sa_sigaction = &handler;
sigaction (SIGUSR1, &usrsig_action, NULL);
}
//####################################################################
int main(int no , char *argc[] )
{
int iret1;
pthread_t thread1;
RegisterSignalHandler();
/* Create independent threads each of which will execute function */
iret1 = pthread_create( &thread1, NULL, ThreadFunction, NULL);
while(cnt == 0);
while(1) {
if(flag == 1){
flag = 0;
raiseUserSignal(thlist[0]);
}
}
pthread_join( thread1, NULL);
return 0;
}
In general x86_64 programs are likely to have been built with -fomit-frame-pointer as it is the default when optimisation is on.
What that means is that RBP is not usable for unwinding the stack and you will either need to use the DWARF unwind information (if you have debugging information available) or the exception unwind table.
You may want to look at the libunwind project.
The primary goal of [libunwind] is to define a portable and efficient C programming interface (API) to determine the call-chain of a program. [...] As such, the API is useful in a number of applications. Some examples include:
debuggers
The libunwind API makes it trivial for debuggers to generate the call-chain (backtrace) of the threads in a running program/
In particular, have a look at the local unwinding section of their documentation, it contains explanations and the following code example (with you need to link with -lunwind) that prints the backtrace of the current function:
#define UNW_LOCAL_ONLY
#include <libunwind.h>
void show_backtrace (void) {
unw_cursor_t cursor; unw_context_t uc;
unw_word_t ip, sp;
unw_getcontext(&uc);
unw_init_local(&cursor, &uc);
while (unw_step(&cursor) > 0) {
unw_get_reg(&cursor, UNW_REG_IP, &ip);
unw_get_reg(&cursor, UNW_REG_SP, &sp);
printf ("ip = %lx, sp = %lx\n", (long) ip, (long) sp);
}
}
I just found out that someone is calling - from a signal handler - a definitely not async-signal-safe function that I wrote.
So, now I'm curious: how to circumvent this situation from happening again? I'd like to be able to easily determine if my code is running in signal handler context (language is C, but wouldn't the solution apply to any language?):
int myfunc( void ) {
if( in_signal_handler_context() ) { return(-1) }
// rest of function goes here
return( 0 );
}
This is under Linux.
Hope this isn't an easy answer, or else I'll feel like an idiot.
Apparently, newer Linux/x86 (probably since some 2.6.x kernel) calls signal handlers from the vdso. You could use this fact to inflict the following horrible hack upon the unsuspecting world:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
uintmax_t vdso_start = 0;
uintmax_t vdso_end = 0; /* actually, next byte */
int check_stack_for_vdso(uint32_t *esp, size_t len)
{
size_t i;
for (i = 0; i < len; i++, esp++)
if (*esp >= vdso_start && *esp < vdso_end)
return 1;
return 0;
}
void handler(int signo)
{
uint32_t *esp;
__asm__ __volatile__ ("mov %%esp, %0" : "=r"(esp));
/* XXX only for demonstration, don't call printf from a signal handler */
printf("handler: check_stack_for_vdso() = %d\n", check_stack_for_vdso(esp, 20));
}
void parse_maps()
{
FILE *maps;
char buf[256];
char path[7];
uintmax_t start, end, offset, inode;
char r, w, x, p;
unsigned major, minor;
maps = fopen("/proc/self/maps", "rt");
if (maps == NULL)
return;
while (!feof(maps) && !ferror(maps)) {
if (fgets(buf, 256, maps) != NULL) {
if (sscanf(buf, "%jx-%jx %c%c%c%c %jx %u:%u %ju %6s",
&start, &end, &r, &w, &x, &p, &offset,
&major, &minor, &inode, path) == 11) {
if (!strcmp(path, "[vdso]")) {
vdso_start = start;
vdso_end = end;
break;
}
}
}
}
fclose(maps);
printf("[vdso] at %jx-%jx\n", vdso_start, vdso_end);
}
int main()
{
struct sigaction sa;
uint32_t *esp;
parse_maps();
memset(&sa, 0, sizeof(struct sigaction));
sa.sa_handler = handler;
sa.sa_flags = SA_RESTART;
if (sigaction(SIGUSR1, &sa, NULL) < 0) {
perror("sigaction");
exit(1);
}
__asm__ __volatile__ ("mov %%esp, %0" : "=r"(esp));
printf("before kill: check_stack_for_vdso() = %d\n", check_stack_for_vdso(esp, 20));
kill(getpid(), SIGUSR1);
__asm__ __volatile__ ("mov %%esp, %0" : "=r"(esp));
printf("after kill: check_stack_for_vdso() = %d\n", check_stack_for_vdso(esp, 20));
return 0;
}
SCNR.
If we can assume your application doesn't manually block signals using sigprocmask() or pthread_sigmask(), then this is pretty simple: get your current thread ID (tid). Open /proc/tid/status and get the values for SigBlk and SigCgt. AND those two values. If the result of that AND is non-zero, then that thread is currently running from inside a signal handler. I've tested this myself and it works.
There are two proper ways to deal with this:
Have your co-workers stop doing the wrong thing. Good luck pulling this off with the boss, though...
Make your function re-entrant and async-safe. If necessary, provide a function with a different signature (e.g. using the widely-used *_r naming convention) with the additional arguments that are necessary for state preservation.
As for the non-proper way to do this, on Linux with GNU libc you can use backtrace() and friends to go through the caller list of your function. It's not easy to get right, safe or portable, but it might do for a while:
/*
* *** Warning ***
*
* Black, fragile and unportable magic ahead
*
* Do not use this, lest the daemons of hell be unleashed upon you
*/
int in_signal_handler_context() {
int i, n;
void *bt[1000];
char **bts = NULL;
n = backtrace(bt, 1000);
bts = backtrace_symbols(bt, n);
for (i = 0; i < n; ++i)
printf("%i - %s\n", i, bts[i]);
/* Have a look at the caller chain */
for (i = 0; i < n; ++i) {
/* Far more checks are needed here to avoid misfires */
if (strstr(bts[i], "(__libc_start_main+") != NULL)
return 0;
if (strstr(bts[i], "libc.so.6(+") != NULL)
return 1;
}
return 0;
}
void unsafe() {
if (in_signal_handler_context())
printf("John, you know you are an idiot, right?\n");
}
In my opinion, it might just be better to quit rather than be forced to write code like this.
You could work out something using sigaltstack. Set up an alternative signal stack, get the stack pointer in some async-safe way, if within the alternative stack go on, otherwise abort().
I guess you need to do the following. This is a complex solution, which combines the best practices not only from coding, but from software engineering as well!
Persuade your boss that naming convention on signal handlers is a good thing. Propose, for example, a Hungarian notation, and tell that it was used in Microsoft with great success.
So, all signal handlers will start with sighnd, like sighndInterrupt.
Your function that detects signal handling context would do the following:
Get the backtrace().
Look if any of the functions in it begin with sighnd.... If it does, then congratulations, you're inside a signal handler!
Otherwise, you're not.
Try to avoid working with Jimmy in the same company. "There can be only one", you know.
for code optimized at -O2 or better (istr) have found need to add -fno-omit-frame-pointer
else gcc will optimize out the stack context information