register_kprobe returns EINVAL (-22) error for instructions involving rip - linux

I am trying to insert probes at different instructions with kprobes in function of kernel module.
But register_kprobe is returning EINVAL(-22) error for 0xffffffffa33c1085 instruction addresses and 0xffffffffa33c109b from below assembly code (it passes for all other instruction addresses).
Instructions giving errors:
0xffffffffa33c1085 <test_increment+5>: mov 0x21bd(%rip),%eax # 0xffffffffa33c3248
0xffffffffa33c109b <test_increment+27>: mov %esi,0x21a7(%rip) # 0xffffffffa33c3248
Observed that both these instructions use rip register. Tried with functions of other modules, observed same error with instructions which use rip register.
Why is register_kprobe failing ? does it have any constraints involving rip ? Any help is appreciated.
System has kernel 3.10.0-514 on x86_64 installed.
kprobe function:
kp = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
kp->post_handler = exit_func;
kp->pre_handler = entry_func;
kp->addr = sym_addr;
atomic_set(&pcount, 0);
ret = register_kprobe(kp);
if ( ret != 0 ) {
printk(KERN_INFO "register_kprobe returned %d for %s\n", ret, str);
kfree(kp);
kp=NULL;
return ret;
}
probed function:
int race=0;
void test_increment()
{
race++;
printk(KERN_INFO "VALUE=%d\n",race);
return;
}
assembly code:
crash> dis -l test_increment
0xffffffffa33c1080 <test_increment>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffa33c1085 <test_increment+5>: mov 0x21bd(%rip),%eax # 0xffffffffa33c3248
0xffffffffa33c108b <test_increment+11>: push %rbp
0xffffffffa33c108c <test_increment+12>: mov $0xffffffffa33c2024,%rdi
0xffffffffa33c1093 <test_increment+19>: mov %rsp,%rbp
0xffffffffa33c1096 <test_increment+22>: lea 0x1(%rax),%esi
0xffffffffa33c1099 <test_increment+25>: xor %eax,%eax
0xffffffffa33c109b <test_increment+27>: mov %esi,0x21a7(%rip) # 0xffffffffa33c3248
0xffffffffa33c10a1 <test_increment+33>: callq 0xffffffff81659552 <printk>
0xffffffffa33c10a6 <test_increment+38>: pop %rbp
0xffffffffa33c10a7 <test_increment+39>: retq
Thanks

Turns out, register_kprobe does have limitations with instructions invoving rip relative addressing for x86_64.
Here is snippet of __copy_instruction function code causing error (register_kprobe -> prepare_kprobe -> arch_prepare_kprobe -> arch_copy_kprobe -> __copy_instruction )
#ifdef CONFIG_X86_64
if (insn_rip_relative(&insn)) {
s64 newdisp;
u8 *disp;
kernel_insn_init(&insn, dest);
insn_get_displacement(&insn);
/*
* The copied instruction uses the %rip-relative addressing
* mode. Adjust the displacement for the difference between
* the original location of this instruction and the location
* of the copy that will actually be run. The tricky bit here
* is making sure that the sign extension happens correctly in
* this calculation, since we need a signed 32-bit result to
* be sign-extended to 64 bits when it's added to the %rip
* value and yield the same 64-bit result that the sign-
* extension of the original signed 32-bit displacement would
* have given.
*/
newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
if ((s64) (s32) newdisp != newdisp) {
pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value);
return 0;
}
disp = (u8 *) dest + insn_offset_displacement(&insn);
*(s32 *) disp = (s32) newdisp;
}
#endif
http://elixir.free-electrons.com/linux/v3.10/ident/__copy_instruction
A new displacement value is calculated based new instruction address (where orig insn is copied). If that value doesn't fit in 32 bit, it returns 0 which results in EINVAL error. Hence the failure.
As a workaround, we can set kprobe handler post previous instruction or pre next instruction based on need (works for me).

Related

x86 Linux ELF Loader Troubles

I'm trying to write an ELF executable loader for x86-64 Linux, similar to this, which was implemented on ARM. Chris Rossbach's advanced OS class includes a lab that does basically what I want to do. My goal is to load a simple (statically-linked) "hello world" type binary into my process's memory and run it without execveing. I have successfully mmap'd the ELF file, set up the stack, and jumped to the ELF's entry point (_start).
// put ELF file into memory. This is just one line of a complex
// for() loop that loads the binary from a file.
mmap((void*)program_header.p_vaddr, program_header.p_memsz, map, MAP_PRIVATE|MAP_FIXED, elffd, program_header.p_offset);
newstack = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); // Map a page for the stack
if((long)newstack < 0) {
fprintf(stderr, "ERROR: mmap returned error when allocating stack, %s\n", strerror(errno));
exit(1);
}
topstack = (unsigned long*)((unsigned char*)newstack+4096); // Top of new stack
*((unsigned long*)topstack-1) = 0; // Set up the stack
*((unsigned long*)topstack-2) = 0; // with argc, argv[], etc.
*((unsigned long*)topstack-3) = 0;
*((unsigned long*)topstack-4) = argv[1];
*((unsigned long*)topstack-5) = 1;
asm("mov %0,%%rsp\n" // Install new stack pointer
"xor %%rax, %%rax\n" // Zero registers
"xor %%rbx, %%rbx\n"
"xor %%rcx, %%rcx\n"
"xor %%rdx, %%rdx\n"
"xor %%rsi, %%rsi\n"
"xor %%rdi, %%rdi\n"
"xor %%r8, %%r8\n"
"xor %%r9, %%r9\n"
"xor %%r10, %%r10\n"
"xor %%r11, %%r11\n"
"xor %%r12, %%r12\n"
"xor %%r13, %%r13\n"
"xor %%r14, %%r14\n"
:
: "r"(topstack-5)
:"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14");
asm("push %%rax\n"
"pop %%rax\n"
:
:
: "rax");
asm("mov %0,%%rax\n" // Jump to the entry point of the loaded ELF file
"jmp *%%rax\n"
:
: "r"(jump_target)
: );
I then step through this code in gdb. I've pasted the first few instructions of the startup code below. Everything works great until the first push instruction (starred). The push causes a segfault.
0x60026000 xor %ebp,%ebp
0x60026002 mov %rdx,%r9
0x60026005 pop %rsi
0x60026006 mov %rsp,%rdx
0x60026009 and $0xfffffffffffffff0,%rsp
0x6002600d * push %rax
0x6002600e push %rsp
0x6002600f mov $0x605f4990,%r8
I have tried:
Using the stack from the original process.
mmaping a new stack (as in the above code): (1) and (2) both cause segfaults.
pushing and poping to/from the stack before jmping to the loaded ELF file. This does not cause a segfault.
Changing the protection flags for the stack in the second mmap to PROT_READ | PROT_WRITE | PROT_EXEC. This doesn't make a difference.
I suspect this maybe has something to do with the segment descriptors (maybe?). It seems like the code from the ELF file that I'm loading does not have write access to the stack segment, no matter where it is located. I have not tried to modify the segment descriptor for the newly loaded binary or change the architectural segment registers. Is this necessary? Does anybody know how to fix this?
It turned out that when I was stepping through the loaded code in gdb, the debugger would consistently blow by the first push instruction when I typed nexti and instead continue execution. It was not in fact the push instruction that was causing the segfault but a much later instruction in the C library start code. The problem was caused by a failed call to mmap in the initial binary load that I didn't error check.
Regarding gdb randomly deciding to continue execution instead of stepping: this can be fixed by loading the symbols from the target executable after jumping to the newly loaded executable.

VC++ SSE code generation - is this a compiler bug?

A very particular code sequence in VC++ generated the following instruction (for Win32):
unpcklpd xmm0,xmmword ptr [ebp-40h]
2 questions arise:
(1) As far as I understand the intel manual, unpcklpd accepts as 2nd argument a 128-aligned memory address. If the address is relative to a stack frame alignment cannot be forced. Is this really a compiler bug?
(2) Exceptions are thrown from at the execution of this instruction only when run from the debugger, and even then not always. Even attaching to the process and executing this code does not throw. How can this be??
The particular exception thrown is access violation at 0xFFFFFFFF, but AFAIK that's just a code for misalignment.
[Edit:]
Here's some source that demonstrates the bad code generation - but typically doesn't cause a crash. (that's mostly what I'm wondering about)
[Edit 2:]
The code sample now reproduces the actual crash. This one also crashes outside the debugger - I suspect the difference occurs because the debugger launches the program at different typical base addresses.
// mock.cpp
#include <stdio.h>
struct mockVect2d
{
double x, y;
mockVect2d() {}
mockVect2d(double a, double b) : x(a), y(b) {}
mockVect2d operator + (const mockVect2d& u) {
return mockVect2d(x + u.x, y + u.y);
}
};
struct MockPoly
{
MockPoly() {}
mockVect2d* m_Vrts;
double m_Area;
int m_Convex;
bool m_ParClear;
void ClearPar() { m_Area = -1.; m_Convex = 0; m_ParClear = true; }
MockPoly(int len) { m_Vrts = new mockVect2d[len]; }
mockVect2d& Vrt(int i) {
if (!m_ParClear) ClearPar();
return m_Vrts[i];
}
const mockVect2d& GetCenter() { return m_Vrts[0]; }
};
struct MockItem
{
MockItem() : Contour(1) {}
MockPoly Contour;
};
struct Mock
{
Mock() {}
MockItem m_item;
virtual int GetCount() { return 2; }
virtual mockVect2d GetCenter() { return mockVect2d(1.0, 2.0); }
virtual MockItem GetItem(int i) { return m_item; }
};
void testInner(int a)
{
int c = 8;
printf("%d", c);
Mock* pMock = new Mock;
int Flag = true;
int nlr = pMock->GetCount();
if (nlr == 0)
return;
int flr = 1;
if (flr == nlr)
return;
if (Flag)
{
if (flr < nlr && flr>0) {
int c = 8;
printf("%d", c);
MockPoly pol(2);
mockVect2d ctr = pMock->GetItem(0).Contour.GetCenter();
// The mess happens here:
// ; 74 : pol.Vrt(1) = ctr + mockVect2d(0., 1.0);
//
// call ? Vrt#MockPoly##QAEAAUmockVect2d##H#Z; MockPoly::Vrt
// movdqa xmm0, XMMWORD PTR $T4[ebp]
// unpcklpd xmm0, QWORD PTR tv190[ebp] **** crash!
// movdqu XMMWORD PTR[eax], xmm0
pol.Vrt(0) = ctr + mockVect2d(1.0, 0.);
pol.Vrt(1) = ctr + mockVect2d(0., 1.0);
}
}
}
void main()
{
testInner(2);
return;
}
If you prefer, download a ready vcxproj with all the switches set from here. This includes the complete ASM too.
Update: this is now a confirmed VC++ compiler bug, hopefully to be resolved in VS2015 RTM.
Edit: The connect report, like many others, is now garbage. However the compiler bug seems to be resolved in VS2017 - not in 2015 update 3.
Since no one else has stepped up, I'm going to take a shot.
1) If the address is relative to a stack frame alignment cannot be forced. Is this really a compiler bug?
I'm not sure it is true that you cannot force alignment for stack variables. Consider this code:
struct foo
{
char a;
int b;
unsigned long long c;
};
int wmain(int argc, wchar_t* argv[])
{
foo moo;
moo.a = 1;
moo.b = 2;
moo.c = 3;
}
Looking at the startup code for main, we see:
00E31AB0 push ebp
00E31AB1 mov ebp,esp
00E31AB3 sub esp,0DCh
00E31AB9 push ebx
00E31ABA push esi
00E31ABB push edi
00E31ABC lea edi,[ebp-0DCh]
00E31AC2 mov ecx,37h
00E31AC7 mov eax,0CCCCCCCCh
00E31ACC rep stos dword ptr es:[edi]
00E31ACE mov eax,dword ptr [___security_cookie (0E440CCh)]
00E31AD3 xor eax,ebp
00E31AD5 mov dword ptr [ebp-4],eax
Adding __declspec(align(16)) to moo gives
01291AB0 push ebx
01291AB1 mov ebx,esp
01291AB3 sub esp,8
01291AB6 and esp,0FFFFFFF0h <------------------------
01291AB9 add esp,4
01291ABC push ebp
01291ABD mov ebp,dword ptr [ebx+4]
01291AC0 mov dword ptr [esp+4],ebp
01291AC4 mov ebp,esp
01291AC6 sub esp,0E8h
01291ACC push esi
01291ACD push edi
01291ACE lea edi,[ebp-0E8h]
01291AD4 mov ecx,3Ah
01291AD9 mov eax,0CCCCCCCCh
01291ADE rep stos dword ptr es:[edi]
01291AE0 mov eax,dword ptr [___security_cookie (12A40CCh)]
01291AE5 xor eax,ebp
01291AE7 mov dword ptr [ebp-4],eax
Apparently the compiler (VS2010 compiled debug for Win32), recognizing that we will need specific alignments for the code, takes steps to ensure it can provide that.
2) Exceptions are thrown from at the execution of this instruction only when run from the debugger, and even then not always. Even attaching to the process and executing this code does not throw. How can this be??
So, a couple of thoughts:
"and even then not always" - Not standing over your shoulder when you run this, I can't say for certain. However it seems plausible that just by random chance, stacks could get created with the alignment you need. By default, x86 uses 4byte stack alignment. If you need 16 byte alignment, you've got a 1 in 4 shot.
As for the rest (from https://msdn.microsoft.com/en-us/library/aa290049%28v=vs.71%29.aspx#ia64alignment_topic4):
On the x86 architecture, the operating system does not make the alignment fault visible to the application. ...you will also suffer performance degradation on the alignment fault, but it will be significantly less severe than on the Itanium, because the hardware will make the multiple accesses of memory to retrieve the unaligned data.
TLDR: Using __declspec(align(16)) should give you the alignment you want, even for stack variables. For unaligned accesses, the OS will catch the exception and handle it for you (at a cost of performance).
Edit1: Responding to the first 2 comments below:
Based on MS's docs, you are correct about the alignment of stack parameters, but they propose a solution as well:
You cannot specify alignment for function parameters. When data that
has an alignment attribute is passed by value on the stack, its
alignment is controlled by the calling convention. If data alignment
is important in the called function, copy the parameter into correctly
aligned memory before use.
Neither your sample on Microsoft connect nor the code about produce the same code for me (I'm only on vs2010), so I can't test this. But given this code from your sample:
struct mockVect2d
{
double x, y;
mockVect2d(double a, double b) : x(a), y(b) {}
It would seem that aligning either mockVect2d or the 2 doubles might help.

Linux kernel system call returns -1 instead of {-1, -256}

I'm a kernel newbie and am facing a weird issue. I have written a proof-of-concept calculator syscall and while it works fine for most computations, it is returning -1 when the SUBTRACTION result is between -1 to -256. If someone can throw some light on what could be happening, would appreciate it. Below is the syscall code.
SYSCALL_DEFINE3(calc, int, a, int, b , char, op) {
int res_int;
switch(op) {
case '+': res_int = a + b;
break;
case '-': res_int = a - b;
break;
case '*': res_int = a * b;
break;
case '/': res_int = (a*1000) / b;
break;
}
printk(KERN_INFO "KERNEL CALC RESULT : %d %c %d = %ld",a, op, b, res_int);
return res_int;
}
EDIT:
Kernel version: Android Linux Kernel 3.10.xxx.
Platform: Nexus7 ARM EABI.
What I don't understand is why it is failing. The errno is not useful at all since it is setting -res_int to errno. Also, I don't understand why it is failing only when res_int is {-1, -256}.
a=1200, b=1300 op='-' => res_int=-100 is an example case where printk prints -100, but in my userspace app, I receive -1.
It looks like when res_int is {-1, -256}, errno is being set to -res_int.
root#android:/data/local # ./calc
Please enter request in 'num1 oper num2' format:
2.45 - 2.2
returned from syscall with res_int = 250
errno = 22, strerror(errno) = Invalid argument
Calculator result = 0.250000
root#android:/data/local # ./calc
Please enter request in 'num1 oper num2' format:
2.2 - 2.45
returned from syscall with res_int = -1
errno = 250, strerror(errno) = Unknown error 250
Calculator result = -0.001000
root#android:/data/local #
You didnt' mention your kernel version and platform but fwiw, from kernel's perspective a syscall usually returns zero on success and a negative number on error. Generally this is the ABI convention between kernel folks and application developers so the code can understand each other.
But user program always uses C library as a wrapper to make system calls and the C API should comply with the API standard. That is, the C library will check the return value from kernel space and set the errno based on that. (Note that errno is in user space and kernel doesn't know it.)
Say, if syscall foo(), from user's view, returns -1 on failure and should set the errno to indicate the error (ERR1 for failure 1, ERR2 for failure2, etc), the kernel syscall implementation would return -ERR1 or -ERR2 accordingly and the C library would check the return value against zero and if it's negative, it will return -1 to user and also set errno to ERR1/ERR2.
Hope this helps.
Update: I checked the x86_64 code of glibc, and this may help understand this:
.text
ENTRY (syscall)
movq %rdi, %rax /* Syscall number -> rax. */
movq %rsi, %rdi /* shift arg1 - arg5. */
movq %rdx, %rsi
movq %rcx, %rdx
movq %r8, %r10
movq %r9, %r8
movq 8(%rsp),%r9 /* arg6 is on the stack. */
syscall /* Do the system call. */
cmpq $-4095, %rax /* Check %rax for error. */
jae SYSCALL_ERROR_LABEL /* Jump to error handler if error. */
L(pseudo_end):
ret /* Return to caller. */
PSEUDO_END (syscall)
File: "sysdeps/unix/sysv/linux/x86_64/syscall.S"
Linus said he will make sure the no syscall returns a value in -1 .. -4095 as a valid result so we can savely test with -4095.
Update: so I guess it's your c library that converts the return value. You platform ABI may define that syscall returning {-1, -256} on failure so the C wrapper sets the return value to -1 in such cases and set the errno accordingly.

visual studio 2012 update 3 compiler bug - not calling dtor

I believe I've found a somewhat obscure but scary bug in the Visual Studio 2012 Update 3 C++ compiler. I found it while writing unit tests using gtest. The tests started showing memory leaks, and after investigating the problem seemed to reduce to a bug in the compiler.
I submitted the issue to Microsoft:
https://connect.microsoft.com/VisualStudio/feedback/details/794722/parameter-dtor-not-called-when-overloaded-operator-involved-in-return
In the past I've mistakenly called "compiler bug" on more of my own bugs than I care to admit. So I thought I'd post the question here in case anyone wants to attempt to reproduce the problem themselves. If I can be pointed towards a mistake of my own in this code, that would be extremely helpful! I'm really hoping it's not actually the case that the VC++ compiler fails to call destructors in the following program.
Note that the faulty behavior occurs with the optimizer disabled, so it's not an optimizer bug.
I tried this code in gcc 4.2.1 (i686-apple-darwin11) and it behaves as expected.
Here's the code for the single source file in the project:
#include <string>
int instance_count= 0;
class c {
public:
c( std::string s ) : m_s(s) { ++instance_count; }
c( const c& other ) : m_s(other.m_s) { ++instance_count; }
~c() {--instance_count;}
private:
std::string m_s;
};
class d {
public:
d() {}
void operator=(int) {}
};
void f( c c_ ) {
try {}
catch(...) { return d() = 5; }
}
int main( int argc, char* argv[] ) {
c instance("leak");
f(instance);
return instance_count == 1 ? 0 : -1;
}
To compile it in Visual Studio 2012 Update 3:
File -> New -> Project..., select Win32 Console Application, click OK then click Finish
Build -> Configuration Manager -> Active Solution Platform -> New..., select x64, click OK
Replace the contents of the main .cpp file with the above code
Either add #include "stdafx.h" to the top of the file or turn off precompiler headers
Run the program, note that the exit code is -1, I expect it to be 0. This seems to reproduce in both 32-bit and 64-bit builds, although I was focusing on 64-bit.
Comment out the try/catch blocks in f(), note that the exit code becomes 0. I don't see why this change should affect the exit code since the catch() block isn't even executing.
Looks like an issue in codegen. The dissassembly shows the following for function f.
With return statement -
try { }
002039B8 mov byte ptr [ebp-4],1
002039BC jmp f+6Eh (02039DEh)
catch(...) { return d() = 5; }
002039BE push 5
002039C0 lea ecx,[ebp-0D5h]
002039C6 call d::d (0201474h)
002039CB mov ecx,eax
002039CD call d::operator= (0201479h)
002039D2 mov eax,2039E7h
002039D7 ret
002039D8 mov eax,2039DEh
002039DD ret
$LN4:
002039DE mov dword ptr [ebp-4],0
002039E5 jmp $LN8+0Fh (02039F6h)
$LN8:
002039E7 mov dword ptr [ebp-4],0FFFFFFFFh
002039EE lea ecx,[c_]
002039F1 call c::~c (020101Eh)
}
Notice the jump f+6Eh(02039DEh) for dissassembly of try block. This jumps to
002039DE mov dword ptr [ebp-4],0
002039E5 jmp $LN8+0Fh (02039F6h)
which totally skips the call to destructor. One more thing to observe is that the call to destructor is before the closing brace ('}').
If we take a look at the code without return statement,
try { }
013839B8 mov byte ptr [ebp-4],1
013839BC jmp f+68h (013839D8h)
catch(...) { /*return*/ d() = 5; }
013839BE push 5
013839C0 lea ecx,[ebp-0D5h]
013839C6 call d::d (01381474h)
013839CB mov ecx,eax
013839CD call d::operator= (01381479h)
013839D2 mov eax,13839E1h
013839D7 ret
013839D8 mov dword ptr [ebp-4],0
013839DF jmp $LN8+7h (013839E8h)
$LN8:
013839E1 mov dword ptr [ebp-4],0
}
013839E8 mov dword ptr [ebp-4],0FFFFFFFFh
013839EF lea ecx,[c_]
013839F2 call c::~c (0138101Eh)
Here, the call to destructor is after the brace ('}').

Linux sys_call_table rip relative addressing x86_64

I am trying to get offset of sys_call_table on Linux x86_64.
First of all I read pointer to system_call entry by reading it from MSR_LSTAR and it's correct
static unsigned long read_msr(unsigned int msr)
{
unsigned low, high;
asm volatile("rdmsr" : "=a" (low), "=d" (high) : "c" (msr));
return ((low) | ((u64)(high) << 32));
}
Then I parse it to find opcode of call instruction and it is also correct
#define CALL_OP 0xFF
#define CALL_MODRM 0x14
static unsigned long find_syscall_table(unsigned char *ptr)
{
//correct
for (; (*ptr != CALL_OP) || (*(ptr+1) != CALL_MODRM); ptr++);
//not correct
ptr += *(unsigned int*)(ptr + 3);
pr_info("%lx", (unsigned long)ptr);
return ptr;
}
But I failed to get address after call opcode. First byte of ptr is opcode, then ModRM byte, then SIB and then 32bit displacement, so I add 3 to ptr and dereferenced it as integer value and then add it to ptr, because it is %RIP, and address is RIP relative. But the result value is wrong, it don't coincide with value I see in gdb, so where am I wrong?
It's not x7e9fed00 but rather -0x7e9fed00 - a negative displacement.
That is the sign-magnitude form of the 2's complement negative number 0x81601300
which is stored by a little-endian processor as "00 13 60 81"
No idea if you will find sys_call_table at the resulting address however. As an alternative idea, it seems some people find it by searching memory for the known pointers to functions that should be listed in it.

Resources