How to test/validate the vmalloc guard page is working in Linux - linux

I am studying stack guarding in Linux. I found that the Linux kernel VMAP_STACK config parameter is using the guard page mechanism along with vmalloc() to provide stack guarding.
I am trying to find a way to check how this guard page is working in Linux kernel. I googled and checked the kernel code, but did NOT find out the codes.
A further question is how to verify the guarded stack.
I had a kernel module to underrun/overflow a process's kernel stack, like this
static void shoot_kernel_stack(void)
{
unsigned char *ptr = task_stack_page(current);
unsigned char *tmp = NULL;
tmp = ptr + THREAD_SIZE + PAGE_SIZE + 0;
// tmp -= 0x100;
memset(tmp, 0xB4, 0x10); // Underrun
}
I really get the kernel panic like below,
[ 8006.358354] BUG: stack guard page was hit at 00000000e8dc2d98 (stack is 00000000cff0f921..00000000653b24a9)
[ 8006.361276] kernel stack overflow (page fault): 0000 [#1] SMP PTI
Is this the right way to verify the guard page?

The VMAP_STACK Linux feature is used to map the kernel stack of the threads into VMA. By virtually mapping stack, the underlying physical pages don't need to be contiguous. It is possible to detect cross-page overflows by adding guard pages. As the VMA are followed by a guard (unless the VM_NO_GUARD flag is passed at allocation time), the stacks allocated in those area benefits from it for stack overflow detection.
ALLOCATION
The thread stacks are allocated at thread creation time with alloc_thread_stack_node() in kernel/fork.c. When VMAP_STACK is activated, the stacks are cached because according to the comments in the source code:
vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
flush. Try to minimize the number of calls by caching stacks.
The kernel stack size is THREAD_SIZE (equal to 4 pages on x86_64 platforms). The source code of the allocation invoked at thread creation time is:
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
{
#ifdef CONFIG_VMAP_STACK
void *stack;
int i;
[...] // <----- Part which gets a previously cached stack. If no stack in cache
// the following is run to allocate a brand new stack:
/*
* Allocated stacks are cached and later reused by new threads,
* so memcg accounting is performed manually on assigning/releasing
* stacks to tasks. Drop __GFP_ACCOUNT.
*/
stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
VMALLOC_START, VMALLOC_END,
THREADINFO_GFP & ~__GFP_ACCOUNT,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
[...]
__vmalloc_node_range() is defined in mm/vmalloc.c. This calls __get_vm_area_node(). As the latter is not passed the VM_NO_GUARD flags, an additional page is added at the end of the allocated area. This is the guard page of the VMA:
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
BUG_ON(in_interrupt());
size = PAGE_ALIGN(size);
if (unlikely(!size))
return NULL;
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, get_count_order_long(size),
PAGE_SHIFT, IOREMAP_MAX_ORDER);
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;
if (!(flags & VM_NO_GUARD)) // <----- A GUARD PAGE IS ADDED
size += PAGE_SIZE;
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
setup_vmalloc_vm(area, va, flags, caller);
return area;
}
OVERFLOW MANAGEMENT
The stack overflow management is architecture dependent (i.e. source code located in arch/...). The links referenced below provide some pointers on some architecture dependent implementations.
For x86_64 platform, the overflow check is done upon the page fault interruption which triggers the following chain of function calls: do_page_fault()->__do_page_fault()->do_kern_addr_fault()->bad_area_nosemaphore()->no_context() function defined in arch/x86/mm/fault.c. In no_context(), there is a part dedicated to VMAP_STACK management for the detection of the stack under/overflow:
static noinline void
no_context(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int signal, int si_code)
{
struct task_struct *tsk = current;
unsigned long flags;
int sig;
[...]
#ifdef CONFIG_VMAP_STACK
/*
* Stack overflow? During boot, we can fault near the initial
* stack in the direct map, but that's not an overflow -- check
* that we're in vmalloc space to avoid this.
*/
if (is_vmalloc_addr((void *)address) &&
(((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
/*
* We're likely to be running with very little stack space
* left. It's plausible that we'd hit this condition but
* double-fault even before we get this far, in which case
* we're fine: the double-fault handler will deal with it.
*
* We don't want to make it all the way into the oops code
* and then double-fault, though, because we're likely to
* break the console driver and lose most of the stack dump.
*/
asm volatile ("movq %[stack], %%rsp\n\t"
"call handle_stack_overflow\n\t"
"1: jmp 1b"
: ASM_CALL_CONSTRAINT
: "D" ("kernel stack overflow (page fault)"),
"S" (regs), "d" (address),
[stack] "rm" (stack));
unreachable();
}
#endif
[...]
}
In the above code, when a stack under/overflow is detected, the handle_stack_overflow() function defined in arch/x86/kernel/traps.c) is called:
#ifdef CONFIG_VMAP_STACK
__visible void __noreturn handle_stack_overflow(const char *message,
struct pt_regs *regs,
unsigned long fault_address)
{
printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
(void *)fault_address, current->stack,
(char *)current->stack + THREAD_SIZE - 1);
die(message, regs, 0);
/* Be absolutely certain we don't return. */
panic("%s", message);
}
#endif
The example error message "BUG: stack guard page was hit at..." pointed out in the question comes from the above handle_stack_overflow() function.
FROM YOUR EXAMPLE MODULE
When VMAP_STACK is defined, the stack_vm_area field of the task descriptor appears and is set with the VMA address associated to the stack. From there, it is possible to grab interesting information:
struct task_struct *task;
#ifdef CONFIG_VMAP_STACK
struct vm_struct *vm;
#endif // CONFIG_VMAP_STACK
task = current;
printk("\tKernel stack: 0x%lx\n", (unsigned long)(task->stack));
printk("\tStack end magic: 0x%lx\n", *(unsigned long *)(task->stack));
#ifdef CONFIG_VMAP_STACK
vm = task->stack_vm_area;
printk("\tstack_vm_area->addr = 0x%lx\n", (unsigned long)(vm->addr));
printk("\tstack_vm_area->nr_pages = %u\n", vm->nr_pages);
printk("\tstack_vm_area->size = %lu\n", vm->size);
#endif // CONFIG_VMAP_STACK
printk("\tLocal var in stack: 0x%lx\n", (unsigned long)(&task));
The nr_pages field is the number of pages without the additional guard page. The last unsigned long at the top of the stack is set with STACK_END_MAGIC defined in include/uapi/linux/magic.h as:
#define STACK_END_MAGIC 0x57AC6E9D
REFERENCES:
Preventing stack guard-page hopping
arm64: VMAP_STACK support
CONFIG_VMAP_STACK: Use a virtually-mapped stack
Linux 4.9 On x86_64 To Support Vmapped Stacks
A Decade of Linux Kernel Vulnerabilities

Related

Where does "Freeing unused kernel memory" come from?

I often see Freeing unused kernel memory: xxxK (......) from dmesg, but I can never find this log from kernel source code with the help of grep/rg.
Where does it come from?
That line of text does not exist as a single, complete string, hence your failure to grep it.
This all gets rolling when free_initmem() in init/main.c calls free_initmem_default().
The line in question originates from free_initmem_default() in include/linux/mm.h:
/*
* Default method to free all the __init memory into the buddy system.
* The freed pages will be poisoned with pattern "poison" if it's within
* range [0, UCHAR_MAX].
* Return pages freed into the buddy system.
*/
static inline unsigned long free_initmem_default(int poison)
{
extern char __init_begin[], __init_end[];
return free_reserved_area(&__init_begin, &__init_end,
poison, "unused kernel");
}
The rest of that text is from free_reserved_area() in mm/page_alloc.c:
unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
{
void *pos;
unsigned long pages = 0;
...
if (pages && s)
pr_info("Freeing %s memory: %ldK\n",
s, pages << (PAGE_SHIFT - 10));
return pages;
}
(Code excerpts from v5.2)
From my answer here:
Some functions in the kernel source code are marked with __init because they run only once during initialization. This instructs the compiler to mark a function in a special way. The linker collects all such functions and puts them at the end of the final binary file.
Example method signature:
static int __init clk_disable_unused(void)
{
// some code
}
When the kernel starts, this code runs only once during initialization. After it runs, the kernel can free this memory to reuse it and you will see the kernel
message:
Freeing unused kernel memory: 108k freed

Why mm_struct->start_stack and vm_area_struct->start don't point to the same address?

As far as I understand memory management in Linux kernel, there is a mm_struct structure responsible for address space in each process. One important memory region is stack. This should be identified by vm_area_struct memory region and mm_struct itself has a pointer mm_struct->stack_start which is stack's address.
I came accross the code below and what I cannot understand is why any of the memory region start/end addresses are not equal to mm_struct->stack_start value. Any help in understanding this would be very much appreciated. Thanks
Some of the results of loading the compiled kernel module:
Vma number 14: Starts at 0x7fff4bb68000, Ends at 0x7fff4bb8a000
Vma number 15: Starts at 0x7fff4bbfc000, Ends at 0x7fff4bbfe000
Vma number 16: Starts at 0x7fff4bbfe000, Ends at 0x7fff4bc00000
Code Segment start = 0x400000, end = 0x400854
Data Segment start = 0x600858, end = 0x600a94
Stack Segment start = 0x7fff4bb88420
One can find that stack segment start (0x7fff4bb88420) belongs to the vma number 14 but I don't know the addresses are different.
Kernel module source code:
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
static int pid_mem = 1;
static void print_mem(struct task_struct *task)
{
struct mm_struct *mm;
struct vm_area_struct *vma;
int count = 0;
mm = task->mm;
printk("\nThis mm_struct has %d vmas.\n", mm->map_count);
for (vma = mm->mmap ; vma ; vma = vma->vm_next) {
printk ("\nVma number %d: \n", ++count);
printk(" Starts at 0x%lx, Ends at 0x%lx\n",
vma->vm_start, vma->vm_end);
}
printk("\nCode Segment start = 0x%lx, end = 0x%lx \n"
"Data Segment start = 0x%lx, end = 0x%lx\n"
"Stack Segment start = 0x%lx\n",
mm->start_code, mm->end_code,
mm->start_data, mm->end_data,
mm->start_stack);
}
static int mm_exp_load(void){
struct task_struct *task;
printk("\nGot the process id to look up as %d.\n", pid_mem);
for_each_process(task) {
if ( task->pid == pid_mem) {
printk("%s[%d]\n", task->comm, task->pid);
print_mem(task);
}
}
return 0;
}
static void mm_exp_unload(void)
{
printk("\nPrint segment information module exiting.\n");
}
module_init(mm_exp_load);
module_exit(mm_exp_unload);
module_param(pid_mem, int, 0);
MODULE_AUTHOR ("Krishnakumar. R, rkrishnakumar#gmail.com");
MODULE_DESCRIPTION ("Print segment information");
MODULE_LICENSE("GPL");
Looks like start_stack is the initial stack pointer address. It's calculated by the kernel when the program is executed and is based on the stack section address given in the executable file. I don't think it gets updated at all thereafter. The system uses start_stack in at least one instance: to identify which vma represents "the stack" (when providing /proc/<pid>/maps), as the vma containing that address is guaranteed to contain the (main) stack.
But note that this is only the stack for the "main" (initial) thread; a multi-threaded program will have other stacks too -- one per thread. Since they all share the same address space, all threads will show the same set of vmas, and I think you'll find they all have the same start_stack value as well. But only the main thread's stack pointer will be within the main stack vma. The other threads will each have their own stack vmas -- this is so that each thread's stack can grow independently.
In general, there is one mm_struct for a process, but many vm_area_struct and each responds for a mmaped area.
For example, in a 32-bit system, a process have a virtual address space of 4GB, all of which is pointed by the mm_struct. However, there can be many regions within the 4GB space. Each of the region is pointed by a vm_area_struct, and this region is limited by the vm_area_struct->start and vm_area_struct->end. So, obviously the mm_struct struct contains a list of vm_area_struct.
Here is the detail introduction.

How does linux know when to allocate more pages to a call stack?

Given the program below, segfault() will (As the name suggests) segfault the program by accessing 256k below the stack. nofault() however, gradually pushes below the stack all the way to 1m below, but never segfaults.
Additionally, running segfault() after nofault() doesn't result in an error either.
If I put sleep()s in nofault() and use the time to cat /proc/$pid/maps I see the allocated stack space grows between the first and second call, this explains why segfault() doesn't crash afterwards - there's plenty of memory.
But the disassembly shows there's no change to %rsp. This makes sense since that would screw up the call stack.
I presumed that the maximum stack size would be baked into the binary at compile time (In retrospect that would be very hard for a compiler to do) or that it would just periodically check %rsp and add a buffer after that.
How does the kernel know when to increase the stack memory?
#include <stdio.h>
#include <unistd.h>
void segfault(){
char * x;
int a;
for( x = (char *)&x-1024*256; x<(char *)(&x+1); x++){
a = *x & 0xFF;
printf("%p = 0x%02x\n",x,a);
}
}
void nofault(){
char * x;
int a;
sleep(20);
for( x = (char *)(&x); x>(char *)&x-1024*1024; x--){
a = *x & 0xFF;
printf("%p = 0x%02x\n",x,a);
}
sleep(20);
}
int main(){
nofault();
segfault();
}
The processor raises a page fault when you access an unmapped page. The kernel's page fault handler checks whether the address is reasonably close to the process's %rsp and if so, it allocates some memory and resumes the process. If you are too far below %rsp, the kernel passes the fault along to the process as a signal.
I tried to find the precise definition of what addresses are close enough to %rsp to trigger stack growth, and came up with this from linux/arch/x86/mm.c:
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
* and pusha to work. ("enter $65535, $31" pushes
* 32 pointers and then decrements %sp by 65535.)
*/
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
bad_area(regs, error_code, address);
return;
}
But experimenting with your program I found that 65536+32*sizeof(unsigned long) isn't the actual cutoff point between segfault and no segfault. It seems to be about twice that value. So I'll just stick with the vague "reasonably close" as my official answer.

Is the sscanf function in the Linux kernel susceptible to buffer overflow attacks?

From what I understand, a typical buffer overflow attack occurs when an attack overflows a buffer of memory on the stack, thus allowing the attacker to inject malicious code and rewrite the return address on the stack to point to that code.
This is a common concern when using functions (such as sscanf) that blindly copy data from one area to another, checking one for a termination byte:
char str[8]; /* holds up to 8 bytes of data */
char *buf = "lots and lots of foobars"; /* way more than 8 bytes of data */
sscanf(buf, "%s", str); /* buffer overflow occurs here! */
I noticed some sysfs_ops store functions in the Linux kernel are implemented with the Linux kernel's version of the sscanf function:
static char str[8]; /* global string */
static ssize_t my_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t size)
{
sscanf(buf, "%s", str); /* buf holds more than 8 bytes! */
return size;
}
Suppose this store callback function is set to a writable sysfs attribute. Would a malicious user be able to intentionally overflow the buffer via a write call?
Normally, I would expect guards against buffer overflow attacks -- such as limiting the number of bytes read -- but I see none in a good number of functions (for example in drivers/scsi/scsi_sysfs.c).
Does the implementation of the Linux kernel version of sscanf protect against buffer overflow attacks; or is there another reason -- perhaps buffer overflow attacks are impossible given how the Linux kernel works under the hood?
The Linux sscanf() is vulnerable to buffer overflows; inspection of the source shows this. You can use width specifiers to limit the amount a %s is allowed to write. At some point your str must have had copy_from_user() run on it as well. It is possible the user space to pass some garbage pointer to the kernel.
In the version of Linux you cited, the scsi_sysfs.c does have a buffer overflow. The latest version does not. The committed fix should fix the issue you see.
Short answer:
sscanf, when well called, will not cause buffer overflow, especially in sysfs xxx_store() function. (There are a lot sscanf in sysfs XXX_store() examples), because Linux kernel add a '\0' (zero-terminated) byte after the string (buf[len] = 0;) for your XXX_store() function.
Long answer:
Normally, sysfs are defined to have a strict formatted data. Since you expect 8 bytes at most, it's reasonable to limit the size you get like this:
static char str[8]; /* global string */
static ssize_t my_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t size)
{
if (size > 8) {
printk("Error: Input size > 8: too large\n");
return -EINVAL;
}
sscanf(buf, "%s", str); /* buf holds more than 8 bytes! */
return size;
}
(Note: use 9 rather than 8, if you expect a 8-bytes string plus '\n')
(Note that you do reject some inputs such as those with many leading white spaces. However, who would send a string with many leading white spaces? Those who want to break your code, right? If they don't follow your spec, just reject them.)
Note that Linux kernel purposely inserts a '\0' at offset len (i.e. buf[len] = 0;) when the user write len bytes to sysfs purposely for safe sscanf, as said in a comment in kernel 2.6: fs/sysfs/file.c:
static int
fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t count)
{
int error;
if (!buffer->page)
buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
if (!buffer->page)
return -ENOMEM;
if (count >= PAGE_SIZE)
count = PAGE_SIZE - 1;
error = copy_from_user(buffer->page,buf,count);
buffer->needs_read_fill = 1;
/* if buf is assumed to contain a string, terminate it by \0,
so e.g. sscanf() can scan the string easily */
buffer->page[count] = 0;
return error ? -EFAULT : count;
}
...
static ssize_t
sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
struct sysfs_buffer * buffer = file->private_data;
ssize_t len;
mutex_lock(&buffer->mutex);
len = fill_write_buffer(buffer, buf, count);
if (len > 0)
len = flush_write_buffer(file->f_path.dentry, buffer, len);
if (len > 0)
*ppos += len;
mutex_unlock(&buffer->mutex);
return len;
}
Higher kernel version keeps the same logic (though already completely rewritten).

Reading x86 MSR from kernel module

My main aim is to get the address values of the last 16 branches maintained by the LBR registers when a program crashes. I tried two ways till now -
1) msr-tools
This allows me to read the msr values from the command line. I make system calls to it from the C program itself and try to read the values. But the register values seem no where related to the addresses in the program itself. Most probably the registers are getting polluted from the other branches in system code. I tried turning off recording of branches in ring 0 and far jumps. But that doesn't help. Still getting unrelated values.
2) accessing through kernel module
Ok I wrote a very simple module (I've never done this before) to access the msr registers directly and possibly avoid register pollution.
Here's what I have -
#define LBR 0x1d9 //IA32_DEBUGCTL MSR
//I first set this to some non 0 value using wrmsr (msr-tools)
static void __init do_rdmsr(unsigned msr, unsigned unused2)
{
uint64_t msr_value;
__asm__ __volatile__ (" rdmsr"
: "=A" (msr_value)
: "c" (msr)
);
printk(KERN_EMERG "%lu \n",msr_value);
}
static int hello_init(void)
{
printk(KERN_EMERG "Value is ");
do_rdmsr (LBR,0);
return 0;
}
static void hello_exit(void)
{
printk(KERN_EMERG "End\n");
}
module_init(hello_init);
module_exit(hello_exit);
But the problem is that every time I use dmesg to read the output I get just
Value is 0
(I have tried for other registers - it always comes as 0)
Is there something that I am forgetting here?
Any help? Thanks
Use the following:
unsigned long long x86_get_msr(int msr)
{
unsigned long msrl = 0, msrh = 0;
/* NOTE: rdmsr is always return EDX:EAX pair value */
asm volatile ("rdmsr" : "=a"(msrl), "=d"(msrh) : "c"(msr));
return ((unsigned long long)msrh << 32) | msrl;
}
You can use Ilya Matveychikov's answer... or... OR :
#include <asm/msr.h>
int err;
unsigned int msr, cpu;
unsigned long long val;
/* rdmsr without exception handling */
val = rdmsrl(msr);
/* rdmsr with exception handling */
err = rdmsrl_safe(msr, &val);
/* rdmsr on a given CPU (instead of current one) */
err = rdmsrl_safe_on_cpu(cpu, msr, &val);
And there are many more functions, such as :
int msr_set_bit(u32 msr, u8 bit)
int msr_clear_bit(u32 msr, u8 bit)
void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
Have a look at /lib/modules/<uname -r>/build/arch/x86/include/asm/msr.h

Resources