Where could I find the code of "sched_getcpu()" - linux

Recently I'm using the function sched_getcpu() from the header file sched.h on Linux.
However, I'm wondering where could I find the source code of this function?
Thanks.

Under Linux, the sched_getcpu() function is a glibc wrapper to sys_getcpu() system call, which is architecture specific.
For the x86_64 architecture, it is defined under arch/x86/include/asm/vgtod.h as __getcpu() (tree 4.x):
#ifdef CONFIG_X86_64
#define VGETCPU_CPU_MASK 0xfff
static inline unsigned int __getcpu(void)
{
unsigned int p;
/*
* Load per CPU data from GDT. LSL is faster than RDTSCP and
* works on all CPUs. This is volatile so that it orders
* correctly wrt barrier() and to keep gcc from cleverly
* hoisting it out of the calling function.
*/
asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
return p;
}
#endif /* CONFIG_X86_64 */
Being this function called by __vdso_getcpu() declared in arch/entry/vdso/vgetcpu.c:
notrace long
__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
{
unsigned int p;
p = __getcpu();
if (cpu)
*cpu = p & VGETCPU_CPU_MASK;
if (node)
*node = p >> 12;
return 0;
}
(See vDSO for details regarding what vdso prefix is).
EDIT 1: (in reply to arm code location)
ARM code location
It can be found in the arch/arm/include/asm/thread_info.h file:
static inline struct thread_info *current_thread_info(void)
{
return (struct thread_info *)
(current_stack_pointer & ~(THREAD_SIZE - 1));
}
This function is used by raw_smp_processor_id() that is defined in the file arch/arm/include/asm/smp.h as:
#define raw_smp_processor_id() (current_thread_info()->cpu)
And it's called by getcpu system call declared in the file kernel/sys.c:
SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, struct getcpu_cache __user *, unused)
{
int err = 0;
int cpu = raw_smp_processor_id();
if (cpup)
err |= put_user(cpu, cpup);
if (nodep)
err |= put_user(cpu_to_node(cpu), nodep);
return err ? -EFAULT : 0;
}

Related

How could I use `kallsyms_lookup_name` function to fix `unknown character` error when loading Linux kernel module?

I'm trying to complete a hooking sample attachment in a program for my uni assignment. The task requires to get a system call sys_rt_sigaction hooked when initiating a loadable module in Linux kernel (I use Ubuntu 18.04 LTS, kernel version is 5.0.0-23-generic). So, the case I'm struggling originates from an error could not insert module <module name>: Unknown symbol in module once I started sudo insmod <my module name>.ko.
After some googling, I see clear this problem arises due to missing sys_call_table export to run inserting as smoothly as well. Following this post, I want to cope that invoking kallsyms_lookup_name call before kicking off init procedure.
There is .c-file which provides with definitions of operations accessible by module (file name is buffer.c):
#define __KERNEL__
#define MODULE
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <asm/uaccess.h>
#include <linux/syscalls.h>
#include <linux/kallsyms.h>
#include <linux/unistd.h>
void * sys_call_table = (void *) kallsyms_lookup_name("sys_call_table");// some wrongness here, but what exactly?
MODULE_LICENSE("GPL");
int (*real_rt_sigaction)(const char * path); // true syscall prototype
static int __init buffer_init_module(void);
static void __exit buffer_exit_module(void);
static int device_open(struct inode *, struct file *); // driver file opening
static int device_release(struct inode *, struct file *); // return of system resource control
static ssize_t device_read(struct file *, char *, size_t, loff_t *); // reading from driver file
static ssize_t device_write(struct file *, const char *, size_t, loff_t *); // writing into driver file
#define DEVICE_NAME "buffer"
#define BUF_LEN 80
// to be called instead
int alter_rt_sigaction(int signum, const struct sigaction *act,
struct sigaction *oldact, size_t sigsetsize) {
printk(KERN_INFO "Syscall function hooked - you've lost control of your experience");
return 0;
}
static int Major;
static int Device_Open = 0;
static int total_open = 1;
static char Buf[BUF_LEN + 1] = "Buffer is empty, add some input\n";
static char *Msg_ptr;
static int Buf_Char = 50;
static int Bytes_Read = 0;
static struct file_operations fops = {
.read = device_read,
.write = device_write,
.open = device_open,
.release = device_release
};
static int __init buffer_init_module(void)
{
printk(KERN_INFO
"Device initializing in progress...");
Major = register_chrdev(0, DEVICE_NAME, &fops);
if(Major < 0) {
printk("Major number hasn't been assigned - Driver registration failed\n");
return Major;
}
printk(KERN_INFO "Registration success - device major number: %d\n", Major);
real_rt_sigaction=sys_call_table[__NR_rt_sigaction];
sys_call_table[__NR_rt_sigaction]=alter_rt_sigaction; // hooking implementation
return 0;
}
static void __exit buffer_exit_module(void)
{
unregister_chrdev(Major, DEVICE_NAME);
printk(KERN_INFO "Outside the module - exit successfully completed\n");
sys_call_table[__NR_rt_sigaction]=real_rt_sigaction; // original call reset
}
static int device_open(struct inode *inode, struct file *file)
{
if(Device_Open)
return -EBUSY;
Device_Open++;
printk(KERN_INFO "Device file has been accessed %d time(s)\n", total_open++);
Msg_ptr = Buf;
try_module_get(THIS_MODULE);
Bytes_Read = 0;
return 0;
}
static int device_release(struct inode * node, struct file * filep)
{
Device_Open--;
module_put(THIS_MODULE);
printk(KERN_INFO "Device file gets close\n");
return 0;
}
static ssize_t device_read(struct file * filep, char * buffer, size_t len, loff_t * offset)
{
int got_read = Bytes_Read;
if(Bytes_Read >= Buf_Char)
return 0;
while(len && (Bytes_Read < Buf_Char)) {
put_user(Msg_ptr[Bytes_Read], buffer+Bytes_Read);
len--;
Bytes_Read++;
}
return Bytes_Read-got_read;
}
static ssize_t device_write(struct file * filep, const char * buffer, size_t len, loff_t * offset)
{
Buf_Char = 0;
if(Buf_Char >= BUF_LEN) {
return 0;
}
while(len && (Buf_Char < BUF_LEN))
{
get_user(Msg_ptr[Buf_Char], buffer+Buf_Char);
len--;
Buf_Char++;
}
return Buf_Char;
}
module_init(buffer_init_module);
module_exit(buffer_exit_module);
Additively, there is code in Makefile:
obj-m += buffer.o
all:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
clean:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
The painful moment here is an error message initializer element is not constant whenever I was trying to build module via sudo make in my project folder. As I follow the beginner's tutorials and need for some basic insight, it might be highly appreciable to see any help with solution or even some ideas how to handle the same problem more effectively, indeed.

How to change scheduler policy in Loadable Kernel Module (LKM) in Linux?

I have written an LKM in Linux. The ioctl function, my_ioctl is called by a user-level program foo.c. I want to change the scheduling policy of foo.c. Therefore I am doing the following in my my_ioctl function from this link:
struct sched_attr attr;
int ret;
unsigned int flags = 0;
attr.size = sizeof(attr);
attr.sched_flags = 0;
attr.sched_nice = 0;
attr.sched_priority = 0;
/* This creates a 10ms/30ms reservation */
attr.sched_policy = SCHED_DEADLINE;
attr.sched_runtime = 10 * 1000 * 1000;
attr.sched_period = attr.sched_deadline = 30 * 1000 * 1000;
ret = sched_setattr(current->pid, &attr, flags);
if (ret < 0) {
perror("sched_setattr");
exit(-1);
}
The sched_setattr is following:
int sched_setattr(pid_t pid,
const struct sched_attr *attr,
unsigned int flags) {
return syscall(__NR_sched_setattr, pid, attr, flags);
}
I have changed the syscall to sys_mycall because it's LKM. struct sched_attr is also defined in the above mentioned Linux Kernel documentation link. However, I could not change the scheduling policy by this. It throws me error like scheduling policy cannot be changed from kernel space.
I don't understand why this is the case. There is an utility chrt which does the same thing for a process from the user-space; then why is this not possible from a LKM? Or am I missing something?

jprobe do_execve does not work with kernel 4.1

I want to set a jprobe hook on do_execve to catch every executed program.
My code is working on <= 3.2 linux kernel (debian). This is my output on linux kernel 3.2:
[ 628.534037] registered: do_execve, ret: 0
[ 723.995797] execve: /usr/bin/vi
[ 726.807025] execve: /bin/dmesg
on 4.1 kernel I the same result (everything is registered) but there is no "execve":
[ 8621.430568] registered: do_execve, ret: 0
And this is my code:
static struct jprobe jprobe_hooks[] = {
{
.entry = jdo_execve,
.kp = { .symbol_name = "do_execve" }
}};
static long jdo_execve(const char *filename, const char __user *const __user *argv, const char __user *const __user *envp, struct pt_regs *regs)
{
printk(KERN_INFO "execve: %s", filename );
}
//
// registration
//
int ret, x, reg_error;
reg_error = 0;
for (x = 0; x < sizeof(jprobe_hooks) / sizeof(jprobe_hooks[0]); x++)
{
ret = register_jprobe(&jprobe_hooks[x]);
if (ret < 0)
{
printk(KERN_INFO "register_jprobe failed, returned %d, item: %s\n", ret, jprobe_hooks[x].kp.symbol_name);
reg_error++;
}
else
{
printk(KERN_INFO "registered: %s, ret: %u\n", jprobe_hooks[x].kp.symbol_name, ret);
}
}
When I do a grep on kallsyms I get on 3.2:
grep do_execv /proc/kallsyms
ffffffff81100650 T do_execve
and on 4.2:
grep do_execv /proc/kallsyms
ffffffff811d2950 T do_execve
ffffffff811d2980 T do_execveat
I even tried to change the function (because do_execve prototype has changed) to this:
static int jdo_execve(struct filename *fname, const char __user *const __user *__argv, const char __user *const __user *__envp)
{
int i = 0;
printk(KERN_INFO "execve: %s ", fname->name );
}
and even that didn't help.
I can set hooks on other functions like do_fork or sys_open, but not on do_execve. Why? Anybody has ideas? Why is it not working anymore?
Edit:
I'm also hooking do_execveat:
static int jdo_execveat(int fd, struct filename *fname, const char __user *const __user *__argv, const char __user *const __user *__envp, int flags)
There are several problems that may prevent you jprobe messages:
You don't end print messages printk(KERN_INFO "execve: %s", filename ); with newline, thus you log buffer is not flushed.
API had changed. Now do_execve has filename parameter as struct filename.
Your jprobe code is silly: you don't have module entry, jprobe routine must end with jprobe_return() call and so on. Look at the samples in kernel source tree at "samples/kprobes"
Try to fix it - maybe it will help.
Anyway, I tried it by myself - here is the code - and things is, indeed, looks strange. When I load the module it registers 2 jprobes - one for do_execve, another for do_execveat. But I don't see any messages when I execute programs. BUT what I do see is periodical messages like this:
jprobe: execve: /usr/lib/systemd/systemd-cgroups-agent
It means that jprobe itself works, but not for every execve call.
So I wrote a simple C program to call execve just to be sure it's really called and I still nothing happens execpt systemd-cgroups-agent.

How to test your own Linux module?

Today I am getting started with developing Linux modules. It was rather hard to write, compile and work with Helloworld, but I've done it.
My second module with open, write, read functions is ready, but I really dont know how to test it. Write method just makes printk(). My module is loaded, its name is iamnoob. How to test this write(...) function and to find smth in var/log/syslog?
cat > iamnoob just writes a file to the dir. Same with cp and other.
Sorry for noob question, i've googled, but no answer has been found. Sorry for poor English.
A basic kernel module would normally include registering a character device.
Simple imlementation requires:
Register chrdev region with specific major & minor.
Allocate file operations structure and implement the basic read / write APIs.
Initialize and register character device with the file operations structure to the major / minor region.
See the following code snippet as a template of a module (only read / write APIs are imlemented):
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <asm-generic/uaccess.h>
#define MY_BUFFER_SIZE (1024 * 10)
#define MY_CHRDEV_MAJOR 217
#define MY_CHRDEV_MINOR 0
static struct cdev my_cdev;
static unsigned char *my_buf;
static dev_t my_dev = MKDEV(MY_CHRDEV_MAJOR, MY_CHRDEV_MINOR);
ssize_t my_read(struct file *file, char __user * buf, size_t count, loff_t * ppos)
{
int size;
size = MY_BUFFER_SIZE - 100 - (int)*ppos;
if (size > count)
size = count;
if (copy_to_user(buf, my_buf + *ppos, count))
return -EFAULT;
*ppos += size;
return size;
}
ssize_t my_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
int size;
size = MY_BUFFER_SIZE - 100 - (int)*ppos;
if (size > count)
size = count;
if (copy_from_user(my_buf + *ppos, buf, count))
return -EFAULT;
*ppos += size;
return size;
}
long my_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
printk ("%s!\n", __FUNCTION__);
return 0;
}
int my_mmap(struct file *f, struct vm_area_struct *vma)
{
printk ("%s!\n", __FUNCTION__);
return 0;
}
int my_open(struct inode *i, struct file *f)
{
printk ("%s!\n", __FUNCTION__);
return 0;
}
int my_release(struct inode *i, struct file *f)
{
printk ("%s!\n", __FUNCTION__);
return 0;
}
struct file_operations my_fops =
{
.owner = THIS_MODULE,
.read = &my_read,
.write = &my_write,
.unlocked_ioctl = &my_unlocked_ioctl,
.mmap = &my_mmap,
.open = &my_open,
.release = &my_release,
};
static int __init my_module_init(void)
{
int line = 0;
unsigned char *pos;
printk ("%s!\n", __FUNCTION__);
my_buf = (unsigned char *)kzalloc(MY_BUFFER_SIZE, 0);
if (my_buf == NULL) {
printk("%s - failed to kzallocate buf!\n", __FUNCTION__);
return -1;
}
pos = my_buf;
while (pos - my_buf < MY_BUFFER_SIZE - 100) {
sprintf(pos, "Line #%d\n", line++);
pos += strlen(pos);
}
cdev_init(&my_cdev, &my_fops);
if (register_chrdev_region(my_dev, 1, "my_dev")) {
pr_err("Failed to allocate device number\n");
}
cdev_add(&my_cdev, my_dev, 1);
printk ("%s - registered chrdev\n", __FUNCTION__);
return 0;
}
static void __exit my_module_exit(void)
{
printk ("my_module_exit.\n");
unregister_chrdev_region(my_dev, 1);
return;
}
module_init(my_module_init);
module_exit(my_module_exit);
MODULE_LICENSE("GPL");
This module uses a buffer for file operations, therefore can be tested on any machine, regardless of its HW. Make sure you avoid unnecessary printk's as loops may harm your kernel stability.
Once this is done, in user-space shell you should create a /dev node to represent your character device:
sudo mknod /dev/[dev_name] c [major] [minor]
for example:
sudo mknod /dev/my_dev c 217 0
Then you can test your read / write APIs with:
sudo insmod my_modult.ko
cat /dev/my_dev
less -f /dev/my_dev
sudo su
root> echo "This is a test" > /dev/my_dev
root> exit
cat /dev/my_dev
The shell commands listed above perform read, then login as root (to allow writing to device), write to the char dev, then exit and read again to see the changes.
Now you'd normally implement ioctl and mmap if needed.

Intercepting a system call [duplicate]

I'm trying to write some simple test code as a demonstration of hooking the system call table.
"sys_call_table" is no longer exported in 2.6, so I'm just grabbing the address from the System.map file, and I can see it is correct (Looking through the memory at the address I found, I can see the pointers to the system calls).
However, when I try to modify this table, the kernel gives an "Oops" with "unable to handle kernel paging request at virtual address c061e4f4" and the machine reboots.
This is CentOS 5.4 running 2.6.18-164.10.1.el5. Is there some sort of protection or do I just have a bug? I know it comes with SELinux, and I've tried putting it in to permissive mode, but it doesn't make a difference
Here's my code:
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/unistd.h>
void **sys_call_table;
asmlinkage int (*original_call) (const char*, int, int);
asmlinkage int our_sys_open(const char* file, int flags, int mode)
{
printk("A file was opened\n");
return original_call(file, flags, mode);
}
int init_module()
{
// sys_call_table address in System.map
sys_call_table = (void*)0xc061e4e0;
original_call = sys_call_table[__NR_open];
// Hook: Crashes here
sys_call_table[__NR_open] = our_sys_open;
}
void cleanup_module()
{
// Restore the original call
sys_call_table[__NR_open] = original_call;
}
I finally found the answer myself.
http://www.linuxforums.org/forum/linux-kernel/133982-cannot-modify-sys_call_table.html
The kernel was changed at some point so that the system call table is read only.
cypherpunk:
Even if it is late but the Solution
may interest others too: In the
entry.S file you will find: Code:
.section .rodata,"a"
#include "syscall_table_32.S"
sys_call_table -> ReadOnly You have to
compile the Kernel new if you want to
"hack" around with sys_call_table...
The link also has an example of changing the memory to be writable.
nasekomoe:
Hi everybody. Thanks for replies. I
solved the problem long ago by
modifying access to memory pages. I
have implemented two functions that do
it for my upper level code:
#include <asm/cacheflush.h>
#ifdef KERN_2_6_24
#include <asm/semaphore.h>
int set_page_rw(long unsigned int _addr)
{
struct page *pg;
pgprot_t prot;
pg = virt_to_page(_addr);
prot.pgprot = VM_READ | VM_WRITE;
return change_page_attr(pg, 1, prot);
}
int set_page_ro(long unsigned int _addr)
{
struct page *pg;
pgprot_t prot;
pg = virt_to_page(_addr);
prot.pgprot = VM_READ;
return change_page_attr(pg, 1, prot);
}
#else
#include <linux/semaphore.h>
int set_page_rw(long unsigned int _addr)
{
return set_memory_rw(_addr, 1);
}
int set_page_ro(long unsigned int _addr)
{
return set_memory_ro(_addr, 1);
}
#endif // KERN_2_6_24
Here's a modified version of the original code that works for me.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/unistd.h>
#include <asm/semaphore.h>
#include <asm/cacheflush.h>
void **sys_call_table;
asmlinkage int (*original_call) (const char*, int, int);
asmlinkage int our_sys_open(const char* file, int flags, int mode)
{
printk("A file was opened\n");
return original_call(file, flags, mode);
}
int set_page_rw(long unsigned int _addr)
{
struct page *pg;
pgprot_t prot;
pg = virt_to_page(_addr);
prot.pgprot = VM_READ | VM_WRITE;
return change_page_attr(pg, 1, prot);
}
int init_module()
{
// sys_call_table address in System.map
sys_call_table = (void*)0xc061e4e0;
original_call = sys_call_table[__NR_open];
set_page_rw(sys_call_table);
sys_call_table[__NR_open] = our_sys_open;
}
void cleanup_module()
{
// Restore the original call
sys_call_table[__NR_open] = original_call;
}
Thanks Stephen, your research here was helpful to me. I had a few problems, though, as I was trying this on a 2.6.32 kernel, and getting WARNING: at arch/x86/mm/pageattr.c:877 change_page_attr_set_clr+0x343/0x530() (Not tainted) followed by a kernel OOPS about not being able to write to the memory address.
The comment above the mentioned line states:
// People should not be passing in unaligned addresses
The following modified code works:
int set_page_rw(long unsigned int _addr)
{
return set_memory_rw(PAGE_ALIGN(_addr) - PAGE_SIZE, 1);
}
int set_page_ro(long unsigned int _addr)
{
return set_memory_ro(PAGE_ALIGN(_addr) - PAGE_SIZE, 1);
}
Note that this still doesn't actually set the page as read/write in some situations. The static_protections() function, which is called inside of set_memory_rw(), removes the _PAGE_RW flag if:
It's in the BIOS area
The address is inside .rodata
CONFIG_DEBUG_RODATA is set and the kernel is set to read-only
I found this out after debugging why I still got "unable to handle kernel paging request" when trying to modify the address of kernel functions. I was eventually able to solve that problem by finding the page table entry for the address myself and manually setting it to writable. Thankfully, the lookup_address() function is exported in version 2.6.26+. Here is the code I wrote to do that:
void set_addr_rw(unsigned long addr) {
unsigned int level;
pte_t *pte = lookup_address(addr, &level);
if (pte->pte &~ _PAGE_RW) pte->pte |= _PAGE_RW;
}
void set_addr_ro(unsigned long addr) {
unsigned int level;
pte_t *pte = lookup_address(addr, &level);
pte->pte = pte->pte &~_PAGE_RW;
}
Finally, while Mark's answer is technically correct, it'll case problem when ran inside Xen. If you want to disable write-protect, use the read/write cr0 functions. I macro them like this:
#define GPF_DISABLE write_cr0(read_cr0() & (~ 0x10000))
#define GPF_ENABLE write_cr0(read_cr0() | 0x10000)
Hope this helps anyone else who stumbles upon this question.
Note that the following will also work instead of using change_page_attr and cannot be depreciated:
static void disable_page_protection(void) {
unsigned long value;
asm volatile("mov %%cr0,%0" : "=r" (value));
if (value & 0x00010000) {
value &= ~0x00010000;
asm volatile("mov %0,%%cr0": : "r" (value));
}
}
static void enable_page_protection(void) {
unsigned long value;
asm volatile("mov %%cr0,%0" : "=r" (value));
if (!(value & 0x00010000)) {
value |= 0x00010000;
asm volatile("mov %0,%%cr0": : "r" (value));
}
}
If you are dealing with kernel 3.4 and later (it can also work with earlier kernels, I didn't test it) I would recommend a smarter way to acquire the system callы table location.
For example
#include <linux/module.h>
#include <linux/kallsyms.h>
static unsigned long **p_sys_call_table;
/* Aquire system calls table address */
p_sys_call_table = (void *) kallsyms_lookup_name("sys_call_table");
That's it. No addresses, it works fine with every kernel I've tested.
The same way you can use a not exported Kernel function from your module:
static int (*ref_access_remote_vm)(struct mm_struct *mm, unsigned long addr,
void *buf, int len, int write);
ref_access_remote_vm = (void *)kallsyms_lookup_name("access_remote_vm");
Enjoy!
As others have hinted, the whole story is a bit different now on modern kernels. I'll be covering x86-64 here, for syscall hijacking on modern arm64 refer to this other answer of mine. Also NOTE: this is plain and simple syscall hijacking. Non-invasive hooking can be done in a much nicer way using kprobes.
Since Linux v4.17, x86 (both 64 and 32 bit) now uses syscall wrappers that take a struct pt_regs * as the only argument (see commit 1, commit 2). You can see arch/x86/include/asm/syscall.h for the definitions.
Additionally, as others have described already in different answers, the simplest way to modify sys_call_table is to temporarily disable CR0 WP (Write-Protect) bit, which could be done using read_cr0() and write_cr0(). However, since Linux v5.3, [native_]write_cr0 will check sensitive bits that should never change (like WP) and refuse to change them (commit). In order to work around this, we need to write CR0 manually using inline assembly.
Here is a working kernel module (tested on Linux 5.10 and 5.18) that does syscall hijacking on modern Linux x86-64 considering the above caveats and assuming that you already know the address of sys_call_table (if you also want to find that in the module, see Proper way of getting the address of non-exported kernel symbols in a Linux kernel module):
// SPDX-License-Identifier: (GPL-2.0 OR MIT)
/**
* Test syscall table hijacking on x86-64. This module will replace the `read`
* syscall with a simple wrapper which logs every invocation of `read` using
* printk().
*
* Tested on Linux x86-64 v5.10, v5.18.
*
* Usage:
*
* sudo cat /proc/kallsyms | grep sys_call_table # grab address
* sudo insmod syscall_hijack.ko sys_call_table_addr=0x<address_here>
*/
#include <linux/init.h> // module_{init,exit}()
#include <linux/module.h> // THIS_MODULE, MODULE_VERSION, ...
#include <linux/kernel.h> // printk(), pr_*()
#include <asm/special_insns.h> // {read,write}_cr0()
#include <asm/processor-flags.h> // X86_CR0_WP
#include <asm/unistd.h> // __NR_*
#ifdef pr_fmt
#undef pr_fmt
#endif
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
typedef long (*sys_call_ptr_t)(const struct pt_regs *);
static sys_call_ptr_t *real_sys_call_table;
static sys_call_ptr_t original_read;
static unsigned long sys_call_table_addr;
module_param(sys_call_table_addr, ulong, 0);
MODULE_PARM_DESC(sys_call_table_addr, "Address of sys_call_table");
// Since Linux v5.3 [native_]write_cr0 won't change "sensitive" CR0 bits, need
// to re-implement this ourselves.
static void write_cr0_unsafe(unsigned long val)
{
asm volatile("mov %0,%%cr0": "+r" (val) : : "memory");
}
static long myread(const struct pt_regs *regs)
{
pr_info("read(%ld, 0x%lx, %lx)\n", regs->di, regs->si, regs->dx);
return original_read(regs);
}
static int __init modinit(void)
{
unsigned long old_cr0;
real_sys_call_table = (typeof(real_sys_call_table))sys_call_table_addr;
pr_info("init\n");
// Temporarily disable CR0 WP to be able to write to read-only pages
old_cr0 = read_cr0();
write_cr0_unsafe(old_cr0 & ~(X86_CR0_WP));
// Overwrite syscall and save original to be restored later
original_read = real_sys_call_table[__NR_read];
real_sys_call_table[__NR_read] = myread;
// Restore CR0 WP
write_cr0_unsafe(old_cr0);
pr_info("init done\n");
return 0;
}
static void __exit modexit(void)
{
unsigned long old_cr0;
pr_info("exit\n");
old_cr0 = read_cr0();
write_cr0_unsafe(old_cr0 & ~(X86_CR0_WP));
// Restore original syscall
real_sys_call_table[__NR_read] = original_read;
write_cr0_unsafe(old_cr0);
pr_info("goodbye\n");
}
module_init(modinit);
module_exit(modexit);
MODULE_VERSION("0.1");
MODULE_DESCRIPTION("Test syscall table hijacking on x86-64.");
MODULE_AUTHOR("Marco Bonelli");
MODULE_LICENSE("Dual MIT/GPL");

Resources