Related
Is there a way to reserve a particular range of virtual address space in a process memory map to stop ld.so (dynamic linker) from loading any shared objects into that range. Something like a system wide configuration option that reserves a particular range.
I want to be able to map a region of shared memory into exactly the same virtual address space in several processes so that my pointers in my data-structures will still work. I know I could redesign to use offsets instead of pointers but I don't want to do that.
You can do this by creating a simple shared object and running it via LD_PRELOAD. Compile the following code:
#include <sys/mman.h> // for mmap, munmap, and related constants
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
void my_program_init() __attribute__((constructor));
void *const address = ((void*)0x10000000);
const int size = 0x1000;
void my_program_init() {
printf("Hello from my_program_init!\n");
int fd = shm_open("/mysharedmem", O_CREAT | O_RDWR, 0666);
if (fd == -1) {
printf("shm_open\n");
return;
}
if (ftruncate(fd, size) == -1) {
printf("ftruncate\n");
return;
}
void* shared_mem = mmap(address, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0);
if (shared_mem == MAP_FAILED) {
printf("mmap\n");
return;
}
return;
}
with the following options:
gcc -shared -fPIC -o libmylib.so myso.c
Then you can run your program like this:
LD_PRELOAD=./libmylib.so ./your_prog
The so is then loaded before any runtime linking happens in your program. The function in the so tagged as a constructor runs immediately and uses mmap to reserve the memory you want for your shared block.
You can see this working with the following example program:
#include <sys/mman.h>
#include <string.h>
#include <stdio.h>
int main() {
char *data = (char*)0x10000000;
const char *message = "Hello, world!\n";
memcpy(data, message, strlen(message));
printf("Wrote %ld bytes to memory at address %p %s\n", strlen(message), data, data);
return 0;
}
If you run this without the LD_PRELOAD it will segfault, but if you include the preload the shared block of memory is available as expected.
$ LD_PRELOAD=./libmylib.so ./a.out
Hello from my_program_init!
Wrote 14 bytes to memory at address 0x10000000 Hello, world!
You can construct your own tests to validate that the memory block is actually shared but the easiest check is to recompile the test program again without the memcpy and see that the string is still there from the first run of the program.
I'm trying to write some simple test code as a demonstration of hooking the system call table.
"sys_call_table" is no longer exported in 2.6, so I'm just grabbing the address from the System.map file, and I can see it is correct (Looking through the memory at the address I found, I can see the pointers to the system calls).
However, when I try to modify this table, the kernel gives an "Oops" with "unable to handle kernel paging request at virtual address c061e4f4" and the machine reboots.
This is CentOS 5.4 running 2.6.18-164.10.1.el5. Is there some sort of protection or do I just have a bug? I know it comes with SELinux, and I've tried putting it in to permissive mode, but it doesn't make a difference
Here's my code:
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/unistd.h>
void **sys_call_table;
asmlinkage int (*original_call) (const char*, int, int);
asmlinkage int our_sys_open(const char* file, int flags, int mode)
{
printk("A file was opened\n");
return original_call(file, flags, mode);
}
int init_module()
{
// sys_call_table address in System.map
sys_call_table = (void*)0xc061e4e0;
original_call = sys_call_table[__NR_open];
// Hook: Crashes here
sys_call_table[__NR_open] = our_sys_open;
}
void cleanup_module()
{
// Restore the original call
sys_call_table[__NR_open] = original_call;
}
I finally found the answer myself.
http://www.linuxforums.org/forum/linux-kernel/133982-cannot-modify-sys_call_table.html
The kernel was changed at some point so that the system call table is read only.
cypherpunk:
Even if it is late but the Solution
may interest others too: In the
entry.S file you will find: Code:
.section .rodata,"a"
#include "syscall_table_32.S"
sys_call_table -> ReadOnly You have to
compile the Kernel new if you want to
"hack" around with sys_call_table...
The link also has an example of changing the memory to be writable.
nasekomoe:
Hi everybody. Thanks for replies. I
solved the problem long ago by
modifying access to memory pages. I
have implemented two functions that do
it for my upper level code:
#include <asm/cacheflush.h>
#ifdef KERN_2_6_24
#include <asm/semaphore.h>
int set_page_rw(long unsigned int _addr)
{
struct page *pg;
pgprot_t prot;
pg = virt_to_page(_addr);
prot.pgprot = VM_READ | VM_WRITE;
return change_page_attr(pg, 1, prot);
}
int set_page_ro(long unsigned int _addr)
{
struct page *pg;
pgprot_t prot;
pg = virt_to_page(_addr);
prot.pgprot = VM_READ;
return change_page_attr(pg, 1, prot);
}
#else
#include <linux/semaphore.h>
int set_page_rw(long unsigned int _addr)
{
return set_memory_rw(_addr, 1);
}
int set_page_ro(long unsigned int _addr)
{
return set_memory_ro(_addr, 1);
}
#endif // KERN_2_6_24
Here's a modified version of the original code that works for me.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/unistd.h>
#include <asm/semaphore.h>
#include <asm/cacheflush.h>
void **sys_call_table;
asmlinkage int (*original_call) (const char*, int, int);
asmlinkage int our_sys_open(const char* file, int flags, int mode)
{
printk("A file was opened\n");
return original_call(file, flags, mode);
}
int set_page_rw(long unsigned int _addr)
{
struct page *pg;
pgprot_t prot;
pg = virt_to_page(_addr);
prot.pgprot = VM_READ | VM_WRITE;
return change_page_attr(pg, 1, prot);
}
int init_module()
{
// sys_call_table address in System.map
sys_call_table = (void*)0xc061e4e0;
original_call = sys_call_table[__NR_open];
set_page_rw(sys_call_table);
sys_call_table[__NR_open] = our_sys_open;
}
void cleanup_module()
{
// Restore the original call
sys_call_table[__NR_open] = original_call;
}
Thanks Stephen, your research here was helpful to me. I had a few problems, though, as I was trying this on a 2.6.32 kernel, and getting WARNING: at arch/x86/mm/pageattr.c:877 change_page_attr_set_clr+0x343/0x530() (Not tainted) followed by a kernel OOPS about not being able to write to the memory address.
The comment above the mentioned line states:
// People should not be passing in unaligned addresses
The following modified code works:
int set_page_rw(long unsigned int _addr)
{
return set_memory_rw(PAGE_ALIGN(_addr) - PAGE_SIZE, 1);
}
int set_page_ro(long unsigned int _addr)
{
return set_memory_ro(PAGE_ALIGN(_addr) - PAGE_SIZE, 1);
}
Note that this still doesn't actually set the page as read/write in some situations. The static_protections() function, which is called inside of set_memory_rw(), removes the _PAGE_RW flag if:
It's in the BIOS area
The address is inside .rodata
CONFIG_DEBUG_RODATA is set and the kernel is set to read-only
I found this out after debugging why I still got "unable to handle kernel paging request" when trying to modify the address of kernel functions. I was eventually able to solve that problem by finding the page table entry for the address myself and manually setting it to writable. Thankfully, the lookup_address() function is exported in version 2.6.26+. Here is the code I wrote to do that:
void set_addr_rw(unsigned long addr) {
unsigned int level;
pte_t *pte = lookup_address(addr, &level);
if (pte->pte &~ _PAGE_RW) pte->pte |= _PAGE_RW;
}
void set_addr_ro(unsigned long addr) {
unsigned int level;
pte_t *pte = lookup_address(addr, &level);
pte->pte = pte->pte &~_PAGE_RW;
}
Finally, while Mark's answer is technically correct, it'll case problem when ran inside Xen. If you want to disable write-protect, use the read/write cr0 functions. I macro them like this:
#define GPF_DISABLE write_cr0(read_cr0() & (~ 0x10000))
#define GPF_ENABLE write_cr0(read_cr0() | 0x10000)
Hope this helps anyone else who stumbles upon this question.
Note that the following will also work instead of using change_page_attr and cannot be depreciated:
static void disable_page_protection(void) {
unsigned long value;
asm volatile("mov %%cr0,%0" : "=r" (value));
if (value & 0x00010000) {
value &= ~0x00010000;
asm volatile("mov %0,%%cr0": : "r" (value));
}
}
static void enable_page_protection(void) {
unsigned long value;
asm volatile("mov %%cr0,%0" : "=r" (value));
if (!(value & 0x00010000)) {
value |= 0x00010000;
asm volatile("mov %0,%%cr0": : "r" (value));
}
}
If you are dealing with kernel 3.4 and later (it can also work with earlier kernels, I didn't test it) I would recommend a smarter way to acquire the system callы table location.
For example
#include <linux/module.h>
#include <linux/kallsyms.h>
static unsigned long **p_sys_call_table;
/* Aquire system calls table address */
p_sys_call_table = (void *) kallsyms_lookup_name("sys_call_table");
That's it. No addresses, it works fine with every kernel I've tested.
The same way you can use a not exported Kernel function from your module:
static int (*ref_access_remote_vm)(struct mm_struct *mm, unsigned long addr,
void *buf, int len, int write);
ref_access_remote_vm = (void *)kallsyms_lookup_name("access_remote_vm");
Enjoy!
As others have hinted, the whole story is a bit different now on modern kernels. I'll be covering x86-64 here, for syscall hijacking on modern arm64 refer to this other answer of mine. Also NOTE: this is plain and simple syscall hijacking. Non-invasive hooking can be done in a much nicer way using kprobes.
Since Linux v4.17, x86 (both 64 and 32 bit) now uses syscall wrappers that take a struct pt_regs * as the only argument (see commit 1, commit 2). You can see arch/x86/include/asm/syscall.h for the definitions.
Additionally, as others have described already in different answers, the simplest way to modify sys_call_table is to temporarily disable CR0 WP (Write-Protect) bit, which could be done using read_cr0() and write_cr0(). However, since Linux v5.3, [native_]write_cr0 will check sensitive bits that should never change (like WP) and refuse to change them (commit). In order to work around this, we need to write CR0 manually using inline assembly.
Here is a working kernel module (tested on Linux 5.10 and 5.18) that does syscall hijacking on modern Linux x86-64 considering the above caveats and assuming that you already know the address of sys_call_table (if you also want to find that in the module, see Proper way of getting the address of non-exported kernel symbols in a Linux kernel module):
// SPDX-License-Identifier: (GPL-2.0 OR MIT)
/**
* Test syscall table hijacking on x86-64. This module will replace the `read`
* syscall with a simple wrapper which logs every invocation of `read` using
* printk().
*
* Tested on Linux x86-64 v5.10, v5.18.
*
* Usage:
*
* sudo cat /proc/kallsyms | grep sys_call_table # grab address
* sudo insmod syscall_hijack.ko sys_call_table_addr=0x<address_here>
*/
#include <linux/init.h> // module_{init,exit}()
#include <linux/module.h> // THIS_MODULE, MODULE_VERSION, ...
#include <linux/kernel.h> // printk(), pr_*()
#include <asm/special_insns.h> // {read,write}_cr0()
#include <asm/processor-flags.h> // X86_CR0_WP
#include <asm/unistd.h> // __NR_*
#ifdef pr_fmt
#undef pr_fmt
#endif
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
typedef long (*sys_call_ptr_t)(const struct pt_regs *);
static sys_call_ptr_t *real_sys_call_table;
static sys_call_ptr_t original_read;
static unsigned long sys_call_table_addr;
module_param(sys_call_table_addr, ulong, 0);
MODULE_PARM_DESC(sys_call_table_addr, "Address of sys_call_table");
// Since Linux v5.3 [native_]write_cr0 won't change "sensitive" CR0 bits, need
// to re-implement this ourselves.
static void write_cr0_unsafe(unsigned long val)
{
asm volatile("mov %0,%%cr0": "+r" (val) : : "memory");
}
static long myread(const struct pt_regs *regs)
{
pr_info("read(%ld, 0x%lx, %lx)\n", regs->di, regs->si, regs->dx);
return original_read(regs);
}
static int __init modinit(void)
{
unsigned long old_cr0;
real_sys_call_table = (typeof(real_sys_call_table))sys_call_table_addr;
pr_info("init\n");
// Temporarily disable CR0 WP to be able to write to read-only pages
old_cr0 = read_cr0();
write_cr0_unsafe(old_cr0 & ~(X86_CR0_WP));
// Overwrite syscall and save original to be restored later
original_read = real_sys_call_table[__NR_read];
real_sys_call_table[__NR_read] = myread;
// Restore CR0 WP
write_cr0_unsafe(old_cr0);
pr_info("init done\n");
return 0;
}
static void __exit modexit(void)
{
unsigned long old_cr0;
pr_info("exit\n");
old_cr0 = read_cr0();
write_cr0_unsafe(old_cr0 & ~(X86_CR0_WP));
// Restore original syscall
real_sys_call_table[__NR_read] = original_read;
write_cr0_unsafe(old_cr0);
pr_info("goodbye\n");
}
module_init(modinit);
module_exit(modexit);
MODULE_VERSION("0.1");
MODULE_DESCRIPTION("Test syscall table hijacking on x86-64.");
MODULE_AUTHOR("Marco Bonelli");
MODULE_LICENSE("Dual MIT/GPL");
I have a few questions on using shared memory with processes. I looked at several previous posts and couldn't glean the answers precisely enough. Thanks in advance for your help.
I'm using shm_open + mmap like below. This code works as intended with parent and child alternating to increment g_shared->count (the synchronization is not portable; it works only for certain memory models, but good enough for my case for now). However, when I change MAP_SHARED to MAP_ANONYMOUS | MAP_SHARED, the memory isn't shared and the program hangs since the 'flag' doesn't get flipped. Removing the flag confirms what's happening with each process counting from 0 to 10 (implying that each has its own copy of the structure and hence the 'count' field). Is this the expected behavior? I don't want the memory to be backed by a file; I really want to emulate what might happen if these were threads instead of processes (they need to be processes for other reasons).
Do I really need shm_open? Since the processes belong to the same hierarchy, can I just use mmap alone instead? I understand this would be fairly straightforward if there wasn't an 'exec,' but how do I get it to work when there is an 'exec' following the 'fork?'
I'm using kernel version 3.2.0-23 on x86_64 (Intel i7-2600). For this implementation, does mmap give the same behavior (correctness as well as performance) as shared memory with pthreads sharing the same global object? For example, does the MMU map the segment with 'cacheable' MTRR/TLB attributes?
Is the cleanup_shared() code correct? Is it leaking any memory? How could I check? For example, is there an equivalent of System V's 'ipcs?'
thanks,
/Doobs
shmem.h:
#ifndef __SHMEM_H__
#define __SHMEM_H__
//includes
#define LEN 1000
#define ITERS 10
#define SHM_FNAME "/myshm"
typedef struct shmem_obj {
int count;
char buff[LEN];
volatile int flag;
} shmem_t;
extern shmem_t* g_shared;
extern char proc_name[100];
extern int fd;
void cleanup_shared() {
munmap(g_shared, sizeof(shmem_t));
close(fd);
shm_unlink(SHM_FNAME);
}
static inline
void init_shared() {
int oflag;
if (!strcmp(proc_name, "parent")) {
oflag = O_CREAT | O_RDWR;
} else {
oflag = O_RDWR;
}
fd = shm_open(SHM_FNAME, oflag, (S_IREAD | S_IWRITE));
if (fd == -1) {
perror("shm_open");
exit(EXIT_FAILURE);
}
if (ftruncate(fd, sizeof(shmem_t)) == -1) {
perror("ftruncate");
shm_unlink(SHM_FNAME);
exit(EXIT_FAILURE);
}
g_shared = mmap(NULL, sizeof(shmem_t),
(PROT_WRITE | PROT_READ),
MAP_SHARED, fd, 0);
if (g_shared == MAP_FAILED) {
perror("mmap");
cleanup_shared();
exit(EXIT_FAILURE);
}
}
static inline
void proc_write(const char* s) {
fprintf(stderr, "[%s] %s\n", proc_name, s);
}
#endif // __SHMEM_H__
shmem1.c (parent process):
#include "shmem.h"
int fd;
shmem_t* g_shared;
char proc_name[100];
void work() {
int i;
for (i = 0; i < ITERS; ++i) {
while (g_shared->flag);
++g_shared->count;
sprintf(g_shared->buff, "%s: %d", proc_name, g_shared->count);
proc_write(g_shared->buff);
g_shared->flag = !g_shared->flag;
}
}
int main(int argc, char* argv[], char* envp[]) {
int status, child;
strcpy(proc_name, "parent");
init_shared(argv);
fprintf(stderr, "Map address is: %p\n", g_shared);
if (child = fork()) {
work();
waitpid(child, &status, 0);
cleanup_shared();
fprintf(stderr, "Parent finished!\n");
} else { /* child executes shmem2 */
execvpe("./shmem2", argv + 2, envp);
}
}
shmem2.c (child process):
#include "shmem.h"
int fd;
shmem_t* g_shared;
char proc_name[100];
void work() {
int i;
for (i = 0; i < ITERS; ++i) {
while (!g_shared->flag);
++g_shared->count;
sprintf(g_shared->buff, "%s: %d", proc_name, g_shared->count);
proc_write(g_shared->buff);
g_shared->flag = !g_shared->flag;
}
}
int main(int argc, char* argv[], char* envp[]) {
int status;
strcpy(proc_name, "child");
init_shared(argv);
fprintf(stderr, "Map address is: %p\n", g_shared);
work();
cleanup_shared();
return 0;
}
Passing MAP_ANONYMOUS causes the kernel to ignore your file descriptor argument and give you a private mapping instead. That's not what you want.
Yes, you can create an anonymous shared mapping in a parent process, fork, and have the child process inherit the mapping, sharing the memory with the parent and any other children. That obvoiusly doesn't survive an exec() though.
I don't understand this question; pthreads doesn't allocate memory. The cacheability will depend on the file descriptor you mapped. If it's a disk file or anonymous mapping, then it's cacheable memory. If it's a video framebuffer device, it's probably not.
That's the right way to call munmap(), but I didn't verify the logic beyond that. All processes need to unmap, only one should call unlink.
2b) as a middle-ground of a sort, it is possible to call:
int const shm_fd = shm_open(fn,...);
shm_unlink(fn);
in a parent process, and then pass fd to a child process created by fork()/execve() via argp or envp. since open file descriptors of this type will survive the fork()/execve(), you can mmap the fd in both the parent process and any dervied processes. here's a more complete code example copied and simplified/sanitized from code i ran successfully under Ubuntu 12.04 / linux kernel 3.13 / glibc 2.15:
int create_shm_fd( void ) {
int oflags = O_RDWR | O_CREAT | O_TRUNC;
string const fn = "/some_shm_fn_maybe_with_pid";
int fd;
neg_one_fail( fd = shm_open( fn.c_str(), oflags, S_IRUSR | S_IWUSR ), "shm_open" );
if( fd == -1 ) { rt_err( strprintf( "shm_open() failed with errno=%s", str(errno).c_str() ) ); }
// for now, we'll just pass the open fd to our child process, so
// we don't need the file/name/link anymore, and by unlinking it
// here we can try to minimize the chance / amount of OS-level shm
// leakage.
neg_one_fail( shm_unlink( fn.c_str() ), "shm_unlink" );
// by default, the fd returned from shm_open() has FD_CLOEXEC
// set. it seems okay to remove it so that it will stay open
// across execve.
int fd_flags = 0;
neg_one_fail( fd_flags = fcntl( fd, F_GETFD ), "fcntl" );
fd_flags &= ~FD_CLOEXEC;
neg_one_fail( fcntl( fd, F_SETFD, fd_flags ), "fcntl" );
// resize the shm segment for later mapping via mmap()
neg_one_fail( ftruncate( fd, 1024*1024*4 ), "ftruncate" );
return fd;
}
it's not 100% clear to me if it's okay spec-wise to remove the FD_CLOEXEC and/or assume that after doing so the fd really will survive the exec. the man page for exec is unclear; it says: "POSIX shared memory regions are unmapped", but to me that's redundant with the general comments earlier that mapping are not preserved, and doesn't say that shm_open()'d fd will be closed. any of course there's the fact that, as i mentioned, the code does seem to work in at least one case.
the reason i might use this approach is that it would seem to reduce the chance of leaking the shared memory segment / filename, and it makes it clear that i don't need persistence of the memory segment.
So I'm trying to write a kernel module that uses the linux/timer.h file. I got it to work inside just the module, and now I am trying to get it to work from a user program.
Here is my kernel module:
//Necessary Includes For Device Drivers.
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/errno.h>
#include <linux/proc_fs.h>
#include <asm/uaccess.h>
#include <linux/timer.h>
#include <linux/ioctl.h>
#define DEVICE_NAME "mytimer"
#define DEVICE_FILE_NAME "mytimer"
#define MAJOR_NUM 61
#define MINOR_NUM 0
MODULE_LICENSE("Dual BSD/GPL");
static struct timer_list my_timer;
struct file_operations FileOps =
{
//No File Operations for this timer.
};
//Function to perform when timer expires.
void TimerExpire(int data)
{
printk("Timer Data: %d\n", data);
}
//Function to set up timers.
void TimerSetup(void)
{
setup_timer(&my_timer, TimerExpire, 5678);
mod_timer(&my_timer, jiffies + msecs_to_jiffies(5000));
}
//Module Init and Exit Functions.
int init_module(void)
{
int initResult = register_chrdev(MAJOR_NUM, "mytimer", &FileOps);
if (initResult < 0)
{
printk("Cannot obtain major number %d\n", MAJOR_NUM);
return initResult;
}
printk("Loading MyTimer Kernel Module...\n");
return 0;
}
void cleanup_module(void)
{
unregister_chrdev(MAJOR_NUM, "mytimer");
printk("Unloading MyTimer Kernel Module...\n");
}
More specifically, I want my user program to call the TimerSetup() function. I know that I'll need to use ioctl() but I'm not sure how to specify in my MODULE FILE that TimerSetup() should be callable via ioctl().
Also, my second question: I was able to insmod my module and also mknod into /dev/mytimer with the correct major number. But when I tried to open() it so that I can get the file descriptor from it, it kept returning -1, which I'm assuming is wrong. I made sure the permissions were fine (in fact, I made it 777 just to be sure)... It still doesn't work... Is there something I'm missing?
Here is the user program just in case:
#include <stdio.h>
int main(int argc, char* argv[])
{
int fd = open("/dev/mytimer", "r");
printf("fd: %d\n", fd);
return 0;
}
The example code you need can be found in drivers/watchdog/softdog.c (from Linux 2.6.33 at the time this was written), which illustrates proper file operations as well as how to permit userland to fill a structure with ioctl().
It's actually a great, working tutorial for anyone who needs to write trivial character device drivers.
I dissected softdog's ioctl interface when answering my own question, which may be helpful to you.
Here's the gist of it (though far from exhaustive) ...
In softdog_ioctl() you see a simple initialization of struct watchdog_info that advertises functionality, version and device information:
static const struct watchdog_info ident = {
.options = WDIOF_SETTIMEOUT |
WDIOF_KEEPALIVEPING |
WDIOF_MAGICCLOSE,
.firmware_version = 0,
.identity = "Software Watchdog",
};
We then look at a simple case where the user just wants to obtain these capabilities:
switch (cmd) {
case WDIOC_GETSUPPORT:
return copy_to_user(argp, &ident, sizeof(ident)) ? -EFAULT : 0;
... which of course, will fill the corresponding userspace watchdog_info with the initialized values above. If copy_to_user() fails, -EFAULT is returned which causes the corresponding userspace ioctl() call to return -1 with a meaningful errno being set.
Note, the magic requests are actually defined in linux/watchdog.h , so that the kernel and userspace share them:
#define WDIOC_GETSUPPORT _IOR(WATCHDOG_IOCTL_BASE, 0, struct watchdog_info)
#define WDIOC_GETSTATUS _IOR(WATCHDOG_IOCTL_BASE, 1, int)
#define WDIOC_GETBOOTSTATUS _IOR(WATCHDOG_IOCTL_BASE, 2, int)
#define WDIOC_GETTEMP _IOR(WATCHDOG_IOCTL_BASE, 3, int)
#define WDIOC_SETOPTIONS _IOR(WATCHDOG_IOCTL_BASE, 4, int)
#define WDIOC_KEEPALIVE _IOR(WATCHDOG_IOCTL_BASE, 5, int)
#define WDIOC_SETTIMEOUT _IOWR(WATCHDOG_IOCTL_BASE, 6, int)
#define WDIOC_GETTIMEOUT _IOR(WATCHDOG_IOCTL_BASE, 7, int)
#define WDIOC_SETPRETIMEOUT _IOWR(WATCHDOG_IOCTL_BASE, 8, int)
#define WDIOC_GETPRETIMEOUT _IOR(WATCHDOG_IOCTL_BASE, 9, int)
#define WDIOC_GETTIMELEFT _IOR(WATCHDOG_IOCTL_BASE, 10, int)
WDIOC obviously signifying "Watchdog ioctl"
You can easily take that a step further, having your driver do something and place the result of that something in the structure and copy it to userspace. For instance, if struct watchdog_info also had a member __u32 result_code. Note, __u32 is just the kernel's version of uint32_t.
With ioctl(), the user passes the address of an object, be it a structure, integer, whatever to the kernel expecting the kernel to write its reply in an identical object and copy the results to the address that was provided.
The second thing you are going to need to do is make sure your device knows what to do when someone opens, reads from it, writes to it, or uses a hook like ioctl(), which you can easily see by studying softdog.
Of interest is:
static const struct file_operations softdog_fops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.write = softdog_write,
.unlocked_ioctl = softdog_ioctl,
.open = softdog_open,
.release = softdog_release,
};
Where you see the unlocked_ioctl handler going to ... you guessed it, softdog_ioctl().
I think you might be juxtaposing a layer of complexity that really doesn't exist when dealing with ioctl(), it really is that simple. For that same reason, most kernel developers frown on new ioctl interfaces being added unless they are absolutely necessary. Its just too easy to lose track of the type that ioctl() is going to fill vs the magic you use to do it, which is the primary reason that copy_to_user() fails often resulting in the kernel rotting with hordes of userspace processes stuck in disk sleep.
For a timer, I agree, ioctl() is the shortest path to sanity.
You are missing a .open function pointer in your file_operations structure to specify the function to be called when a process attempts to open the device file. You will need to specify a .ioctl function pointer for your ioctl function as well.
Try reading through The Linux Kernel Module Programming Guide, specifically chapters 4 (Character Device Files) and 7 (Talking to Device Files).
Chapter 4 introduces the file_operations structure, which holds pointers to functions defined by the module/driver that perform various operations such as open or ioctl.
Chapter 7 provides information on communicating with a module/drive via ioctls.
Linux Device Drivers, Third Edition is another good resource.
Minimal runnable example
Tested in a fully reproducible QEMU + Buildroot environment, so might help others get their ioctl working. GitHub upstream:
kernel module |
shared header |
userland.
The most annoying part was understanding that some low ids are hijacked: ioctl is not called if cmd = 2 , you have to use _IOx macros.
Kernel module:
#include <asm/uaccess.h> /* copy_from_user, copy_to_user */
#include <linux/debugfs.h>
#include <linux/module.h>
#include <linux/printk.h> /* printk */
#include "ioctl.h"
MODULE_LICENSE("GPL");
static struct dentry *dir;
static long unlocked_ioctl(struct file *filp, unsigned int cmd, unsigned long argp)
{
void __user *arg_user;
union {
int i;
lkmc_ioctl_struct s;
} arg_kernel;
arg_user = (void __user *)argp;
pr_info("cmd = %x\n", cmd);
switch (cmd) {
case LKMC_IOCTL_INC:
if (copy_from_user(&arg_kernel.i, arg_user, sizeof(arg_kernel.i))) {
return -EFAULT;
}
pr_info("0 arg = %d\n", arg_kernel.i);
arg_kernel.i += 1;
if (copy_to_user(arg_user, &arg_kernel.i, sizeof(arg_kernel.i))) {
return -EFAULT;
}
break;
case LKMC_IOCTL_INC_DEC:
if (copy_from_user(&arg_kernel.s, arg_user, sizeof(arg_kernel.s))) {
return -EFAULT;
}
pr_info("1 arg = %d %d\n", arg_kernel.s.i, arg_kernel.s.j);
arg_kernel.s.i += 1;
arg_kernel.s.j -= 1;
if (copy_to_user(arg_user, &arg_kernel.s, sizeof(arg_kernel.s))) {
return -EFAULT;
}
break;
default:
return -EINVAL;
break;
}
return 0;
}
static const struct file_operations fops = {
.owner = THIS_MODULE,
.unlocked_ioctl = unlocked_ioctl
};
static int myinit(void)
{
dir = debugfs_create_dir("lkmc_ioctl", 0);
/* ioctl permissions are not automatically restricted by rwx as for read / write,
* but we could of course implement that ourselves:
* https://stackoverflow.com/questions/29891803/user-permission-check-on-ioctl-command */
debugfs_create_file("f", 0, dir, NULL, &fops);
return 0;
}
static void myexit(void)
{
debugfs_remove_recursive(dir);
}
module_init(myinit)
module_exit(myexit)
Shared header between the kernel module and userland:
ioctl.h
#ifndef IOCTL_H
#define IOCTL_H
#include <linux/ioctl.h>
typedef struct {
int i;
int j;
} lkmc_ioctl_struct;
#define LKMC_IOCTL_MAGIC 0x33
#define LKMC_IOCTL_INC _IOWR(LKMC_IOCTL_MAGIC, 0, int)
#define LKMC_IOCTL_INC_DEC _IOWR(LKMC_IOCTL_MAGIC, 1, lkmc_ioctl_struct)
#endif
Userland:
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include "../ioctl.h"
int main(int argc, char **argv)
{
int fd, arg_int, ret;
lkmc_ioctl_struct arg_struct;
if (argc < 2) {
puts("Usage: ./prog <ioctl-file>");
return EXIT_FAILURE;
}
fd = open(argv[1], O_RDONLY);
if (fd == -1) {
perror("open");
return EXIT_FAILURE;
}
/* 0 */
{
arg_int = 1;
ret = ioctl(fd, LKMC_IOCTL_INC, &arg_int);
if (ret == -1) {
perror("ioctl");
return EXIT_FAILURE;
}
printf("arg = %d\n", arg_int);
printf("ret = %d\n", ret);
printf("errno = %d\n", errno);
}
puts("");
/* 1 */
{
arg_struct.i = 1;
arg_struct.j = 1;
ret = ioctl(fd, LKMC_IOCTL_INC_DEC, &arg_struct);
if (ret == -1) {
perror("ioctl");
return EXIT_FAILURE;
}
printf("arg = %d %d\n", arg_struct.i, arg_struct.j);
printf("ret = %d\n", ret);
printf("errno = %d\n", errno);
}
close(fd);
return EXIT_SUCCESS;
}
I'm new to 64-bits architecture. Could you tell me what's MAX file size supported by file mapping in 64 bits linux machine. I want to open more than 20GB files by file mapping, is it available?
I write a sample code. But it causes Bus Error when I get the value of the pointer in GBSIZE offset:
unsigned char* pCur = pBegin + GBSIZE;
//pBegin is the pointer returned by mmap
printf("%c",*pCur);
BTW, printf("%c",*pBegin ); works fine. and my address sizes : 38 bits physical, 48 bits virtual
Here is the full code:
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
//#define FILEPATH "smallfile"
#define FILEPATH "bigfile"
#define GBSIZE (1024L*1024L*1024L)
#define TBSIZE (1024L*GBSIZE)
#define NUMSIZE (20L * GBSIZE)
//#define NUMSIZE (10)
#define FILESIZE (NUMINTS * sizeof(int))
int main(int argc, char *argv[])
{
int i;
int fd;
unsigned char *pBegin;
fd = open(FILEPATH, O_RDONLY);
if (fd == -1) {
perror("Error opening file for reading");
exit(EXIT_FAILURE);
}
pBegin = mmap(0, NUMSIZE, PROT_READ, MAP_SHARED, fd, 0);
if (pBegin == MAP_FAILED) {
close(fd);
perror("Error mmapping the file");
exit(EXIT_FAILURE);
}
/** ERROR happens here!!! **/
unsigned char* pCur = pBegin + GBSIZE;
printf("%c",*pCur);
if (munmap(pBegin, NUMSIZE) == -1) {
perror("Error un-mmapping the file");
}
close(fd);
return 0;
}
Although pointers are 64-bit wide, most processors do not actually support virtual addresses using the full 64 bits. To see what size virtual addresses your processor supports, look in /proc/cpuinfo (48 bits is typical).
grep "address sizes" /proc/cpuinfo
Additionally, half of the virtual address space is used by the kernel and not available to userspace - leaving 47 bits in the current Linux implementation.
However, even taking this into account, you will still have plenty of room for a 20GB file. 47 bits in theory means a virtual address space of 128TB.
From the mmap(2) man page:
void *mmap(void *addr, size_t length, int prot, int flags,
int fd, off_t offset);
length is a size_t, which on 64-bit machines is 64 bits in length. Therefore yes, you can theoretically map a 20GB file.
64-bit addresses allow for many orders of magnitude more than 20 GB.
(This answer was originally edited into the question by OP)
You have requested a 20GB map onto a file which was only 50MB in size.
As described by the mmap man page, mmap succeeds when you request the length too big, however it will give SIGBUS or SIGSEGV when you actually try to read beyond the end of the underlying file.
Agree with MarkR, you are dereference an invalid address.
// A bug in these lines.
unsigned char* pCur = pBegin + GBSIZE;
printf("%c",*pCur);
unsigned char* pEnd = pBegin + NUMSIZE;
unsigned char* pLast = pEnd - 1;
unsigned char* pCur = pLast;
I modified your code to use HUGE TLB flags as the following.
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
#define MAP_HUGE_SHIFT 26
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
#define KSIZE 1024L
#define MSIZE (1024L*1024L)
#define GSIZE (1024L*1024L*1024L)
#define TSIZE (1024L*GSIZE)
#define INIT_MEM 0
// Fail on my MacBook Pro (Retina, 13-inch, Early 2015)
// Darwin Kernel Version 16.5.0:x86_64
// #define NUMSIZE (16L * TSIZE)
// mmap ok; init: got killed; signal 9
// #define NUMSIZE (8L * TSIZE)
// Got killed signal 9
// #define NUMSIZE (1L * TSIZE)
// OK
// #define NUMSIZE (200L * GSIZE)
// OK
#define NUMSIZE (20L * GSIZE)
typedef unsigned long long ETYPE;
#define MEMSIZE (NUMSIZE*sizeof(ETYPE))
#define PGSIZE (16*KSIZE)
void init(ETYPE* ptr) {
*ptr = (ETYPE)ptr;
}
int verify(ETYPE* ptr) {
if (*ptr != (ETYPE)ptr) {
fprintf(stderr, "ERROR: 0x%016llx != %p.\n", *ptr, ptr);
return -1;
}
else {
fprintf(stdout, "OK: 0x%016llx = %p.\n", *ptr, ptr);
}
return 0;
}
int main(int argc, char *argv[])
{
int i;
int fd;
ETYPE *pBegin;
int flags = MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_1GB;
printf("mmap memory size:%lu GB\n", MEMSIZE/GSIZE);
pBegin = (ETYPE*) mmap(0, MEMSIZE, PROT_READ | PROT_WRITE, flags, -1, 0);
if (pBegin == MAP_FAILED) {
perror("Error mmapping the file");
exit(EXIT_FAILURE);
}
ETYPE* pEnd = pBegin + NUMSIZE;
ETYPE* pCur = pBegin;
#if INIT_MEM
while (pCur < pEnd) {
init(pCur);
// ++pCur; //slow if init all addresses.
pCur += (PGSIZE/sizeof(ETYPE));
}
#endif
init(&pBegin[0]);
init(&pBegin[NUMSIZE-1]);
verify(&pBegin[0]);
verify(&pBegin[NUMSIZE-1]);
if (munmap(pBegin, MEMSIZE) == -1) {
perror("Error un-mmapping the file");
}
return 0;
}