KASAN complains when calling copy_from/to_user - linux

We are developing a linux driver, and noticed KASAN complains when I read/write the created device file.
The minimal example is listed as below (so not well designed).
It creates file /dev/test_ctl and enables read/write/ioctl.
We compiled a 4.6.2 kernel with KASAN enabled and this code in updated Fedora.
After modprobe the module, I tried to read(cat)/write(echo foo > ) and called ioctl, all got BUG: KASAN: user-memory-access on address ... .... I expected these operations runs without any warning.
According to the pr_info, I noticed the behavior is caused by copy_to/from_user functions.
How can we get rid of this noise since we use KASAN for memory related runtime checks.
#define pr_fmt(fmt) "test dev : " fmt
#include <linux/module.h>
#include <linux/device.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <asm-generic/uaccess.h>
#include <asm-generic/ioctl.h>
#include <linux/spinlock.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("FOO");
MODULE_DESCRIPTION("BAR");
#define LEN 1024
#define IOCTL_READ _IOR('t', 0xD0, u8[LEN])
#define IOCTL_WRITE _IOW('t', 0xD1, u8[LEN])
static u8 buf[LEN];
static dev_t major;
static struct class *class;
static struct cdev cdev;
static struct device *cdevice;
static long test_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
long err;
void __user *ptr = (void __user *)arg;
pr_info("ioctl: cmd: %08x, arg: %p, local: %p\n", cmd, ptr, buf);
switch (cmd) {
case IOCTL_READ:
pr_info("ioctl/r: calling copy_to_user\n");
if (copy_to_user(ptr, buf, LEN)) {
pr_info("ioctl/r: failed to copy to user\n");
err = -EFAULT;
} else {
pr_info("ioctl/r: buffer copied, val: %8ph\n", buf);
err = 0;
}
break;
case IOCTL_WRITE:
pr_info("ioctl/r: calling copy_from_user\n");
if (copy_from_user(buf, ptr, LEN)) {
pr_info("ioctl/w: failed to copy from user\n");
err = -EFAULT;
} else {
pr_info("ioctl/w: buffer copied, val: %8ph\n", buf);
err = 0;
}
break;
default:
pr_info("ioctl: invalid command\n");
err = -EINVAL;
break;
}
return err;
}
static int test_open(struct inode *inode, struct file *file)
{
return 0;
}
static int test_release(struct inode *inode, struct file *file)
{
return 0;
}
static ssize_t test_read(struct file *filep,
char __user *ptr, size_t len, loff_t *offset)
{
ssize_t r;
if (*offset) {
return 0;
}
if (len > LEN) {
len = LEN;
}
*offset += len;
pr_info("calling copy_to_user\n");
r = copy_to_user(ptr, buf, len) ? -EFAULT : len;
pr_info("called copy_to_user\n");
return r;
}
static ssize_t test_write(struct file *filep,
const char __user *ptr, size_t len, loff_t *offset)
{
ssize_t r;
if (*offset) {
return -EINVAL;
}
if (len > LEN) {
len = LEN;
}
*offset += len;
pr_info("calling copy_from_user\n");
r = copy_from_user(buf, ptr, len) ? -EFAULT : len;
pr_info("called copy_from_user\n");
return r;
}
static const struct file_operations cdev_ops = {
.owner = THIS_MODULE,
.unlocked_ioctl = test_ioctl,
.open = test_open,
.release = test_release,
.read = test_read,
.write = test_write,
};
static int test_init(void)
{
int rc;
pr_info("test device initing\n");
rc = alloc_chrdev_region(&major, 0, 1, "test");
if (rc) {
goto fail_alloc_chrdev_region;
}
pr_info("major assigned: %d\n", (int)MAJOR(major));
class = class_create(THIS_MODULE, "test_ctl");
if (IS_ERR(class)) {
pr_err("failed to create class\n");
rc = PTR_RET(class);
goto fail_create_class;
}
cdev_init(&cdev, &cdev_ops);
cdev.owner = THIS_MODULE;
rc = cdev_add(&cdev, MKDEV(MAJOR(major), 0), 1);
if (rc) {
pr_info("failed to add char dev\n");
goto fail_cdev_add;
}
cdevice = device_create(class, NULL, MKDEV(MAJOR(major), 0), NULL,
"test_ctl");
if (IS_ERR(cdevice)) {
pr_err("failed to create /dev/ file\n");
rc = PTR_RET(cdevice);
goto fail_device_create;
}
pr_info("driver initialized\n");
return 0;
device_destroy(class, MKDEV(major, 0));
fail_device_create:
cdev_del(&cdev);
fail_cdev_add:
class_destroy(class);
fail_create_class:
unregister_chrdev_region(major, 1);
fail_alloc_chrdev_region:
return rc;
}
module_init(test_init);
Here is the kernel message during od /dev/test_ctl
[18088.583185] test dev : called copy_to_user
[18117.378665] test dev : ioctl: cmd: 00005401, arg: 00007ffeb2303070, local: ffffffffa018aba0
[18117.380954] test dev : ioctl: invalid command
[18117.386294] test dev : calling copy_to_user
[18117.388772] ==================================================================
[18117.390903] BUG: KASAN: user-memory-access on address 00007f52831a8000
[18117.392150] Write of size 1024 by task od/2057
[18117.393305] CPU: 1 PID: 2057 Comm: od Tainted: G B O 4.6.2-kasan-outline #1
[18117.395448] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[18117.396655] ffff880009cf8000 00000000da76c32b ffff88000a2efbc8 ffffffff816e1398
[18117.399220] 0000000000000400 ffff88000a2efc60 ffff88000a2efc50 ffffffff8134fe46
[18117.401765] 0000000000000246 ffffffffa0189140 0000000000000292 ffff88000a2efc00
[18117.404304] Call Trace:
[18117.405375] [<ffffffff816e1398>] dump_stack+0x85/0xcd
[18117.406550] [<ffffffff8134fe46>] kasan_report_error+0x456/0x560
[18117.407777] [<ffffffff81197896>] ? debug_lockdep_rcu_enabled+0x26/0x40
[18117.409039] [<ffffffff813504e8>] kasan_report+0x58/0x60
[18117.410222] [<ffffffff8134f498>] ? memcpy+0x28/0x40
[18117.411394] [<ffffffff8134f08d>] __asan_storeN+0x12d/0x180
[18117.412592] [<ffffffff8134f498>] memcpy+0x28/0x40
[18117.413766] [<ffffffffa0188019>] __copy_to_user+0x9/0x10 [test_drv]
[18117.415010] [<ffffffffa0188132>] test_read+0x72/0xa0 [test_drv]
[18117.416233] [<ffffffff8138529d>] __vfs_read+0xdd/0x260
[18117.417419] [<ffffffff813851c0>] ? vfs_iter_write+0x190/0x190
[18117.418644] [<ffffffff813f7140>] ? __fsnotify_update_child_dentry_flags.part.1+0x160/0x160
[18117.442467] [<ffffffff812e0117>] ? vm_mmap_pgoff+0x167/0x1a0
[18117.443686] [<ffffffff8116ff18>] ? up_write+0x28/0x50
[18117.444874] [<ffffffff812e0117>] ? vm_mmap_pgoff+0x167/0x1a0
[18117.446093] [<ffffffff81636dd5>] ? security_file_permission+0xd5/0x100
[18117.447344] [<ffffffff81386bb7>] vfs_read+0xb7/0x1a0
[18117.448536] [<ffffffff81388dba>] SyS_read+0xba/0x150
[18117.449713] [<ffffffff81388d00>] ? vfs_copy_file_range+0x370/0x370
[18117.450945] [<ffffffff811776c6>] ? trace_hardirqs_on_caller+0x16/0x290
[18117.452193] [<ffffffff8100401b>] ? trace_hardirqs_on_thunk+0x1b/0x1d
[18117.453441] [<ffffffff81b386fc>] entry_SYSCALL_64_fastpath+0x1f/0xbd
[18117.454689] [<ffffffff811717b6>] ? trace_hardirqs_off_caller+0x16/0x120
[18117.455943] ==================================================================
[18117.462008] test dev : called copy_to_user
When calling echo 1 > /dev/test_ctl, I got following kernel message:
[18212.023598] test dev : calling copy_from_user
[18212.024844] ==================================================================
[18212.027020] BUG: KASAN: user-memory-access on address 00007f31bf34f000
[18212.028272] Read of size 2 by task bash/1982
[18212.029425] CPU: 1 PID: 1982 Comm: bash Tainted: G B O 4.6.2-kasan-outline #1
[18212.031585] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[18212.032904] ffff8800306e0000 000000003803bec2 ffff88000a2dfbc0 ffffffff816e1398
[18212.035569] 0000000000000002 ffff88000a2dfc58 ffff88000a2dfc48 ffffffff8134fe46
[18212.038169] 0000000000000246 ffffffffa0189040 0000000000000286 ffff88000a2dfbf8
[18212.040760] Call Trace:
[18212.041835] [<ffffffff816e1398>] dump_stack+0x85/0xcd
[18212.043032] [<ffffffff8134fe46>] kasan_report_error+0x456/0x560
[18212.044276] [<ffffffff81197896>] ? debug_lockdep_rcu_enabled+0x26/0x40
[18212.045538] [<ffffffff813504e8>] kasan_report+0x58/0x60
[18212.046742] [<ffffffff8134f48d>] ? memcpy+0x1d/0x40
[18212.047944] [<ffffffff8134ef0a>] __asan_loadN+0x12a/0x180
[18212.049155] [<ffffffff8134f48d>] memcpy+0x1d/0x40
[18212.050340] [<ffffffffa0188019>] __copy_to_user+0x9/0x10 [test_drv]
[18212.051593] [<ffffffffa0188097>] test_write+0x77/0xa0 [test_drv]
[18212.052840] [<ffffffff813854fd>] __vfs_write+0xdd/0x260
[18212.054040] [<ffffffff81385420>] ? __vfs_read+0x260/0x260
[18212.055252] [<ffffffff81300b40>] ? __pmd_alloc+0x250/0x250
[18212.056464] [<ffffffff813b9595>] ? __fd_install+0x5/0x3f0
[18212.057736] [<ffffffff813b92ac>] ? __alloc_fd+0x3c/0x2b0
[18212.058952] [<ffffffff81197896>] ? debug_lockdep_rcu_enabled+0x26/0x40
[18212.060220] [<ffffffff81197896>] ? debug_lockdep_rcu_enabled+0x26/0x40
[18212.061488] [<ffffffff81636d68>] ? security_file_permission+0x68/0x100
[18212.062757] [<ffffffff81386d96>] vfs_write+0xf6/0x260
[18212.063959] [<ffffffff81388f0a>] SyS_write+0xba/0x150
[18212.065165] [<ffffffff81388e50>] ? SyS_read+0x150/0x150
[18212.066382] [<ffffffff811776c6>] ? trace_hardirqs_on_caller+0x16/0x290
[18212.067666] [<ffffffff8100401b>] ? trace_hardirqs_on_thunk+0x1b/0x1d
[18212.068933] [<ffffffff81b386fc>] entry_SYSCALL_64_fastpath+0x1f/0xbd
[18212.070199] [<ffffffff811717b6>] ? trace_hardirqs_off_caller+0x16/0x120
[18212.071470] ==================================================================
[18212.073790] test dev : called copy_from_user

So KASAN complains about user-space accesses. I don't know why. But #include <asm-generic/uaccess.h> looks suspicious: outer code (like modules) should rarely include asm-specific headers. Use standard #include <linux/uaccess.h>. BTW, for x86 asm-generic version of uaccess.h is never used: its asm/uaccess.h header defines user access functions manually. It could be a reason of your problem. The same is true for asm-generic/ioctl.h inclusion. – Tsyvarev
Thank you for your help. I used linux/uaccess.h instead of asm-generic/uaccess.h and this behavior is not happening again. – OstCollector

Related

Failed to get kernel data using copy_to_user not working with debugfs

I am trying to implement simple debugfs interface module. Code attached for reference. To write data I'm using echo 'string' > /sys/kernel/debug/debugexercise/text and its working as expected data being copied into kernel buffer.
But when I try to retrieve data back using cat command i.e. cat /sys/kernel/debug/debugexercise/text , its not printing any data on terminal.
I have also tried using simple_read_from_buffer instead of copy_to_user but got the same result.
Anybody have idea what is the problem with this code. 4.13.0-45-generic is the kernel version on my system.
#include <linux/module.h>
#include <linux/debugfs.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#define LEN 512
static struct dentry *test_dir;
static struct dentry *test_file;
static char ker_buf[LEN] ;
/* read file operation */
static ssize_t test_read(struct file *fp, char __user *user_buffer, size_t count, loff_t *position){
printk(KERN_NOTICE "debugfs_read called, count %d\n", count);
return copy_to_user(user_buffer, ker_buf, LEN);
}
static ssize_t test_write(struct file *fp, const char __user *user_buffer, size_t count, loff_t *position){
printk(KERN_NOTICE "debugfs_write called, count %d\n",count);
if(count > LEN )
return -EINVAL;
copy_from_user(ker_buf, user_buffer, count);
printk(KERN_NOTICE "write buffer complete: %s\n",ker_buf);
return count;
}
static struct file_operations fops_debug = {
.read = test_read,
.write = test_write,
};
static int __init init_debug(void)
{
test_dir = debugfs_create_dir("debugexercise", NULL);
if(NULL == test_dir){
printk(KERN_ERR "debugfs_create_dir() Failed\n");
return -1;
}
else
printk(KERN_NOTICE "debugexercise created\n");
test_file = debugfs_create_file("text", 0644, test_dir, NULL, &fops_debug);
if(NULL == test_file){
printk(KERN_ERR "debugfs_create_file() Failed\n");
debugfs_remove(test_dir);
return -1;
}
else
printk(KERN_NOTICE "text under debugexercise created\n");
return 0;
}
static void __exit exit_debug(void)
{
printk(KERN_NOTICE "removing module\n");
debugfs_remove(test_file);
debugfs_remove(test_dir);
}
module_init(init_debug)
module_exit(exit_debug)
MODULE_LICENSE("GPL");
copy_to_user returns the number of bytes that could not be copied. On success, this will be zero. Hence, the cat displays 0 characters. I believe you should do:
if (copy_to_user(user_buffer, ker_buf, LEN)){
printk(KERN_INFO "copy to user failed.\n");
return -EINVAL; /* For instance ... */
}
return LEN;

"/dev/**** No such device" error coming even with valid charachter device

I am writing a kernel driver in drivers/char/new_driver.c .
This new driver registers two new devices /dev/device1 and /dev/device2 by calling misc_register() api.
error = misc_register(&device1);
error = misc_register(&device2);
static struct miscdevice device1 = {
MISC_DYNAMIC_MINOR,
"device1",
&device1_fops
};
static struct miscdevice device2 = {
MISC_DYNAMIC_MINOR,
"device2",
&device2_fops
};
When I load the module I see that 2 two devices are getting created properly in /dev/device1 and /dev/device2
But when I try to write/read operation on this device it gives error saying that there is "No such device " .
Any idea what causes this type of error ? Anything missing in the driver code ?
I still believe that this is permission issue. I am attaching the sample code (without any real implementation) skeleton. Please have a look into it.
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/module.h>
static int sample_open(struct inode *inode, struct file *file)
{
pr_info("Maverick: %s func :\n",__func__);
return 0;
}
static int sample_close(struct inode *inodep, struct file *filp)
{
pr_info("Maverick: %s func :\n",__func__);
return 0;
}
static ssize_t sample_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
pr_info("Maverick: %s func :\n",__func__);
return len; /* Not doing anything with the data */
}
static ssize_t sample_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
pr_info("Maverick: %s func :\n",__func__);
return len; /* Not do anything with the data */
}
static const struct file_operations sample_fops = {
.owner = THIS_MODULE,
.write = sample_write,
.read = sample_read,
.open = sample_open,
.release = sample_close,
};
struct miscdevice sample_device = {
.minor = MISC_DYNAMIC_MINOR,
.name = "device1",
.fops = &sample_fops,
};
static int __init misc_init(void)
{
int error;
error = misc_register(&sample_device);
if (error) {
pr_err("can't misc_register :(\n");
return error;
}
pr_info("Maverick: %s func :\n",__func__);
return 0;
}
static void __exit misc_exit(void)
{
misc_deregister(&sample_device);
pr_info("Maverick: %s func :\n",__func__);
}
module_init(misc_init)
module_exit(misc_exit)
MODULE_DESCRIPTION("Sample Misc Driver");
MODULE_AUTHOR("Vinod Maverick <vinodmaverickr007#gmail.com>");
MODULE_LICENSE("GPL");
Then compile the driver:
make
sudo insmod misc_sample.ko
ilab#SSID-iLBPG3:~/vinod/ldd$ echo "hello" > /dev/device1
-bash: /dev/device1: Permission denied
ilab#SSID-iLBPG3:~/vinod/ldd$ sudo su
root#SSID-iLBPG3:/home/ilab/vinod/ldd# echo "hello" > /dev/device1
However I have not any code inside the read/write callback but still you can see the dmesg without any error
root#SSID-iLBPG3:/home/ilab/vinod/ldd# dmesg
[2903599.416005] Maverick: misc_init func :
[2903623.966281] Maverick: sample_open func :
[2903623.966292] Maverick: sample_write func :
[2903623.966295] Maverick: sample_close func :
root#SSID-iLBPG3:/home/ilab/vinod/ldd#
To anyone who still has this problem, I managed to fix it by changing my minor number from a crazy high number (999) to a more reasonable number (71). Here's my resulting miscdevice struct:
static struct miscdevice chrdev = {
.minor = 71,
.name = "tracefunc",
.fops = &chrdev_fops,
.mode = S_IRUGO,
};

Linux - proc_fs implementation in a basic kernel module

So I am trying to implement and see how does a module creates a virtual file in /proc and this is a very simple code I wrote:
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <asm/uaccess.h>
#define BUF_SIZE 48
struct proc_dir_entry *proc_entry,*root_dir=NULL;
char result_buffer[BUF_SIZE];
char before[BUF_SIZE];
int count,temp=1;
ssize_t write(struct file *f, const char __user *buf, size_t len, loff_t *off)
{
printk(KERN_ERR "my4: Somebody toyed here\n");
if (copy_from_user(result_buffer,buf,len))
return -EFAULT;
return len;
}
// this read implementation is just a fudge, i am just trying to get the grasp of basic concepts here
ssize_t read(struct file *f, char __user *buf, size_t c, loff_t *off)
{
int len = 0;
if (temp == 0) {
temp = 1;
return 0;
}
sprintf(before,"[%s]",result_buffer);
if (count <= 1)
sprintf(result_buffer,"my4 read %d\n",count++);
else
sprintf(result_buffer,"my4 read again!! count=%d\n", count++);
len = strlen(result_buffer);
if (copy_to_user(buf, result_buffer, len)) return -EFAULT;
printk(KERN_ERR "my4: page before=[%s]\n\t page after=[%s]\n", before, result_buffer);
temp = 0;
return len;
}
static const struct file_operations file_ops = {
.owner = THIS_MODULE,
.read = read,
.write = write
};
int init_my4(void) {
count = 1;
proc_entry = proc_create("my4", 438, NULL, &file_ops);
strcpy(result_buffer, "initialized\n");
if (proc_entry == NULL) {
printk(KERN_ERR "my4: could not create proc entry\n");
return -ENOMEM;
}
printk(KERN_INFO "my4: Module loaded successfully\n");
return 0;
}
void unload_my4(void) {
remove_proc_entry("my4",root_dir);
printk(KERN_INFO "my4: Module unloaded successfully\n");
}
module_init(init_my4);
module_exit(unload_my4);
MODULE_LICENSE("GPL");
What the problem is that BUF_SIZE being just 48 if I write more than that to the file like :
echo "Lets write more than 48 bytes to the file and see if it breaks or overflows or if the kernel kills my process" > /proc/my4
Now when I cat /proc/my4 I get:
my4: Somebody toyed here
[23482.029609] my4: page before=[[Lets write more than 48 bytes to the file and see if it breaks my4 read again!! count=3
]
page after=[my4 read again!! count=3
I fail to understand what is happening here that is how does the kernel simply ignores the input greater than 48 bytes and where does it go?
As I thought that either it will overflow with a segfault or the kernel will kill the process!

How to test your own Linux module?

Today I am getting started with developing Linux modules. It was rather hard to write, compile and work with Helloworld, but I've done it.
My second module with open, write, read functions is ready, but I really dont know how to test it. Write method just makes printk(). My module is loaded, its name is iamnoob. How to test this write(...) function and to find smth in var/log/syslog?
cat > iamnoob just writes a file to the dir. Same with cp and other.
Sorry for noob question, i've googled, but no answer has been found. Sorry for poor English.
A basic kernel module would normally include registering a character device.
Simple imlementation requires:
Register chrdev region with specific major & minor.
Allocate file operations structure and implement the basic read / write APIs.
Initialize and register character device with the file operations structure to the major / minor region.
See the following code snippet as a template of a module (only read / write APIs are imlemented):
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <asm-generic/uaccess.h>
#define MY_BUFFER_SIZE (1024 * 10)
#define MY_CHRDEV_MAJOR 217
#define MY_CHRDEV_MINOR 0
static struct cdev my_cdev;
static unsigned char *my_buf;
static dev_t my_dev = MKDEV(MY_CHRDEV_MAJOR, MY_CHRDEV_MINOR);
ssize_t my_read(struct file *file, char __user * buf, size_t count, loff_t * ppos)
{
int size;
size = MY_BUFFER_SIZE - 100 - (int)*ppos;
if (size > count)
size = count;
if (copy_to_user(buf, my_buf + *ppos, count))
return -EFAULT;
*ppos += size;
return size;
}
ssize_t my_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
int size;
size = MY_BUFFER_SIZE - 100 - (int)*ppos;
if (size > count)
size = count;
if (copy_from_user(my_buf + *ppos, buf, count))
return -EFAULT;
*ppos += size;
return size;
}
long my_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
printk ("%s!\n", __FUNCTION__);
return 0;
}
int my_mmap(struct file *f, struct vm_area_struct *vma)
{
printk ("%s!\n", __FUNCTION__);
return 0;
}
int my_open(struct inode *i, struct file *f)
{
printk ("%s!\n", __FUNCTION__);
return 0;
}
int my_release(struct inode *i, struct file *f)
{
printk ("%s!\n", __FUNCTION__);
return 0;
}
struct file_operations my_fops =
{
.owner = THIS_MODULE,
.read = &my_read,
.write = &my_write,
.unlocked_ioctl = &my_unlocked_ioctl,
.mmap = &my_mmap,
.open = &my_open,
.release = &my_release,
};
static int __init my_module_init(void)
{
int line = 0;
unsigned char *pos;
printk ("%s!\n", __FUNCTION__);
my_buf = (unsigned char *)kzalloc(MY_BUFFER_SIZE, 0);
if (my_buf == NULL) {
printk("%s - failed to kzallocate buf!\n", __FUNCTION__);
return -1;
}
pos = my_buf;
while (pos - my_buf < MY_BUFFER_SIZE - 100) {
sprintf(pos, "Line #%d\n", line++);
pos += strlen(pos);
}
cdev_init(&my_cdev, &my_fops);
if (register_chrdev_region(my_dev, 1, "my_dev")) {
pr_err("Failed to allocate device number\n");
}
cdev_add(&my_cdev, my_dev, 1);
printk ("%s - registered chrdev\n", __FUNCTION__);
return 0;
}
static void __exit my_module_exit(void)
{
printk ("my_module_exit.\n");
unregister_chrdev_region(my_dev, 1);
return;
}
module_init(my_module_init);
module_exit(my_module_exit);
MODULE_LICENSE("GPL");
This module uses a buffer for file operations, therefore can be tested on any machine, regardless of its HW. Make sure you avoid unnecessary printk's as loops may harm your kernel stability.
Once this is done, in user-space shell you should create a /dev node to represent your character device:
sudo mknod /dev/[dev_name] c [major] [minor]
for example:
sudo mknod /dev/my_dev c 217 0
Then you can test your read / write APIs with:
sudo insmod my_modult.ko
cat /dev/my_dev
less -f /dev/my_dev
sudo su
root> echo "This is a test" > /dev/my_dev
root> exit
cat /dev/my_dev
The shell commands listed above perform read, then login as root (to allow writing to device), write to the char dev, then exit and read again to see the changes.
Now you'd normally implement ioctl and mmap if needed.

How to mmap a Linux kernel buffer to user space?

Let's say the buffer is allocated using a page based scheme. One way to implement mmap would be to use remap_pfn_range but LDD3 says this does not work for conventional memory. It appears we can work around this by marking the page(s) reserved using SetPageReserved so that it gets locked in memory. But isn't all kernel memory already non-swappable i.e. already reserved? Why the need to set the reserved bit explicitly?
Does this have something to do with pages allocated from HIGH_MEM?
The simplest way to map a set of pages from the kernel in your mmap method is to use the fault handler to map the pages. Basically you end up with something like:
static int my_mmap(struct file *filp, struct vm_area_struct *vma)
{
vma->vm_ops = &my_vm_ops;
return 0;
}
static const struct file_operations my_fops = {
.owner = THIS_MODULE,
.open = nonseekable_open,
.mmap = my_mmap,
.llseek = no_llseek,
};
(where the other file operations are whatever your module needs). Also in my_mmap you do whatever range checking etc. is needed to validate the mmap parameters.
Then the vm_ops look like:
static int my_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
vmf->page = my_page_at_index(vmf->pgoff);
get_page(vmf->page);
return 0;
}
static const struct vm_operations_struct my_vm_ops = {
.fault = my_fault
}
where you just need to figure out for a given vma / vmf passed to your fault function which page to map into userspace. This depends on exactly how your module works. For example, if you did
my_buf = vmalloc_user(MY_BUF_SIZE);
then the page you use would be something like
vmalloc_to_page(my_buf + (vmf->pgoff << PAGE_SHIFT));
But you could easily create an array and allocate a page for each entry, use kmalloc, whatever.
[just noticed that my_fault is a slightly amusing name for a function]
Minimal runnable example and userland test
Kernel module:
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h> /* min */
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/uaccess.h> /* copy_from_user, copy_to_user */
#include <linux/slab.h>
static const char *filename = "lkmc_mmap";
enum { BUFFER_SIZE = 4 };
struct mmap_info {
char *data;
};
/* After unmap. */
static void vm_close(struct vm_area_struct *vma)
{
pr_info("vm_close\n");
}
/* First page access. */
static vm_fault_t vm_fault(struct vm_fault *vmf)
{
struct page *page;
struct mmap_info *info;
pr_info("vm_fault\n");
info = (struct mmap_info *)vmf->vma->vm_private_data;
if (info->data) {
page = virt_to_page(info->data);
get_page(page);
vmf->page = page;
}
return 0;
}
/* After mmap. TODO vs mmap, when can this happen at a different time than mmap? */
static void vm_open(struct vm_area_struct *vma)
{
pr_info("vm_open\n");
}
static struct vm_operations_struct vm_ops =
{
.close = vm_close,
.fault = vm_fault,
.open = vm_open,
};
static int mmap(struct file *filp, struct vm_area_struct *vma)
{
pr_info("mmap\n");
vma->vm_ops = &vm_ops;
vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_private_data = filp->private_data;
vm_open(vma);
return 0;
}
static int open(struct inode *inode, struct file *filp)
{
struct mmap_info *info;
pr_info("open\n");
info = kmalloc(sizeof(struct mmap_info), GFP_KERNEL);
pr_info("virt_to_phys = 0x%llx\n", (unsigned long long)virt_to_phys((void *)info));
info->data = (char *)get_zeroed_page(GFP_KERNEL);
memcpy(info->data, "asdf", BUFFER_SIZE);
filp->private_data = info;
return 0;
}
static ssize_t read(struct file *filp, char __user *buf, size_t len, loff_t *off)
{
struct mmap_info *info;
ssize_t ret;
pr_info("read\n");
if ((size_t)BUFFER_SIZE <= *off) {
ret = 0;
} else {
info = filp->private_data;
ret = min(len, (size_t)BUFFER_SIZE - (size_t)*off);
if (copy_to_user(buf, info->data + *off, ret)) {
ret = -EFAULT;
} else {
*off += ret;
}
}
return ret;
}
static ssize_t write(struct file *filp, const char __user *buf, size_t len, loff_t *off)
{
struct mmap_info *info;
pr_info("write\n");
info = filp->private_data;
if (copy_from_user(info->data, buf, min(len, (size_t)BUFFER_SIZE))) {
return -EFAULT;
} else {
return len;
}
}
static int release(struct inode *inode, struct file *filp)
{
struct mmap_info *info;
pr_info("release\n");
info = filp->private_data;
free_page((unsigned long)info->data);
kfree(info);
filp->private_data = NULL;
return 0;
}
static const struct file_operations fops = {
.mmap = mmap,
.open = open,
.release = release,
.read = read,
.write = write,
};
static int myinit(void)
{
proc_create(filename, 0, NULL, &fops);
return 0;
}
static void myexit(void)
{
remove_proc_entry(filename, NULL);
}
module_init(myinit)
module_exit(myexit)
MODULE_LICENSE("GPL");
GitHub upstream.
Userland test:
#define _XOPEN_SOURCE 700
#include <assert.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h> /* uintmax_t */
#include <string.h>
#include <sys/mman.h>
#include <unistd.h> /* sysconf */
/* Format documented at:
* https://github.com/torvalds/linux/blob/v4.9/Documentation/vm/pagemap.txt
*/
typedef struct {
uint64_t pfn : 54;
unsigned int soft_dirty : 1;
unsigned int file_page : 1;
unsigned int swapped : 1;
unsigned int present : 1;
} PagemapEntry;
/* Parse the pagemap entry for the given virtual address.
*
* #param[out] entry the parsed entry
* #param[in] pagemap_fd file descriptor to an open /proc/pid/pagemap file
* #param[in] vaddr virtual address to get entry for
* #return 0 for success, 1 for failure
*/
int pagemap_get_entry(PagemapEntry *entry, int pagemap_fd, uintptr_t vaddr)
{
size_t nread;
ssize_t ret;
uint64_t data;
nread = 0;
while (nread < sizeof(data)) {
ret = pread(pagemap_fd, ((uint8_t*)&data) + nread, sizeof(data),
(vaddr / sysconf(_SC_PAGE_SIZE)) * sizeof(data) + nread);
nread += ret;
if (ret <= 0) {
return 1;
}
}
entry->pfn = data & (((uint64_t)1 << 54) - 1);
entry->soft_dirty = (data >> 54) & 1;
entry->file_page = (data >> 61) & 1;
entry->swapped = (data >> 62) & 1;
entry->present = (data >> 63) & 1;
return 0;
}
/* Convert the given virtual address to physical using /proc/PID/pagemap.
*
* #param[out] paddr physical address
* #param[in] pid process to convert for
* #param[in] vaddr virtual address to get entry for
* #return 0 for success, 1 for failure
*/
int virt_to_phys_user(uintptr_t *paddr, pid_t pid, uintptr_t vaddr)
{
char pagemap_file[BUFSIZ];
int pagemap_fd;
snprintf(pagemap_file, sizeof(pagemap_file), "/proc/%ju/pagemap", (uintmax_t)pid);
pagemap_fd = open(pagemap_file, O_RDONLY);
if (pagemap_fd < 0) {
return 1;
}
PagemapEntry entry;
if (pagemap_get_entry(&entry, pagemap_fd, vaddr)) {
return 1;
}
close(pagemap_fd);
*paddr = (entry.pfn * sysconf(_SC_PAGE_SIZE)) + (vaddr % sysconf(_SC_PAGE_SIZE));
return 0;
}
enum { BUFFER_SIZE = 4 };
int main(int argc, char **argv)
{
int fd;
long page_size;
char *address1, *address2;
char buf[BUFFER_SIZE];
uintptr_t paddr;
if (argc < 2) {
printf("Usage: %s <mmap_file>\n", argv[0]);
return EXIT_FAILURE;
}
page_size = sysconf(_SC_PAGE_SIZE);
printf("open pathname = %s\n", argv[1]);
fd = open(argv[1], O_RDWR | O_SYNC);
if (fd < 0) {
perror("open");
assert(0);
}
printf("fd = %d\n", fd);
/* mmap twice for double fun. */
puts("mmap 1");
address1 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (address1 == MAP_FAILED) {
perror("mmap");
assert(0);
}
puts("mmap 2");
address2 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (address2 == MAP_FAILED) {
perror("mmap");
return EXIT_FAILURE;
}
assert(address1 != address2);
/* Read and modify memory. */
puts("access 1");
assert(!strcmp(address1, "asdf"));
/* vm_fault */
puts("access 2");
assert(!strcmp(address2, "asdf"));
/* vm_fault */
strcpy(address1, "qwer");
/* Also modified. So both virtual addresses point to the same physical address. */
assert(!strcmp(address2, "qwer"));
/* Check that the physical addresses are the same.
* They are, but TODO why virt_to_phys on kernel gives a different value? */
assert(!virt_to_phys_user(&paddr, getpid(), (uintptr_t)address1));
printf("paddr1 = 0x%jx\n", (uintmax_t)paddr);
assert(!virt_to_phys_user(&paddr, getpid(), (uintptr_t)address2));
printf("paddr2 = 0x%jx\n", (uintmax_t)paddr);
/* Check that modifications made from userland are also visible from the kernel. */
read(fd, buf, BUFFER_SIZE);
assert(!memcmp(buf, "qwer", BUFFER_SIZE));
/* Modify the data from the kernel, and check that the change is visible from userland. */
write(fd, "zxcv", 4);
assert(!strcmp(address1, "zxcv"));
assert(!strcmp(address2, "zxcv"));
/* Cleanup. */
puts("munmap 1");
if (munmap(address1, page_size)) {
perror("munmap");
assert(0);
}
puts("munmap 2");
if (munmap(address2, page_size)) {
perror("munmap");
assert(0);
}
puts("close");
close(fd);
return EXIT_SUCCESS;
}
GitHub upstream.
Tested on kernel 5.4.3.
Though the pages are reserved via a kernel driver, it is meant to be accessed via user space. As a result, the PTE (page table entries) do not know if the pfn belongs to user space or kernel space (even though they are allocated via kernel driver).
This is why they are marked with SetPageReserved.

Resources