Related
I have try to compile a program which manage userfault-fd to collect some dirty pages on memory. In this program i have used the uffdio_writeprotect structure to survey a memory region where program try to access. I am using Ubuntu 18.04 with linux kernel version 5.14.0. But when i compile a program with the command gcc -o with_userfault -I. with_userfault.c
i still have the same errors such as: with_userfault.c:108:34: error: storage size of ‘wp’ isn’t known
struct uffdio_writeprotect wp;
with_userfault.c:114:21: error: ‘UFFDIO_WRITEPROTECT’ undeclared (first use in this function); did you mean ‘UFFDIO_REGISTER’?
if (ioctl(fd, UFFDIO_WRITEPROTECT, &wp) == -1)
I don't know how to solve this problem. Please I need your help to solve this issue because i don't know what i am suppose to do now. thanks!!!!!
this is a source code:
The header file:
#ifndef __RDTSC_H_DEFINED__
#define __RDTSC_H_DEFINED__
#if defined(__i386__)
static __inline__ unsigned long long rdtsc(void)
{
unsigned long long int x;
__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
return x;
}
#elif defined(__x86_64__)
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
#elif defined(__powerpc__)
static __inline__ unsigned long long rdtsc(void)
{
unsigned long long int result=0;
unsigned long int upper, lower,tmp;
__asm__ volatile(
"0: \n"
"\tmftbu %0 \n"
"\tmftb %1 \n"
"\tmftbu %2 \n"
"\tcmpw %2,%0 \n"
"\tbne 0b \n"
: "=r"(upper),"=r"(lower),"=r"(tmp)
);
result = upper;
result = result<<32;
result = result|lower;
return(result);
}
#else
#error "No tick counter is available!"
#endif
/* $RCSfile: $ $Author: kazutomo $
* $Revision: 1.6 $ $Date: 2005/04/13 18:49:58 $
*/
#endif
The source file
/*
* Example program about using userfaultfd(2) for garbage collection.
*
* This establishes a couple pages, all of which are filled from
* compressed files on disk when first accessed. For simplicity
* these are
* one file per page. Files are written at the beginning of the
* program.
*
* Later, this program demonstrates the use of write protection to
* get
* a notification on write access, analogous to using
* mprotect(!PROT_WRITE)
* and doing the bookkeeping in a SIGSEGV handler.
*
*/
#include <linux/userfaultfd.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <time.h>
#include <math.h>
#include <unistd.h>
#include <strings.h>
#include <string.h>
#include <unistd.h>
#include <asm/unistd.h>
#include <poll.h>
#include <pthread.h>
#include "rdtsc.h"
//#define size 102400
#define FILE_BUF 25
#define PAGE_SIZE sysconf(_SC_PAGE_SIZE)
#define n_pages 102400
//102400
#define iterations 1
pthread_t tracker;
void *tracker_task();
int i;
unsigned long main_tsc_start, main_tsc_end;
// This is doing the work in the uffd handler thread
void *tracker_task(void *data)
{
int fd = *(int *)(data);
for (;;)
{
struct uffd_msg msg;
struct pollfd pollfd;
int pollres, readret;
unsigned long addr, page_begin, whichpage;
unsigned long handler_tsc_start, handler_tsc_end;
pollfd.fd = fd;
pollfd.events = POLLIN;
pollres = poll(&pollfd, 1, -1);
if( pollres == -1 )
perror("poll");
if (pollfd.revents & POLLERR)
{
fprintf(stderr, "POLLERR on userfaultfd\n");
exit(1);
}
readret = read(fd, &msg, sizeof(msg));
if (readret == -1)
perror("read userfaultfd");
if (readret != sizeof(msg))
{
fprintf(stderr, "short read, not expected, exiting\n");
exit(1);
}
/*
* Proper sequence is important here.
*
* For the GC we expect that write-protected pages can only
* be pages already backed by physical pages.
* Regular writes into unprotected pages that come before
* reads need the page be filled.
*
* So we do the WP case first and get it out of the way.
* Then both of the other cases need the page read.
*/
if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP)
{
// send write unlock
struct uffdio_writeprotect wp;
wp.range.start = msg.arg.pagefault.address;
wp.range.len = PAGE_SIZE;
wp.mode = 0;
//printf("sending !UFFDIO_WRITEPROTECT event to
//userfaultfd\n");
if (ioctl(fd, UFFDIO_WRITEPROTECT, &wp) == -1)
perror("ioctl(UFFDIO_WRITEPROTECT)");
//continue;
}
}
printf("end\n");
return NULL;
}
int main(int argc, char *argv[])
{
unsigned long *region;
int uffd, uffd_flags, expected, t_create;
void *status;
struct uffdio_writeprotect wp;
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
char *clear = "3", *drop_caches_path = "/proc/sys/vm/drop_caches";
FILE *drop_caches_file;
main_tsc_start = rdtsc();
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
{
perror("syscall");
exit(2);
}
uffdio_api.api = UFFD_API;
uffdio_api.features = 1;
if (ioctl(uffd, UFFDIO_API, &uffdio_api))
{
fprintf(stderr, "UFFDIO_API\n");
return 1;
}
//printf("Features: 0x%llx\n", uffdio_api.features);
if (uffdio_api.api != UFFD_API)
{
fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
return 1;
}
/* Allocate memory that will be tracked */
region = (unsigned long *) mmap(NULL, PAGE_SIZE * n_pages, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (!region)
{
perror("mmap");
exit(2);
}
/* Force alignment of contiguous pages */
if (posix_memalign((void **)region, PAGE_SIZE, PAGE_SIZE * (n_pages - 1)))
{
fprintf(stderr, "cannot align by PAGE_SIZE %ld\n", PAGE_SIZE);
exit(1);
}
uffdio_register.range.start = (unsigned long)region;
uffdio_register.range.len = PAGE_SIZE * n_pages;
uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
//main_tsc_start = rdtsc();
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
{
perror("ioctl(UFFDIO_REGISTER)");
exit(1);
}
expected = UFFD_API_RANGE_IOCTLS;
if ((uffdio_register.ioctls & expected) != expected)
{
fprintf(stderr, "ioctl set is incorrect\n");
exit(1);
}
if( (t_create = pthread_create(&tracker, NULL, tracker_task,
&uffd)) != 0 )
{
errno = t_create;
perror("pthread_create");
}
//printf("mainline writing writable pages.\n");
for (unsigned long i = 0; i < n_pages; i++)
{
unsigned long entry = (i * PAGE_SIZE / sizeof(unsigned long)) + ((rand()%10000) % 512);//(i % 512);
region[entry] = i;
}
drop_caches_file = fopen(drop_caches_path, "w");
for (i = 0; i < iterations; i++)//
{
/* Indicate the range of pages to be write-protected */
//sync();
fwrite(clear, sizeof(clear), 1, drop_caches_file);
wp.range.start = (unsigned long long)region;
wp.range.len = PAGE_SIZE * n_pages;
wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
/* Write-protect pages */
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp) == -1)
{
perror("ioctl(UFFDIO_WRITEPROTECT)");
exit(1);
}
/* Now try to "touch" the pages to trigger page faults and handling by the tracker thread */
for (unsigned long i = 0; i < n_pages; i++)
{
unsigned long entry = (i * PAGE_SIZE / sizeof(unsigned long)) + ((rand()%10000) % 512);//(i % 512);
region[entry] = i;
}
}
fclose(drop_caches_file);
if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
fprintf(stderr, "ioctl unregister failure\n");
return 1;
}
main_tsc_end = rdtsc();
printf("Apllication TSC : %lu\n", (unsigned long) (main_tsc_end-main_tsc_start));
return 0;
}
I have built a custom hardware configuration in Vivado for Xilinx SoC board, and used petalinux to create a custom driver to control the hardware logic. It seems like after running insmod command, the driver is never initialized and the ->probe() function is not called.
I am new to this domain, and wondering if anyone ran into a similar issue and has some pointers on where and what to check in order to see where the issue is. Any advice would be very helpful!
Running the dmesg command after inserting the driver into the kernel:
root#plzwork3:/proc/device-tree/amba_pl#0/simpleMultiplier#a0000000# dmesg
[ 3351.680317] <1>Hello module world.
[ 3351.683735] <1>Module parameters were (0xdeadbeef) and "default"
Device Tree entity created by Vivado:
/ {
amba_pl: amba_pl#0 {
#address-cells = <2>;
#size-cells = <2>;
compatible = "simple-bus";
ranges ;
simpleMultiplier_0: simpleMultiplier#a0000000 {
clock-names = "axiliteregport_aclk";
clocks = <&zynqmp_clk 71>;
compatible = "xlnx,simpleMultiplier-1.0";
reg = <0x0 0xa0000000 0x0 0x10000>;
xlnx,axiliteregport-addr-width = <0x4>;
xlnx,axiliteregport-data-width = <0x20>;
};
};
};
Device driver code snippets:
#ifdef CONFIG_OF
static struct of_device_id simpmod_of_match[] = {
{ .compatible = "xlnx,simpleMultiplier-1.0", },
{ /* end of list */ },
};
Full device driver code:
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/io.h>
#include <linux/interrupt.h>
#include <linux/of_address.h>
#include <linux/of_device.h>
#include <linux/of_platform.h>
#include <linux/uaccess.h>
#include <linux/cdev.h>
#include <linux/fs.h>
/* Standard module information, edit as appropriate */
MODULE_LICENSE("GPL");
MODULE_AUTHOR
("Xilinx Inc.");
MODULE_DESCRIPTION("simpmod - loadable module template generated by petalinux-create -t modules");
#define DRIVER_NAME "simpmod"
#define CLASS_NAME "CLASS_TUT"
static char ker_buf[100];
static int currLen = 0;
// Parts of the math operation (2,2,+) is 2+2
static int operand_1 = 0;
static int operand_2 = 0;
static int result = 0;
static struct class *driver_class = NULL;
static dev_t first;
static struct cdev c_dev; // Global variable for the character device
static struct device *ourDevice;
// Pointer to the IP registers
volatile unsigned int *regA;
volatile unsigned int *regB;
volatile unsigned int *regC;
// Structure to hold device specific data
struct simpmod_local {
int irq;
unsigned long mem_start;
unsigned long mem_end;
void __iomem *base_addr;
};
// Character callbacks prototype
static int dev_open(struct inode *inod, struct file *fil);
static ssize_t dev_read(struct file *fil, char *buf, size_t len, loff_t *off);
static ssize_t dev_write(struct file *fil, const char *buf, size_t len, loff_t *off);
static int dev_release(struct inode *inod, struct file *fil);
static struct file_operations fops = {
.read=dev_read,
.write=dev_write,
.open=dev_open,
.release=dev_release,
};
static irqreturn_t simpmod_irq(int irq, void *lp){
printk("simpmod interrupt\n");
return IRQ_HANDLED;
}
static int simpmod_probe(struct platform_device *pdev){
struct resource *r_irq; /* Interrupt resources */
struct resource *r_mem; /* IO mem resources */
struct device *dev = &pdev->dev;
static struct simpmod_local *lp = NULL;
int rc = 0;
printk("Device Tree Probing\n");
// Get data of type IORESOURCE_MEM(reg-addr) from the device-tree
// Other types defined here:
// http://lxr.free-electrons.com/source/include/linux/ioport.h#L33
r_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
if (!r_mem) {
dev_err(dev, "invalid address\n");
return -ENODEV;
}
// Allocate memory (continuous physical)to hold simpmod_local struct
lp = (struct simpmod_local *) kmalloc(sizeof(struct simpmod_local), GFP_KERNEL);
if (!lp) {
printk("Cound not allocate simpmod device\n");
return -ENOMEM;
}
dev_set_drvdata(dev, lp);
// Save data on simpmod_local strucutre
lp->mem_start = r_mem->start;
lp->mem_end = r_mem->end;
// Ask the kernel the memory region defined on the device-tree and
// prevent other drivers to overlap on this region
// This is needed before the ioremap
if (!request_mem_region(lp->mem_start,
lp->mem_end - lp->mem_start + 1,
DRIVER_NAME)) {
dev_err(dev, "Couldn't lock memory region at %p\n",
(void *)lp->mem_start);
rc = -EBUSY;
goto error1;
}
// Get an virtual address from the device physical address with a
// range size: lp->mem_end - lp->mem_start + 1
lp->base_addr = ioremap(lp->mem_start, lp->mem_end - lp->mem_start + 1);
if (!lp->base_addr) {
dev_err(dev, "simpmod: Could not allocate iomem\n");
rc = -EIO;
goto error2;
}
// ****************** NORMAL Device diver *************************
// register a range of char device numbers
if (alloc_chrdev_region(&first, 0, 1, "Leonardo") < 0){
printk(KERN_ALERT "alloc_chrdev_region failed\n");
return -1;
}
// Create class (/sysfs)
driver_class = class_create(THIS_MODULE, CLASS_NAME);
if (driver_class == NULL) {
printk(KERN_ALERT "Create class failed\n");
unregister_chrdev_region(first, 1);
return -1;
}
ourDevice = device_create(driver_class, NULL, first, NULL, "tutDevice");
if ( ourDevice == NULL){
printk(KERN_ALERT "Create device failed\n");
class_destroy(driver_class);
unregister_chrdev_region(first, 1);
return -1;
}
// Create a character device /dev/tutDevice
cdev_init(&c_dev, &fops);
if (cdev_add(&c_dev, first, 1) == -1){
printk(KERN_ALERT "Create character device failed\n");
device_destroy(driver_class, first);
class_destroy(driver_class);
unregister_chrdev_region(first, 1);
return -1;petalinux-config -c rootfs
}
// Create the attribute file on /sysfs/class/CLASS_TUT/ called
// parCrtl and isBusy
// Get data of type IORESOURCE_IRQ(interrupt) from the device-tree
r_irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
if (!r_irq) {
printk("no IRQ found\n");
printk("simpmod at 0x%08x mapped to 0x%08x\n",
(unsigned int __force)lp->mem_start,
(unsigned int __force)lp->base_addr);
// Configuring pointers to the IP registers
regA = (unsigned int __force)lp->base_addr + 0x10;
regB = (unsigned int __force)lp->base_addr + 0x18;
regC = (unsigned int __force)lp->base_addr + 0x26;
printk("regA: 0x%08x\n",(unsigned int)regA);
printk("regB: 0x%08x\n",(unsigned int)regB);
printk("regC: 0x%08x\n",(unsigned int)regC);
return 0;
}
lp->irq = r_irq->start;
rc = request_irq(lp->irq, &simpmod_irq, 0, DRIVER_NAME, lp);
if (rc) {
dev_err(dev, "testmodule: Could not allocate interrupt %d.\n",
lp->irq);
goto error3;
}
return 0;
error3:
free_irq(lp->irq, lp);
error2:
release_mem_region(lp->mem_start, lp->mem_end - lp->mem_start + 1);
error1:
kfree(lp);
dev_set_drvdata(dev, NULL);
return rc;
}
static int simpmod_remove(struct platform_device *pdev){
struct device *dev = &pdev->dev;
struct simpmod_local *lp = dev_get_drvdata(dev);
free_irq(lp->irq, lp);
release_mem_region(lp->mem_start, lp->mem_end - lp->mem_start + 1);
kfree(lp);
dev_set_drvdata(dev, NULL);
return 0;
}
// Indicate which type of hardware we handle on this case(simpleAlu-1.0)
#ifdef CONFIG_OF
static struct of_device_id simpmod_of_match[] = {
{ .compatible = "xlnx,simpleMultiplier-1.0", },
{ /* end of list */ },
};
MODULE_DEVICE_TABLE(of, simpmod_of_match);
#else
# define simpmod_of_match
#endif
static struct platform_driver simpmod_driver = {
.driver = {
.name = DRIVER_NAME,
.owner = THIS_MODULE,
.of_match_table = simpmod_of_match,
},
.probe = simpmod_probe,
.remove = simpmod_remove,
};
static int __init simpmod_init(void)
{
printk("<1>Simple device driver.\n");
printk("Hussam was here, and he init the module\n");
return platform_driver_register(&simpmod_driver);
}
static void __exit simpmod_exit(void)
{
platform_driver_unregister(&simpmod_driver);
printk(KERN_ALERT "Goodbye module world.\n");
}
static int dev_open(struct inode *inod, struct file *fil){
printk(KERN_ALERT "Character device opened\n");
return 0;
}
// Just send to the user a string with the value of result
static ssize_t dev_read(struct file *fil, char *buf, size_t len, loff_t *off){
// Return the result only once (otherwise a simple cat will loop)
// Copy from kernel space to user space
printk(KERN_ALERT "Reading device rx: %d\n",(int)len);
int n = sprintf(ker_buf, "%d\n", *regC);
// Copy back to user the result (to,from,size)
copy_to_user(buf,ker_buf,n);
printk(KERN_ALERT "Returning %s rx: %d\n",ker_buf,n);
return n;
}
// Parse the input stream ex: "50,2,*" to some operand variables.
static ssize_t dev_write(struct file *fil, const char *buf, size_t len, loff_t *off){
// Get data from user space to kernel space
copy_from_user(ker_buf,buf,len);
sscanf (ker_buf,"%d,%d,%c",&operand_1,&operand_2);
ker_buf[len] = 0;
// Change the IP registers to the parsed operands (on rega and regb)
*regA = (unsigned int)operand_1;
*regB = (unsigned int)operand_2;
printk(KERN_ALERT "Receiving math operation <%d %d>\n",operand_1,operand_2);
return len;
}
static int dev_release(struct inode *inod, struct file *fil){
printk(KERN_ALERT "Device closed\n");
return 0;
}
module_init(simpmod_init);
module_exit(simpmod_exit);
the problem is, that the (compatible) names in your device tree does not match with those in your code:
Device Tree
compatible = "simple-bus";
Code:
#ifdef CONFIG_OF
static struct of_device_id simpmod_of_match[] = {
{ .compatible = "xlnx,simpleMultiplier-1.0", },
{ /* end of list */ },
};
Put your device tree:
compatible = "xlnx,simpleMultiplier-1.0";
Both entries should match!
Now your "probe" function should be called!
BR
Andreas
your of_match table section in the driver code:
// Indicate which type of hardware we handle on this case(simpleAlu-1.0)
#ifdef CONFIG_OF
static struct of_device_id simpmod_of_match[] = {
{ .compatible = "xlnx,simpleMultiplier-1.0", },
{ /* end of list */ },
};
MODULE_DEVICE_TABLE(of, simpmod_of_match);
#else
# define simpmod_of_match
#endif
should be like this:
static struct of_device_id simpmod_of_match[] = {
{ .compatible = "xlnx,simpleMultiplier-1.0", },
{ /* end of list */ },
};
MODULE_DEVICE_TABLE(of, simpmod_of_match);
and make sure the compatible line strings have to match in the pl.dtsi and of_match table
I want to write a kernel module that uses Netfilter to modify the source ip to "100.100.100.100" whiche packet the Destination IP is "192.68.4.103" and the protocol is TCP.
I write a .c file but when i install this module ,it will cause the computer crash. How should I rewrite it?
The system is ubuntu16.04 . Here is the code I wrote:
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include<linux/inet.h>
#include<net/ip.h>
#include<net/tcp.h>
#include <linux/netdevice.h>
#include <linux/inet.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("LINHOS");
#define IF_NAME "eno1" //Network name which is got by command "ifconfig"
struct nf_hook_ops nfho;
static unsigned int checkIP(
unsigned int hooknum,
struct sk_buff *__skb,
const struct net_device *in,
const struct net_device *out,
int(*okfn)(struct sk_buff *)){
struct sk_buff *skb;
struct net_device *dev;
struct iphdr *iph;
struct tcphdr *tcph;
int tot_len;
unsigned int iph_len;
int tcph_len;
int ret;
skb = __skb;
if (skb == NULL)
return NF_ACCEPT;
iph = ip_hdr(skb);
if (iph == NULL)
return NF_ACCEPT;
tot_len = ntohs(iph->tot_len);
if (iph->daddr =="192.68.4.103") { //locla ip
iph_len = ip_hdrlen(skb);
skb_pull(skb, iph_len);
skb_reset_transport_header(skb);
if (iph->protocol == IPPROTO_TCP) {
tcph = tcp_hdr(skb);
tcph_len = tcp_hdrlen(skb);
iph->saddr = in_aton("100.100.100.100");
dev = dev_get_by_name(&init_net, IF_NAME);
tcph->check = 0;
skb->csum = csum_partial((unsigned char *) tcph,
tot_len - iph_len,
0);
tcph->check = csum_tcpudp_magic(iph->saddr,
iph->daddr,
ntohs(iph->tot_len) - iph_len,
iph->protocol,
skb->csum);
iph->check = 0;
iph->check = ip_fast_csum(iph, iph->ihl);
skb->ip_summed = CHECKSUM_NONE;
skb->pkt_type = PACKET_HOST;
skb->dev = dev;
skb_push(skb, iph_len);
skb_push(skb, ETH_ALEN);
ret = dev_queue_xmit(skb);
if (ret < 0) {
printk(KERN_ERR "dev_queue_xmit() error!\n");
return NF_DROP;
}
return NF_STOLEN;
}
skb_push(skb, iph_len);
skb_reset_transport_header(skb);
}
return NF_ACCEPT;
}
static int __initfilter_init(void){
printk("----------------checkIP is OK!--------------");
nfho.hook = checkIP;
nfho.pf = AF_INET;
nfho.hooknum = NF_INET_PRE_ROUTING;
nfho.priority = NF_IP_PRI_FIRST;
int ret = nf_register_hook(&nfho);
if (ret < 0) {
printk(KERN_ERR "can't modify skb hook!");
return ret;
}
return 0;
}
static void filter_exit(void) {
nf_unregister_hook(&nfho);
}
module_init(filter_init);
module_exit(filter_exit);
Could anyone explain to me what I am doing wrong?
I am trying to understand procfs for communication between userspace and kernel module. My module has basic two functions for procfs write_proc, driver_mmap.
I call multiple times write_proc by calling fputs("123456789",fd). where fd is file descriptor to procfs entry in /proc directory. But I don't see write_proc called multiple time. Code is attached by here.
<code>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/slab.h>
#include <sound/core.h>
#include <sound/initval.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/proc_fs.h>
#include <asm/uaccess.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/mm.h> /* mmap related stuff */
#define BUF_SIZE 64 * 1024
int *MmapBuffer;
int Factor = 1;
static int write_proc(struct file *filp, int *buf, size_t count, loff_t *offp)
{
int rc,i;
printk("in Write \n");
for (i = 1; i <= 16*1024 ; i++)
MmapBuffer[i-1] = (i+1)*Factor;
Factor++;
return count;
}
static int driver_mmap(struct file *file, struct vm_area_struct *vma)
{
int ret;
vma->vm_flags |= VM_LOCKED|VM_SHARED;
ret = remap_pfn_range(vma, vma->vm_start,
virt_to_phys(MmapBuffer) >> PAGE_SHIFT,
vma->vm_end-vma->vm_start, vma->vm_page_prot);
if(ret != 0)
printk("MMAP Failed \n");
SetPageReserved(virt_to_page(MmapBuffer));
printk("MMAP Succeeded \n");
return 0;
}
// file operations
struct file_operations proc_fops =
{
.write = write_proc,
.mmap = driver_mmap,
};
// init module
int init_module_test(void)
{
printk("<1>Hello world\n");
MmapBuffer = kzalloc(BUF_SIZE,__GFP_COLD|GFP_DMA);
if(MmapBuffer == NULL)
printk("Kzalloc failed. reduce buffer size \n");
proc_create ("Test_fs",0,NULL, &proc_fops);
return 0;
}
// exit module
void cleanup_module_test(void)
{
kfree(MmapBuffer);
remove_proc_entry ("Test_fs", NULL);
printk("Goodbye world\n");
}
module_init(init_module_test);
module_exit(cleanup_module_test);
MODULE_LICENSE("GPL");
</code>
Application code
<code>
#include<stdio.h>
#include<stdlib.h>
#include<sys/mman.h>
#include<errno.h>
#include <fcntl.h>
int main(void)
{
int fd;
int i,j;
int *msg ;
printf("Allocation started \n ");
msg=(int*)malloc(64*1024);
if(msg == NULL)
printf("Allocation failed \n");
//unsigned int *addr;
printf("Starting opening \n ");
if((fd=open("/proc/Test_fs", O_RDONLY ))<0)
{
printf("File not opened ");
}
printf("Starting mapping \n ");
msg = mmap(NULL, 64*1024, PROT_READ, MAP_SHARED , fd, 0);
printf("done from module \n ");
if(msg == MAP_FAILED)
{
printf("MAP failed and error is %s", strerror(errno));
return 0;
}
close(fd);
printf("Successful mapping");
FILE *f;
f=fopen("/proc/Test_fs", "wr");
if(!f)
{
printf("File not opened ");
}
for (j = 0; j < 10 ; j++)
{
if(fputs("1234567890,",f) <= 0)
printf("write failed, ");
for (i = 0; i < 16*1024 ; i++)
printf("%d, ", msg[i]);
printf("\n \n done \n \n ");
}
fclose(f);
return 0;
}
</code>
Let's say the buffer is allocated using a page based scheme. One way to implement mmap would be to use remap_pfn_range but LDD3 says this does not work for conventional memory. It appears we can work around this by marking the page(s) reserved using SetPageReserved so that it gets locked in memory. But isn't all kernel memory already non-swappable i.e. already reserved? Why the need to set the reserved bit explicitly?
Does this have something to do with pages allocated from HIGH_MEM?
The simplest way to map a set of pages from the kernel in your mmap method is to use the fault handler to map the pages. Basically you end up with something like:
static int my_mmap(struct file *filp, struct vm_area_struct *vma)
{
vma->vm_ops = &my_vm_ops;
return 0;
}
static const struct file_operations my_fops = {
.owner = THIS_MODULE,
.open = nonseekable_open,
.mmap = my_mmap,
.llseek = no_llseek,
};
(where the other file operations are whatever your module needs). Also in my_mmap you do whatever range checking etc. is needed to validate the mmap parameters.
Then the vm_ops look like:
static int my_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
vmf->page = my_page_at_index(vmf->pgoff);
get_page(vmf->page);
return 0;
}
static const struct vm_operations_struct my_vm_ops = {
.fault = my_fault
}
where you just need to figure out for a given vma / vmf passed to your fault function which page to map into userspace. This depends on exactly how your module works. For example, if you did
my_buf = vmalloc_user(MY_BUF_SIZE);
then the page you use would be something like
vmalloc_to_page(my_buf + (vmf->pgoff << PAGE_SHIFT));
But you could easily create an array and allocate a page for each entry, use kmalloc, whatever.
[just noticed that my_fault is a slightly amusing name for a function]
Minimal runnable example and userland test
Kernel module:
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h> /* min */
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/uaccess.h> /* copy_from_user, copy_to_user */
#include <linux/slab.h>
static const char *filename = "lkmc_mmap";
enum { BUFFER_SIZE = 4 };
struct mmap_info {
char *data;
};
/* After unmap. */
static void vm_close(struct vm_area_struct *vma)
{
pr_info("vm_close\n");
}
/* First page access. */
static vm_fault_t vm_fault(struct vm_fault *vmf)
{
struct page *page;
struct mmap_info *info;
pr_info("vm_fault\n");
info = (struct mmap_info *)vmf->vma->vm_private_data;
if (info->data) {
page = virt_to_page(info->data);
get_page(page);
vmf->page = page;
}
return 0;
}
/* After mmap. TODO vs mmap, when can this happen at a different time than mmap? */
static void vm_open(struct vm_area_struct *vma)
{
pr_info("vm_open\n");
}
static struct vm_operations_struct vm_ops =
{
.close = vm_close,
.fault = vm_fault,
.open = vm_open,
};
static int mmap(struct file *filp, struct vm_area_struct *vma)
{
pr_info("mmap\n");
vma->vm_ops = &vm_ops;
vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_private_data = filp->private_data;
vm_open(vma);
return 0;
}
static int open(struct inode *inode, struct file *filp)
{
struct mmap_info *info;
pr_info("open\n");
info = kmalloc(sizeof(struct mmap_info), GFP_KERNEL);
pr_info("virt_to_phys = 0x%llx\n", (unsigned long long)virt_to_phys((void *)info));
info->data = (char *)get_zeroed_page(GFP_KERNEL);
memcpy(info->data, "asdf", BUFFER_SIZE);
filp->private_data = info;
return 0;
}
static ssize_t read(struct file *filp, char __user *buf, size_t len, loff_t *off)
{
struct mmap_info *info;
ssize_t ret;
pr_info("read\n");
if ((size_t)BUFFER_SIZE <= *off) {
ret = 0;
} else {
info = filp->private_data;
ret = min(len, (size_t)BUFFER_SIZE - (size_t)*off);
if (copy_to_user(buf, info->data + *off, ret)) {
ret = -EFAULT;
} else {
*off += ret;
}
}
return ret;
}
static ssize_t write(struct file *filp, const char __user *buf, size_t len, loff_t *off)
{
struct mmap_info *info;
pr_info("write\n");
info = filp->private_data;
if (copy_from_user(info->data, buf, min(len, (size_t)BUFFER_SIZE))) {
return -EFAULT;
} else {
return len;
}
}
static int release(struct inode *inode, struct file *filp)
{
struct mmap_info *info;
pr_info("release\n");
info = filp->private_data;
free_page((unsigned long)info->data);
kfree(info);
filp->private_data = NULL;
return 0;
}
static const struct file_operations fops = {
.mmap = mmap,
.open = open,
.release = release,
.read = read,
.write = write,
};
static int myinit(void)
{
proc_create(filename, 0, NULL, &fops);
return 0;
}
static void myexit(void)
{
remove_proc_entry(filename, NULL);
}
module_init(myinit)
module_exit(myexit)
MODULE_LICENSE("GPL");
GitHub upstream.
Userland test:
#define _XOPEN_SOURCE 700
#include <assert.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h> /* uintmax_t */
#include <string.h>
#include <sys/mman.h>
#include <unistd.h> /* sysconf */
/* Format documented at:
* https://github.com/torvalds/linux/blob/v4.9/Documentation/vm/pagemap.txt
*/
typedef struct {
uint64_t pfn : 54;
unsigned int soft_dirty : 1;
unsigned int file_page : 1;
unsigned int swapped : 1;
unsigned int present : 1;
} PagemapEntry;
/* Parse the pagemap entry for the given virtual address.
*
* #param[out] entry the parsed entry
* #param[in] pagemap_fd file descriptor to an open /proc/pid/pagemap file
* #param[in] vaddr virtual address to get entry for
* #return 0 for success, 1 for failure
*/
int pagemap_get_entry(PagemapEntry *entry, int pagemap_fd, uintptr_t vaddr)
{
size_t nread;
ssize_t ret;
uint64_t data;
nread = 0;
while (nread < sizeof(data)) {
ret = pread(pagemap_fd, ((uint8_t*)&data) + nread, sizeof(data),
(vaddr / sysconf(_SC_PAGE_SIZE)) * sizeof(data) + nread);
nread += ret;
if (ret <= 0) {
return 1;
}
}
entry->pfn = data & (((uint64_t)1 << 54) - 1);
entry->soft_dirty = (data >> 54) & 1;
entry->file_page = (data >> 61) & 1;
entry->swapped = (data >> 62) & 1;
entry->present = (data >> 63) & 1;
return 0;
}
/* Convert the given virtual address to physical using /proc/PID/pagemap.
*
* #param[out] paddr physical address
* #param[in] pid process to convert for
* #param[in] vaddr virtual address to get entry for
* #return 0 for success, 1 for failure
*/
int virt_to_phys_user(uintptr_t *paddr, pid_t pid, uintptr_t vaddr)
{
char pagemap_file[BUFSIZ];
int pagemap_fd;
snprintf(pagemap_file, sizeof(pagemap_file), "/proc/%ju/pagemap", (uintmax_t)pid);
pagemap_fd = open(pagemap_file, O_RDONLY);
if (pagemap_fd < 0) {
return 1;
}
PagemapEntry entry;
if (pagemap_get_entry(&entry, pagemap_fd, vaddr)) {
return 1;
}
close(pagemap_fd);
*paddr = (entry.pfn * sysconf(_SC_PAGE_SIZE)) + (vaddr % sysconf(_SC_PAGE_SIZE));
return 0;
}
enum { BUFFER_SIZE = 4 };
int main(int argc, char **argv)
{
int fd;
long page_size;
char *address1, *address2;
char buf[BUFFER_SIZE];
uintptr_t paddr;
if (argc < 2) {
printf("Usage: %s <mmap_file>\n", argv[0]);
return EXIT_FAILURE;
}
page_size = sysconf(_SC_PAGE_SIZE);
printf("open pathname = %s\n", argv[1]);
fd = open(argv[1], O_RDWR | O_SYNC);
if (fd < 0) {
perror("open");
assert(0);
}
printf("fd = %d\n", fd);
/* mmap twice for double fun. */
puts("mmap 1");
address1 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (address1 == MAP_FAILED) {
perror("mmap");
assert(0);
}
puts("mmap 2");
address2 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (address2 == MAP_FAILED) {
perror("mmap");
return EXIT_FAILURE;
}
assert(address1 != address2);
/* Read and modify memory. */
puts("access 1");
assert(!strcmp(address1, "asdf"));
/* vm_fault */
puts("access 2");
assert(!strcmp(address2, "asdf"));
/* vm_fault */
strcpy(address1, "qwer");
/* Also modified. So both virtual addresses point to the same physical address. */
assert(!strcmp(address2, "qwer"));
/* Check that the physical addresses are the same.
* They are, but TODO why virt_to_phys on kernel gives a different value? */
assert(!virt_to_phys_user(&paddr, getpid(), (uintptr_t)address1));
printf("paddr1 = 0x%jx\n", (uintmax_t)paddr);
assert(!virt_to_phys_user(&paddr, getpid(), (uintptr_t)address2));
printf("paddr2 = 0x%jx\n", (uintmax_t)paddr);
/* Check that modifications made from userland are also visible from the kernel. */
read(fd, buf, BUFFER_SIZE);
assert(!memcmp(buf, "qwer", BUFFER_SIZE));
/* Modify the data from the kernel, and check that the change is visible from userland. */
write(fd, "zxcv", 4);
assert(!strcmp(address1, "zxcv"));
assert(!strcmp(address2, "zxcv"));
/* Cleanup. */
puts("munmap 1");
if (munmap(address1, page_size)) {
perror("munmap");
assert(0);
}
puts("munmap 2");
if (munmap(address2, page_size)) {
perror("munmap");
assert(0);
}
puts("close");
close(fd);
return EXIT_SUCCESS;
}
GitHub upstream.
Tested on kernel 5.4.3.
Though the pages are reserved via a kernel driver, it is meant to be accessed via user space. As a result, the PTE (page table entries) do not know if the pfn belongs to user space or kernel space (even though they are allocated via kernel driver).
This is why they are marked with SetPageReserved.