How to get the size of the VDSO on a Linux x86_64 system - linux

I'd like to dump the VDSO to disk in a way that I can verify it is correct with objdump -D.
We can get the base address of the VDSO with getauxval(AT_SYSINFO_EHDR) as documented in vdso(7). But how does one get the size of the object?
I happen to know it is exactly two pages long, but I'm not certain I can rely on that.
I can't see anything in the ELF header that would indicate the size of the object as a whole, and I've also tried iterating and dumping the sections via dl_iterate_phdr(3) to no joy (presumably this would skip the ELF header?).
Any ideas? Do I really have to scrape the size out of the proc maps?

The VDSO header gives you the start of the file. To find the end, calculate the maximum of:
the end of all segments from the program header table (offset + size)
the end of the section header table
the end of the program header table
Then write everything between the start and end to disk. The following program should do the trick:
#include <stdlib.h>
#include <stdio.h>
#include <sys/auxv.h>
struct elf_fhdr_64
{
uint32_t magic;
uint8_t ei_class;
uint8_t ei_data;
uint8_t ei_version;
uint8_t ei_osabi;
uint8_t ei_abiversion;
uint8_t pad[7];
uint16_t e_type;
uint16_t e_machine;
uint32_t e_version;
uint64_t e_entry;
uint64_t e_phoff; // program header offset
uint64_t e_shoff;
uint32_t e_flags;
uint16_t e_ehsize;
uint16_t e_phentsize;
uint16_t e_phnum; // number of program headers
uint16_t e_shentsize;
uint16_t e_shnum;
// ...
};
struct elf_phdr_64
{
uint32_t p_type;
uint32_t p_flags;
uint64_t p_offset; // offset in file
uint64_t p_vaddr;
uint64_t p_paddr;
uint64_t p_filesz; // size in file
// ...
};
struct elf_shdr_64
{
uint32_t sh_name;
uint32_t sh_type;
uint64_t sh_flags;
uint64_t sh_addr; // virtual addr when loaded
uint64_t sh_offset; // offset in file
uint64_t sh_size; // size in file
// ...
};
int main(int argc, char **argv)
{
unsigned long vdso_hdr = getauxval(AT_SYSINFO_EHDR);
uint32_t elf_magic = * (uint32_t *)vdso_hdr;
if (elf_magic == 0x464C457F) {
printf("has elf magic, proceeding...\n");
}
else {
printf("no elf magic.\n");
exit(1);
}
struct elf_fhdr_64 * fhdrp = (struct elf_fhdr_64 *) vdso_hdr;
int num_phdrs = fhdrp->e_phnum;
uint16_t phentsize = fhdrp->e_phentsize;
long max_offs = 0;
for (int i = 0; i < num_phdrs; i++) {
struct elf_phdr_64 * phdr = (struct elf_phdr_64 *)(vdso_hdr
+ fhdrp->e_phoff + i * phentsize);
long file_offs = phdr->p_offset + phdr->p_filesz;
if (max_offs < file_offs) max_offs = file_offs;
}
int num_shdrs = fhdrp->e_shnum;
int shentsize = fhdrp->e_shentsize;
for (int i = 0; i < num_shdrs; i++) {
struct elf_shdr_64 * shdr = (struct elf_shdr_64 *)(vdso_hdr
+ fhdrp->e_shoff + i * shentsize);
long file_offs = shdr->sh_offset + shdr->sh_size;
if (max_offs < file_offs) max_offs = file_offs;
}
// section table:
int section_table_max = fhdrp->e_shoff + (num_shdrs * shentsize);
if (max_offs < section_table_max) max_offs = section_table_max;
// phdrs table:
int phdr_table_max = fhdrp->e_phoff + (num_phdrs * phentsize);
if (max_offs < phdr_table_max) max_offs = section_table_max;
FILE * outfile = fopen("test-vdso.so", "wb");
if (outfile) {
fwrite((void *) vdso_hdr, 1, max_offs, outfile);
fclose(outfile);
}
return 0;
}

Related

Which Linux kernel version support uffdio_writeprotect structure and how to compile and install that linux kernel?

I have try to compile a program which manage userfault-fd to collect some dirty pages on memory. In this program i have used the uffdio_writeprotect structure to survey a memory region where program try to access. I am using Ubuntu 18.04 with linux kernel version 5.14.0. But when i compile a program with the command gcc -o with_userfault -I. with_userfault.c
i still have the same errors such as: with_userfault.c:108:34: error: storage size of ‘wp’ isn’t known
struct uffdio_writeprotect wp;
with_userfault.c:114:21: error: ‘UFFDIO_WRITEPROTECT’ undeclared (first use in this function); did you mean ‘UFFDIO_REGISTER’?
if (ioctl(fd, UFFDIO_WRITEPROTECT, &wp) == -1)
I don't know how to solve this problem. Please I need your help to solve this issue because i don't know what i am suppose to do now. thanks!!!!!
this is a source code:
The header file:
#ifndef __RDTSC_H_DEFINED__
#define __RDTSC_H_DEFINED__
#if defined(__i386__)
static __inline__ unsigned long long rdtsc(void)
{
unsigned long long int x;
__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
return x;
}
#elif defined(__x86_64__)
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
#elif defined(__powerpc__)
static __inline__ unsigned long long rdtsc(void)
{
unsigned long long int result=0;
unsigned long int upper, lower,tmp;
__asm__ volatile(
"0: \n"
"\tmftbu %0 \n"
"\tmftb %1 \n"
"\tmftbu %2 \n"
"\tcmpw %2,%0 \n"
"\tbne 0b \n"
: "=r"(upper),"=r"(lower),"=r"(tmp)
);
result = upper;
result = result<<32;
result = result|lower;
return(result);
}
#else
#error "No tick counter is available!"
#endif
/* $RCSfile: $ $Author: kazutomo $
* $Revision: 1.6 $ $Date: 2005/04/13 18:49:58 $
*/
#endif
The source file
/*
* Example program about using userfaultfd(2) for garbage collection.
*
* This establishes a couple pages, all of which are filled from
* compressed files on disk when first accessed. For simplicity
* these are
* one file per page. Files are written at the beginning of the
* program.
*
* Later, this program demonstrates the use of write protection to
* get
* a notification on write access, analogous to using
* mprotect(!PROT_WRITE)
* and doing the bookkeeping in a SIGSEGV handler.
*
*/
#include <linux/userfaultfd.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <time.h>
#include <math.h>
#include <unistd.h>
#include <strings.h>
#include <string.h>
#include <unistd.h>
#include <asm/unistd.h>
#include <poll.h>
#include <pthread.h>
#include "rdtsc.h"
//#define size 102400
#define FILE_BUF 25
#define PAGE_SIZE sysconf(_SC_PAGE_SIZE)
#define n_pages 102400
//102400
#define iterations 1
pthread_t tracker;
void *tracker_task();
int i;
unsigned long main_tsc_start, main_tsc_end;
// This is doing the work in the uffd handler thread
void *tracker_task(void *data)
{
int fd = *(int *)(data);
for (;;)
{
struct uffd_msg msg;
struct pollfd pollfd;
int pollres, readret;
unsigned long addr, page_begin, whichpage;
unsigned long handler_tsc_start, handler_tsc_end;
pollfd.fd = fd;
pollfd.events = POLLIN;
pollres = poll(&pollfd, 1, -1);
if( pollres == -1 )
perror("poll");
if (pollfd.revents & POLLERR)
{
fprintf(stderr, "POLLERR on userfaultfd\n");
exit(1);
}
readret = read(fd, &msg, sizeof(msg));
if (readret == -1)
perror("read userfaultfd");
if (readret != sizeof(msg))
{
fprintf(stderr, "short read, not expected, exiting\n");
exit(1);
}
/*
* Proper sequence is important here.
*
* For the GC we expect that write-protected pages can only
* be pages already backed by physical pages.
* Regular writes into unprotected pages that come before
* reads need the page be filled.
*
* So we do the WP case first and get it out of the way.
* Then both of the other cases need the page read.
*/
if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP)
{
// send write unlock
struct uffdio_writeprotect wp;
wp.range.start = msg.arg.pagefault.address;
wp.range.len = PAGE_SIZE;
wp.mode = 0;
//printf("sending !UFFDIO_WRITEPROTECT event to
//userfaultfd\n");
if (ioctl(fd, UFFDIO_WRITEPROTECT, &wp) == -1)
perror("ioctl(UFFDIO_WRITEPROTECT)");
//continue;
}
}
printf("end\n");
return NULL;
}
int main(int argc, char *argv[])
{
unsigned long *region;
int uffd, uffd_flags, expected, t_create;
void *status;
struct uffdio_writeprotect wp;
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
char *clear = "3", *drop_caches_path = "/proc/sys/vm/drop_caches";
FILE *drop_caches_file;
main_tsc_start = rdtsc();
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
{
perror("syscall");
exit(2);
}
uffdio_api.api = UFFD_API;
uffdio_api.features = 1;
if (ioctl(uffd, UFFDIO_API, &uffdio_api))
{
fprintf(stderr, "UFFDIO_API\n");
return 1;
}
//printf("Features: 0x%llx\n", uffdio_api.features);
if (uffdio_api.api != UFFD_API)
{
fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
return 1;
}
/* Allocate memory that will be tracked */
region = (unsigned long *) mmap(NULL, PAGE_SIZE * n_pages, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (!region)
{
perror("mmap");
exit(2);
}
/* Force alignment of contiguous pages */
if (posix_memalign((void **)region, PAGE_SIZE, PAGE_SIZE * (n_pages - 1)))
{
fprintf(stderr, "cannot align by PAGE_SIZE %ld\n", PAGE_SIZE);
exit(1);
}
uffdio_register.range.start = (unsigned long)region;
uffdio_register.range.len = PAGE_SIZE * n_pages;
uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
//main_tsc_start = rdtsc();
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
{
perror("ioctl(UFFDIO_REGISTER)");
exit(1);
}
expected = UFFD_API_RANGE_IOCTLS;
if ((uffdio_register.ioctls & expected) != expected)
{
fprintf(stderr, "ioctl set is incorrect\n");
exit(1);
}
if( (t_create = pthread_create(&tracker, NULL, tracker_task,
&uffd)) != 0 )
{
errno = t_create;
perror("pthread_create");
}
//printf("mainline writing writable pages.\n");
for (unsigned long i = 0; i < n_pages; i++)
{
unsigned long entry = (i * PAGE_SIZE / sizeof(unsigned long)) + ((rand()%10000) % 512);//(i % 512);
region[entry] = i;
}
drop_caches_file = fopen(drop_caches_path, "w");
for (i = 0; i < iterations; i++)//
{
/* Indicate the range of pages to be write-protected */
//sync();
fwrite(clear, sizeof(clear), 1, drop_caches_file);
wp.range.start = (unsigned long long)region;
wp.range.len = PAGE_SIZE * n_pages;
wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
/* Write-protect pages */
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp) == -1)
{
perror("ioctl(UFFDIO_WRITEPROTECT)");
exit(1);
}
/* Now try to "touch" the pages to trigger page faults and handling by the tracker thread */
for (unsigned long i = 0; i < n_pages; i++)
{
unsigned long entry = (i * PAGE_SIZE / sizeof(unsigned long)) + ((rand()%10000) % 512);//(i % 512);
region[entry] = i;
}
}
fclose(drop_caches_file);
if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
fprintf(stderr, "ioctl unregister failure\n");
return 1;
}
main_tsc_end = rdtsc();
printf("Apllication TSC : %lu\n", (unsigned long) (main_tsc_end-main_tsc_start));
return 0;
}

Strange byte alignment between windows and Linux gcc compiler

I see a strange behavior of reading a bin file and mapping in to the structure in the windows compared to linux gcc compiler.
Below is my c code:
#include <inttypes.h>
#include <stdio.h>
#include <fcntl.h>
#define IV_MAX_LEN 32
#define HASH_MAX_LEN 64
#define MAX_NUM_IMGS 8
#define MAX_NUM_SRK_RECORDS 4
#define SKIP_OFFSET_1K 0x400
typedef struct {
uint8_t version;
uint16_t length;
uint8_t tag;
uint16_t srk_table_offset;
uint16_t cert_offset;
uint16_t blob_offset;
uint16_t signature_offset;
uint32_t reserved;
} __attribute__((packed)) blk_hdr_t;
typedef struct {
uint32_t offset;
uint32_t size;
uint64_t dst;
uint64_t entry;
uint32_t hab_flags;
uint32_t meta;
uint8_t hash[HASH_MAX_LEN];
uint8_t iv[IV_MAX_LEN];
} __attribute__((packed)) boot_t;
typedef struct {
uint8_t version;
uint16_t length;
uint8_t tag;
uint32_t flags;
uint16_t sw_version;
uint8_t fuse_version;
uint8_t num_images;
uint16_t sig_blk_offset;
uint16_t reserved;
boot_t img[MAX_NUM_IMGS];
blk_hdr_t blk_hdr;
uint32_t sigblk_size;
uint32_t padding;
} __attribute__((packed)) headerMain_t;
int main()
{
int ofd =1;
char filename[] = "sample.bin";
int readSizes= 0;
int headerSizes= 0;
headerMain_t header;
uint8_t *byte;
ofd = open(filename, O_RDONLY);
printf("\nOPENING File: %s!\n", filename);
printf("\n STRUCT sizes: %d, %d, %d,\n", sizeof(headerMain_t), sizeof(boot_t), sizeof(blk_hdr_t));
#if 0
if(lseek(ofd, SKIP_OFFSET_1K, SEEK_SET) < 0) {
printf("Error Read \n");
}
#endif
readSizes = read(ofd, &header, sizeof(header)) ;
headerSizes = sizeof(header);
printf("\n Read SIZE: %d / %d !",readSizes,headerSizes);
printf("\n Read Bytes: \n");
byte = (uint8_t*)&header;
for(int i=0; i<readSizes; i++)
{
printf("0x%02x, ",*byte);
byte++;
if(0 ==i%20 )
printf("\n");
}
return 0;
}
and here is the input binary file it reads. (This sample bin file is 0x01 (20 times)... 0xFF(20 times) = so 255 x 20 = 5100 bytes)
The same code is compiled and run in windows mingW-gcc and linux gcc.
Following are the strange observations seen in the windows run:
blk_hdr_t struct although 4 byte aligned and 16 bytes it total: 22 bytes
(Update: I found that this solution worked with -mno-ms-bitfields option)
Although 5100 bytes are available, read() function could read only 1000 bytes. what choked it further to read ?
read() also puts the "holes" with "0x00" bytes value in it. I don't understand this strange behaviour.
Enabling the lseek to skip first 1024 bytes shall reach the end of the file.
Everything looks perfect on the linux result (although point 3) does exist on the linux side too: although this is trivial for me, i'm curious on the behaviour)
so finally, How can make this code on windows gcc with the Exact result as in the linux gcc ?
could anyone please enlighten me ?
(The parameters in the structures cannot be reshuffled)

accessing i2c platform device from userspace program

I'm trying to access an 24c256 eeprom content from user space in a am335x_starter_kit.
I dont have to add eeprom driver into kernel and make modifications in board.c file because board already uses eeprom to access some board configuration and Mac address information.
I just want to access eeprom content from user space.
I used read and write functions for character devices before but i2c platform devices doesnt have these functions.
struct i2c_driver {
unsigned int class;
int (* attach_adapter) (struct i2c_adapter *);
int (* probe) (struct i2c_client *, const struct i2c_device_id *);
int (* remove) (struct i2c_client *);
void (* shutdown) (struct i2c_client *);
void (* alert) (struct i2c_client *, unsigned int data);
int (* command) (struct i2c_client *client, unsigned int cmd, void *arg);
struct device_driver driver;
const struct i2c_device_id * id_table;
int (* detect) (struct i2c_client *, struct i2c_board_info *);
const unsigned short * address_list;
struct list_head clients;
};
This is the eeprom driver. Board file uses it from kernel to get mac address and board configuration data.
/*
* at24.c - handle most I2C EEPROMs
*
* Copyright (C) 2005-2007 David Brownell
* Copyright (C) 2008 Wolfram Sang, Pengutronix
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/mutex.h>
#include <linux/sysfs.h>
#include <linux/mod_devicetable.h>
#include <linux/log2.h>
#include <linux/bitops.h>
#include <linux/jiffies.h>
#include <linux/of.h>
#include <linux/i2c.h>
#include <linux/i2c/at24.h>
/*
* I2C EEPROMs from most vendors are inexpensive and mostly interchangeable.
* Differences between different vendor product lines (like Atmel AT24C or
* MicroChip 24LC, etc) won't much matter for typical read/write access.
* There are also I2C RAM chips, likewise interchangeable. One example
* would be the PCF8570, which acts like a 24c02 EEPROM (256 bytes).
*
* However, misconfiguration can lose data. "Set 16-bit memory address"
* to a part with 8-bit addressing will overwrite data. Writing with too
* big a page size also loses data. And it's not safe to assume that the
* conventional addresses 0x50..0x57 only hold eeproms; a PCF8563 RTC
* uses 0x51, for just one example.
*
* Accordingly, explicit board-specific configuration data should be used
* in almost all cases. (One partial exception is an SMBus used to access
* "SPD" data for DRAM sticks. Those only use 24c02 EEPROMs.)
*
* So this driver uses "new style" I2C driver binding, expecting to be
* told what devices exist. That may be in arch/X/mach-Y/board-Z.c or
* similar kernel-resident tables; or, configuration data coming from
* a bootloader.
*
* Other than binding model, current differences from "eeprom" driver are
* that this one handles write access and isn't restricted to 24c02 devices.
* It also handles larger devices (32 kbit and up) with two-byte addresses,
* which won't work on pure SMBus systems.
*/
struct at24_data {
struct at24_platform_data chip;
struct memory_accessor macc;
int use_smbus;
/*
* Lock protects against activities from other Linux tasks,
* but not from changes by other I2C masters.
*/
struct mutex lock;
struct bin_attribute bin;
u8 *writebuf;
unsigned write_max;
unsigned num_addresses;
/*
* Some chips tie up multiple I2C addresses; dummy devices reserve
* them for us, and we'll use them with SMBus calls.
*/
struct i2c_client *client[];
};
/*
* This parameter is to help this driver avoid blocking other drivers out
* of I2C for potentially troublesome amounts of time. With a 100 kHz I2C
* clock, one 256 byte read takes about 1/43 second which is excessive;
* but the 1/170 second it takes at 400 kHz may be quite reasonable; and
* at 1 MHz (Fm+) a 1/430 second delay could easily be invisible.
*
* This value is forced to be a power of two so that writes align on pages.
*/
static unsigned io_limit = 128;
module_param(io_limit, uint, 0);
MODULE_PARM_DESC(io_limit, "Maximum bytes per I/O (default 128)");
/*
* Specs often allow 5 msec for a page write, sometimes 20 msec;
* it's important to recover from write timeouts.
*/
static unsigned write_timeout = 25;
module_param(write_timeout, uint, 0);
MODULE_PARM_DESC(write_timeout, "Time (in ms) to try writes (default 25)");
#define AT24_SIZE_BYTELEN 5
#define AT24_SIZE_FLAGS 8
#define AT24_BITMASK(x) (BIT(x) - 1)
/* create non-zero magic value for given eeprom parameters */
#define AT24_DEVICE_MAGIC(_len, _flags) \
((1 << AT24_SIZE_FLAGS | (_flags)) \
<< AT24_SIZE_BYTELEN | ilog2(_len))
static const struct i2c_device_id at24_ids[] = {
/* needs 8 addresses as A0-A2 are ignored */
{ "24c00", AT24_DEVICE_MAGIC(128 / 8, AT24_FLAG_TAKE8ADDR) },
/* old variants can't be handled with this generic entry! */
{ "24c01", AT24_DEVICE_MAGIC(1024 / 8, 0) },
{ "24c02", AT24_DEVICE_MAGIC(2048 / 8, 0) },
/* spd is a 24c02 in memory DIMMs */
{ "spd", AT24_DEVICE_MAGIC(2048 / 8,
AT24_FLAG_READONLY | AT24_FLAG_IRUGO) },
{ "24c04", AT24_DEVICE_MAGIC(4096 / 8, 0) },
/* 24rf08 quirk is handled at i2c-core */
{ "24c08", AT24_DEVICE_MAGIC(8192 / 8, 0) },
{ "24c16", AT24_DEVICE_MAGIC(16384 / 8, 0) },
{ "24c32", AT24_DEVICE_MAGIC(32768 / 8, AT24_FLAG_ADDR16) },
{ "24c64", AT24_DEVICE_MAGIC(65536 / 8, AT24_FLAG_ADDR16) },
{ "24c128", AT24_DEVICE_MAGIC(131072 / 8, AT24_FLAG_ADDR16) },
{ "24c256", AT24_DEVICE_MAGIC(262144 / 8, AT24_FLAG_ADDR16) },
{ "24c512", AT24_DEVICE_MAGIC(524288 / 8, AT24_FLAG_ADDR16) },
{ "24c1024", AT24_DEVICE_MAGIC(1048576 / 8, AT24_FLAG_ADDR16) },
{ "at24", 0 },
{ /* END OF LIST */ }
};
MODULE_DEVICE_TABLE(i2c, at24_ids);
/*-------------------------------------------------------------------------*/
/*
* This routine supports chips which consume multiple I2C addresses. It
* computes the addressing information to be used for a given r/w request.
* Assumes that sanity checks for offset happened at sysfs-layer.
*/
static struct i2c_client *at24_translate_offset(struct at24_data *at24,
unsigned *offset)
{
unsigned i;
if (at24->chip.flags & AT24_FLAG_ADDR16) {
i = *offset >> 16;
*offset &= 0xffff;
} else {
i = *offset >> 8;
*offset &= 0xff;
}
return at24->client[i];
}
static ssize_t at24_eeprom_read(struct at24_data *at24, char *buf,
unsigned offset, size_t count)
{
struct i2c_msg msg[2];
u8 msgbuf[2];
struct i2c_client *client;
unsigned long timeout, read_time;
int status, i;
memset(msg, 0, sizeof(msg));
/*
* REVISIT some multi-address chips don't rollover page reads to
* the next slave address, so we may need to truncate the count.
* Those chips might need another quirk flag.
*
* If the real hardware used four adjacent 24c02 chips and that
* were misconfigured as one 24c08, that would be a similar effect:
* one "eeprom" file not four, but larger reads would fail when
* they crossed certain pages.
*/
/*
* Slave address and byte offset derive from the offset. Always
* set the byte address; on a multi-master board, another master
* may have changed the chip's "current" address pointer.
*/
client = at24_translate_offset(at24, &offset);
if (count > io_limit)
count = io_limit;
switch (at24->use_smbus) {
case I2C_SMBUS_I2C_BLOCK_DATA:
/* Smaller eeproms can work given some SMBus extension calls */
if (count > I2C_SMBUS_BLOCK_MAX)
count = I2C_SMBUS_BLOCK_MAX;
break;
case I2C_SMBUS_WORD_DATA:
count = 2;
break;
case I2C_SMBUS_BYTE_DATA:
count = 1;
break;
default:
/*
* When we have a better choice than SMBus calls, use a
* combined I2C message. Write address; then read up to
* io_limit data bytes. Note that read page rollover helps us
* here (unlike writes). msgbuf is u8 and will cast to our
* needs.
*/
i = 0;
if (at24->chip.flags & AT24_FLAG_ADDR16)
msgbuf[i++] = offset >> 8;
msgbuf[i++] = offset;
msg[0].addr = client->addr;
msg[0].buf = msgbuf;
msg[0].len = i;
msg[1].addr = client->addr;
msg[1].flags = I2C_M_RD;
msg[1].buf = buf;
msg[1].len = count;
}
/*
* Reads fail if the previous write didn't complete yet. We may
* loop a few times until this one succeeds, waiting at least
* long enough for one entire page write to work.
*/
timeout = jiffies + msecs_to_jiffies(write_timeout);
do {
read_time = jiffies;
switch (at24->use_smbus) {
case I2C_SMBUS_I2C_BLOCK_DATA:
status = i2c_smbus_read_i2c_block_data(client, offset,
count, buf);
break;
case I2C_SMBUS_WORD_DATA:
status = i2c_smbus_read_word_data(client, offset);
if (status >= 0) {
buf[0] = status & 0xff;
buf[1] = status >> 8;
status = count;
}
break;
case I2C_SMBUS_BYTE_DATA:
status = i2c_smbus_read_byte_data(client, offset);
if (status >= 0) {
buf[0] = status;
status = count;
}
break;
default:
status = i2c_transfer(client->adapter, msg, 2);
if (status == 2)
status = count;
}
dev_dbg(&client->dev, "read %zu#%d --> %d (%ld)\n",
count, offset, status, jiffies);
if (status == count)
return count;
/* REVISIT: at HZ=100, this is sloooow */
msleep(1);
} while (time_before(read_time, timeout));
return -ETIMEDOUT;
}
static ssize_t at24_read(struct at24_data *at24,
char *buf, loff_t off, size_t count)
{
ssize_t retval = 0;
if (unlikely(!count))
return count;
/*
* Read data from chip, protecting against concurrent updates
* from this host, but not from other I2C masters.
*/
mutex_lock(&at24->lock);
while (count) {
ssize_t status;
status = at24_eeprom_read(at24, buf, off, count);
if (status <= 0) {
if (retval == 0)
retval = status;
break;
}
buf += status;
off += status;
count -= status;
retval += status;
}
mutex_unlock(&at24->lock);
return retval;
}
static ssize_t at24_bin_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *attr,
char *buf, loff_t off, size_t count)
{
struct at24_data *at24;
at24 = dev_get_drvdata(container_of(kobj, struct device, kobj));
return at24_read(at24, buf, off, count);
}
/*
* Note that if the hardware write-protect pin is pulled high, the whole
* chip is normally write protected. But there are plenty of product
* variants here, including OTP fuses and partial chip protect.
*
* We only use page mode writes; the alternative is sloooow. This routine
* writes at most one page.
*/
static ssize_t at24_eeprom_write(struct at24_data *at24, const char *buf,
unsigned offset, size_t count)
{
struct i2c_client *client;
struct i2c_msg msg;
ssize_t status;
unsigned long timeout, write_time;
unsigned next_page;
/* Get corresponding I2C address and adjust offset */
client = at24_translate_offset(at24, &offset);
/* write_max is at most a page */
if (count > at24->write_max)
count = at24->write_max;
/* Never roll over backwards, to the start of this page */
next_page = roundup(offset + 1, at24->chip.page_size);
if (offset + count > next_page)
count = next_page - offset;
/* If we'll use I2C calls for I/O, set up the message */
if (!at24->use_smbus) {
int i = 0;
msg.addr = client->addr;
msg.flags = 0;
/* msg.buf is u8 and casts will mask the values */
msg.buf = at24->writebuf;
if (at24->chip.flags & AT24_FLAG_ADDR16)
msg.buf[i++] = offset >> 8;
msg.buf[i++] = offset;
memcpy(&msg.buf[i], buf, count);
msg.len = i + count;
}
/*
* Writes fail if the previous one didn't complete yet. We may
* loop a few times until this one succeeds, waiting at least
* long enough for one entire page write to work.
*/
timeout = jiffies + msecs_to_jiffies(write_timeout);
do {
write_time = jiffies;
if (at24->use_smbus) {
status = i2c_smbus_write_i2c_block_data(client,
offset, count, buf);
if (status == 0)
status = count;
} else {
status = i2c_transfer(client->adapter, &msg, 1);
if (status == 1)
status = count;
}
dev_dbg(&client->dev, "write %zu#%d --> %zd (%ld)\n",
count, offset, status, jiffies);
if (status == count)
return count;
/* REVISIT: at HZ=100, this is sloooow */
msleep(1);
} while (time_before(write_time, timeout));
return -ETIMEDOUT;
}
static ssize_t at24_write(struct at24_data *at24, const char *buf, loff_t off,
size_t count)
{
ssize_t retval = 0;
if (unlikely(!count))
return count;
/*
* Write data to chip, protecting against concurrent updates
* from this host, but not from other I2C masters.
*/
mutex_lock(&at24->lock);
while (count) {
ssize_t status;
status = at24_eeprom_write(at24, buf, off, count);
if (status <= 0) {
if (retval == 0)
retval = status;
break;
}
buf += status;
off += status;
count -= status;
retval += status;
}
mutex_unlock(&at24->lock);
return retval;
}
static ssize_t at24_bin_write(struct file *filp, struct kobject *kobj,
struct bin_attribute *attr,
char *buf, loff_t off, size_t count)
{
struct at24_data *at24;
at24 = dev_get_drvdata(container_of(kobj, struct device, kobj));
return at24_write(at24, buf, off, count);
}
/*-------------------------------------------------------------------------*/
/*
* This lets other kernel code access the eeprom data. For example, it
* might hold a board's Ethernet address, or board-specific calibration
* data generated on the manufacturing floor.
*/
static ssize_t at24_macc_read(struct memory_accessor *macc, char *buf,
off_t offset, size_t count)
{
struct at24_data *at24 = container_of(macc, struct at24_data, macc);
return at24_read(at24, buf, offset, count);
}
static ssize_t at24_macc_write(struct memory_accessor *macc, const char *buf,
off_t offset, size_t count)
{
struct at24_data *at24 = container_of(macc, struct at24_data, macc);
return at24_write(at24, buf, offset, count);
}
/*-------------------------------------------------------------------------*/
#ifdef CONFIG_OF
static void at24_get_ofdata(struct i2c_client *client,
struct at24_platform_data *chip)
{
const __be32 *val;
struct device_node *node = client->dev.of_node;
if (node) {
if (of_get_property(node, "read-only", NULL))
chip->flags |= AT24_FLAG_READONLY;
val = of_get_property(node, "pagesize", NULL);
if (val)
chip->page_size = be32_to_cpup(val);
}
}
#else
static void at24_get_ofdata(struct i2c_client *client,
struct at24_platform_data *chip)
{ }
#endif /* CONFIG_OF */
static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
{
struct at24_platform_data chip;
bool writable;
int use_smbus = 0;
struct at24_data *at24;
int err;
unsigned i, num_addresses;
kernel_ulong_t magic;
if (client->dev.platform_data) {
chip = *(struct at24_platform_data *)client->dev.platform_data;
} else {
if (!id->driver_data) {
err = -ENODEV;
goto err_out;
}
magic = id->driver_data;
chip.byte_len = BIT(magic & AT24_BITMASK(AT24_SIZE_BYTELEN));
magic >>= AT24_SIZE_BYTELEN;
chip.flags = magic & AT24_BITMASK(AT24_SIZE_FLAGS);
/*
* This is slow, but we can't know all eeproms, so we better
* play safe. Specifying custom eeprom-types via platform_data
* is recommended anyhow.
*/
chip.page_size = 1;
/* update chipdata if OF is present */
at24_get_ofdata(client, &chip);
chip.setup = NULL;
chip.context = NULL;
}
if (!is_power_of_2(chip.byte_len))
dev_warn(&client->dev,
"byte_len looks suspicious (no power of 2)!\n");
if (!chip.page_size) {
dev_err(&client->dev, "page_size must not be 0!\n");
err = -EINVAL;
goto err_out;
}
if (!is_power_of_2(chip.page_size))
dev_warn(&client->dev,
"page_size looks suspicious (no power of 2)!\n");
/* Use I2C operations unless we're stuck with SMBus extensions. */
if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
if (chip.flags & AT24_FLAG_ADDR16) {
err = -EPFNOSUPPORT;
goto err_out;
}
if (i2c_check_functionality(client->adapter,
I2C_FUNC_SMBUS_READ_I2C_BLOCK)) {
use_smbus = I2C_SMBUS_I2C_BLOCK_DATA;
} else if (i2c_check_functionality(client->adapter,
I2C_FUNC_SMBUS_READ_WORD_DATA)) {
use_smbus = I2C_SMBUS_WORD_DATA;
} else if (i2c_check_functionality(client->adapter,
I2C_FUNC_SMBUS_READ_BYTE_DATA)) {
use_smbus = I2C_SMBUS_BYTE_DATA;
} else {
err = -EPFNOSUPPORT;
goto err_out;
}
}
//???????????????
if (chip.flags & AT24_FLAG_TAKE8ADDR)
num_addresses = 8;
else
num_addresses = DIV_ROUND_UP(chip.byte_len, (chip.flags & AT24_FLAG_ADDR16) ? 65536 : 256);
at24 = kzalloc(sizeof(struct at24_data) + num_addresses * sizeof(struct i2c_client *), GFP_KERNEL);
if (!at24) {
err = -ENOMEM;
goto err_out;
}
mutex_init(&at24->lock);
at24->use_smbus = use_smbus;
at24->chip = chip;
at24->num_addresses = num_addresses;
/*
* Export the EEPROM bytes through sysfs, since that's convenient.
* By default, only root should see the data (maybe passwords etc)
*/
sysfs_bin_attr_init(&at24->bin);
at24->bin.attr.name = "eeprom";
at24->bin.attr.mode = chip.flags & AT24_FLAG_IRUGO ? S_IRUGO : S_IRUSR;
at24->bin.read = at24_bin_read;
at24->bin.size = chip.byte_len;
at24->macc.read = at24_macc_read;
writable = !(chip.flags & AT24_FLAG_READONLY);
if (writable) {
if (!use_smbus || i2c_check_functionality(client->adapter,
I2C_FUNC_SMBUS_WRITE_I2C_BLOCK)) {
unsigned write_max = chip.page_size;
at24->macc.write = at24_macc_write;
at24->bin.write = at24_bin_write;
at24->bin.attr.mode |= S_IWUSR;
if (write_max > io_limit)
write_max = io_limit;
if (use_smbus && write_max > I2C_SMBUS_BLOCK_MAX)
write_max = I2C_SMBUS_BLOCK_MAX;
at24->write_max = write_max;
/* buffer (data + address at the beginning) */
at24->writebuf = kmalloc(write_max + 2, GFP_KERNEL);
if (!at24->writebuf) {
err = -ENOMEM;
goto err_struct;
}
} else {
dev_warn(&client->dev,
"cannot write due to controller restrictions.");
}
}
at24->client[0] = client;
/* use dummy devices for multiple-address chips */
for (i = 1; i < num_addresses; i++) {
at24->client[i] = i2c_new_dummy(client->adapter,
client->addr + i);
if (!at24->client[i]) {
dev_err(&client->dev, "address 0x%02x unavailable\n",
client->addr + i);
err = -EADDRINUSE;
goto err_clients;
}
}
err = sysfs_create_bin_file(&client->dev.kobj, &at24->bin);
if (err)
goto err_clients;
i2c_set_clientdata(client, at24);
dev_info(&client->dev, "%zu byte %s EEPROM, %s, %u bytes/write\n", at24->bin.size, client->name,
writable ? "writable" : "read-only", at24->write_max);
if (use_smbus == I2C_SMBUS_WORD_DATA ||
use_smbus == I2C_SMBUS_BYTE_DATA) {
dev_notice(&client->dev, "Falling back to %s reads, "
"performance will suffer\n", use_smbus ==
I2C_SMBUS_WORD_DATA ? "word" : "byte");
}
/* export data to kernel code */
if (chip.setup)
chip.setup(&at24->macc, chip.context);
return 0;
err_clients:
for (i = 1; i < num_addresses; i++)
if (at24->client[i])
i2c_unregister_device(at24->client[i]);
kfree(at24->writebuf);
err_struct:
kfree(at24);
err_out:
dev_dbg(&client->dev, "probe error %d\n", err);
return err;
}
/*-------------------------------------------------------------------------*/
static int __devexit at24_remove(struct i2c_client *client)
{
struct at24_data *at24;
int i;
at24 = i2c_get_clientdata(client);
sysfs_remove_bin_file(&client->dev.kobj, &at24->bin);
for (i = 1; i < at24->num_addresses; i++)
i2c_unregister_device(at24->client[i]);
kfree(at24->writebuf);
kfree(at24);
return 0;
}
/*-------------------------------------------------------------------------*/
static struct i2c_driver at24_driver = {
.driver = {
.name = "at24",
.owner = THIS_MODULE,
},
.probe = at24_probe,
.remove = __devexit_p(at24_remove),
.id_table = at24_ids,
};
static int __init at24_init(void)
{
if (!io_limit) {
pr_err("at24: io_limit must not be 0!\n");
return -EINVAL;
}
io_limit = rounddown_pow_of_two(io_limit);
return i2c_add_driver(&at24_driver);
}
module_init(at24_init);
static void __exit at24_exit(void)
{
i2c_del_driver(&at24_driver);
}
module_exit(at24_exit);
MODULE_DESCRIPTION("Driver for most I2C EEPROMs");
MODULE_AUTHOR("David Brownell and Wolfram Sang");
MODULE_LICENSE("GPL");
These are snippets from board file:
static struct i2c_board_info __initdata am335x_i2c0_boardinfo[] = {
{
/* Baseboard board EEPROM */
I2C_BOARD_INFO("24c256", BASEBOARD_I2C_ADDR),
.platform_data = &am335x_baseboard_eeprom_info,
},
.
.
static struct at24_platform_data am335x_baseboard_eeprom_info = {
.byte_len = (256*1024) / 8,
.page_size = 64,
.flags = AT24_FLAG_ADDR16,
.setup = am335x_evm_setup,
.context = (void *)NULL,
};
static void am335x_evm_setup(struct memory_accessor *mem_acc, void *context)
{
int ret;
char tmp[10];
struct device *mpu_dev;
/* 1st get the MAC address from EEPROM */
ret = mem_acc->read(mem_acc, (char *)&am335x_mac_addr,
EEPROM_MAC_ADDRESS_OFFSET, sizeof(am335x_mac_addr));
.
.
.
How can i read from/write into eeprom content from user space.
Should i use sysfs? What should i do?
EEPROM:
It's part of setting the MAC and serial number, but the only way to know if the EEPROM is working is to read its content.
$ cat /sys/bus/i2c/devices/2-0057/eeprom | hexdump -C

STM32F105, arm-none-eabi-gcc, Contiki: Storing float in struct and printing float in C fails

I have two typedef struct as shown below:
typedef struct{
UInt32 length;
void* data;
UInt16 value;
} my_type;
typedef struct{
UInt8 type;
UInt32 length;
void* value;
} tlv_t;
What I trying next is to allocate memory for an my_type struct, a tlv_t struct that is pointed to from the created my_type object and for a float number, which is pointed to from the tlv_t object.If I'm executing the code without the last line of the code below it is working well. I can store the value and I can access it.But as soon as I try to access it a second time the uploaded code isn't running at all anymore on the STM32F105 Contiki-based board. The odd part is that this is only the case when using floating point numbers. No problems at all with other datatypes like int. Unfortunately, I really need to use float... What am I doing wrong?
Another problem is that printf doesn't support some flags like %f or %ul. Does anybody know how to add support for it on Contiki?
my_type* t = malloc(sizeof(my_type));
t->data = malloc(sizeof(tlv_t));
tlv_t* tv = t->data;
tv->type = 10;
tv->length = sizeof(float);
tv->value = malloc(sizeof(float));
*(float*) tv->value = 212.32;
printf("tv->value: %i\n", (int) *(float*) tv->value);
printf("tv->value: %i\n", (int) *(float*) tv->value); // without this line it is working
EDIT:
I forgot to add these typedefs:
typedef unsigned char UInt8;
typedef unsigned short UInt16;
typedef unsigned long UInt32;
EDIT2:
Here is the complete code:
#include <contiki.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cfs/cfs.h>
#include <cfs/cfs-coffee.h>
#include "arg.h"
/*---------------------------------------------------------------------------*/
PROCESS(main_process, "Contiki CLV build015_1");
AUTOSTART_PROCESSES(&main_process);
/*---------------------------------------------------------------------------*/
PROCESS_THREAD(main_process, ev, data)
{
PROCESS_BEGIN();
my_type* t = malloc(sizeof(my_type));
t->data = malloc(sizeof(tlv_t));
tlv_t* tv = t->data;
tv->type = 10;
tv->length = sizeof(float);
tv->value = malloc(sizeof(float));
*(float*) tv->value = 212.32;
printf("tv->value: %i\n", (int) *(float*) tv->value);
printf("tv->value: %i\n", (int) *(float*) tv->value); // without this line it is working
while (1) {
PROCESS_YIELD();
}
PROCESS_END();
}
EDIT3:
I'm using the latest arm-none-eabi-gcc (version 4_8-2013q4-20131204). Are there any known issues when dealing with structs, floats or memory management?
Try
PROCESS_THREAD(main_process, ev, data)
{
static my_type *t;
static tlv_t *tv;
static float f = 212.32;
PROCESS_BEGIN();
t = (my_type *)malloc(sizeof(my_type));
t->data = malloc(sizeof(tlv_t));
tv = (tlv_t *)t->data;
tv->type = 10;
tv->length = sizeof(float);
tv->value = malloc(sizeof(float));
//*(float *) tv->value = 212.32;
memmove(tv->value, &f, 4);
printf("tv->value: %i\n", (int) *(float*) tv->value);
printf("tv->value: %i\n", (int) *(float*) tv->value); // without this line it is working
printf("t address: %x \n", (unsigned int)t);
while (1) {
PROCESS_YIELD();
}
PROCESS_END();
}
I suggest you fix your code, so you get no compiler warnings anymore (don't turn them off). Add casts as needed.
After I did those fixes your code worked for me, so the code is ugly but ok.
#define UInt32 unsigned int
#define UInt16 unsigned short
#define UInt8 unsigned char
typedef struct{
UInt32 length;
void* data;
UInt16 value;
} my_type;
typedef struct{
UInt8 type;
UInt32 length;
void* value;
} tlv_t;
int _tmain(int argc, _TCHAR* argv[])
{
my_type* t = (my_type*)malloc(sizeof(my_type));
t->data = malloc(sizeof(tlv_t));
tlv_t* tv = (tlv_t*)t->data;
tv->type = 10;
tv->length = sizeof(float);
tv->value = malloc(sizeof(float));
*(float*) tv->value = (float)212.32;
printf("tv->value: %i\n", (int) *(float*) tv->value);
printf("tv->value: %i\n", (int) *(float*) tv->value); // without this line it
getchar();
}
gives
tv->value: 212
tv->value: 212

How to mmap a Linux kernel buffer to user space?

Let's say the buffer is allocated using a page based scheme. One way to implement mmap would be to use remap_pfn_range but LDD3 says this does not work for conventional memory. It appears we can work around this by marking the page(s) reserved using SetPageReserved so that it gets locked in memory. But isn't all kernel memory already non-swappable i.e. already reserved? Why the need to set the reserved bit explicitly?
Does this have something to do with pages allocated from HIGH_MEM?
The simplest way to map a set of pages from the kernel in your mmap method is to use the fault handler to map the pages. Basically you end up with something like:
static int my_mmap(struct file *filp, struct vm_area_struct *vma)
{
vma->vm_ops = &my_vm_ops;
return 0;
}
static const struct file_operations my_fops = {
.owner = THIS_MODULE,
.open = nonseekable_open,
.mmap = my_mmap,
.llseek = no_llseek,
};
(where the other file operations are whatever your module needs). Also in my_mmap you do whatever range checking etc. is needed to validate the mmap parameters.
Then the vm_ops look like:
static int my_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
vmf->page = my_page_at_index(vmf->pgoff);
get_page(vmf->page);
return 0;
}
static const struct vm_operations_struct my_vm_ops = {
.fault = my_fault
}
where you just need to figure out for a given vma / vmf passed to your fault function which page to map into userspace. This depends on exactly how your module works. For example, if you did
my_buf = vmalloc_user(MY_BUF_SIZE);
then the page you use would be something like
vmalloc_to_page(my_buf + (vmf->pgoff << PAGE_SHIFT));
But you could easily create an array and allocate a page for each entry, use kmalloc, whatever.
[just noticed that my_fault is a slightly amusing name for a function]
Minimal runnable example and userland test
Kernel module:
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h> /* min */
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/uaccess.h> /* copy_from_user, copy_to_user */
#include <linux/slab.h>
static const char *filename = "lkmc_mmap";
enum { BUFFER_SIZE = 4 };
struct mmap_info {
char *data;
};
/* After unmap. */
static void vm_close(struct vm_area_struct *vma)
{
pr_info("vm_close\n");
}
/* First page access. */
static vm_fault_t vm_fault(struct vm_fault *vmf)
{
struct page *page;
struct mmap_info *info;
pr_info("vm_fault\n");
info = (struct mmap_info *)vmf->vma->vm_private_data;
if (info->data) {
page = virt_to_page(info->data);
get_page(page);
vmf->page = page;
}
return 0;
}
/* After mmap. TODO vs mmap, when can this happen at a different time than mmap? */
static void vm_open(struct vm_area_struct *vma)
{
pr_info("vm_open\n");
}
static struct vm_operations_struct vm_ops =
{
.close = vm_close,
.fault = vm_fault,
.open = vm_open,
};
static int mmap(struct file *filp, struct vm_area_struct *vma)
{
pr_info("mmap\n");
vma->vm_ops = &vm_ops;
vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_private_data = filp->private_data;
vm_open(vma);
return 0;
}
static int open(struct inode *inode, struct file *filp)
{
struct mmap_info *info;
pr_info("open\n");
info = kmalloc(sizeof(struct mmap_info), GFP_KERNEL);
pr_info("virt_to_phys = 0x%llx\n", (unsigned long long)virt_to_phys((void *)info));
info->data = (char *)get_zeroed_page(GFP_KERNEL);
memcpy(info->data, "asdf", BUFFER_SIZE);
filp->private_data = info;
return 0;
}
static ssize_t read(struct file *filp, char __user *buf, size_t len, loff_t *off)
{
struct mmap_info *info;
ssize_t ret;
pr_info("read\n");
if ((size_t)BUFFER_SIZE <= *off) {
ret = 0;
} else {
info = filp->private_data;
ret = min(len, (size_t)BUFFER_SIZE - (size_t)*off);
if (copy_to_user(buf, info->data + *off, ret)) {
ret = -EFAULT;
} else {
*off += ret;
}
}
return ret;
}
static ssize_t write(struct file *filp, const char __user *buf, size_t len, loff_t *off)
{
struct mmap_info *info;
pr_info("write\n");
info = filp->private_data;
if (copy_from_user(info->data, buf, min(len, (size_t)BUFFER_SIZE))) {
return -EFAULT;
} else {
return len;
}
}
static int release(struct inode *inode, struct file *filp)
{
struct mmap_info *info;
pr_info("release\n");
info = filp->private_data;
free_page((unsigned long)info->data);
kfree(info);
filp->private_data = NULL;
return 0;
}
static const struct file_operations fops = {
.mmap = mmap,
.open = open,
.release = release,
.read = read,
.write = write,
};
static int myinit(void)
{
proc_create(filename, 0, NULL, &fops);
return 0;
}
static void myexit(void)
{
remove_proc_entry(filename, NULL);
}
module_init(myinit)
module_exit(myexit)
MODULE_LICENSE("GPL");
GitHub upstream.
Userland test:
#define _XOPEN_SOURCE 700
#include <assert.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h> /* uintmax_t */
#include <string.h>
#include <sys/mman.h>
#include <unistd.h> /* sysconf */
/* Format documented at:
* https://github.com/torvalds/linux/blob/v4.9/Documentation/vm/pagemap.txt
*/
typedef struct {
uint64_t pfn : 54;
unsigned int soft_dirty : 1;
unsigned int file_page : 1;
unsigned int swapped : 1;
unsigned int present : 1;
} PagemapEntry;
/* Parse the pagemap entry for the given virtual address.
*
* #param[out] entry the parsed entry
* #param[in] pagemap_fd file descriptor to an open /proc/pid/pagemap file
* #param[in] vaddr virtual address to get entry for
* #return 0 for success, 1 for failure
*/
int pagemap_get_entry(PagemapEntry *entry, int pagemap_fd, uintptr_t vaddr)
{
size_t nread;
ssize_t ret;
uint64_t data;
nread = 0;
while (nread < sizeof(data)) {
ret = pread(pagemap_fd, ((uint8_t*)&data) + nread, sizeof(data),
(vaddr / sysconf(_SC_PAGE_SIZE)) * sizeof(data) + nread);
nread += ret;
if (ret <= 0) {
return 1;
}
}
entry->pfn = data & (((uint64_t)1 << 54) - 1);
entry->soft_dirty = (data >> 54) & 1;
entry->file_page = (data >> 61) & 1;
entry->swapped = (data >> 62) & 1;
entry->present = (data >> 63) & 1;
return 0;
}
/* Convert the given virtual address to physical using /proc/PID/pagemap.
*
* #param[out] paddr physical address
* #param[in] pid process to convert for
* #param[in] vaddr virtual address to get entry for
* #return 0 for success, 1 for failure
*/
int virt_to_phys_user(uintptr_t *paddr, pid_t pid, uintptr_t vaddr)
{
char pagemap_file[BUFSIZ];
int pagemap_fd;
snprintf(pagemap_file, sizeof(pagemap_file), "/proc/%ju/pagemap", (uintmax_t)pid);
pagemap_fd = open(pagemap_file, O_RDONLY);
if (pagemap_fd < 0) {
return 1;
}
PagemapEntry entry;
if (pagemap_get_entry(&entry, pagemap_fd, vaddr)) {
return 1;
}
close(pagemap_fd);
*paddr = (entry.pfn * sysconf(_SC_PAGE_SIZE)) + (vaddr % sysconf(_SC_PAGE_SIZE));
return 0;
}
enum { BUFFER_SIZE = 4 };
int main(int argc, char **argv)
{
int fd;
long page_size;
char *address1, *address2;
char buf[BUFFER_SIZE];
uintptr_t paddr;
if (argc < 2) {
printf("Usage: %s <mmap_file>\n", argv[0]);
return EXIT_FAILURE;
}
page_size = sysconf(_SC_PAGE_SIZE);
printf("open pathname = %s\n", argv[1]);
fd = open(argv[1], O_RDWR | O_SYNC);
if (fd < 0) {
perror("open");
assert(0);
}
printf("fd = %d\n", fd);
/* mmap twice for double fun. */
puts("mmap 1");
address1 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (address1 == MAP_FAILED) {
perror("mmap");
assert(0);
}
puts("mmap 2");
address2 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (address2 == MAP_FAILED) {
perror("mmap");
return EXIT_FAILURE;
}
assert(address1 != address2);
/* Read and modify memory. */
puts("access 1");
assert(!strcmp(address1, "asdf"));
/* vm_fault */
puts("access 2");
assert(!strcmp(address2, "asdf"));
/* vm_fault */
strcpy(address1, "qwer");
/* Also modified. So both virtual addresses point to the same physical address. */
assert(!strcmp(address2, "qwer"));
/* Check that the physical addresses are the same.
* They are, but TODO why virt_to_phys on kernel gives a different value? */
assert(!virt_to_phys_user(&paddr, getpid(), (uintptr_t)address1));
printf("paddr1 = 0x%jx\n", (uintmax_t)paddr);
assert(!virt_to_phys_user(&paddr, getpid(), (uintptr_t)address2));
printf("paddr2 = 0x%jx\n", (uintmax_t)paddr);
/* Check that modifications made from userland are also visible from the kernel. */
read(fd, buf, BUFFER_SIZE);
assert(!memcmp(buf, "qwer", BUFFER_SIZE));
/* Modify the data from the kernel, and check that the change is visible from userland. */
write(fd, "zxcv", 4);
assert(!strcmp(address1, "zxcv"));
assert(!strcmp(address2, "zxcv"));
/* Cleanup. */
puts("munmap 1");
if (munmap(address1, page_size)) {
perror("munmap");
assert(0);
}
puts("munmap 2");
if (munmap(address2, page_size)) {
perror("munmap");
assert(0);
}
puts("close");
close(fd);
return EXIT_SUCCESS;
}
GitHub upstream.
Tested on kernel 5.4.3.
Though the pages are reserved via a kernel driver, it is meant to be accessed via user space. As a result, the PTE (page table entries) do not know if the pfn belongs to user space or kernel space (even though they are allocated via kernel driver).
This is why they are marked with SetPageReserved.

Resources