linux kernel - how to get physical address (memory management)? - linux

In linux,
Page Global Directory offset address(cr3 + index) can be calculated using pgd_offset() MACRO.
Page Upper Directory offset address can be calculated using pud_offset() API.
Page Middle Directory offset address can be calculated using pmd_offset() API.
Page Table Entry offset address can be calculated using pte_offset_map() MACRO.
Then, how to get physical address? (yellow line in above picture)
Is there a function or MACRO to calculate physical address?
edit : x86-64 architecture.

The Linux kernel uses a generic four-page paging model, which is not only suitable for 32-bit systems but also for 64-bit systems. The paging unit is part of the MMU (Memory Management Unit), which converts a linear address into a physical address.
I wrote a kernel module for you to simulate the process of virtual address conversion to physical address. I am assuming you know the principal of paging system.
static void get_pgtable_macro(void)
{
printk("PAGE_OFFSET = 0x%lx\n", PAGE_OFFSET);
printk("PGDIR_SHIFT = %d\n", PGDIR_SHIFT);
printk("PUD_SHIFT = %d\n", PUD_SHIFT);
printk("PMD_SHIFT = %d\n", PMD_SHIFT);
printk("PAGE_SHIFT = %d\n", PAGE_SHIFT);
printk("PTRS_PER_PGD = %d\n", PTRS_PER_PGD);
printk("PTRS_PER_PUD = %d\n", PTRS_PER_PUD);
printk("PTRS_PER_PMD = %d\n", PTRS_PER_PMD);
printk("PTRS_PER_PTE = %d\n", PTRS_PER_PTE);
printk("PAGE_MASK = 0x%lx\n", PAGE_MASK);
}
static unsigned long vaddr2paddr(unsigned long vaddr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
unsigned long paddr = 0;
unsigned long page_addr = 0;
unsigned long page_offset = 0;
pgd = pgd_offset(current->mm, vaddr);
printk("pgd_val = 0x%lx\n", pgd_val(*pgd));
printk("pgd_index = %lu\n", pgd_index(vaddr));
if (pgd_none(*pgd)) {
printk("not mapped in pgd\n");
return -1;
}
pud = pud_offset(pgd, vaddr);
printk("pud_val = 0x%lx\n", pud_val(*pud));
if (pud_none(*pud)) {
printk("not mapped in pud\n");
return -1;
}
pmd = pmd_offset(pud, vaddr);
printk("pmd_val = 0x%lx\n", pmd_val(*pmd));
printk("pmd_index = %lu\n", pmd_index(vaddr));
if (pmd_none(*pmd)) {
printk("not mapped in pmd\n");
return -1;
}
pte = pte_offset_kernel(pmd, vaddr);
printk("pte_val = 0x%lx\n", pte_val(*pte));
printk("pte_index = %lu\n", pte_index(vaddr));
if (pte_none(*pte)) {
printk("not mapped in pte\n");
return -1;
}
/* Page frame physical address mechanism | offset */
page_addr = pte_val(*pte) & PAGE_MASK;
page_offset = vaddr & ~PAGE_MASK;
paddr = page_addr | page_offset;
printk("page_addr = %lx, page_offset = %lx\n", page_addr, page_offset);
printk("vaddr = %lx, paddr = %lx\n", vaddr, paddr);
return paddr;
}
static int __init v2p_init(void)
{
unsigned long vaddr = 0;
printk("vaddr to paddr module is running..\n");
get_pgtable_macro();
printk("\n");
vaddr = (unsigned long)vmalloc(1000 * sizeof(char));
if (vaddr == 0) {
printk("vmalloc failed..\n");
return 0;
}
printk("vmalloc_vaddr=0x%lx\n", vaddr);
vaddr2paddr(vaddr);
printk("\n\n");
vaddr = __get_free_page(GFP_KERNEL);
if (vaddr == 0) {
printk("__get_free_page failed..\n");
return 0;
}
printk("get_page_vaddr=0x%lx\n", vaddr);
vaddr2paddr(vaddr);
return 0;
}
static void __exit v2p_exit(void)
{
printk("vaddr to paddr module is leaving..\n");
vfree((void *)vaddr);
free_page(vaddr);
}
Get_pgtable_macro () Prints some macros in the current system paging mechanism.
Through vmalloc () in the allocation of memory space in kernel space, calling vaddr2paddr () will be converted into a virtual address physical address.
Use vaddr2paddr () to translate the virtual address into a physical address by allocating the frame in kernel space with __get_free_pages ().
Release the requested memory space through vfree () and free_page (), respectively.
Vaddr2paddr () is executed as follows:
Calculate the linear address pgd of the page global catalog entry by pgd_offset, passing in the memory descriptor mm and the linear address vaddr. Next, print the page global catalog entry pointed to by pgd.
Calculate the linear address pud of the page parent directory entry by pud_offset, passing the parameters to the linear address pgd of the page global directory entry and the linear address vaddr. Then print the pud referred to the parent directory entry.
Calculate the linear address pmd of the page middle directory entry through pmd_offset, passing the parameters to the linear address pud and the linear address vaddr of the parent directory entry. Then print the middle of the page referred to pmd directory entries.
Pte_offset_kernel pte_offset_kernel calculated by the linear address pte, the parameters for the middle of the directory entry of the linear address pmd linear address and address vaddr. Then print the page table item pointed to by pte.
pte_val (* pte) to remove the page table entries, and PAGE_MASK phase and the result is to access the page's physical address; vaddr & ~ PAGE_MASK used to get linear address offset field; the two or the final physical address calculation.
Print the physical address

Related

what does deallocation function in xv6's allocation function

in case 1 and 2, what does the deallocation function do in an allocation function?
case 1: if(mem == 0)
// does this condition mean physical memory has not space?
case 2: if(mappages(pgdir, (char*)a, PGSIZE, V2P(mem), PTE_W|PTE_U) < 0)
// does this condtion mean pagetable entry has not allocate in physical memory?
I attached the deallocation function and the allocation function.
reference:
https://github.com/fernandabonetti/xv6/blob/master/vm.c
int
allocuvm(pde_t *pgdir, uint oldsz, uint newsz)
{
char *mem;
uint a;
if(newsz >= KERNBASE)
return 0;
if(newsz < oldsz)
return oldsz;
a = PGROUNDUP(oldsz);
for(; a < newsz; a += PGSIZE){
mem = kalloc();
if(mem == 0){
cprintf("allocuvm out of memory\n");
deallocuvm(pgdir, newsz, oldsz);
return 0;
}
memset(mem, 0, PGSIZE);
if(mappages(pgdir, (char*)a, PGSIZE, V2P(mem), PTE_W|PTE_U) < 0){
cprintf("allocuvm out of memory (2)\n");
deallocuvm(pgdir, newsz, oldsz);
kfree(mem);
return 0;
}
}
return newsz;
}
int
deallocuvm(pde_t *pgdir, uint oldsz, uint newsz)
{
pte_t *pte;
uint a, pa;
if(newsz >= oldsz)
return oldsz;
a = PGROUNDUP(newsz);
for(; a < oldsz; a += PGSIZE){
pte = walkpgdir(pgdir, (char*)a, 0);
if(!pte)
a = PGADDR(PDX(a) + 1, 0, 0) - PGSIZE;
else if((*pte & PTE_P) != 0){
pa = PTE_ADDR(*pte);
if(pa == 0)
panic("kfree");
char *v = P2V(pa);
kfree(v);
*pte = 0;
}
}
return newsz;
}
allocuvm is a short of Allocate User Virtual Memory. This function is responsible to increase the user's virtual memory in a specific page directory.
There are indeed 2 cases where this function can fail:
Case 1: kalloc function failed. kalloc is a short of kernel allocation. This function is responsible to return an address of a new, currently unused, page in RAM. If it returns 0, that means there are no available unused pages currently.
Case 2: mappages function failed. This function is responsible of making the new allocated page to be accessible by the process who uses the given page directory by mapping that page with the next virtual address available in the page directory.
If this function fails that means it failed in doing so, probably due to the page directory being already full.
In both cases, allocuvm didn't managed to increase the user's memory to the size requested, Therefore, it is undoing all allocations until the point of failure, so the virtual memory will remain unchanged, and returns an error it self.

Could I/O memory access be used inside ISR under Linux (ARM)?

I'm creating driver for communication with FPGA under Linux. FPGA is connected via GPMC interface. When I tested read/write from driver context - everithing works perfectly. But the problem is that I need to read some address on interrupt. So I created interrupt handler, registred it and put iomemory reading in it (readw function). But when interrupt is fired - only zero's are readed. I tested every part of driver from the top to the bottom and it seems like the problem is in iomemory access inside ISR. When I replaced io access with constant value - it successfully passed to user-level application.
ARM version: armv7a (Cortex ARM-A8 (DM3730))
Compiler: CodeSourcery 2014.05
Here is some code from driver which represents performed actions:
// Request physical memory region for FPGA address IO
void* uni_PhysMem_request(const unsigned long addr, const unsigned long size) {
// Handle to be returned
void* handle = NULL;
// Check if memory region successfully requested (mapped to module)
if (!request_mem_region(addr, size, moduleName)) {
printk(KERN_ERR "\t\t\t\t%s() failed to request_mem_region(0x%p, %lu)\n", __func__, (void*)addr, size);
}
// Remap physical memory
if (!(handle = ioremap(addr, size))) {
printk(KERN_ERR "\t\t\t\t%s() failed to ioremap(0x%p, %lu)\n", __func__, (void*)addr, size);
}
// Return virtual address;
return handle;
}
// ...
// ISR
static irqreturn_t uni_IRQ_handler(int irq, void *dev_id) {
size_t readed = 0;
if (irq == irqNumber) {
printk(KERN_DEBUG "\t\t\t\tIRQ handling...\n");
printk(KERN_DEBUG "\t\t\t\tGPIO %d pin is %s\n", irqGPIOPin, ((gpio_get_value(irqGPIOPin) == 0) ? "LOW" : "HIGH"));
// gUniAddr is a struct which holds GPMC remapped virtual address (from uni_PhysMem_request), offset and read size
if ((readed = uni_ReadBuffer_IRQ(gUniAddr.gpmc.addr, gUniAddr.gpmc.offset, gUniAddr.size)) < 0) {
printk(KERN_ERR "\t\t\t\tunable to read data\n");
}
else {
printk(KERN_INFO "\t\t\t\tdata readed success (%zu bytes)\n", readed);
}
}
return IRQ_HANDLED;
}
// ...
// Read buffer by IRQ
ssize_t uni_ReadBuffer_IRQ(void* physAddr, unsigned long physOffset, size_t buffSize) {
size_t size = 0;
size_t i;
for (i = 0; i < buffSize; i += 2) {
size += uni_RB_write(readw(physAddr + physOffset)); // Here readed value sent to ring buffer. When "readw" replaced with any constant - everything OK
}
return size;
}
Looks like the problem was in code optimizations. I changed uni_RB_write function to pass physical address and data size, also read now performed via ioread16_rep function. So now everything works just fine.

Is there any way to know the physical address of attached shared memory?

I want to know the "physical" address of a newly attached shared memory on Linux kernel.
As far as I know, do_shmat() returns the "virtual" address of shared memory.
So I tried to translate the return value of do_shamt() using TLB, by modifying shmat in the kernel like below.
SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
{
unsigned long ret;
unsigned long phys_ret;
unsigned int regVal;
long err;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
if (err)
return err;
force_successful_syscall_return();
pgd = pgd_offset(current->mm, ret);
pmd = pmd_offset(pgd, ret);
pte = pte_offset_kernel(pmd, ret);
printk("*pte = 0x%lx\n", *pte);
return (long)ret;
}
But pte points to an address which has 0 so I cannot actually get the physical address.
Why can't I get the right pte in my code?
Try this:
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/cma.h>
#include <linux/dma-contiguous.h>
#include <linux/cdev.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/highmem.h>
/***************************************************************************************
* phys_addr_t getPhysicalPageAddress(unsigned long va)
*
* Description
* Virtual to Physical address translation method.
* Performs a page walk to translate the given virtual address
* to its physical page address.
*
***************************************************************************************/
phys_addr_t getPhysicalPageAddress(unsigned long va)
{
phys_addr_t pa;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep , pte;
struct page *pagina;
struct mm_struct * mm;
int pfn;
pa = 0;
mm = current->mm;
// Variable initialization
pagina = NULL;
pgd = NULL;
pmd = NULL;
ptep = NULL;
// Using Page Tables (this mechanism is known as "Page Walk"), we find the page that corresponds to Virtual Address
pgd = pgd_offset(mm, va);
if (!pgd_none(*pgd) || !pgd_bad(*pgd))
{
pud = pud_offset(pgd , va);
if (!pud_none(*pud) || !pud_bad(*pud))
{
pmd = pmd_offset(pud, va);
if (!pmd_none(*pmd) || !pmd_bad(*pmd))
{
ptep = pte_offset_map(pmd, va);
if (ptep)
{
pte = *ptep;
pte_unmap(ptep);
pagina = pte_page(pte);
// The page has been found
// Seek Page Frame Number for this page
pfn = page_to_pfn(pagina);
// Seek Physical Address for this page, using "page_to_phys()" macro
pa = page_to_phys(pagina);
} else printk(KERN_ERR, "Page Walk exception at pte entry. The Virtual Address 0x%lx cannot be translated for this process", va );
} else printk(KERN_ERR, "Page Walk exception at pmd entry. The Virtual Address 0x%lx cannot be translated for this process", va );
} else printk(KERN_ERR, "Page Walk exception at pud entry. The Virtual Address 0x%lx cannot be translated for this process", va );
} else printk(KERN_ERR, "Page Walk exception at pgd entry. The Virtual Address 0x%lx cannot be translated for this process", va );
return pa;
}

Arm64 Linux Page Table Walk

Currently I'm developing some research-related programs and I need to find the pte of some specific addresses. My development environment is Juno r1 board (CPUs are A53 and A57 ) and it's running arm64 Linux kernel.
I use some typical page table walk codes like this:
int find_physical_pte(void *addr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep;
unsigned long long address;
address = (unsigned long long)addr;
pgd = pgd_offset(current->mm, address);
printk(KERN_INFO "\npgd is: %p\n", (void *)pgd);
printk(KERN_INFO "pgd value: %llx\n", *pgd);
if (pgd_none(*pgd) || pgd_bad(*pgd))
return -1;
pud = pud_offset(pgd, address);
printk(KERN_INFO "\npud is: %p\n", (void *)pud);
printk(KERN_INFO "pud value: %llx\n", (*pud).pgd);
if (pud_none(*pud) || pud_bad(*pud))
return -2;
pmd = pmd_offset(pud, address);
printk(KERN_INFO "\npmd is: %p\n", (void *)pmd);
printk(KERN_INFO "pmd value: %llx\n",*pmd);
if (pmd_none(*pmd) || pmd_bad(*pmd))
return -3;
ptep = pte_offset_kernel(pmd, address);
printk(KERN_INFO "\npte is: %p\n", (void *)ptep);
printk(KERN_INFO "pte value: %llx\n",*ptep);
if (!ptep)
return -4;
return 1;
}
However, when the program checks the pte for the address(0xffffffc0008b2000), it always returns an empty pmd.
My guess is that I got the wrong pgd in the first step. I saw Tims Notes said that using current->mm only could get the pgd of TTBR0 (user space pgd) while the address I checked is a kernel space address so I should try to get the pgd of TTBR1.
So my question is: If I want to get the pte of a kernel space address, can I use current->mm to get the pgd?
If I can't, is there anything else I could try instead?
Any suggestion is welcome! Thank you.
Simon
I finally solved the problem.
Actually, my code is correct. The only part I missed is a page table entry check.
According to the page table design of ARMv8, ARM uses 4 levels page table for 4kb granule case. Each level (level 0-3 defined in the link) is implemented as pgd, pud, pmd, and ptep in Linux code.
In the ARM architecture, each level can be either block entry or the table entry (see the AArch64 Descriptor Format Section in the link).
If the memory address belongs to a 4kb table entry, then it needs to be traced down till level 3 entry (ptep). However, for the address belongs to a larger chunk, the corresponding table entry may save in the pgd, pud, or pmd level.
By checking the last 2 bits of the entry in each level, you know it's block entry or not and you only keep tracing down for the block entry.
Here is how to improve my code above:
Retrieving the descriptor based on the page table pointer desc = *pgd and then checking the last 2 bits of the descriptor.
If the descriptor is a block entry (0x01) then you need to extract the lower level entry as my code shows above.
If you already get the table entry (0x11) at any level, then you can stop there and translate the VA to PA based on the descriptor desc you just get.
int find_physical_pte(void *addr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep;
unsigned long long address;
address = (unsigned long long)addr;
pgd = pgd_offset(current->mm, address);
printk(KERN_INFO "\npgd is: %p\n", (void *)pgd);
printk(KERN_INFO "pgd value: %llx\n", *pgd);
if (pgd_none(*pgd) || pgd_bad(*pgd))
return -1;
//check if (*pgd) is a table entry. Exit here if you get the table entry.
pud = pud_offset(pgd, address);
printk(KERN_INFO "\npud is: %p\n", (void *)pud);
printk(KERN_INFO "pud value: %llx\n", (*pud).pgd);
if (pud_none(*pud) || pud_bad(*pud))
return -2;
//check if (*pud) is a table entry. Exit here if you get the table entry.
pmd = pmd_offset(pud, address);
printk(KERN_INFO "\npmd is: %p\n", (void *)pmd);
printk(KERN_INFO "pmd value: %llx\n",*pmd);
if (pmd_none(*pmd) || pmd_bad(*pmd))
return -3;
//check if (*pmd) is a table entry. Exit here if you get the table entry.
ptep = pte_offset_kernel(pmd, address);
printk(KERN_INFO "\npte is: %p\n", (void *)ptep);
printk(KERN_INFO "pte value: %llx\n",*ptep);
if (!ptep)
return -4;
return 1;
}
I think the problem you are having is that you are passing the struct mm_struct * pointer of the current process. But the address you are passing if from the kernel virtual address space. You need to pass the mm pointer to the init process (&init_mm):
pgd = pgd_offset(&init_mm, address);
I think the rest should be fine, but I haven't tested it. You can also look at how it is done in the kernel in the file arch/arm64/mm/dump.c

How to get the physical address from the logical one in a Linux kernel module?

Is there any suitable way to get the physical address by the logical one except to walk through page directory entries by hand? I've looked for this functionality in kernel's sources and found that there is a follow_page function that do it well with built-in huge and transparent-huge pages support. But it's not exported to kernel modules (why???)...
So, I don't want to invent the wheel and I think that it's not very good to reimplement the follow_page functionality by hand.
Well, it might looks as something like that (follow PTE from an virtual address):
void follow_pte(struct mm_struct * mm, unsigned long address, pte_t * entry)
{
pgd_t * pgd = pgd_offset(mm, address);
printk("follow_pte() for %lx\n", address);
entry->pte = 0;
if (!pgd_none(*pgd) && !pgd_bad(*pgd)) {
pud_t * pud = pud_offset(pgd, address);
struct vm_area_struct * vma = find_vma(mm, address);
printk(" pgd = %lx\n", pgd_val(*pgd));
if (pud_none(*pud)) {
printk(" pud = empty\n");
return;
}
if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
entry->pte = pud_val(*pud);
printk(" pud = huge\n");
return;
}
if (!pud_bad(*pud)) {
pmd_t * pmd = pmd_offset(pud, address);
printk(" pud = %lx\n", pud_val(*pud));
if (pmd_none(*pmd)) {
printk(" pmd = empty\n");
return;
}
if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
entry->pte = pmd_val(*pmd);
printk(" pmd = huge\n");
return;
}
if (pmd_trans_huge(*pmd)) {
entry->pte = pmd_val(*pmd);
printk(" pmd = trans_huge\n");
return;
}
if (!pmd_bad(*pmd)) {
pte_t * pte = pte_offset_map(pmd, address);
printk(" pmd = %lx\n", pmd_val(*pmd));
if (!pte_none(*pte)) {
entry->pte = pte_val(*pte);
printk(" pte = %lx\n", pte_val(*pte));
} else {
printk(" pte = empty\n");
}
pte_unmap(pte);
}
}
}
}
I think you can achieve virtual->physical translation through an indirect method by a combination of /proc/[pid]/maps ( gives the virtual mapping for a process ) and /proc/[pid]/pagemap( Gives Virtual Page to Physical Page mapping for every addressable page ). First, find out the mapping of virtual addresses of your process from maps ( This is done so that you don't search every byte in pagemap ) Then check for the physical mapping of the desired virtual address in pagemap ( pagemap is not in text format. Here is a detailed explantion of the format Pagemap )
This should give you the exact virtual-->physical mapping
It sounds like you're looking for virt_to_phys.

Resources