I have an assembly source file, named: helloworld.s:
.global _start
_start: mov X0, #1
ldr X1, =helloworld
mov X2, #13
mov X8, #64
svc 0
mov X0, #0
mov X8, #93
svc 0
.data
helloworld: .ascii "Hello nice warm World"
.ascii "Hello nice warm World2"
I created an executable file:
/usr/aarch64-linux-gnu/bin/as -o helloworld.o helloworld.s
/usr/aarch64-linux-gnu/bin/ld -o helloworld helloworld.o
and then, created an objdump output of the executable file:
/usr/aarch64-linux-gnu/bin/objdump -s -D helloworld > objdum_output_helloworld.txt
This gives:
helloworld: file format elf64-littleaarch64
Contents of section .text:
...
4000b0 200080d2 e1000058 a20180d2 080880d2 ......X........
4000c0 010000d4 000080d2 a80b80d2 010000d4 ................
4000d0 d8004100 00000000 ..A.....
...
Contents of section .data:
...
4100d8 48656c6c 6f206e69 63652077 61726d20 Hello nice warm
4100e8 576f726c 6448656c 6c6f206e 69636520 WorldHello nice
4100f8 7761726d 20576f72 6c6432 warm World2
...
Disassembly of section .text:
00000000004000b0 <_start>:
...
4000b0: d2800020 mov x0, #0x1 // #1
4000b4: 580000e1 ldr x1, 4000d0 <_start+0x20>
4000b8: d28001a2 mov x2, #0xd // #13
4000bc: d2800808 mov x8, #0x40 // #64
4000c0: d4000001 svc #0x0
4000c4: d2800000 mov x0, #0x0 // #0
4000c8: d2800ba8 mov x8, #0x5d // #93
4000cc: d4000001 svc #0x0
4000d0: 004100d8 .inst 0x004100d8 ; undefined
4000d4: 00000000 .inst 0x00000000 ; undefined
...
Disassembly of section .data:
00000000004100d8 <helloworld>:
...
4100d8: 6c6c6548 ldnp d8, d25, [x10, #-320]
4100dc: 696e206f ldpsw x15, x8, [x3, #-144]
4100e0: 77206563 .inst 0x77206563 ; undefined
4100e4: 206d7261 .inst 0x206d7261 ; undefined
4100e8: 6c726f57 ldnp d23, d27, [x26, #-224]
4100ec: 6c654864 ldnp d4, d18, [x3, #-432]
4100f0: 6e206f6c umin v12.16b, v27.16b, v0.16b
4100f4: 20656369 .inst 0x20656369 ; undefined
4100f8: 6d726177 ldp d23, d24, [x11, #-224]
4100fc: 726f5720 .inst 0x726f5720 ; undefined
410100: Address 0x0000000000410100 is out of bounds.
The question:
How can I see from the objdump output only, the existence of two separate strings:
"Hello nice warm World"
and
"Hello nice warm World2" ?
Thanks
When we looking on assembler at x86 CPU , syscall look like:
0F 05 syscall ; LINUX - sys_nanosleep
48 3D 01 F0 FF FF cmp rax, 0FFFFFFFFFFFFF001h
When are we talking about ARM CPU what is the convention how syscall looks like in assembler?
The source code for the musl libc library may help: all supported architectures have a small header file implementing the 'syscalls'.
x86_64:
static __inline long __syscall0(long n)
{
unsigned long ret;
__asm__ __volatile__ ("syscall" : "=a"(ret) : "a"(n) : "rcx", "r11", "memory");
return ret;
}
Arm:
#ifdef __thumb__
/* Avoid use of r7 in asm constraints when producing thumb code,
* since it's reserved as frame pointer and might not be supported. */
#define __ASM____R7__
#define __asm_syscall(...) do { \
__asm__ __volatile__ ( "mov %1,r7 ; mov r7,%2 ; svc 0 ; mov r7,%1" \
: "=r"(r0), "=&r"((int){0}) : __VA_ARGS__ : "memory"); \
return r0; \
} while (0)
#else
#define __ASM____R7__ __asm__("r7")
#define __asm_syscall(...) do { \
__asm__ __volatile__ ( "svc 0" \
: "=r"(r0) : __VA_ARGS__ : "memory"); \
return r0; \
} while (0)
#endif
Aarch64:
#define __asm_syscall(...) do { \
__asm__ __volatile__ ( "svc 0" \
: "=r"(x0) : __VA_ARGS__ : "memory", "cc"); \
return x0; \
} while (0)
Example for generated code:
/* syscall.c */
#define __asm_syscall(...) do { \
__asm__ __volatile__ ( "svc 0" \
: "=r"(x0) : __VA_ARGS__ : "memory", "cc"); \
return x0; \
} while (0)
static inline long __syscall0(long n)
{
register long x8 __asm__("x8") = n;
register long x0 __asm__("x0");
__asm_syscall("r"(x8));
}
void test(void) {
__syscall0(1);
}
/opt/arm/9/gcc-arm-9.2-2019.12-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-objdump -D syscall.o
/opt/arm/9/gcc-arm-9.2-2019.12-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc -c -o syscall.o syscall.c
Disassembly of section .text:
0000000000000000 <__syscall0>:
0: d10043ff sub sp, sp, #0x10
4: f90007e0 str x0, [sp, #8]
8: f94007e8 ldr x8, [sp, #8]
c: d4000001 svc #0x0
10: 910043ff add sp, sp, #0x10
14: d65f03c0 ret
0000000000000018 <test>:
18: a9bf7bfd stp x29, x30, [sp, #-16]!
1c: 910003fd mov x29, sp
20: d2800020 mov x0, #0x1 // #1
24: 97fffff7 bl 0 <__syscall0>
28: d503201f nop
2c: a8c17bfd ldp x29, x30, [sp], #16
30: d65f03c0 ret
This being said, the arm documentation cannot really be called garbage, even though you found difficult to find the exact information you were looking for: The Exploration Tools section of their web site is excellent IMHO.
You can find the pseudo-code for the SVC instructions and their exact encodings here and here, and you even could simulate the instructions: Alastair Reid wrote a couple of fascinating articles here regarding ISA formal specifications.
I am working on a Linux kernel module (KVM and KVM_intel have already been unloaded) to test Intel's VMX function.
And now I am wondering the pre-requisite of entering VMX root mode.
My kernel modules uses Linux file system interface to provide a device interface to user space program to do VMX operations.
Here is the code for reference (it was from https://www.cs.usfca.edu/~cruse/cs686s07/nmiexits.c, and changed to work with my Linux-2.6.32, and other needed files were also from that link)
//-------------------------------------------------------------------
// nmiexits.c (A modification of our 'linuxvmm.c' module)
//
// This Linux kernel module implements a device-driver (named
// '/dev/vmm') which lets an application program execute some
// real-mode code in Virtual-8086 mode within a guest virtual
// machine, assuming the cpu supports Intel VMX instructions.
//
// This modification sets the pin-based VM Execution Controls
// so that control passes to our Virtual Machine Manager when
// any external interrupt (or non-maskable interrupt) occurs.
// These asynchronous events are then serviced by Linux ISRs,
// and our guest VM is resumed. For the case of non-maskable
// interrupts, the host executes the 'int $0x02' instruction;
// for the case of external interrupts, the appropriate Linux
// interrupt service routine automatically gets executed when
// the host executes 'sti', which allows the CPU to recognize
// the still-pending external interrupt-request.
//
// compile using: $ mmake nmiexits
// install using: $ /sbin/insmod nmiexits.ko
//
// NOTE: Written and tested using Linux x86_64 kernel 2.6.17.
//
// programmer: ALLAN CRUSE
// date begun: 29 APR 2007
// completion: 03 MAY 2007 -- our initial driver-prototype
// revised on: 14 MAY 2007 -- sets 'interrupt-exiting' control
// revised on: 24 MAY 2007 -- sets the 'NMI-exiting' control
// revised on: 21 JUL 2008 -- for Linux kernel version 2.6.26.
//-------------------------------------------------------------------
#include <linux/kernel.h>
#include <linux/module.h> // for init_module()
#include <linux/proc_fs.h> // for create_proc_read_entry()
#include <linux/fs.h> // for struct file_operations
#include <asm/io.h> // for virt_to_phys()
#include <asm/uaccess.h> // for copy_from_user()
#include <linux/slab.h> // for init_module()
#include <linux/mm.h> // for remap_pfn_range()
#include <linux/seq_file.h>
#include "machine.h" // for our VMCS fields
#include "myvmx.h" // for 'regs_ia32'
#define N_ARENAS 11 // number of 64KB memory allocations
#define ARENA_LENGTH (64<<10) // size of each allocated memory-arena
#define IA32_VMX_BASIC 0x0480
#define IA32_VMX_PINBASED_CTLS 0x0481
#define IA32_VMX_PROCBASED_CTLS 0x0482
#define IA32_VMX_EXIT_CTLS 0x0483
#define IA32_VMX_ENTRY_CTLS 0x0484
#define IA32_VMX_MISC 0x0485
#define IA32_VMX_CR0_FIXED0 0x0486
#define IA32_VMX_CR0_FIXED1 0x0487
#define IA32_VMX_CR4_FIXED0 0x0488
#define IA32_VMX_CR4_FIXED1 0x0489
#define IA32_VMX_VMCS_ENUM 0x048A
#define IA32_VMX_PROCBASED_CTLS2 0x048B
#define IA32_VMX_EPT_VPID_CAP 0x048C
#define IA32_VMX_TRUE_PINBASED_CTLS 0x048D
#define IA32_VMX_TRUE_PROCBASED_CTLS 0x048E
#define IA32_VMX_TRUE_EXIT_CTLS 0x048F
#define IA32_VMX_TRUE_ENTRY_CTLS 0x0490
#define NUM_VMX_MSR (IA32_VMX_TRUE_ENTRY_CTLS - IA32_VMX_BASIC + 1)
#define LEGACY_REACH 0x110000 // end of 'real-addressible' memory
#define PAGE_DIR_OFFSET 0x2000
#define PAGE_TBL_OFFSET 0x3000
#define IDT_KERN_OFFSET 0x4000
#define GDT_KERN_OFFSET 0x4800
#define LDT_KERN_OFFSET 0x4A00
#define TSS_KERN_OFFSET 0x4C00
#define TOS_KERN_OFFSET 0x8000
#define ISR_KERN_OFFSET 0x8000
#define __SELECTOR_TASK 0x0008
#define __SELECTOR_LDTR 0x0010
#define __SELECTOR_CODE 0x0004
#define __SELECTOR_DATA 0x000C
#define __SELECTOR_VRAM 0x0014
#define __SELECTOR_FLAT 0x001C
char modname[] = "nmiexits";
int my_major = 88;
char cpu_oem[16];
int cpu_features;
void *kmem[ N_ARENAS ];
unsigned long msr0x480[ NUM_VMX_MSR ];
unsigned long cr0, cr4;
unsigned long msr_efer;
unsigned long vmxon_region;
unsigned long guest_region;
unsigned long pgdir_region;
unsigned long pgtbl_region;
unsigned long g_IDT_region;
unsigned long g_GDT_region;
unsigned long g_LDT_region;
unsigned long g_TSS_region;
unsigned long g_TOS_region;
unsigned long g_ISR_region;
//============================================================
long my_ioctl( struct file *, unsigned int, unsigned long );
int my_release ( struct inode *inode, struct file *file );
int my_mmap( struct file *file, struct vm_area_struct *vma )
{
unsigned long user_virtaddr = vma->vm_start;
unsigned long region_length = vma->vm_end - vma->vm_start;
unsigned long physical_addr, pfn;
int i;
// we require prescribed parameter-values from our client
if ( user_virtaddr != 0x00000000L ) return -EINVAL;
if ( region_length != LEGACY_REACH ) return -EINVAL;
// let the kernel know not to try swapping out this region
/// vma->vm_flags |= VM_RESERVED;
// ask the kernel to add page-table entries to 'map' these arenas
for (i = 0; i < N_ARENAS+6; i++)
{
int j = i % 16;
if ( j < 0xA ) physical_addr = virt_to_phys( kmem[ j ] );
else physical_addr = user_virtaddr;
pfn = ( physical_addr >> PAGE_SHIFT );
if ( remap_pfn_range( vma, user_virtaddr, pfn,
ARENA_LENGTH, vma->vm_page_prot ) ) return -EAGAIN;
user_virtaddr += ARENA_LENGTH;
}
// copy page-frame 0x000 to bottom of arena 0x0 (for IVT and BDA)
memcpy( kmem[0], phys_to_virt( 0x00000 ), PAGE_SIZE );
// copy page-frames 0x90 to 0x9F to arena 0x9 (for EBDA)
memcpy( kmem[9], phys_to_virt( 0x90000 ), ARENA_LENGTH );
return 0; // SUCCESS
}
struct file_operations
my_fops = {
owner: THIS_MODULE,
.unlocked_ioctl= my_ioctl,
mmap: my_mmap,
.release = my_release,
};
void set_CR4_vmxe( void *dummy )
{
asm( " mov %%cr4, %%rax \n"\
" bts $13, %%rax \n"\
" mov %%rax, %%cr4 " ::: "ax" );
}
void clear_CR4_vmxe( void *dummy )
{
asm( " mov %%cr4, %%rax \n"\
" btr $13, %%rax \n"\
" mov %%rax, %%cr4 " ::: "ax" );
}
static inline u64
vmx_rdmsr (u32 ecx)
{
u32 edx, eax;
asm volatile ("rdmsr":"=d" (edx), "=a" (eax):"c" (ecx));
return (((u64) edx) << 32) | ((u64) eax);
}
static inline void
vmx_wrmsr (u32 ecx, u64 val)
{
u32 edx, eax;
edx = (u32) (val >> 32);
eax = (u32) val;
asm volatile ("wrmsr"::"d" (edx), "a" (eax), "c" (ecx));
}
int init_module( void )
{
int i, j;
// confirm installation and show device-major number
printk( "<1>\nInstalling \'%s\' module ", modname );
printk( "(major=%d) \n", my_major );
// verify processor supports Intel Virtualization Technology
asm( " xor %%eax, %%eax \n"\
" cpuid \n"\
" mov %%ebx, cpu_oem+0 \n"\
" mov %%edx, cpu_oem+4 \n"\
" mov %%ecx, cpu_oem+8 \n"\
::: "ax", "bx", "cx", "dx" );
printk( " processor is \'%s\' \n", cpu_oem );
if ( strncmp( cpu_oem, "GenuineIntel", 12 ) == 0 )
asm( " mov $1, %%eax \n"\
" cpuid \n"\
" mov %%ecx, cpu_features \n"\
::: "ax", "bx", "cx", "dx" );
if ( ( cpu_features & (1<<5) ) == 0 )
{
printk( " Virtualization Technology is unsupported \n" );
return -ENODEV;
}
else printk( " Virtualization Technology is supported \n" );
// read contents of the VMX-Capability Model-Specific Registers
asm( " xor %%rbx, %%rbx \n"\
" mov %0, %%rcx \n"\
"nxcap: \n"\
" rdmsr \n"\
" mov %%eax, msr0x480+0(, %%rbx, 8) \n"\
" mov %%edx, msr0x480+4(, %%rbx, 8) \n"\
" inc %%rcx \n"\
" inc %%rbx \n"\
" cmp $17, %%rbx \n"\
" jb nxcap \n"\
:: "i" (IA32_VMX_BASIC) : "ax", "bx", "cx", "dx" );
// preserve the initial values in relevant system registers
asm( " mov %%cr0, %%rax \n mov %%rax, cr0 " ::: "ax" );
asm( " mov %%cr4, %%rax \n mov %%rax, cr4 " ::: "ax" );
asm( " mov %0, %%ecx \n"\
" rdmsr \n"\
" mov %%eax, msr_efer+0 \n"\
" mov %%edx, msr_efer+4 \n"\
:: "i" (MSR_EFER) : "ax", "cx", "dx" );
// allocate page-aligned blocks of non-pageable kernel memory
for (i = 0; i < N_ARENAS; i++)
{
kmem[ i ] = kmalloc( ARENA_LENGTH, GFP_KERNEL );
if ( kmem[ i ] == NULL )
{
for (j = 0; j < i; j++) kfree( kmem[ j ] );
return -ENOMEM;
}
else memset( kmem[ i ], 0x00, ARENA_LENGTH );
}
// assign usages to allocated kernel memory areas
vmxon_region = virt_to_phys( kmem[ 10 ] + 0x0000 );
guest_region = virt_to_phys( kmem[ 10 ] + 0x1000 );
pgdir_region = virt_to_phys( kmem[ 10 ] + PAGE_DIR_OFFSET );
pgtbl_region = virt_to_phys( kmem[ 10 ] + PAGE_TBL_OFFSET );
g_IDT_region = virt_to_phys( kmem[ 10 ] + IDT_KERN_OFFSET );
g_GDT_region = virt_to_phys( kmem[ 10 ] + GDT_KERN_OFFSET );
g_LDT_region = virt_to_phys( kmem[ 10 ] + LDT_KERN_OFFSET );
g_TSS_region = virt_to_phys( kmem[ 10 ] + TSS_KERN_OFFSET );
g_TOS_region = virt_to_phys( kmem[ 10 ] + TOS_KERN_OFFSET );
g_ISR_region = virt_to_phys( kmem[ 10 ] + ISR_KERN_OFFSET );
return register_chrdev( my_major, modname, &my_fops );
}
void cleanup_module( void )
{
int i;
smp_call_function( clear_CR4_vmxe, NULL, 1 );
clear_CR4_vmxe( NULL );
unregister_chrdev( my_major, modname );
for (i = 0; i < N_ARENAS; i++) kfree( kmem[ i ] );
printk( "<1>Removing \'%s\' module\n", modname );
}
MODULE_LICENSE("GPL");
unsigned short _gdtr[ 5 ], _idtr[ 5 ];
unsigned int _eax, _ebx, _ecx, _edx, _esp, _ebp, _esi, _edi;
int retval = -1;
int nmiints = 0;
int extints = 0;
regs_ia32 vm;
long my_ioctl( struct file *file, unsigned int count, unsigned long buf)
{
unsigned long *gdt, *ldt, *idt;
unsigned int *pgtbl, *pgdir, *tss, phys_addr = 0;
signed long desc = 0;
int i, j;
// sanity check: we require the client-process to pass an
// exact amount of data representing CPU's register-state
if ( count != sizeof( regs_ia32 ) ) return -EINVAL;
// reinitialize the Virtual Machine Control Stuctures
memset( phys_to_virt( vmxon_region ), 0x00, PAGE_SIZE );
memset( phys_to_virt( guest_region ), 0x00, PAGE_SIZE );
memcpy( phys_to_virt( vmxon_region ), msr0x480, 4 );
memcpy( phys_to_virt( guest_region ), msr0x480, 4 );
// initialize our guest-task's page-table and page-directory
pgtbl = (unsigned int*)phys_to_virt( pgtbl_region );
for (i = 0; i < 18; i++) {
switch ( i ) {
case 0: case 1: case 2: case 3: case 4:
case 5: case 6: case 7: case 8: case 9:
phys_addr = virt_to_phys( kmem[ i ] ); break;
case 10: case 11: case 12: case 13: case 14: case 15:
phys_addr = i * ARENA_LENGTH; break;
case 16:
phys_addr = virt_to_phys( kmem[ 0 ] ); break;
case 17:
phys_addr = virt_to_phys( kmem[ 10 ] ); break;
}
for (j = 0; j < 16; j++)
pgtbl[ i*16 + j ] = phys_addr + (j << PAGE_SHIFT) + 7;
}
pgdir = (unsigned int*)phys_to_virt( pgdir_region );
pgdir[ 0 ] = (unsigned int)pgtbl_region + 7;
// copy the client's virtual-machine register-values
if ( copy_from_user( &vm, (void*)buf, count ) ) return -EFAULT;
guest_ES_selector = vm.es;
guest_CS_selector = vm.cs;
guest_SS_selector = vm.ss;
guest_DS_selector = vm.ds;
guest_FS_selector = vm.fs;
guest_GS_selector = vm.gs;
_eax = vm.eax;
_ebx = vm.ebx;
_ecx = vm.ecx;
_edx = vm.edx;
_ebp = vm.ebp;
_esi = vm.esi;
_edi = vm.edi;
guest_RSP = vm.esp;
guest_RIP = vm.eip;
guest_RFLAGS = vm.eflags;
guest_RFLAGS |= (1 << 17); // VM=1 (for Virtual-8086 mode)
guest_RFLAGS |= (1 << 1); // it's essential to set bit #1
// setup other guest-state fields (for Virtual-8086 mode)
guest_ES_base = (guest_ES_selector << 4);
guest_CS_base = (guest_CS_selector << 4);
guest_SS_base = (guest_SS_selector << 4);
guest_DS_base = (guest_DS_selector << 4);
guest_FS_base = (guest_FS_selector << 4);
guest_GS_base = (guest_GS_selector << 4);
guest_ES_limit = 0xFFFF;
guest_CS_limit = 0xFFFF;
guest_SS_limit = 0xFFFF;
guest_DS_limit = 0xFFFF;
guest_FS_limit = 0xFFFF;
guest_GS_limit = 0xFFFF;
guest_ES_access_rights = 0xF3;
guest_CS_access_rights = 0xF3;
guest_SS_access_rights = 0xF3;
guest_DS_access_rights = 0xF3;
guest_FS_access_rights = 0xF3;
guest_GS_access_rights = 0xF3;
guest_CR0 = 0x80000031;
guest_CR4 = 0x00002011;
guest_CR3 = pgdir_region;
guest_VMCS_link_pointer_full = 0xFFFFFFFF;
guest_VMCS_link_pointer_high = 0xFFFFFFFF;
guest_IDTR_base = LEGACY_REACH + IDT_KERN_OFFSET;
guest_GDTR_base = LEGACY_REACH + GDT_KERN_OFFSET;
guest_LDTR_base = LEGACY_REACH + LDT_KERN_OFFSET;
guest_TR_base = LEGACY_REACH + TSS_KERN_OFFSET;
guest_IDTR_limit = (256 * 8) - 1;
guest_GDTR_limit = (3 * 8) - 1;
guest_LDTR_limit = (4 * 8) - 1;
guest_TR_limit = (26 * 4) + 0x20 + 0x2000;
guest_LDTR_access_rights = 0x82;
guest_TR_access_rights = 0x8B;
guest_LDTR_selector = __SELECTOR_LDTR;
guest_TR_selector = __SELECTOR_TASK;
// provisionally initialize our guest-task's LDTR
ldt = (unsigned long*)phys_to_virt( g_LDT_region );
ldt[ __SELECTOR_CODE >> 3 ] = 0x00CF9B000000FFFF;
ldt[ __SELECTOR_DATA >> 3 ] = 0x00CF93000000FFFF;
ldt[ __SELECTOR_VRAM >> 3 ] = 0x0000920B8000FFFF;
ldt[ __SELECTOR_FLAT >> 3 ] = 0x008F92000000FFFF;
// Adjust the CODE and DATA descriptors here
desc = LEGACY_REACH + ISR_KERN_OFFSET;
desc <<= 16;
desc &= 0x000000FFFFFF0000;
ldt[ __SELECTOR_CODE >> 3 ] |= desc;
ldt[ __SELECTOR_DATA >> 3 ] |= desc;
// initialize our guest-task's GDTR
gdt = (unsigned long*)phys_to_virt( g_GDT_region );
desc = 0x00008B0000000000;
desc |= (guest_TR_base << 32)&0xFF00000000000000;
desc |= (guest_TR_base << 16)&0x000000FFFFFF0000;
desc |= (guest_TR_limit & 0xFFFF);
gdt[ __SELECTOR_TASK >> 3 ] = desc;
desc = 0x0000820000000000;
desc |= ( guest_LDTR_base << 32)&0xFF00000000000000;
desc |= ( guest_LDTR_base << 16)&0x000000FFFFFF0000;
desc |= ( guest_LDTR_limit & 0xFFFF );
gdt[ __SELECTOR_LDTR >> 3 ] = desc;
// initialize our guest's IDT
idt = (unsigned long*)phys_to_virt( g_IDT_region );
desc = 0; // offset-address for GPF isr
desc &= 0x00000000FFFFFFFF;
desc |= (desc << 32);
desc &= 0xFFFF00000000FFFF;
desc |= ( __SELECTOR_CODE << 16);
desc |= 0x00008E0000000000;
idt[ 13 ] = desc;
// initialize our guest's Task-State Segment
tss = (unsigned int*)phys_to_virt( g_TSS_region );
tss[ 1 ] = TOS_KERN_OFFSET;
tss[ 2 ] = __SELECTOR_DATA;
tss[ 25 ] = 0x00880000;
tss[ guest_TR_limit >> 2 ] = 0xFF;
//----------------------------------------------------
// initialize the global variables for the host state
//----------------------------------------------------
asm(" mov %%cr0, %%rax \n mov %%rax, host_CR0 " ::: "ax" );
asm(" mov %%cr4, %%rax \n mov %%rax, host_CR4 " ::: "ax" );
asm(" mov %%cr3, %%rax \n mov %%rax, host_CR3 " ::: "ax" );
asm(" str host_TR_selector ");
asm(" mov %es, host_ES_selector ");
asm(" mov %cs, host_CS_selector ");
asm(" mov %ss, host_SS_selector ");
asm(" mov %ds, host_DS_selector ");
asm(" mov %fs, host_FS_selector ");
asm(" mov %gs, host_GS_selector ");
asm(" sgdt _gdtr \n sidt _idtr ");
host_GDTR_base = *(unsigned long*)( _gdtr+1 );
host_IDTR_base = *(unsigned long*)( _idtr+1 );
gdt = (unsigned long*)host_GDTR_base;
desc = gdt[ (host_TR_selector >> 3) + 0 ];
host_TR_base = ((desc >> 16)&0x00FFFFFF)|((desc >> 32)&0xFF000000);
desc = gdt[ (host_TR_selector >> 3) + 1 ];
desc <<= 48; // maneuver to insure 'canonical' address
host_TR_base |= (desc >> 16)&0xFFFFFFFF00000000;
asm( " mov $0x174, %%ecx \n"\
" rdmsr \n"\
" mov %%eax, host_SYSENTER_CS \n"\
" inc %%ecx \n"\
" rdmsr \n"\
" mov %%eax, host_SYSENTER_ESP+0 \n"\
" mov %%edx, host_SYSENTER_ESP+4 \n"\
" inc %%ecx \n"\
" rdmsr \n"\
" mov %%eax, host_SYSENTER_EIP+0 \n"\
" mov %%edx, host_SYSENTER_EIP+4 \n"\
::: "ax", "cx", "dx" );
asm( " mov %0, %%ecx \n"\
" rdmsr \n"\
" mov %%eax, host_FS_base+0 \n"\
" mov %%edx, host_FS_base+4 \n"\
:: "i" (0xC0000100) : "ax", "cx", "dx" );
asm( " mov %0, %%ecx \n"\
" rdmsr \n"\
" mov %%eax, host_GS_base+0 \n"\
" mov %%edx, host_GS_base+4 \n"\
:: "i" (0xC0000101) : "ax", "cx", "dx" );
//------------------------------------------------------
// initialize the global variables for the VMX controls
//------------------------------------------------------
control_VMX_pin_based = msr0x480[ 1 ];
control_VMX_cpu_based = msr0x480[ 2 ];
control_VM_exit_controls = msr0x480[ 3 ];
control_VM_entry_controls = msr0x480[ 4 ];
control_VMX_pin_based |= (1 << 0); // exit on interrupts
control_VMX_pin_based |= (1 << 3); // NMI-exiting
control_VMX_cpu_based |= (1 << 7) | (1 << 29); // Hlt + Monitor exit
control_pagefault_errorcode_match = 0xFFFFFFFF;
control_VM_exit_controls |= (1 << 9); // exit to 64-bit host
control_CR0_mask = 0x80000021;
control_CR4_mask = 0x00002000;
control_CR0_shadow = 0x80000021;
control_CR4_shadow = 0x00002000;
control_CR3_target_count = 2;
control_CR3_target0 = guest_CR3; // guest's directory
control_CR3_target1 = host_CR3; // host's directory
// initialize our counters for NMIs and external interrupts
nmiints = 0;
extints = 0;
// enable virtual machine extensions (bit 13 in CR4)
set_CR4_vmxe( NULL );
smp_call_function( set_CR4_vmxe, NULL, 1 );
//---------------------
// launch the guest VM
//---------------------
asm volatile (" .type my_vmm, #function \n"\
" pushfq \n"\
" push %rax \n"\
" push %rbx \n"\
" push %rcx \n"\
" push %rdx \n"\
" push %rbp \n"\
" push %rsi \n"\
" push %rdi \n"\
" push %r11 \n"\
" \n"\
" lea my_vmm, %rax \n"\
" \n"\
" mov %rax, host_RIP \n"\
" mov %rsp, host_RSP \n"\
" \n"\
" vmxon vmxon_region \n"\
" jc fail \n"\
" jz over \n"\
" \n"\
" movl $1, retval \n"\
" vmclear guest_region \n"\
" \n"\
" movl $2, retval \n"\
" vmptrld guest_region \n"\
" \n"\
" movl $3, retval \n"\
" \n"\
" xor %rdx, %rdx \n"\
" mov elements, %rcx \n"\
"nxwr: \n"\
" mov machine+0(%rdx), %rax \n"\
" mov machine+8(%rdx), %rbx \n"\
" vmwrite (%rbx), %rax \n"\
" add $16, %rdx \n"\
" loop nxwr \n"\
" \n"\
" movl $4, retval \n"\
" mov _eax, %eax \n"\
" mov _ebx, %ebx \n"\
" mov _ecx, %ecx \n"\
" mov _edx, %edx \n"\
" mov _ebp, %ebp \n"\
" mov _esi, %esi \n"\
" mov _edi, %edi \n"\
" vmlaunch \n"\
" movl $5, retval \n"\
" jmp read \n"\
"my_vmm: \n"\
" \n"\
" mov %eax, _eax \n"\
" mov %ebx, _ebx \n"\
" mov %ecx, _ecx \n"\
" mov %edx, _edx \n"\
" mov %ebp, _ebp \n"\
" mov %esi, _esi \n"\
" mov %edi, _edi \n"\
"read: \n"\
" xor %rdx, %rdx \n"\
" mov rocount, %rcx \n"\
"nxrd: \n"\
" mov results+0(%rdx), %rax \n"\
" mov results+8(%rdx), %rbx \n"\
" vmread %rax, (%rbx) \n"\
" add $16, %rdx \n"\
" loop nxrd \n"\
" \n"\
" cmpl $0, info_vmexit_reason \n"\
" je was_nmi \n"\
" \n"\
" cmpl $1, info_vmexit_reason \n"\
" je was_extint \n"\
" \n"\
" jmp over \n"\
" \n"\
"was_nmi: \n"\
" incl nmiints \n"\
/* " int $0x02 \n"\
*/ " jmp resume_guest \n"\
" \n"\
"was_extint: \n"\
" sti \n"\
" incl extints \n"\
" \n"\
"resume_guest: \n"\
" mov _eax, %eax \n"\
" mov _ebx, %ebx \n"\
" mov _ecx, %ecx \n"\
" mov _edx, %edx \n"\
" mov _ebp, %ebp \n"\
" mov _esi, %esi \n"\
" mov _edi, %edi \n"\
" vmresume \n"\
" \n"\
" movl $-1, retval \n"\
"over: \n"\
" vmxoff \n"\
"fail: \n"\
" pop %r11 \n"\
" pop %rdi \n"\
" pop %rsi \n"\
" pop %rbp \n"\
" pop %rdx \n"\
" pop %rcx \n"\
" pop %rbx \n"\
" pop %rax \n"\
" popfq \n"\
);
// show why the VMentry failed, or else why the VMexit occurred
printk( "\n VM-instruction error: %d ", info_vminstr_error );
printk( " Exit Reason: %d \n", info_vmexit_reason );
printk( " VMexit-interruption-information: %08X \n",
info_vmexit_interrupt_information );
printk( " VMexit-interruption-error-code: %08X \n",
info_vmexit_interrupt_error_code );
if (retval >= 0) {
retval = info_vmexit_reason;
}
// display the number of external interruption-exits
printk( "\n" );
printk( " number of external interrupts = %d \n", extints );
printk( " number of non-maskable interrupts = %d \n", nmiints );
// copy the client's virtual-machine register-values
vm.eflags = (unsigned int)guest_RFLAGS;
vm.eip = (unsigned int)guest_RIP;
vm.esp = (unsigned int)guest_RSP;
vm.eax = _eax;
vm.ebx = _ebx;
vm.ecx = _ecx;
vm.edx = _edx;
vm.ebp = _ebp;
vm.esi = _esi;
vm.edi = _edi;
vm.es = guest_ES_selector;
vm.cs = guest_CS_selector;
vm.ss = guest_SS_selector;
vm.ds = guest_DS_selector;
vm.fs = guest_FS_selector;
vm.gs = guest_GS_selector;
if ( copy_to_user( (void*)buf, &vm, count ) ) return -EFAULT;
return retval;
}
int my_release ( struct inode *inode, struct file *file )
{
pr_info("Calling %s\n", __func__);
/*
smp_call_function( clear_CR4_vmxe, NULL, 1 );
clear_CR4_vmxe( NULL );
*/
retval = 0;
return 0;
}
By testing above code, with delay.cpp, I found the very first time VMlaunch will fail with VMX instruction error of 8 (invalid host state), all the subsequent VMX operation will be fine.
After debugging it, i found it is related to where to set and clear VMXE (bit13) in CR4.
If setting VMXE bit in init_module, the VM could be launched well every time, no error of 8.
Then, if clearing VMXE bit in my_release, the VMX operation will fail every time (so i commented out that operation).
I must miss something important about the sequence of entering VMX root operation.
My testing environment is VMware WS, and a bare-metal Ubuntu Linux host.
I tested in SMP host, and non-SMP host, got the same result.
I started with code from a Raspberry Pi assembly language book. It prints out 15 in binary as so:
00000000000000000000000000001111pi#raspberrypi:$
I wanted to add a newline at the end, so I implemented the _newline: and new: .ascii "\n" portion of the code.
I reassembled it, but the output remains the same. Did I miss something in outputting the newline?
.global _start
_start:
mov r6, #15
mov r10, #1
mov r9, r10, lsl #31
ldr r1, =string
_bits:
tst r6, r9
moveq r0, #48
movne r0, #49
str r0, [r1]
mov r8, r6
bl _write
mov r6, r8
movs r9, r9, lsr #1
bne _bits
_newline:
mov r0, #1
mov r2, #1
mov r7, #4
ldr r1, =new
swi 0
_exit:
mov r7, #1
swi 0
_write:
mov r0, #1
mov r2, #1
mov r7, #4
swi 0
bx lr
.data
string: .ascii " "
new: .ascii "\n"
The last few lines of strace output are:
write(1, "1", 11) = 1
write(1, "1", 11) = 1
write(1, "1", 11) = 1
write(1, "1", 11) = 1
write(1, "\0", 11) = 1
exit(1) =?
+++ exited with 1 +++
Your strace output is the clue: write(1, "\0", 11) = 1 shows us that you wrote a 0 byte instead of the ASCII encoding of \n.
When you str r0, [r1], you're storing 4 bytes.
The destination of that store is
.data
string: .ascii " "
new: .ascii "\n"
which is really:
.data
string: .byte ' '
new: .byte '\n'
So each time you store '0' or '1' to string, you're also writing 3 more zero bytes, clobbering your '\n' and 2 more bytes beyond the end of your data section. (It doesn't segfault because you're not right at the end of a page.)
The simplest fix is to use a single-byte store: strb r0, [r1] instead of the word-sized str.
I have two files which are assembled/compiled/linked into minimalistic kernel.
start.s:
.set CPACR_EL1_FPEN, 0b11 << 20
.set BOOT_STACK_SIZE, 8 * 1024
.global __boot_stack
.global __start
.global __halt
.bss
.align 16
__boot_stack:
.fill BOOT_STACK_SIZE
.text
__start:
/* disable FP and SIMD traps */
mov x0, #CPACR_EL1_FPEN
msr cpacr_el1, x0
/* set stack */
adr x0, __boot_stack
add sp, x0, #BOOT_STACK_SIZE
/* call the Rust entry point */
bl __boot
__halt:
/* halt CPU */
wfi
b __halt
boot.rs:
#[no_mangle]
pub extern fn __boot() {
unsafe {
let ptr = 0x9000000 as *mut u8;
*ptr = '!' as u8;
}
}
For opt-level=3 the resulting code outputs single '!' to a serial port (as was intended). For opt-level=0 I have a strange infinite loop (e.g. '!!!!!!!!!....'). Here is the disassembled dump of the problematic code:
0000000000000000 <__kernel_begin>:
0: d2a00600 mov x0, #0x300000 // #3145728
4: d5181040 msr cpacr_el1, x0
8: 100007c0 adr x0, 100 <__boot_stack>
c: 9140081f add sp, x0, #0x2, lsl #12
10: 94000003 bl 1c <__boot>
0000000000000014 <__halt>:
14: d503207f wfi
18: 17ffffff b 14 <__halt>
000000000000001c <__boot>:
1c: a9bf7bfd stp x29, x30, [sp,#-16]!
20: 910003fd mov x29, sp
24: 94000003 bl 30 <aarch64::boot::__boot::__rust_abi>
28: a8c17bfd ldp x29, x30, [sp],#16
2c: d65f03c0 ret
0000000000000030 <aarch64::boot::__boot::__rust_abi>:
30: d10043ff sub sp, sp, #0x10
34: 52a12008 mov w8, #0x9000000 // #150994944
38: 2a0803e9 mov w9, w8
3c: f90007e9 str x9, [sp,#8]
40: 52800428 mov w8, #0x21 // #33
44: 39000128 strb w8, [x9]
48: 910043ff add sp, sp, #0x10
4c: d65f03c0 ret
The code is tested using qemu-system-aarch64. I don't see serious problems with it (except redundancy). Can you suggest a possible cause of such abnormal behaviour?
P.S. This is the optimised version which works properly:
0000000000000000 <__kernel_begin>:
0: d2a00600 mov x0, #0x300000 // #3145728
4: d5181040 msr cpacr_el1, x0
8: 1007ffc0 adr x0, 10000 <__boot_stack>
c: 9140081f add sp, x0, #0x2, lsl #12
10: 94000003 bl 1c <__boot>
0000000000000014 <__halt>:
14: d503207f wfi
18: 17ffffff b 14 <__halt>
000000000000001c <__boot>:
1c: 52a12008 mov w8, #0x9000000 // #150994944
20: 52800429 mov w9, #0x21 // #33
24: 39000109 strb w9, [x8]
28: d65f03c0 ret
I've succeeded to run the non-optimised code without abnormalities. Thanks to Notlikethat for the idea. My stack was just mapped into readonly memory.
So I've just added the offset statement into my linker script (". = 1024M;") in order to make all the symbols to start from 1GiB (where RAM begins). After this modification the code started to work properly.