What is the right sequence to start VMX root operation in Linux - linux
I am working on a Linux kernel module (KVM and KVM_intel have already been unloaded) to test Intel's VMX function.
And now I am wondering the pre-requisite of entering VMX root mode.
My kernel modules uses Linux file system interface to provide a device interface to user space program to do VMX operations.
Here is the code for reference (it was from https://www.cs.usfca.edu/~cruse/cs686s07/nmiexits.c, and changed to work with my Linux-2.6.32, and other needed files were also from that link)
//-------------------------------------------------------------------
// nmiexits.c (A modification of our 'linuxvmm.c' module)
//
// This Linux kernel module implements a device-driver (named
// '/dev/vmm') which lets an application program execute some
// real-mode code in Virtual-8086 mode within a guest virtual
// machine, assuming the cpu supports Intel VMX instructions.
//
// This modification sets the pin-based VM Execution Controls
// so that control passes to our Virtual Machine Manager when
// any external interrupt (or non-maskable interrupt) occurs.
// These asynchronous events are then serviced by Linux ISRs,
// and our guest VM is resumed. For the case of non-maskable
// interrupts, the host executes the 'int $0x02' instruction;
// for the case of external interrupts, the appropriate Linux
// interrupt service routine automatically gets executed when
// the host executes 'sti', which allows the CPU to recognize
// the still-pending external interrupt-request.
//
// compile using: $ mmake nmiexits
// install using: $ /sbin/insmod nmiexits.ko
//
// NOTE: Written and tested using Linux x86_64 kernel 2.6.17.
//
// programmer: ALLAN CRUSE
// date begun: 29 APR 2007
// completion: 03 MAY 2007 -- our initial driver-prototype
// revised on: 14 MAY 2007 -- sets 'interrupt-exiting' control
// revised on: 24 MAY 2007 -- sets the 'NMI-exiting' control
// revised on: 21 JUL 2008 -- for Linux kernel version 2.6.26.
//-------------------------------------------------------------------
#include <linux/kernel.h>
#include <linux/module.h> // for init_module()
#include <linux/proc_fs.h> // for create_proc_read_entry()
#include <linux/fs.h> // for struct file_operations
#include <asm/io.h> // for virt_to_phys()
#include <asm/uaccess.h> // for copy_from_user()
#include <linux/slab.h> // for init_module()
#include <linux/mm.h> // for remap_pfn_range()
#include <linux/seq_file.h>
#include "machine.h" // for our VMCS fields
#include "myvmx.h" // for 'regs_ia32'
#define N_ARENAS 11 // number of 64KB memory allocations
#define ARENA_LENGTH (64<<10) // size of each allocated memory-arena
#define IA32_VMX_BASIC 0x0480
#define IA32_VMX_PINBASED_CTLS 0x0481
#define IA32_VMX_PROCBASED_CTLS 0x0482
#define IA32_VMX_EXIT_CTLS 0x0483
#define IA32_VMX_ENTRY_CTLS 0x0484
#define IA32_VMX_MISC 0x0485
#define IA32_VMX_CR0_FIXED0 0x0486
#define IA32_VMX_CR0_FIXED1 0x0487
#define IA32_VMX_CR4_FIXED0 0x0488
#define IA32_VMX_CR4_FIXED1 0x0489
#define IA32_VMX_VMCS_ENUM 0x048A
#define IA32_VMX_PROCBASED_CTLS2 0x048B
#define IA32_VMX_EPT_VPID_CAP 0x048C
#define IA32_VMX_TRUE_PINBASED_CTLS 0x048D
#define IA32_VMX_TRUE_PROCBASED_CTLS 0x048E
#define IA32_VMX_TRUE_EXIT_CTLS 0x048F
#define IA32_VMX_TRUE_ENTRY_CTLS 0x0490
#define NUM_VMX_MSR (IA32_VMX_TRUE_ENTRY_CTLS - IA32_VMX_BASIC + 1)
#define LEGACY_REACH 0x110000 // end of 'real-addressible' memory
#define PAGE_DIR_OFFSET 0x2000
#define PAGE_TBL_OFFSET 0x3000
#define IDT_KERN_OFFSET 0x4000
#define GDT_KERN_OFFSET 0x4800
#define LDT_KERN_OFFSET 0x4A00
#define TSS_KERN_OFFSET 0x4C00
#define TOS_KERN_OFFSET 0x8000
#define ISR_KERN_OFFSET 0x8000
#define __SELECTOR_TASK 0x0008
#define __SELECTOR_LDTR 0x0010
#define __SELECTOR_CODE 0x0004
#define __SELECTOR_DATA 0x000C
#define __SELECTOR_VRAM 0x0014
#define __SELECTOR_FLAT 0x001C
char modname[] = "nmiexits";
int my_major = 88;
char cpu_oem[16];
int cpu_features;
void *kmem[ N_ARENAS ];
unsigned long msr0x480[ NUM_VMX_MSR ];
unsigned long cr0, cr4;
unsigned long msr_efer;
unsigned long vmxon_region;
unsigned long guest_region;
unsigned long pgdir_region;
unsigned long pgtbl_region;
unsigned long g_IDT_region;
unsigned long g_GDT_region;
unsigned long g_LDT_region;
unsigned long g_TSS_region;
unsigned long g_TOS_region;
unsigned long g_ISR_region;
//============================================================
long my_ioctl( struct file *, unsigned int, unsigned long );
int my_release ( struct inode *inode, struct file *file );
int my_mmap( struct file *file, struct vm_area_struct *vma )
{
unsigned long user_virtaddr = vma->vm_start;
unsigned long region_length = vma->vm_end - vma->vm_start;
unsigned long physical_addr, pfn;
int i;
// we require prescribed parameter-values from our client
if ( user_virtaddr != 0x00000000L ) return -EINVAL;
if ( region_length != LEGACY_REACH ) return -EINVAL;
// let the kernel know not to try swapping out this region
/// vma->vm_flags |= VM_RESERVED;
// ask the kernel to add page-table entries to 'map' these arenas
for (i = 0; i < N_ARENAS+6; i++)
{
int j = i % 16;
if ( j < 0xA ) physical_addr = virt_to_phys( kmem[ j ] );
else physical_addr = user_virtaddr;
pfn = ( physical_addr >> PAGE_SHIFT );
if ( remap_pfn_range( vma, user_virtaddr, pfn,
ARENA_LENGTH, vma->vm_page_prot ) ) return -EAGAIN;
user_virtaddr += ARENA_LENGTH;
}
// copy page-frame 0x000 to bottom of arena 0x0 (for IVT and BDA)
memcpy( kmem[0], phys_to_virt( 0x00000 ), PAGE_SIZE );
// copy page-frames 0x90 to 0x9F to arena 0x9 (for EBDA)
memcpy( kmem[9], phys_to_virt( 0x90000 ), ARENA_LENGTH );
return 0; // SUCCESS
}
struct file_operations
my_fops = {
owner: THIS_MODULE,
.unlocked_ioctl= my_ioctl,
mmap: my_mmap,
.release = my_release,
};
void set_CR4_vmxe( void *dummy )
{
asm( " mov %%cr4, %%rax \n"\
" bts $13, %%rax \n"\
" mov %%rax, %%cr4 " ::: "ax" );
}
void clear_CR4_vmxe( void *dummy )
{
asm( " mov %%cr4, %%rax \n"\
" btr $13, %%rax \n"\
" mov %%rax, %%cr4 " ::: "ax" );
}
static inline u64
vmx_rdmsr (u32 ecx)
{
u32 edx, eax;
asm volatile ("rdmsr":"=d" (edx), "=a" (eax):"c" (ecx));
return (((u64) edx) << 32) | ((u64) eax);
}
static inline void
vmx_wrmsr (u32 ecx, u64 val)
{
u32 edx, eax;
edx = (u32) (val >> 32);
eax = (u32) val;
asm volatile ("wrmsr"::"d" (edx), "a" (eax), "c" (ecx));
}
int init_module( void )
{
int i, j;
// confirm installation and show device-major number
printk( "<1>\nInstalling \'%s\' module ", modname );
printk( "(major=%d) \n", my_major );
// verify processor supports Intel Virtualization Technology
asm( " xor %%eax, %%eax \n"\
" cpuid \n"\
" mov %%ebx, cpu_oem+0 \n"\
" mov %%edx, cpu_oem+4 \n"\
" mov %%ecx, cpu_oem+8 \n"\
::: "ax", "bx", "cx", "dx" );
printk( " processor is \'%s\' \n", cpu_oem );
if ( strncmp( cpu_oem, "GenuineIntel", 12 ) == 0 )
asm( " mov $1, %%eax \n"\
" cpuid \n"\
" mov %%ecx, cpu_features \n"\
::: "ax", "bx", "cx", "dx" );
if ( ( cpu_features & (1<<5) ) == 0 )
{
printk( " Virtualization Technology is unsupported \n" );
return -ENODEV;
}
else printk( " Virtualization Technology is supported \n" );
// read contents of the VMX-Capability Model-Specific Registers
asm( " xor %%rbx, %%rbx \n"\
" mov %0, %%rcx \n"\
"nxcap: \n"\
" rdmsr \n"\
" mov %%eax, msr0x480+0(, %%rbx, 8) \n"\
" mov %%edx, msr0x480+4(, %%rbx, 8) \n"\
" inc %%rcx \n"\
" inc %%rbx \n"\
" cmp $17, %%rbx \n"\
" jb nxcap \n"\
:: "i" (IA32_VMX_BASIC) : "ax", "bx", "cx", "dx" );
// preserve the initial values in relevant system registers
asm( " mov %%cr0, %%rax \n mov %%rax, cr0 " ::: "ax" );
asm( " mov %%cr4, %%rax \n mov %%rax, cr4 " ::: "ax" );
asm( " mov %0, %%ecx \n"\
" rdmsr \n"\
" mov %%eax, msr_efer+0 \n"\
" mov %%edx, msr_efer+4 \n"\
:: "i" (MSR_EFER) : "ax", "cx", "dx" );
// allocate page-aligned blocks of non-pageable kernel memory
for (i = 0; i < N_ARENAS; i++)
{
kmem[ i ] = kmalloc( ARENA_LENGTH, GFP_KERNEL );
if ( kmem[ i ] == NULL )
{
for (j = 0; j < i; j++) kfree( kmem[ j ] );
return -ENOMEM;
}
else memset( kmem[ i ], 0x00, ARENA_LENGTH );
}
// assign usages to allocated kernel memory areas
vmxon_region = virt_to_phys( kmem[ 10 ] + 0x0000 );
guest_region = virt_to_phys( kmem[ 10 ] + 0x1000 );
pgdir_region = virt_to_phys( kmem[ 10 ] + PAGE_DIR_OFFSET );
pgtbl_region = virt_to_phys( kmem[ 10 ] + PAGE_TBL_OFFSET );
g_IDT_region = virt_to_phys( kmem[ 10 ] + IDT_KERN_OFFSET );
g_GDT_region = virt_to_phys( kmem[ 10 ] + GDT_KERN_OFFSET );
g_LDT_region = virt_to_phys( kmem[ 10 ] + LDT_KERN_OFFSET );
g_TSS_region = virt_to_phys( kmem[ 10 ] + TSS_KERN_OFFSET );
g_TOS_region = virt_to_phys( kmem[ 10 ] + TOS_KERN_OFFSET );
g_ISR_region = virt_to_phys( kmem[ 10 ] + ISR_KERN_OFFSET );
return register_chrdev( my_major, modname, &my_fops );
}
void cleanup_module( void )
{
int i;
smp_call_function( clear_CR4_vmxe, NULL, 1 );
clear_CR4_vmxe( NULL );
unregister_chrdev( my_major, modname );
for (i = 0; i < N_ARENAS; i++) kfree( kmem[ i ] );
printk( "<1>Removing \'%s\' module\n", modname );
}
MODULE_LICENSE("GPL");
unsigned short _gdtr[ 5 ], _idtr[ 5 ];
unsigned int _eax, _ebx, _ecx, _edx, _esp, _ebp, _esi, _edi;
int retval = -1;
int nmiints = 0;
int extints = 0;
regs_ia32 vm;
long my_ioctl( struct file *file, unsigned int count, unsigned long buf)
{
unsigned long *gdt, *ldt, *idt;
unsigned int *pgtbl, *pgdir, *tss, phys_addr = 0;
signed long desc = 0;
int i, j;
// sanity check: we require the client-process to pass an
// exact amount of data representing CPU's register-state
if ( count != sizeof( regs_ia32 ) ) return -EINVAL;
// reinitialize the Virtual Machine Control Stuctures
memset( phys_to_virt( vmxon_region ), 0x00, PAGE_SIZE );
memset( phys_to_virt( guest_region ), 0x00, PAGE_SIZE );
memcpy( phys_to_virt( vmxon_region ), msr0x480, 4 );
memcpy( phys_to_virt( guest_region ), msr0x480, 4 );
// initialize our guest-task's page-table and page-directory
pgtbl = (unsigned int*)phys_to_virt( pgtbl_region );
for (i = 0; i < 18; i++) {
switch ( i ) {
case 0: case 1: case 2: case 3: case 4:
case 5: case 6: case 7: case 8: case 9:
phys_addr = virt_to_phys( kmem[ i ] ); break;
case 10: case 11: case 12: case 13: case 14: case 15:
phys_addr = i * ARENA_LENGTH; break;
case 16:
phys_addr = virt_to_phys( kmem[ 0 ] ); break;
case 17:
phys_addr = virt_to_phys( kmem[ 10 ] ); break;
}
for (j = 0; j < 16; j++)
pgtbl[ i*16 + j ] = phys_addr + (j << PAGE_SHIFT) + 7;
}
pgdir = (unsigned int*)phys_to_virt( pgdir_region );
pgdir[ 0 ] = (unsigned int)pgtbl_region + 7;
// copy the client's virtual-machine register-values
if ( copy_from_user( &vm, (void*)buf, count ) ) return -EFAULT;
guest_ES_selector = vm.es;
guest_CS_selector = vm.cs;
guest_SS_selector = vm.ss;
guest_DS_selector = vm.ds;
guest_FS_selector = vm.fs;
guest_GS_selector = vm.gs;
_eax = vm.eax;
_ebx = vm.ebx;
_ecx = vm.ecx;
_edx = vm.edx;
_ebp = vm.ebp;
_esi = vm.esi;
_edi = vm.edi;
guest_RSP = vm.esp;
guest_RIP = vm.eip;
guest_RFLAGS = vm.eflags;
guest_RFLAGS |= (1 << 17); // VM=1 (for Virtual-8086 mode)
guest_RFLAGS |= (1 << 1); // it's essential to set bit #1
// setup other guest-state fields (for Virtual-8086 mode)
guest_ES_base = (guest_ES_selector << 4);
guest_CS_base = (guest_CS_selector << 4);
guest_SS_base = (guest_SS_selector << 4);
guest_DS_base = (guest_DS_selector << 4);
guest_FS_base = (guest_FS_selector << 4);
guest_GS_base = (guest_GS_selector << 4);
guest_ES_limit = 0xFFFF;
guest_CS_limit = 0xFFFF;
guest_SS_limit = 0xFFFF;
guest_DS_limit = 0xFFFF;
guest_FS_limit = 0xFFFF;
guest_GS_limit = 0xFFFF;
guest_ES_access_rights = 0xF3;
guest_CS_access_rights = 0xF3;
guest_SS_access_rights = 0xF3;
guest_DS_access_rights = 0xF3;
guest_FS_access_rights = 0xF3;
guest_GS_access_rights = 0xF3;
guest_CR0 = 0x80000031;
guest_CR4 = 0x00002011;
guest_CR3 = pgdir_region;
guest_VMCS_link_pointer_full = 0xFFFFFFFF;
guest_VMCS_link_pointer_high = 0xFFFFFFFF;
guest_IDTR_base = LEGACY_REACH + IDT_KERN_OFFSET;
guest_GDTR_base = LEGACY_REACH + GDT_KERN_OFFSET;
guest_LDTR_base = LEGACY_REACH + LDT_KERN_OFFSET;
guest_TR_base = LEGACY_REACH + TSS_KERN_OFFSET;
guest_IDTR_limit = (256 * 8) - 1;
guest_GDTR_limit = (3 * 8) - 1;
guest_LDTR_limit = (4 * 8) - 1;
guest_TR_limit = (26 * 4) + 0x20 + 0x2000;
guest_LDTR_access_rights = 0x82;
guest_TR_access_rights = 0x8B;
guest_LDTR_selector = __SELECTOR_LDTR;
guest_TR_selector = __SELECTOR_TASK;
// provisionally initialize our guest-task's LDTR
ldt = (unsigned long*)phys_to_virt( g_LDT_region );
ldt[ __SELECTOR_CODE >> 3 ] = 0x00CF9B000000FFFF;
ldt[ __SELECTOR_DATA >> 3 ] = 0x00CF93000000FFFF;
ldt[ __SELECTOR_VRAM >> 3 ] = 0x0000920B8000FFFF;
ldt[ __SELECTOR_FLAT >> 3 ] = 0x008F92000000FFFF;
// Adjust the CODE and DATA descriptors here
desc = LEGACY_REACH + ISR_KERN_OFFSET;
desc <<= 16;
desc &= 0x000000FFFFFF0000;
ldt[ __SELECTOR_CODE >> 3 ] |= desc;
ldt[ __SELECTOR_DATA >> 3 ] |= desc;
// initialize our guest-task's GDTR
gdt = (unsigned long*)phys_to_virt( g_GDT_region );
desc = 0x00008B0000000000;
desc |= (guest_TR_base << 32)&0xFF00000000000000;
desc |= (guest_TR_base << 16)&0x000000FFFFFF0000;
desc |= (guest_TR_limit & 0xFFFF);
gdt[ __SELECTOR_TASK >> 3 ] = desc;
desc = 0x0000820000000000;
desc |= ( guest_LDTR_base << 32)&0xFF00000000000000;
desc |= ( guest_LDTR_base << 16)&0x000000FFFFFF0000;
desc |= ( guest_LDTR_limit & 0xFFFF );
gdt[ __SELECTOR_LDTR >> 3 ] = desc;
// initialize our guest's IDT
idt = (unsigned long*)phys_to_virt( g_IDT_region );
desc = 0; // offset-address for GPF isr
desc &= 0x00000000FFFFFFFF;
desc |= (desc << 32);
desc &= 0xFFFF00000000FFFF;
desc |= ( __SELECTOR_CODE << 16);
desc |= 0x00008E0000000000;
idt[ 13 ] = desc;
// initialize our guest's Task-State Segment
tss = (unsigned int*)phys_to_virt( g_TSS_region );
tss[ 1 ] = TOS_KERN_OFFSET;
tss[ 2 ] = __SELECTOR_DATA;
tss[ 25 ] = 0x00880000;
tss[ guest_TR_limit >> 2 ] = 0xFF;
//----------------------------------------------------
// initialize the global variables for the host state
//----------------------------------------------------
asm(" mov %%cr0, %%rax \n mov %%rax, host_CR0 " ::: "ax" );
asm(" mov %%cr4, %%rax \n mov %%rax, host_CR4 " ::: "ax" );
asm(" mov %%cr3, %%rax \n mov %%rax, host_CR3 " ::: "ax" );
asm(" str host_TR_selector ");
asm(" mov %es, host_ES_selector ");
asm(" mov %cs, host_CS_selector ");
asm(" mov %ss, host_SS_selector ");
asm(" mov %ds, host_DS_selector ");
asm(" mov %fs, host_FS_selector ");
asm(" mov %gs, host_GS_selector ");
asm(" sgdt _gdtr \n sidt _idtr ");
host_GDTR_base = *(unsigned long*)( _gdtr+1 );
host_IDTR_base = *(unsigned long*)( _idtr+1 );
gdt = (unsigned long*)host_GDTR_base;
desc = gdt[ (host_TR_selector >> 3) + 0 ];
host_TR_base = ((desc >> 16)&0x00FFFFFF)|((desc >> 32)&0xFF000000);
desc = gdt[ (host_TR_selector >> 3) + 1 ];
desc <<= 48; // maneuver to insure 'canonical' address
host_TR_base |= (desc >> 16)&0xFFFFFFFF00000000;
asm( " mov $0x174, %%ecx \n"\
" rdmsr \n"\
" mov %%eax, host_SYSENTER_CS \n"\
" inc %%ecx \n"\
" rdmsr \n"\
" mov %%eax, host_SYSENTER_ESP+0 \n"\
" mov %%edx, host_SYSENTER_ESP+4 \n"\
" inc %%ecx \n"\
" rdmsr \n"\
" mov %%eax, host_SYSENTER_EIP+0 \n"\
" mov %%edx, host_SYSENTER_EIP+4 \n"\
::: "ax", "cx", "dx" );
asm( " mov %0, %%ecx \n"\
" rdmsr \n"\
" mov %%eax, host_FS_base+0 \n"\
" mov %%edx, host_FS_base+4 \n"\
:: "i" (0xC0000100) : "ax", "cx", "dx" );
asm( " mov %0, %%ecx \n"\
" rdmsr \n"\
" mov %%eax, host_GS_base+0 \n"\
" mov %%edx, host_GS_base+4 \n"\
:: "i" (0xC0000101) : "ax", "cx", "dx" );
//------------------------------------------------------
// initialize the global variables for the VMX controls
//------------------------------------------------------
control_VMX_pin_based = msr0x480[ 1 ];
control_VMX_cpu_based = msr0x480[ 2 ];
control_VM_exit_controls = msr0x480[ 3 ];
control_VM_entry_controls = msr0x480[ 4 ];
control_VMX_pin_based |= (1 << 0); // exit on interrupts
control_VMX_pin_based |= (1 << 3); // NMI-exiting
control_VMX_cpu_based |= (1 << 7) | (1 << 29); // Hlt + Monitor exit
control_pagefault_errorcode_match = 0xFFFFFFFF;
control_VM_exit_controls |= (1 << 9); // exit to 64-bit host
control_CR0_mask = 0x80000021;
control_CR4_mask = 0x00002000;
control_CR0_shadow = 0x80000021;
control_CR4_shadow = 0x00002000;
control_CR3_target_count = 2;
control_CR3_target0 = guest_CR3; // guest's directory
control_CR3_target1 = host_CR3; // host's directory
// initialize our counters for NMIs and external interrupts
nmiints = 0;
extints = 0;
// enable virtual machine extensions (bit 13 in CR4)
set_CR4_vmxe( NULL );
smp_call_function( set_CR4_vmxe, NULL, 1 );
//---------------------
// launch the guest VM
//---------------------
asm volatile (" .type my_vmm, #function \n"\
" pushfq \n"\
" push %rax \n"\
" push %rbx \n"\
" push %rcx \n"\
" push %rdx \n"\
" push %rbp \n"\
" push %rsi \n"\
" push %rdi \n"\
" push %r11 \n"\
" \n"\
" lea my_vmm, %rax \n"\
" \n"\
" mov %rax, host_RIP \n"\
" mov %rsp, host_RSP \n"\
" \n"\
" vmxon vmxon_region \n"\
" jc fail \n"\
" jz over \n"\
" \n"\
" movl $1, retval \n"\
" vmclear guest_region \n"\
" \n"\
" movl $2, retval \n"\
" vmptrld guest_region \n"\
" \n"\
" movl $3, retval \n"\
" \n"\
" xor %rdx, %rdx \n"\
" mov elements, %rcx \n"\
"nxwr: \n"\
" mov machine+0(%rdx), %rax \n"\
" mov machine+8(%rdx), %rbx \n"\
" vmwrite (%rbx), %rax \n"\
" add $16, %rdx \n"\
" loop nxwr \n"\
" \n"\
" movl $4, retval \n"\
" mov _eax, %eax \n"\
" mov _ebx, %ebx \n"\
" mov _ecx, %ecx \n"\
" mov _edx, %edx \n"\
" mov _ebp, %ebp \n"\
" mov _esi, %esi \n"\
" mov _edi, %edi \n"\
" vmlaunch \n"\
" movl $5, retval \n"\
" jmp read \n"\
"my_vmm: \n"\
" \n"\
" mov %eax, _eax \n"\
" mov %ebx, _ebx \n"\
" mov %ecx, _ecx \n"\
" mov %edx, _edx \n"\
" mov %ebp, _ebp \n"\
" mov %esi, _esi \n"\
" mov %edi, _edi \n"\
"read: \n"\
" xor %rdx, %rdx \n"\
" mov rocount, %rcx \n"\
"nxrd: \n"\
" mov results+0(%rdx), %rax \n"\
" mov results+8(%rdx), %rbx \n"\
" vmread %rax, (%rbx) \n"\
" add $16, %rdx \n"\
" loop nxrd \n"\
" \n"\
" cmpl $0, info_vmexit_reason \n"\
" je was_nmi \n"\
" \n"\
" cmpl $1, info_vmexit_reason \n"\
" je was_extint \n"\
" \n"\
" jmp over \n"\
" \n"\
"was_nmi: \n"\
" incl nmiints \n"\
/* " int $0x02 \n"\
*/ " jmp resume_guest \n"\
" \n"\
"was_extint: \n"\
" sti \n"\
" incl extints \n"\
" \n"\
"resume_guest: \n"\
" mov _eax, %eax \n"\
" mov _ebx, %ebx \n"\
" mov _ecx, %ecx \n"\
" mov _edx, %edx \n"\
" mov _ebp, %ebp \n"\
" mov _esi, %esi \n"\
" mov _edi, %edi \n"\
" vmresume \n"\
" \n"\
" movl $-1, retval \n"\
"over: \n"\
" vmxoff \n"\
"fail: \n"\
" pop %r11 \n"\
" pop %rdi \n"\
" pop %rsi \n"\
" pop %rbp \n"\
" pop %rdx \n"\
" pop %rcx \n"\
" pop %rbx \n"\
" pop %rax \n"\
" popfq \n"\
);
// show why the VMentry failed, or else why the VMexit occurred
printk( "\n VM-instruction error: %d ", info_vminstr_error );
printk( " Exit Reason: %d \n", info_vmexit_reason );
printk( " VMexit-interruption-information: %08X \n",
info_vmexit_interrupt_information );
printk( " VMexit-interruption-error-code: %08X \n",
info_vmexit_interrupt_error_code );
if (retval >= 0) {
retval = info_vmexit_reason;
}
// display the number of external interruption-exits
printk( "\n" );
printk( " number of external interrupts = %d \n", extints );
printk( " number of non-maskable interrupts = %d \n", nmiints );
// copy the client's virtual-machine register-values
vm.eflags = (unsigned int)guest_RFLAGS;
vm.eip = (unsigned int)guest_RIP;
vm.esp = (unsigned int)guest_RSP;
vm.eax = _eax;
vm.ebx = _ebx;
vm.ecx = _ecx;
vm.edx = _edx;
vm.ebp = _ebp;
vm.esi = _esi;
vm.edi = _edi;
vm.es = guest_ES_selector;
vm.cs = guest_CS_selector;
vm.ss = guest_SS_selector;
vm.ds = guest_DS_selector;
vm.fs = guest_FS_selector;
vm.gs = guest_GS_selector;
if ( copy_to_user( (void*)buf, &vm, count ) ) return -EFAULT;
return retval;
}
int my_release ( struct inode *inode, struct file *file )
{
pr_info("Calling %s\n", __func__);
/*
smp_call_function( clear_CR4_vmxe, NULL, 1 );
clear_CR4_vmxe( NULL );
*/
retval = 0;
return 0;
}
By testing above code, with delay.cpp, I found the very first time VMlaunch will fail with VMX instruction error of 8 (invalid host state), all the subsequent VMX operation will be fine.
After debugging it, i found it is related to where to set and clear VMXE (bit13) in CR4.
If setting VMXE bit in init_module, the VM could be launched well every time, no error of 8.
Then, if clearing VMXE bit in my_release, the VMX operation will fail every time (so i commented out that operation).
I must miss something important about the sequence of entering VMX root operation.
My testing environment is VMware WS, and a bare-metal Ubuntu Linux host.
I tested in SMP host, and non-SMP host, got the same result.
Related
What's meaning of ".inst" in arm assembly instruction
Kernel version : 4.14.199 The spin_lock assembly instructions in crash is crash_arm64> dis _raw_spin_lock -x 0xffffff8008c41e90 <_raw_spin_lock>: stp x29, x30, [sp,#-32]! 0xffffff8008c41e94 <_raw_spin_lock+0x4>: str x19, [sp,#16] 0xffffff8008c41e98 <_raw_spin_lock+0x8>: mov x29, sp 0xffffff8008c41e9c <_raw_spin_lock+0xc>: mov x19, x0 0xffffff8008c41ea0 <_raw_spin_lock+0x10>: nop 0xffffff8008c41ea4 <_raw_spin_lock+0x14>: mov w0, #0x1 // #1 0xffffff8008c41ea8 <_raw_spin_lock+0x18>: bl 0xffffff80080f399c <preempt_count_add> 0xffffff8008c41eac <_raw_spin_lock+0x1c>: mov w10, #0x10000 // #65536 0xffffff8008c41eb0 <_raw_spin_lock+0x20>: .inst 0xb8aa0268 ; undefined 0xffffff8008c41eb4 <_raw_spin_lock+0x24>: nop 0xffffff8008c41eb8 <_raw_spin_lock+0x28>: nop 0xffffff8008c41ebc <_raw_spin_lock+0x2c>: nop 0xffffff8008c41ec0 <_raw_spin_lock+0x30>: eor w9, w8, w8, ror #16 0xffffff8008c41ec4 <_raw_spin_lock+0x34>: cbz w9, 0xffffff8008c41edc <_raw_spin_lock+0x4c> 0xffffff8008c41ec8 <_raw_spin_lock+0x38>: sevl 0xffffff8008c41ecc <_raw_spin_lock+0x3c>: wfe 0xffffff8008c41ed0 <_raw_spin_lock+0x40>: ldaxrh w10, [x19] 0xffffff8008c41ed4 <_raw_spin_lock+0x44>: eor w9, w10, w8, lsr #16 0xffffff8008c41ed8 <_raw_spin_lock+0x48>: cbnz w9, 0xffffff8008c41ecc <_raw_spin_lock+0x3c> 0xffffff8008c41edc <_raw_spin_lock+0x4c>: ldr x19, [sp,#16] 0xffffff8008c41ee0 <_raw_spin_lock+0x50>: ldp x29, x30, [sp],#32 0xffffff8008c41ee4 <_raw_spin_lock+0x54>: ret What's the meaning of .inst instructions ? 0xffffff8008c41eb0 <_raw_spin_lock+0x20>: .inst 0xb8aa0268 ; undefined I found the function definition in arch/arm64/include/asm/spinlock.h. static inline void arch_spin_lock(arch_spinlock_t *lock) { unsigned int tmp; arch_spinlock_t lockval, newval; asm volatile( /* Atomically increment the next ticket. */ ARM64_LSE_ATOMIC_INSN( /* LL/SC */ " prfm pstl1strm, %3\n" "1: ldaxr %w0, %3\n" " add %w1, %w0, %w5\n" " stxr %w2, %w1, %3\n" " cbnz %w2, 1b\n", /* LSE atomics */ " mov %w2, %w5\n" " ldadda %w2, %w0, %3\n" __nops(3) ) /* Did we get the lock? */ " eor %w1, %w0, %w0, ror #16\n" " cbz %w1, 3f\n" /* * No: spin on the owner. Send a local event to avoid missing an * unlock before the exclusive load. */ " sevl\n" "2: wfe\n" " ldaxrh %w2, %4\n" " eor %w1, %w2, %w0, lsr #16\n" " cbnz %w1, 2b\n" /* We got the lock. Critical section starts here. */ "3:" : "=&r" (lockval), "=&r" (newval), "=&r" (tmp), "+Q" (*lock) : "Q" (lock->owner), "I" (1 << TICKET_SHIFT) : "memory"); } In my opinion, the .inst 0xb8aa0268 should correspond to ldadda %w2, %w0, %3\n". Why the crash displayed is different from the source code ?
Linux syscall table from C re-written to assembly
So, from this code, a kernel module, there is a get_system_call function to get the x86_64 system call table. #define IA32_LSTAR 0xc0000082 void *get_system_call(void) { void *system_call; unsigned char *ptr; int i, low, high; asm volatile("rdmsr" : "=a" (low), "=d" (high) : "c" (IA32_LSTAR)); system_call = (void*)(((long)high<<32) | low); printk(KERN_INFO "system_call: 0x%p\n", system_call); for (ptr=system_call, i=0; i<500; i++) { if (ptr[0] == 0xff && ptr[1] == 0x14 && ptr[2] == 0xc5) return (void*)(0xffffffff00000000 | *((unsigned int*)(ptr+3))); ptr++; } return NULL; } I try to rewrite the x86 assembly version like this: global _start section .text _start: mov ecx, 0xc0000082 rdmsr mov edx, 32 mov ecx, edx sal edx, cl or eax, edx .loop_init: mov ecx, eax add ecx, 500 jmp .loop_body .loop: add eax, 1 cmp ecx, eax je .fail .loop_body: cmp byte [eax], 0xff jne .loop cmp byte [eax+1], 0x14 jne .loop cmp byte [eax+2], 0xc5 jne .loop .success: mov ecx, 0xffffffff mov eax, dword [eax+3] or eax, ecx ret .fail: xor eax, eax ret My question is: Is that correct or I'm totally wrong ?
Can't print string custom kernel operating system [duplicate]
I am developing a kernel in C and created something to print on screen on video memory. I expected that the first byte in video memory would be the character to print and the second byte tells the color. But my program has something different but it works!! It is very unexpected and unusual. My kernel code - #define VIDEO_MEM 0xb8000 void write_string( int colour, const unsigned char *string ); void main() { unsigned char *vid = (unsigned char*) VIDEO_MEM; int i=0; for (i = 0; i < 2000; i++) { *vid = ' '; *(vid+2) = 0x1f; vid += 2; } write_string(0x1f,"The Kernel has been loaded successfully!!"); } void write_string( int colour, const unsigned char *string ) { unsigned char *vid = (unsigned char*) VIDEO_MEM; while(*string != 0) { *(vid) = *string; *(vid+2) = colour; ++string; vid+=2; } } It prints the character on *vid and the color on *(vid+2) and then increments the vid by 2. It should then replace and print the next char on *(vid+2). So, the color should go but it still works. Also, the color should be on *(vid+1) When I use *(vid+1) instead of *(vid+2) to print the string, the screen shows down arrow characters (with ACII code 0x1f which I wanted to be the color) replacing the entire string. Why does the code behave so unusual?? Can anyone help? EDIT I have edited my code and now it prints string. But another problem arose. I added a support for printing on particular line number. But now this shifts the string backwards by one character. void write_string( int colour, const unsigned char *string, int pos ) { unsigned char *vid = (unsigned char*) VIDEO_MEM; vid+=pos*160; while(*string != 0) { *vid = colour; *(vid+1) = *string; ++string; vid+=2; } } So, If I tell it to print on line 10, it prints the first character on the last character of the 9th line and then continues. I also have a character printing function that justs prints curly braces (}) instead of the given character and that too one character backwards of the given position (like the error in the write_string function). Also it doen't change the character background color given as argument. void putChar(char character, short col, short row, char attr) { unsigned char* vid_mem = (unsigned char *) VIDEO_MEM; int offset = (row*80 + col)*2; vid_mem += offset; if(!attr) { attr = 0x0f; } *vid_mem = (attr<<8)+character; } EDIT 2 My Boot Loader: [org 0x7c00] KERNEL equ 0x1000 mov [BOOT_DRIVE],dl mov bp,0x9000 mov sp,bp mov bx, msgReal call print_string call load_kernel call switch_to_pm jmp $ %include 'boot/bios.ASM' %include 'boot/gdt.ASM' %include 'boot/protected_mode.ASM' %include 'boot/print32.ASM' [bits 16] load_kernel: mov bx,msgKernel call print_string mov bx, KERNEL mov dh, 15 mov dl, [BOOT_DRIVE] call disk_load ret [bits 32] BEGIN_PM: mov ebx, msgProt call print_string32 call KERNEL jmp $ BOOT_DRIVE db 0 msgReal db "Booted in 16-bit mode",0 msgProt db "Successfully switched to 32-bit mode",0 msgKernel db "Loading the kernel onto memory",0 times 510-($-$$) db 0 dw 0xaa55 bios.ASM - ;BIOS Functions [bits 16] print_string: pusha mov cx,bx mov ah,0x0e printStringStart: mov al,[bx] cmp al,0 je done int 0x10 inc bx jmp printStringStart done: popa ret print_word: pusha mov ax,0x0000 mov cl,0x10 mov al,bh div cl call printDig mov al,bh and al,0x0f call printDig mov ax,0x0000 mov al,bl div cl call printDig mov al,bl and al,0x0f call printDig popa ret printDig: cmp al,0x9 jg alpha add al,'0' mov ah,0x0e int 0x10 jmp pDigDone alpha: sub al,0xa add al,'A' mov ah,0x0e int 0x10 pDigDone: ret hex_prefix: db '0x',0 disk_load: push dx mov ah,0x02 mov al,dh mov ch,0x00 mov dh,0x00 mov cl,0x02 int 0x13 jc disk_error pop dx cmp dh,al jne disk_error ret disk_error: mov ah,0x0e mov al,'X' int 0x10 mov bx,errMsg call print_string jmp $ errMsg: db "Disk Read Error....." times 80-20 db " " db 0 gdt.ASM - gdt_start: gdt_null: dd 0x0 dd 0x0 gdt_code: dw 0xffff dw 0x0 db 0x0 db 10011010b db 11001111b db 0x0 gdt_data: dw 0xffff dw 0x0 db 0x0 db 10010010b db 11001111b db 0x0 gdt_end: gdt_descriptor: dw gdt_end - gdt_start - 1 dd gdt_start CODE_SEG equ gdt_code - gdt_start DATA_SEG equ gdt_data - gdt_start protected_mode.ASM - [bits 16] switch_to_pm: cli lgdt [gdt_descriptor] mov eax, cr0 or eax, 0x1 mov cr0, eax jmp CODE_SEG:init_pm [bits 32] init_pm: mov ax, DATA_SEG mov ds, ax mov ss, ax mov es, ax mov fs, ax mov gs, ax mov ebp,0x90000 mov esp,0x90000 call BEGIN_PM print32.ASM - [bits 32] VIDEO_MEM equ 0xb8000 DEF_COLOR equ 0x0f print_string32: pusha mov edx,VIDEO_MEM print_string32_loop: mov al, [ebx] mov ah, DEF_COLOR cmp al,0 je print_string32_end mov [edx],ax inc ebx add edx,2 jmp print_string32_loop print_string32_end: popa ret I also add a kernel_start.asm file just before the kernel while linking to call the main function - [bits 32] [extern main] call main jmp $ And here's my make file - C_SOURCES = $(wildcard drivers/*.c kernel/*.c) HEADERS = $(wildcard kernel/*.h drivers/*.h) OBJ = ${C_SOURCES:.c=.o} all: os-image os-image: boot/boot_sector.bin kernel.bin cat $^ > $# kernel.bin: kernel/kernel_start.o ${OBJ} ld -o $# -Ttext 0x1000 $^ --oformat binary %.o : %.c gcc -std=c99 -Wall -pedantic -ffreestanding -c $< -o $# %.o : %.asm nasm $< -f elf64 -o $# %.bin : %.asm nasm $< -f bin -o $# clean: rm -fr kernel/*.o rm -fr drivers/*.o rm -fr boot/*.bin rm -fr os-image *.bin *.o
With the changes suggested in other answer and comments, your problem doesn't seem to be reproducible for me. The following code works for me. I've tried to maintain how you coded it just so it makes sense to you: #define VIDEO_MEM 0xb8000 void write_string( unsigned char colour, const char *string ); void write_string_line( unsigned char colour, const char *string, int pos ); void putChar(char character, short col, short row, unsigned char attr); /* Place this at top of file as first code in kernel.o */ __asm__ ("call main\r\n" \ "cli\r\n" \ "hlt\r\n" ); void main() { volatile unsigned char *vid = (unsigned char*) VIDEO_MEM; int i=0; for (i = 0; i < 2000; i++) { *vid = ' '; *(vid+1) = 0x1f; vid += 2; } write_string(0x1f,"The Kernel has been loaded successfully!!"); write_string_line(0x1f,"Testing Here!!",1); putChar('Z',3,3,0xf3); } void write_string( unsigned char colour, const char *string ) { volatile unsigned char *vid = (unsigned char*) VIDEO_MEM; while(*string != 0) { *(vid) = *string; *(vid+1) = colour; ++string; vid+=2; } } void write_string_line( unsigned char colour, const char *string, int pos ) { volatile unsigned char *vid = (unsigned char*) VIDEO_MEM; vid+=pos*160; while(*string != 0) { *vid = *string; *(vid+1) = colour; ++string; vid+=2; } } void putChar(char character, short col, short row, unsigned char attr) { volatile unsigned char* vid_mem = (unsigned char *) VIDEO_MEM; int offset = (row*80 + col)*2; vid_mem += offset; if(!attr) { attr = 0x0f; } *(unsigned short int *)vid_mem = (attr<<8)+character; /* This would do the same as line above *vid_mem = character; *(vid_mem+1) = attr; */ } I've added the __asm__ at the beginning to make sure that code is the first to appear in the generated object file. It likely works without it. I've modified all your *vid pointers to be volatile . Since video is memory mapped IO you don't want to have the compiler potentially remove screen writes when it optimizes. Likely your code will work without volatile, but it is proper to add it here to avoid potential problems. When run BOCHS this code produces this screen output: If you use the code provided here and it doesn't work that would suggest the issue you are having is likely related to the a code you write in your bootloader that read the disk, enabled A20, set the GDT, entered protected mode, and then called into your C code. It is also possible problems could occur depending on how you compile and link your kernel. Likely Cause of Undefined Behavior After all the code and the make file were made available in EDIT 2 it became clear that one significant problem was that most of the code was compiled and linked to 64-bit objects and executables. That code won't work in 32-bit protected mode. In the make file make these adjustments: When compiling with GCC you need to add -m32 option When assembling with GNU Assembler (as) targeting 32-bit objects you need to use --32 When linking with LD you need to add the -melf_i386 option When assembling with NASM targeting 32-bit objects you need to change -f elf64 to -f elf32 A preferable option to using a 64-bit compiler and tool chain from the host environment is to create a cross compiler toolchain for i686 or i386.
This should work. Each VGA cell is of 2 bytes long, First byte stores Character while the second byte stores the color. Also make sure you make marked the pointer volatile. To avoid any type of unexpected changes(or optimizations) made by the compiler on that local field. void write_string( int colour, const unsigned char *string ) { volatile unsigned char *vid = (unsigned char*) VIDEO_MEM; while( *string != 0 ) { *vid++ = *string++; *vid++ = colour; } }
You use *(vid) for first video character for color
Why gdb backtrace syscall address is different from syscall table address
I'm really confusing with syscall address. 1 now I hook a syscall(fake_sendto) replace real syscall(sct[__NR_sendto]), and it workes normally. # define fm_alert(fmt, ...) fm_printk(KERN_ALERT, fmt, ##__VA_ARGS__) void print_ascii(void *addr, size_t count, const char *prompt) { size_t index; fm_alert("%s:\n", prompt); for (index = 0; index < count; index += 1) { pr_cont("%c", *((unsigned char *)addr + index)); } return; } asmlinkage long fake_sendto(int fd, void __user *buff, size_t len, unsigned flags, struct sockaddr __user *addr, int addr_len) { void *kbuf = kmalloc(len + 1, GFP_KERNEL); if (kbuf != NULL) { if (copy_from_user(kbuf, buff, len)) { fm_alert("%s\n", "copy_from_user failed."); } else { if (memcmp(kbuf, "GET", 3) == 0 || memcmp(kbuf, "POST", 4) == 0) { print_ascii(kbuf, len, "ascii"); } } kfree(kbuf); } else { fm_alert("%s\n", "kmalloc failed."); } fm_alert("hook:%p, orig:%p\n", fake_sendto, real_sendto); return real_sendto(fd, buff, len, flags, addr, addr_len); } now I dmesg to show logs: [ 3466.057815] ifmonko.fake_sendto: hook:ffffffffc06d9070, orig:ffffffff8156b2c0 ok, I think the truely sys_sento address is above 0xffffffff8156b2c0 but when I write a test program, gdb print sendto function address is 0x7ffff7b11400 ! see below gdb debug info: (gdb) disas main Dump of assembler code for function main: ... 0x0000000000400cb4 <+743>: callq 0x400810 <sendto#plt> ... End of assembler dump. (gdb) b *0x0000000000400cb4 Breakpoint 1 at 0x400cb4: file ser.c, line 89. (gdb) r Starting program: /home/lid/ser 9898 Breakpoint 1, 0x0000000000400cb4 in main (argc=2, argv=0x7fffffffe6d8) at ser.c:89 89 nwrite = sendto(sfd, buf, strlen(buf), 0, (gdb) c Continuing. Breakpoint 1, 0x0000000000400cb4 in main (argc=2, argv=0x7fffffffe6d8) at ser.c:89 89 nwrite = sendto(sfd, buf, strlen(buf), 0, (gdb) p sendto $1 = {<text variable, no debug info>} 0x7ffff7b11400 <sendto> (gdb) si 0x0000000000400810 in sendto#plt () (gdb) sendto () at ../sysdeps/unix/syscall-template.S:81 81 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS) (gdb) bt #0 sendto () at ../sysdeps/unix/syscall-template.S:81 #1 0x0000000000400cb9 in main (argc=2, argv=0x7fffffffe6d8) at ser.c:89 (gdb) disas Dump of assembler code for function sendto: => 0x00007ffff7b11400 <+0>: cmpl $0x0,0x2c8b6d(%rip) # 0x7ffff7dd9f74 <__libc_multiple_threads> 0x00007ffff7b11407 <+7>: jne 0x7ffff7b1141c <sendto+28> 0x00007ffff7b11409 <+0>: mov %rcx,%r10 0x00007ffff7b1140c <+3>: mov $0x2c,%eax 0x00007ffff7b11411 <+8>: syscall 0x00007ffff7b11413 <+10>: cmp $0xfffffffffffff001,%rax 0x00007ffff7b11419 <+16>: jae 0x7ffff7b1144f <sendto+79> 0x00007ffff7b1141b <+18>: retq 0x00007ffff7b1141c <+28>: sub $0x8,%rsp 0x00007ffff7b11420 <+32>: callq 0x7ffff7b1df20 <__libc_enable_asynccancel> 0x00007ffff7b11425 <+37>: mov %rax,(%rsp) 0x00007ffff7b11429 <+41>: mov %rcx,%r10 0x00007ffff7b1142c <+44>: mov $0x2c,%eax 0x00007ffff7b11431 <+49>: syscall 0x00007ffff7b11433 <+51>: mov (%rsp),%rdi 0x00007ffff7b11437 <+55>: mov %rax,%rdx 0x00007ffff7b1143a <+58>: callq 0x7ffff7b1df80 <__libc_disable_asynccancel> 0x00007ffff7b1143f <+63>: mov %rdx,%rax 0x00007ffff7b11442 <+66>: add $0x8,%rsp 0x00007ffff7b11446 <+70>: cmp $0xfffffffffffff001,%rax 0x00007ffff7b1144c <+76>: jae 0x7ffff7b1144f <sendto+79> 0x00007ffff7b1144e <+78>: retq 0x00007ffff7b1144f <+79>: mov 0x2c2a12(%rip),%rcx # 0x7ffff7dd3e68 0x00007ffff7b11456 <+86>: neg %eax 0x00007ffff7b11458 <+88>: mov %eax,%fs:(%rcx) ---Type <return> to continue, or q <return> to quit--- 0x00007ffff7b1145b <+91>: or $0xffffffffffffffff,%rax 0x00007ffff7b1145f <+95>: retq End of assembler dump. (gdb) why does gdb show different from between hook function and syscall table ?
why does gdb show different from between hook function and syscall table ? One is in the kernel space, and the other is in user space. They have approximately nothing to do with each other.
Visual C++ inline assembly char*
I need to edit a string received from user in C++ code in assembly. I found this tutorial http://msdn.microsoft.com/en-US/library/y8b57x4b(v=vs.80).aspx and according to it my code should work int main () { char* s; s=new char[80]; cin.getline(s,80); __asm { mov eax, offset s } } But the compiler shows an error on the line with mov "improper operand type". What is wrong and how can i fix it?
char* s is a local variable. It will be created when the function is called and "forgotten" when the function returns. There exists no "offset" (i.e an absolute memory address) for it at compiletime. But you can inline-assembler force to load the pointer: #include <iostream> using namespace std; int main () { char* s; s=new char[80]; __asm { mov ebx, s ; = mov ebx, [ebp-4] mov byte ptr [ebx], 'H' mov byte ptr [ebx+1], 'e' mov byte ptr [ebx+2], 'l' mov byte ptr [ebx+3], 'l' mov byte ptr [ebx+4], 'o' mov byte ptr [ebx+5], 0 ; Don't forget the terminator! } cout << s << endl; return 0; }