diff --git a/.vscode/settings.json b/.vscode/settings.json index 4560338a..ede91604 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -144,7 +144,7 @@ "rust-analyzer.checkOnSave.allTargets": false, "rust-analyzer.linkedProjects": [ "./kernel/Cargo.toml", - "./tools/Cargo.toml", + //"./tools/Cargo.toml", ], // "rust-analyzer.cargo.target": "riscv64gc-unknown-none-elf", @@ -154,4 +154,5 @@ "check", ], + "makefile.configureOnOpen": false, } \ No newline at end of file diff --git a/build-scripts/kernel_build/src/cfiles/arch/x86_64.rs b/build-scripts/kernel_build/src/cfiles/arch/x86_64.rs index 8ab7ae2c..65ef94fc 100644 --- a/build-scripts/kernel_build/src/cfiles/arch/x86_64.rs +++ b/build-scripts/kernel_build/src/cfiles/arch/x86_64.rs @@ -31,6 +31,7 @@ impl CFilesArch for X86_64CFilesArch { files.insert(PathBuf::from("src/arch/x86_64/asm/head.S")); files.insert(PathBuf::from("src/arch/x86_64/asm/entry.S")); files.insert(PathBuf::from("src/arch/x86_64/asm/apu_boot.S")); + files.insert(PathBuf::from("src/arch/x86_64/vm/vmx/vmenter.S")); } fn setup_global_flags(&self, c: &mut Build) { diff --git a/kernel/crates/bitmap/src/alloc_bitmap.rs b/kernel/crates/bitmap/src/alloc_bitmap.rs index 4e323866..ae0a78e3 100644 --- a/kernel/crates/bitmap/src/alloc_bitmap.rs +++ b/kernel/crates/bitmap/src/alloc_bitmap.rs @@ -4,7 +4,7 @@ use alloc::vec::Vec; use crate::{bitmap_core::BitMapCore, traits::BitMapOps}; -#[derive(Clone)] +#[derive(Debug, Clone)] pub struct AllocBitmap { elements: usize, data: Vec, @@ -26,6 +26,10 @@ impl AllocBitmap { self.data[i] &= rhs.data[i]; } } + + pub fn data(&self) -> &[usize] { + &self.data + } } impl BitMapOps for AllocBitmap { diff --git a/kernel/crates/bitmap/src/bitmap_core.rs b/kernel/crates/bitmap/src/bitmap_core.rs index 20babb27..384e3277 100644 --- a/kernel/crates/bitmap/src/bitmap_core.rs +++ b/kernel/crates/bitmap/src/bitmap_core.rs @@ -3,7 +3,7 @@ use core::{intrinsics::unlikely, marker::PhantomData}; use crate::traits::BitOps; #[derive(Debug, Clone)] -pub(crate) struct BitMapCore { +pub struct BitMapCore { phantom: PhantomData, } @@ -15,7 +15,7 @@ impl BitMapCore { } /// 获取位图中的某一位 - pub(crate) fn get(&self, n: usize, data: &[T], index: usize) -> Option { + pub fn get(&self, n: usize, data: &[T], index: usize) -> Option { if unlikely(index >= n) { return None; } @@ -30,7 +30,7 @@ impl BitMapCore { } /// 设置位图中的某一位 - pub(crate) fn set(&self, n: usize, data: &mut [T], index: usize, value: bool) -> Option { + pub fn set(&self, n: usize, data: &mut [T], index: usize, value: bool) -> Option { if unlikely(index >= n) { return None; } @@ -43,7 +43,7 @@ impl BitMapCore { Some(bit) } - pub(crate) fn set_all(&self, n: usize, data: &mut [T], value: bool) { + pub fn set_all(&self, n: usize, data: &mut [T], value: bool) { let val = if value { T::max() } else { T::zero() }; for element in data.iter_mut() { *element = val; @@ -58,7 +58,7 @@ impl BitMapCore { } /// 获取位图中第一个为1的位 - pub(crate) fn first_index(&self, data: &[T]) -> Option { + pub fn first_index(&self, data: &[T]) -> Option { for (i, element) in data.iter().enumerate() { let bit = ::first_index(element); if let Some(b) = bit { @@ -70,7 +70,7 @@ impl BitMapCore { } /// 获取位图中第一个为0的位 - pub(crate) fn first_false_index(&self, n: usize, data: &[T]) -> Option { + pub fn first_false_index(&self, n: usize, data: &[T]) -> Option { for (i, element) in data.iter().enumerate() { if let Some(bit) = ::first_false_index(element) { return self.make_index(n, i * T::bit_size() + bit); @@ -81,7 +81,7 @@ impl BitMapCore { } /// 获取位图中最后一个为1的位 - pub(crate) fn last_index(&self, n: usize, data: &[T]) -> Option { + pub fn last_index(&self, n: usize, data: &[T]) -> Option { for (i, element) in data.iter().enumerate().rev() { if let Some(bit) = ::last_index(element) { return self.make_index(n, i * T::bit_size() + bit); @@ -97,7 +97,7 @@ impl BitMapCore { /// /// - `data`:位图数据 /// - `n`:位图有效位数 - pub(crate) fn last_false_index(&self, n: usize, data: &[T]) -> Option { + pub fn last_false_index(&self, n: usize, data: &[T]) -> Option { let mut iter = data.iter().rev(); let mut last_element = *iter.next()?; @@ -123,7 +123,7 @@ impl BitMapCore { } /// 获取位图中下一个为1的位 - pub(crate) fn next_index(&self, n: usize, data: &[T], index: usize) -> Option { + pub fn next_index(&self, n: usize, data: &[T], index: usize) -> Option { if unlikely(index >= n) { return None; } @@ -146,7 +146,7 @@ impl BitMapCore { } /// 获取位图中下一个为0的位 - pub(crate) fn next_false_index(&self, n: usize, data: &[T], index: usize) -> Option { + pub fn next_false_index(&self, n: usize, data: &[T], index: usize) -> Option { if unlikely(index >= n) { return None; } @@ -169,7 +169,7 @@ impl BitMapCore { } /// 获取位图中上一个为1的位 - pub(crate) fn prev_index(&self, n: usize, data: &[T], index: usize) -> Option { + pub fn prev_index(&self, n: usize, data: &[T], index: usize) -> Option { if unlikely(index >= n) { return None; } @@ -190,7 +190,7 @@ impl BitMapCore { None } - pub(crate) fn prev_false_index(&self, n: usize, data: &[T], index: usize) -> Option { + pub fn prev_false_index(&self, n: usize, data: &[T], index: usize) -> Option { let element_index = index / T::bit_size(); let bit_index = index % T::bit_size(); @@ -208,7 +208,7 @@ impl BitMapCore { None } - pub(crate) fn invert(&self, n: usize, data: &mut [T]) { + pub fn invert(&self, n: usize, data: &mut [T]) { for element in data.iter_mut() { ::invert(element); } @@ -222,7 +222,7 @@ impl BitMapCore { } } - pub(crate) fn is_full(&self, n: usize, data: &[T]) -> bool { + pub fn is_full(&self, n: usize, data: &[T]) -> bool { let mut iter = data.iter().peekable(); while let Some(element) = iter.next() { if iter.peek().is_none() { @@ -245,7 +245,7 @@ impl BitMapCore { return false; } - pub(crate) fn is_empty(&self, data: &[T]) -> bool { + pub fn is_empty(&self, data: &[T]) -> bool { for element in data.iter() { if element != &T::zero() { return false; diff --git a/kernel/crates/bitmap/src/lib.rs b/kernel/crates/bitmap/src/lib.rs index 4d799131..27436bd8 100644 --- a/kernel/crates/bitmap/src/lib.rs +++ b/kernel/crates/bitmap/src/lib.rs @@ -13,4 +13,5 @@ mod bitmap_core; mod static_bitmap; pub mod traits; pub use alloc_bitmap::AllocBitmap; +pub use bitmap_core::BitMapCore; pub use static_bitmap::StaticBitmap; diff --git a/kernel/src/arch/x86_64/kvm/vmx/mmu.rs b/kernel/src/arch/x86_64/kvm/vmx/mmu.rs index c05ef9bb..b1f89ff6 100644 --- a/kernel/src/arch/x86_64/kvm/vmx/mmu.rs +++ b/kernel/src/arch/x86_64/kvm/vmx/mmu.rs @@ -88,7 +88,7 @@ fn tdp_get_cr3(_vcpu: &VmxVcpu) -> u64 { return guest_cr3; } -fn tdp_set_eptp(root_hpa: u64) -> Result<(), SystemError> { +pub fn tdp_set_eptp(root_hpa: u64) -> Result<(), SystemError> { // 设置权限位,目前是写死的,可读可写可执行 // EPT paging-structure memory type: Uncacheable let mut eptp = 0x0_u64; diff --git a/kernel/src/arch/x86_64/kvm/vmx/vcpu.rs b/kernel/src/arch/x86_64/kvm/vmx/vcpu.rs index ae6f99ba..3d3216c1 100644 --- a/kernel/src/arch/x86_64/kvm/vmx/vcpu.rs +++ b/kernel/src/arch/x86_64/kvm/vmx/vcpu.rs @@ -501,7 +501,7 @@ pub fn get_segment_base(gdt_base: *const u64, gdt_size: u16, segment_selector: u // } pub fn adjust_vmx_controls(ctl_min: u32, ctl_opt: u32, msr: u32, result: &mut u32) { let vmx_msr_low: u32 = unsafe { (msr::rdmsr(msr) & 0x0000_0000_FFFF_FFFF) as u32 }; - let vmx_msr_high: u32 = unsafe { (msr::rdmsr(msr) << 32) as u32 }; + let vmx_msr_high: u32 = unsafe { (msr::rdmsr(msr) >> 32) as u32 }; let mut ctl: u32 = ctl_min | ctl_opt; ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ diff --git a/kernel/src/arch/x86_64/kvm/vmx/vmexit.rs b/kernel/src/arch/x86_64/kvm/vmx/vmexit.rs index b95d51df..48d9e054 100644 --- a/kernel/src/arch/x86_64/kvm/vmx/vmexit.rs +++ b/kernel/src/arch/x86_64/kvm/vmx/vmexit.rs @@ -264,7 +264,7 @@ extern "C" fn vmexit_handler() { } #[no_mangle] -fn adjust_rip(rip: u64) -> Result<(), SystemError> { +pub fn adjust_rip(rip: u64) -> Result<(), SystemError> { let instruction_length = vmx_vmread(VmcsFields::VMEXIT_INSTR_LEN as u32)?; vmx_vmwrite(VmcsFields::GUEST_RIP as u32, rip + instruction_length)?; Ok(()) diff --git a/kernel/src/arch/x86_64/mm/mod.rs b/kernel/src/arch/x86_64/mm/mod.rs index 85753f9c..6415731f 100644 --- a/kernel/src/arch/x86_64/mm/mod.rs +++ b/kernel/src/arch/x86_64/mm/mod.rs @@ -439,6 +439,15 @@ impl X86_64MMArch { // 不支持的原因是,目前好像没有能正确的设置page-level的xd位,会触发page fault return true; } + + pub unsafe fn read_array(addr: VirtAddr, count: usize) -> Vec { + // 实现读取数组逻辑 + let mut vec = Vec::with_capacity(count); + for i in 0..count { + vec.push(Self::read(addr + i * core::mem::size_of::())); + } + vec + } } impl VirtAddr { diff --git a/kernel/src/arch/x86_64/mod.rs b/kernel/src/arch/x86_64/mod.rs index 2bd97fe0..d825e7cc 100644 --- a/kernel/src/arch/x86_64/mod.rs +++ b/kernel/src/arch/x86_64/mod.rs @@ -20,6 +20,7 @@ pub mod sched; pub mod smp; pub mod syscall; pub mod time; +pub mod vm; pub use self::pci::pci::X86_64PciArch as PciArch; @@ -40,3 +41,12 @@ pub use crate::arch::elf::X86_64ElfArch as CurrentElfArch; pub use crate::arch::smp::X86_64SMPArch as CurrentSMPArch; pub use crate::arch::sched::X86_64SchedArch as CurrentSchedArch; + +pub use crate::arch::vm::KvmArchManager as CurrentKvmManager; + +pub use crate::arch::vm::kvm_host::X86KvmArch as KvmArch; + +pub use crate::arch::vm::x86_kvm_ops as kvm_arch_ops; + +pub use crate::arch::vm::kvm_host::vcpu::X86VcpuArch as VirtCpuArch; +pub use crate::arch::vm::kvm_host::KvmVcpuStat as VirtCpuStat; diff --git a/kernel/src/arch/x86_64/vm/asm.rs b/kernel/src/arch/x86_64/vm/asm.rs new file mode 100644 index 00000000..d3466470 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/asm.rs @@ -0,0 +1,592 @@ +use core::arch::asm; + +use alloc::slice; +use log::{debug, error}; +use raw_cpuid::CpuId; +use system_error::SystemError; +use x86::{ + bits64::vmx::vmxon, + controlregs::{cr0, cr0_write, cr4, cr4_write, Cr0, Cr4}, + msr::{ + rdmsr, wrmsr, IA32_FEATURE_CONTROL, IA32_VMX_CR0_FIXED0, IA32_VMX_CR0_FIXED1, + IA32_VMX_CR4_FIXED0, IA32_VMX_CR4_FIXED1, + }, + vmx::vmcs::ro, +}; + +use crate::{ + arch::{mm::barrier, MMArch}, + mm::{MemoryManagementArch, PhysAddr}, +}; + +use super::vmx::vmx_info; + +pub struct KvmX86Asm; + +impl KvmX86Asm { + pub fn read_pkru() -> u32 { + let cpuid = CpuId::new(); + if let Some(feat) = cpuid.get_extended_feature_info() { + if feat.has_ospke() { + return Self::rdpkru(); + } + } + return 0; + } + + pub fn write_pkru(_val: u32) { + let cpuid = CpuId::new(); + if let Some(feat) = cpuid.get_extended_feature_info() { + if feat.has_ospke() { + todo!(); + } + } + } + + fn rdpkru() -> u32 { + let ecx: u32 = 0; + let pkru: u32; + let _edx: u32; + + unsafe { + asm!( + "rdpkru", + out("eax") pkru, + out("edx") _edx, + in("ecx") ecx, + ); + } + + pkru + } + + pub fn get_segment_base(gdt_base: *const u64, gdt_size: u16, segment_selector: u16) -> u64 { + let table = segment_selector & 0x0004; // get table indicator in selector + let index = (segment_selector >> 3) as usize; // get index in selector + if table == 0 && index == 0 { + return 0; + } + let descriptor_table = unsafe { slice::from_raw_parts(gdt_base, gdt_size.into()) }; + let descriptor = descriptor_table[index]; + + let base_high = (descriptor & 0xFF00_0000_0000_0000) >> 32; + let base_mid = (descriptor & 0x0000_00FF_0000_0000) >> 16; + let base_low = (descriptor & 0x0000_0000_FFFF_0000) >> 16; + let segment_base = (base_high | base_mid | base_low) & 0xFFFFFFFF; + let virtaddr = unsafe { + MMArch::phys_2_virt(PhysAddr::new(segment_base as usize)) + .unwrap() + .data() as u64 + }; + return virtaddr; + } +} + +pub struct VmxAsm; + +impl VmxAsm { + pub fn vmclear(phys_addr: PhysAddr) { + debug!("vmclear addr {phys_addr:?}"); + match unsafe { x86::bits64::vmx::vmclear(phys_addr.data() as u64) } { + Ok(_) => {} + Err(e) => { + panic!("[VMX] vmclear failed! reason: {e:?}"); + } + } + } + + pub fn vmcs_load(phys_addr: PhysAddr) { + match unsafe { x86::bits64::vmx::vmptrld(phys_addr.data() as u64) } { + Ok(_) => {} + Err(e) => { + panic!("[VMX] vmptrld failed! reason: {e:?}"); + } + } + } + + /// vmrite the current VMCS. + pub fn vmx_vmwrite(vmcs_field: u32, value: u64) { + unsafe { + x86::bits64::vmx::vmwrite(vmcs_field, value) + .unwrap_or_else(|_| panic!("vmcs_field: {:x} vmx_write fail", vmcs_field)) + } + } + + /// vmread the current VMCS. + pub fn vmx_vmread(vmcs_field: u32) -> u64 { + unsafe { x86::bits64::vmx::vmread(vmcs_field).expect("vmx_read fail: ") } + } + + pub fn kvm_cpu_vmxon(phys_addr: PhysAddr) -> Result<(), SystemError> { + unsafe { + let mut cr4 = cr4(); + cr4.insert(Cr4::CR4_ENABLE_VMX); + cr4_write(cr4); + + Self::vmx_set_lock_bit()?; + Self::vmx_set_cr0_bits(); + Self::vmx_set_cr4_bits(); + debug!("vmxon addr {phys_addr:?}"); + + vmxon(phys_addr.data() as u64).expect("[VMX] vmxon failed! reason"); + + barrier::mfence(); + + Ok(()) + } + } + + #[allow(dead_code)] + const VMX_VPID_EXTENT_INDIVIDUAL_ADDR: u64 = 0; + const VMX_VPID_EXTENT_SINGLE_CONTEXT: u64 = 1; + #[allow(dead_code)] + const VMX_VPID_EXTENT_ALL_CONTEXT: u64 = 2; + #[allow(dead_code)] + const VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: u64 = 3; + #[allow(dead_code)] + const VMX_EPT_EXTENT_CONTEXT: u64 = 1; + const VMX_EPT_EXTENT_GLOBAL: u64 = 2; + #[allow(dead_code)] + const VMX_EPT_EXTENT_SHIFT: u64 = 24; + + pub fn ept_sync_global() { + Self::invept(Self::VMX_EPT_EXTENT_GLOBAL, 0, 0); + } + #[allow(dead_code)] + pub fn ept_sync_context(eptp: u64) { + if vmx_info().has_vmx_invept_context() { + Self::invept(Self::VMX_EPT_EXTENT_CONTEXT, eptp, 0); + } else { + Self::ept_sync_global(); + } + } + + pub fn sync_vcpu_single(vpid: u16) { + if vpid == 0 { + return; + } + + Self::invvpid(Self::VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0) + } + + pub fn sync_vcpu_global() { + Self::invvpid(Self::VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); + } + + #[inline(always)] + fn invept(ext: u64, eptp: u64, gpa: u64) { + #[repr(C)] + struct InveptDescriptor { + eptp: u64, + gpa: u64, + } + + let descriptor = InveptDescriptor { eptp, gpa }; + + unsafe { + asm!( + "invept {0}, [{1}]", + in(reg) ext, + in(reg) &descriptor, + options(nostack) + ); + } + } + + #[inline(always)] + fn invvpid(ext: u64, vpid: u16, gva: u64) { + #[repr(C)] + struct InvvpidDescriptor { + vpid: u16, + rsvd: u64, + gva: u64, + } + + let descriptor = InvvpidDescriptor { vpid, rsvd: 0, gva }; + + unsafe { + asm!( + "invvpid {0}, [{1}]", + in(reg) ext, + in(reg) &descriptor, + options(nostack) + ); + } + } + + /// Set the mandatory bits in CR4 and clear bits that are mandatory zero + /// (Intel Manual: 24.8 Restrictions on VMX Operation) + fn vmx_set_cr4_bits() { + let ia32_vmx_cr4_fixed0 = unsafe { rdmsr(IA32_VMX_CR4_FIXED0) }; + let ia32_vmx_cr4_fixed1 = unsafe { rdmsr(IA32_VMX_CR4_FIXED1) }; + + let mut cr4 = unsafe { cr4() }; + + cr4 |= Cr4::from_bits_truncate(ia32_vmx_cr4_fixed0 as usize); + cr4 &= Cr4::from_bits_truncate(ia32_vmx_cr4_fixed1 as usize); + + unsafe { cr4_write(cr4) }; + } + + /// Check if we need to set bits in IA32_FEATURE_CONTROL + // (Intel Manual: 24.7 Enabling and Entering VMX Operation) + fn vmx_set_lock_bit() -> Result<(), SystemError> { + const VMX_LOCK_BIT: u64 = 1 << 0; + const VMXON_OUTSIDE_SMX: u64 = 1 << 2; + + let ia32_feature_control = unsafe { rdmsr(IA32_FEATURE_CONTROL) }; + + if (ia32_feature_control & VMX_LOCK_BIT) == 0 { + unsafe { + wrmsr( + IA32_FEATURE_CONTROL, + VMXON_OUTSIDE_SMX | VMX_LOCK_BIT | ia32_feature_control, + ) + }; + } else if (ia32_feature_control & VMXON_OUTSIDE_SMX) == 0 { + return Err(SystemError::EPERM); + } + + Ok(()) + } + + /// Set the mandatory bits in CR0 and clear bits that are mandatory zero + /// (Intel Manual: 24.8 Restrictions on VMX Operation) + fn vmx_set_cr0_bits() { + let ia32_vmx_cr0_fixed0 = unsafe { rdmsr(IA32_VMX_CR0_FIXED0) }; + let ia32_vmx_cr0_fixed1 = unsafe { rdmsr(IA32_VMX_CR0_FIXED1) }; + + let mut cr0 = unsafe { cr0() }; + + cr0 |= Cr0::from_bits_truncate(ia32_vmx_cr0_fixed0 as usize); + cr0 &= Cr0::from_bits_truncate(ia32_vmx_cr0_fixed1 as usize); + + unsafe { cr0_write(cr0) }; + } +} + +#[no_mangle] +unsafe extern "C" fn vmx_vmlaunch() { + if let Err(e) = x86::bits64::vmx::vmlaunch() { + error!( + "vmx_launch fail: {:?}, err code {}", + e, + VmxAsm::vmx_vmread(ro::VM_INSTRUCTION_ERROR) + ); + } +} + +bitflags! { + pub struct IntrInfo: u32 { + const INTR_INFO_VECTOR_MASK = 0xff; + const INTR_INFO_INTR_TYPE_MASK = 0x700; + const INTR_INFO_DELIVER_CODE_MASK = 0x800; + const INTR_INFO_UNBLOCK_NMI = 0x1000; + const INTR_INFO_VALID_MASK = 0x80000000; + const INTR_INFO_RESVD_BITS_MASK = 0x7ffff000; + } + + pub struct IntrType: u32 { + /// external interrupt + const INTR_TYPE_EXT_INTR = (0 << 8); + /// reserved + const INTR_TYPE_RESERVED = (1 << 8); + /// NMI + const INTR_TYPE_NMI_INTR = (2 << 8); + /// processor exception + const INTR_TYPE_HARD_EXCEPTION = (3 << 8); + /// software interrupt + const INTR_TYPE_SOFT_INTR = (4 << 8); + /// ICE breakpoint - undocumented + const INTR_TYPE_PRIV_SW_EXCEPTION = (5 << 8); + /// software exception + const INTR_TYPE_SOFT_EXCEPTION = (6 << 8); + /// other even + const INTR_TYPE_OTHER_EVENT = (7 << 8); + } + + pub struct MiscEnable: u64 { + const MSR_IA32_MISC_ENABLE_FAST_STRING = 1 << 0; + const MSR_IA32_MISC_ENABLE_TCC = 1 << 1; + const MSR_IA32_MISC_ENABLE_EMON = 1 << 7; + const MSR_IA32_MISC_ENABLE_BTS_UNAVAIL = 1 << 11; + const MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL = 1 << 12; + const MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP = 1 << 16; + const MSR_IA32_MISC_ENABLE_MWAIT = 1 << 18; + const MSR_IA32_MISC_ENABLE_LIMIT_CPUID= 1 << 22; + const MSR_IA32_MISC_ENABLE_XTPR_DISABLE = 1 << 23; + const MSR_IA32_MISC_ENABLE_XD_DISABLE = 1 << 34; + } + + pub struct ArchCapabilities: u64 { + /// Not susceptible to Meltdown + const ARCH_CAP_RDCL_NO = 1 << 0; + /// Enhanced IBRS support + const ARCH_CAP_IBRS_ALL = 1 << 1; + /// RET may use alternative branch predictors + const ARCH_CAP_RSBA = 1 << 2; + /// Skip L1D flush on vmentry + const ARCH_CAP_SKIP_VMENTRY_L1DFLUSH = 1 << 3; + /// + /// Not susceptible to Speculative Store Bypass + /// attack, so no Speculative Store Bypass + /// control required. + /// + const ARCH_CAP_SSB_NO = 1 << 4; + /// Not susceptible to + /// Microarchitectural Data + /// Sampling (MDS) vulnerabilities. + const ARCH_CAP_MDS_NO = 1 << 5; + /// The processor is not susceptible to a + /// machine check error due to modifying the + /// code page size along with either the + /// physical address or cache type + /// without TLB invalidation. + const ARCH_CAP_PSCHANGE_MC_NO = 1 << 6; + /// MSR for TSX control is available. + const ARCH_CAP_TSX_CTRL_MSR = 1 << 7; + /// Not susceptible to + /// TSX Async Abort (TAA) vulnerabilities. + const ARCH_CAP_TAA_NO = 1 << 8; + /// Not susceptible to SBDR and SSDP + /// variants of Processor MMIO stale data + /// vulnerabilities. + const ARCH_CAP_SBDR_SSDP_NO = 1 << 13; + /// Not susceptible to FBSDP variant of + /// Processor MMIO stale data + /// vulnerabilities. + const ARCH_CAP_FBSDP_NO = 1 << 14; + /// Not susceptible to PSDP variant of + /// Processor MMIO stale data + /// vulnerabilities. + const ARCH_CAP_PSDP_NO = 1 << 15; + /// VERW clears CPU fill buffer + /// even on MDS_NO CPUs. + const ARCH_CAP_FB_CLEAR = 1 << 17; + /// MSR_IA32_MCU_OPT_CTRL[FB_CLEAR_DIS] + /// bit available to control VERW + /// behavior. + const ARCH_CAP_FB_CLEAR_CTRL = 1 << 18; + /// Indicates RET may use predictors + /// other than the RSB. With eIBRS + /// enabled predictions in kernel mode + /// are restricted to targets in + /// kernel. + const ARCH_CAP_RRSBA = 1 << 19; + /// Not susceptible to Post-Barrier + /// Return Stack Buffer Predictions. + const ARCH_CAP_PBRSB_NO = 1 << 24; + /// CPU is vulnerable to Gather + /// Data Sampling (GDS) and + /// has controls for mitigation. + const ARCH_CAP_GDS_CTRL = 1 << 25; + /// CPU is not vulnerable to Gather + /// Data Sampling (GDS). + const ARCH_CAP_GDS_NO = 1 << 26; + /// IA32_XAPIC_DISABLE_STATUS MSR + /// supported + const ARCH_CAP_XAPIC_DISABLE = 1 << 21; + + const KVM_SUPPORTED_ARCH_CAP = ArchCapabilities::ARCH_CAP_RDCL_NO.bits + | ArchCapabilities::ARCH_CAP_IBRS_ALL.bits + | ArchCapabilities::ARCH_CAP_RSBA.bits + | ArchCapabilities::ARCH_CAP_SKIP_VMENTRY_L1DFLUSH.bits + | ArchCapabilities::ARCH_CAP_SSB_NO.bits + | ArchCapabilities::ARCH_CAP_MDS_NO.bits + | ArchCapabilities::ARCH_CAP_PSCHANGE_MC_NO.bits + | ArchCapabilities::ARCH_CAP_TSX_CTRL_MSR.bits + | ArchCapabilities::ARCH_CAP_TAA_NO.bits + | ArchCapabilities::ARCH_CAP_SBDR_SSDP_NO.bits + | ArchCapabilities::ARCH_CAP_FBSDP_NO.bits + | ArchCapabilities::ARCH_CAP_PSDP_NO.bits + | ArchCapabilities::ARCH_CAP_FB_CLEAR.bits + | ArchCapabilities::ARCH_CAP_RRSBA.bits + | ArchCapabilities::ARCH_CAP_PBRSB_NO.bits + | ArchCapabilities::ARCH_CAP_GDS_NO.bits; + } +} + +#[derive(Debug, Default, Clone)] +pub struct MsrData { + pub host_initiated: bool, + pub index: u32, + pub data: u64, +} + +#[repr(C, align(16))] +#[derive(Debug, Default, Copy, Clone)] +pub struct VmxMsrEntry { + pub index: u32, + pub reserved: u32, + pub data: u64, +} + +#[allow(dead_code)] +pub mod hyperv { + /* Hyper-V specific model specific registers (MSRs) */ + + /* MSR used to identify the guest OS. */ + pub const HV_X64_MSR_GUEST_OS_ID: u32 = 0x40000000; + + /* MSR used to setup pages used to communicate with the hypervisor. */ + pub const HV_X64_MSR_HYPERCALL: u32 = 0x40000001; + + /* MSR used to provide vcpu index */ + pub const HV_REGISTER_VP_INDEX: u32 = 0x40000002; + + /* MSR used to reset the guest OS. */ + pub const HV_X64_MSR_RESET: u32 = 0x40000003; + + /* MSR used to provide vcpu runtime in 100ns units */ + pub const HV_X64_MSR_VP_RUNTIME: u32 = 0x40000010; + + /* MSR used to read the per-partition time reference counter */ + pub const HV_REGISTER_TIME_REF_COUNT: u32 = 0x40000020; + + /* A partition's reference time stamp counter (TSC) page */ + pub const HV_REGISTER_REFERENCE_TSC: u32 = 0x40000021; + + /* MSR used to retrieve the TSC frequency */ + pub const HV_X64_MSR_TSC_FREQUENCY: u32 = 0x40000022; + + /* MSR used to retrieve the local APIC timer frequency */ + pub const HV_X64_MSR_APIC_FREQUENCY: u32 = 0x40000023; + + /* Define the virtual APIC registers */ + pub const HV_X64_MSR_EOI: u32 = 0x40000070; + pub const HV_X64_MSR_ICR: u32 = 0x40000071; + pub const HV_X64_MSR_TPR: u32 = 0x40000072; + pub const HV_X64_MSR_VP_ASSIST_PAGE: u32 = 0x40000073; + + /* Define synthetic interrupt controller model specific registers. */ + pub const HV_REGISTER_SCONTROL: u32 = 0x40000080; + pub const HV_REGISTER_SVERSION: u32 = 0x40000081; + pub const HV_REGISTER_SIEFP: u32 = 0x40000082; + pub const HV_REGISTER_SIMP: u32 = 0x40000083; + pub const HV_REGISTER_EOM: u32 = 0x40000084; + pub const HV_REGISTER_SINT0: u32 = 0x40000090; + pub const HV_REGISTER_SINT1: u32 = 0x40000091; + pub const HV_REGISTER_SINT2: u32 = 0x40000092; + pub const HV_REGISTER_SINT3: u32 = 0x40000093; + pub const HV_REGISTER_SINT4: u32 = 0x40000094; + pub const HV_REGISTER_SINT5: u32 = 0x40000095; + pub const HV_REGISTER_SINT6: u32 = 0x40000096; + pub const HV_REGISTER_SINT7: u32 = 0x40000097; + pub const HV_REGISTER_SINT8: u32 = 0x40000098; + pub const HV_REGISTER_SINT9: u32 = 0x40000099; + pub const HV_REGISTER_SINT10: u32 = 0x4000009A; + pub const HV_REGISTER_SINT11: u32 = 0x4000009B; + pub const HV_REGISTER_SINT12: u32 = 0x4000009C; + pub const HV_REGISTER_SINT13: u32 = 0x4000009D; + pub const HV_REGISTER_SINT14: u32 = 0x4000009E; + pub const HV_REGISTER_SINT15: u32 = 0x4000009F; + + /* + * Define synthetic interrupt controller model specific registers for + * nested hypervisor. + */ + pub const HV_REGISTER_NESTED_SCONTROL: u32 = 0x40001080; + pub const HV_REGISTER_NESTED_SVERSION: u32 = 0x40001081; + pub const HV_REGISTER_NESTED_SIEFP: u32 = 0x40001082; + pub const HV_REGISTER_NESTED_SIMP: u32 = 0x40001083; + pub const HV_REGISTER_NESTED_EOM: u32 = 0x40001084; + pub const HV_REGISTER_NESTED_SINT0: u32 = 0x40001090; + + /* + * Synthetic Timer MSRs. Four timers per vcpu. + */ + pub const HV_REGISTER_STIMER0_CONFIG: u32 = 0x400000B0; + pub const HV_REGISTER_STIMER0_COUNT: u32 = 0x400000B1; + pub const HV_REGISTER_STIMER1_CONFIG: u32 = 0x400000B2; + pub const HV_REGISTER_STIMER1_COUNT: u32 = 0x400000B3; + pub const HV_REGISTER_STIMER2_CONFIG: u32 = 0x400000B4; + pub const HV_REGISTER_STIMER2_COUNT: u32 = 0x400000B5; + pub const HV_REGISTER_STIMER3_CONFIG: u32 = 0x400000B6; + pub const HV_REGISTER_STIMER3_COUNT: u32 = 0x400000B7; + + /* Hyper-V guest idle MSR */ + pub const HV_X64_MSR_GUEST_IDLE: u32 = 0x400000F0; + + /* Hyper-V guest crash notification MSR's */ + pub const HV_REGISTER_CRASH_P0: u32 = 0x40000100; + pub const HV_REGISTER_CRASH_P1: u32 = 0x40000101; + pub const HV_REGISTER_CRASH_P2: u32 = 0x40000102; + pub const HV_REGISTER_CRASH_P3: u32 = 0x40000103; + pub const HV_REGISTER_CRASH_P4: u32 = 0x40000104; + pub const HV_REGISTER_CRASH_CTL: u32 = 0x40000105; + + /* TSC emulation after migration */ + pub const HV_X64_MSR_REENLIGHTENMENT_CONTROL: u32 = 0x40000106; + pub const HV_X64_MSR_TSC_EMULATION_CONTROL: u32 = 0x40000107; + pub const HV_X64_MSR_TSC_EMULATION_STATUS: u32 = 0x40000108; + + /* TSC invariant control */ + pub const HV_X64_MSR_TSC_INVARIANT_CONTROL: u32 = 0x40000118; + + /* + * The defines related to the synthetic debugger are required by KDNet, but + * they are not documented in the Hyper-V TLFS because the synthetic debugger + * functionality has been deprecated and is subject to removal in future + * versions of Windows. + */ + pub const HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: u32 = 0x40000080; + pub const HYPERV_CPUID_SYNDBG_INTERFACE: u32 = 0x40000081; + pub const HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: u32 = 0x40000082; + + /* + * Hyper-V synthetic debugger platform capabilities + * These are HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX bits. + */ + pub const HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING: u32 = 1 << 1; + + /* Hyper-V Synthetic debug options MSR */ + pub const HV_X64_MSR_SYNDBG_CONTROL: u32 = 0x400000F1; + pub const HV_X64_MSR_SYNDBG_STATUS: u32 = 0x400000F2; + pub const HV_X64_MSR_SYNDBG_SEND_BUFFER: u32 = 0x400000F3; + pub const HV_X64_MSR_SYNDBG_RECV_BUFFER: u32 = 0x400000F4; + pub const HV_X64_MSR_SYNDBG_PENDING_BUFFER: u32 = 0x400000F5; + pub const HV_X64_MSR_SYNDBG_OPTIONS: u32 = 0x400000FF; +} + +#[allow(dead_code)] +pub mod kvm_msr { + pub const MSR_KVM_WALL_CLOCK: u32 = 0x11; + pub const MSR_KVM_SYSTEM_TIME: u32 = 0x12; + + /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ + pub const MSR_KVM_WALL_CLOCK_NEW: u32 = 0x4b564d00; + pub const MSR_KVM_SYSTEM_TIME_NEW: u32 = 0x4b564d01; + pub const MSR_KVM_ASYNC_PF_EN: u32 = 0x4b564d02; + pub const MSR_KVM_STEAL_TIME: u32 = 0x4b564d03; + pub const MSR_KVM_PV_EOI_EN: u32 = 0x4b564d04; + pub const MSR_KVM_POLL_CONTROL: u32 = 0x4b564d05; + pub const MSR_KVM_ASYNC_PF_INT: u32 = 0x4b564d06; + pub const MSR_KVM_ASYNC_PF_ACK: u32 = 0x4b564d07; + pub const MSR_KVM_MIGRATION_CONTROL: u32 = 0x4b564d08; + + pub const PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x00000016; + pub const CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x0401e172; + pub const VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x00036dff; + pub const VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x000011ff; +} + +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum VcpuSegment { + ES, + CS, + SS, + DS, + FS, + GS, + TR, + LDTR, +} + +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum SegmentCacheField { + SEL = 0, + BASE = 1, + LIMIT = 2, + AR = 3, + NR = 4, +} diff --git a/kernel/src/arch/x86_64/vm/cpuid.rs b/kernel/src/arch/x86_64/vm/cpuid.rs new file mode 100644 index 00000000..4fd447a6 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/cpuid.rs @@ -0,0 +1,59 @@ +use alloc::vec::Vec; + +#[derive(Debug, Default, Clone, Copy)] +#[allow(dead_code)] +pub struct KvmCpuidEntry2 { + pub function: u32, + pub index: u32, + pub flags: KvmCpuidFlag, + pub eax: u32, + pub ebx: u32, + pub ecx: u32, + pub edx: u32, + padding: [u32; 3], +} + +impl KvmCpuidEntry2 { + pub fn find( + entries: &Vec, + function: u32, + index: Option, + ) -> Option { + for e in entries { + if e.function != function { + continue; + } + + if !e + .flags + .contains(KvmCpuidFlag::KVM_CPUID_FLAG_SIGNIFCANT_INDEX) + || Some(e.index) == index + { + return Some(*e); + } + + if index.is_none() { + return Some(*e); + } + } + + None + } +} + +bitflags! { + pub struct KvmCpuidFlag: u32 { + /// 表示CPUID函数的输入索引值是重要的,它会影响CPUID函数的行为或返回值 + const KVM_CPUID_FLAG_SIGNIFCANT_INDEX = 1 << 0; + /// 表示CPUID函数是有状态的,即它的行为可能受到先前CPUID函数调用的影响 + const KVM_CPUID_FLAG_STATEFUL_FUNC = 1 << 1; + /// 表示CPUID函数的状态应该在下一次CPUID函数调用中读取 + const KVM_CPUID_FLAG_STATE_READ_NEXT = 1 << 2; + } +} + +impl Default for KvmCpuidFlag { + fn default() -> Self { + Self::empty() + } +} diff --git a/kernel/src/arch/x86_64/vm/exit.rs b/kernel/src/arch/x86_64/vm/exit.rs new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/exit.rs @@ -0,0 +1 @@ + diff --git a/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs b/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs new file mode 100644 index 00000000..90a4bbda --- /dev/null +++ b/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs @@ -0,0 +1,62 @@ +use alloc::boxed::Box; + +use crate::{ + arch::kvm_arch_ops, + virt::vm::kvm_host::{vcpu::VirtCpu, Vm}, +}; + +const APIC_DEFAULT_PHYS_BASE: u64 = 0xfee00000; +#[allow(dead_code)] +const MSR_IA32_APICBASE: u64 = 0x0000001b; +const MSR_IA32_APICBASE_BSP: u64 = 1 << 8; +const MSR_IA32_APICBASE_ENABLE: u64 = 1 << 11; +#[allow(dead_code)] +const MSR_IA32_APICBASE_BASE: u64 = 0xfffff << 12; + +#[derive(Debug)] +pub struct KvmLapic { + pub apicv_active: bool, + pub regs: Box<[u8]>, +} + +impl VirtCpu { + pub fn lapic_reset(&mut self, vm: &Vm, init_event: bool) { + kvm_arch_ops().apicv_pre_state_restore(self); + + if !init_event { + let mut msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; + if vm.arch.bsp_vcpu_id == self.vcpu_id { + msr_val |= MSR_IA32_APICBASE_BSP; + } + + self.lapic_set_base(msr_val); + } + + if self.arch.apic.is_none() { + return; + } + + todo!() + } + + fn lapic_set_base(&mut self, value: u64) { + let old_val = self.arch.apic_base; + let apic = self.arch.apic.as_ref(); + + self.arch.apic_base = value; + + if (old_val ^ value) & MSR_IA32_APICBASE_ENABLE != 0 { + // TODO: kvm_update_cpuid_runtime(vcpu); + } + + if apic.is_none() { + return; + } + + if (old_val ^ value) & MSR_IA32_APICBASE_ENABLE != 0 { + // if value & MSR_IA32_APICBASE_ENABLE != 0 {} + } + + todo!() + } +} diff --git a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs new file mode 100644 index 00000000..40ca696b --- /dev/null +++ b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs @@ -0,0 +1,463 @@ +use core::{fmt::Debug, sync::atomic::AtomicU32}; + +use alloc::{boxed::Box, vec::Vec}; +use bit_field::BitField; +use bitmap::{traits::BitMapOps, AllocBitmap}; +use system_error::SystemError; +use x86::{ + bits64::rflags::RFlags, + controlregs::{Cr0, Cr4}, + dtables::DescriptorTablePointer, +}; +use x86_64::registers::control::EferFlags; + +use crate::{ + smp::cpu::ProcessorId, + virt::vm::{ + kvm_host::{ + vcpu::VirtCpu, Vm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, KVM_USERSAPCE_IRQ_SOURCE_ID, + }, + user_api::UapiKvmSegment, + }, +}; + +use crate::arch::VirtCpuArch; + +use super::{ + asm::{MsrData, VcpuSegment, VmxMsrEntry}, + vmx::{exit::ExitFastpathCompletion, vmx_info}, + x86_kvm_manager, x86_kvm_ops, +}; + +pub mod lapic; +pub mod page; +pub mod vcpu; +#[allow(dead_code)] +pub const TSS_IOPB_BASE_OFFSET: usize = 0x66; +pub const TSS_BASE_SIZE: usize = 0x68; +pub const TSS_IOPB_SIZE: usize = 65536 / 8; +pub const TSS_REDIRECTION_SIZE: usize = 256 / 8; +pub const RMODE_TSS_SIZE: usize = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1; + +pub const KVM_PFN_NOSLOT: u64 = 0x1 << 63; + +#[allow(dead_code)] +#[derive(Debug, Default)] +pub struct X86KvmArch { + /// 中断芯片模式 + pub irqchip_mode: KvmIrqChipMode, + /// 负责引导(bootstrap)kvm的vcpu_id + bsp_vcpu_id: usize, + pub pause_in_guest: bool, + pub cstate_in_guest: bool, + pub mwait_in_guest: bool, + pub hlt_in_guest: bool, + pub bus_lock_detection_enabled: bool, + irq_sources_bitmap: u64, + default_tsc_khz: u64, + guest_can_read_msr_platform_info: bool, + apicv_inhibit_reasons: usize, + + pub max_vcpu_ids: usize, + + pub notify_vmexit_flags: NotifyVmExitFlags, + pub notify_window: u32, + + msr_fliter: Option>, + + pub noncoherent_dma_count: AtomicU32, + + pub active_mmu_pages: Vec, + + pub n_max_mmu_pages: usize, + pub n_used_mmu_pages: usize, +} + +impl X86KvmArch { + pub fn init(kvm_type: usize) -> Result { + if kvm_type != 0 { + return Err(SystemError::EINVAL); + } + let mut arch = x86_kvm_ops().vm_init(); + + // 设置中断源位图 + arch.irq_sources_bitmap + .set_bit(KVM_USERSAPCE_IRQ_SOURCE_ID, true) + .set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, true); + + arch.default_tsc_khz = x86_kvm_manager().max_tsc_khz; + arch.guest_can_read_msr_platform_info = true; + + arch.apicv_init(); + Ok(arch) + } + + fn apicv_init(&mut self) { + self.apicv_inhibit_reasons + .set_bit(KvmApicvInhibit::ABSENT, true); + + if !vmx_info().enable_apicv { + self.apicv_inhibit_reasons + .set_bit(KvmApicvInhibit::DISABLE, true); + } + } + + pub fn msr_allowed(&self, msr: u32, ftype: MsrFilterType) -> bool { + // x2APIC MSRs + if (0x800..=0x8ff).contains(&msr) { + return true; + } + + if let Some(msr_filter) = &self.msr_fliter { + let mut allowed = msr_filter.default_allow; + + for i in 0..msr_filter.count as usize { + let range = &msr_filter.ranges[i]; + let start = range.base; + let end = start + range.nmsrs; + let flags = range.flags; + let bitmap = &range.bitmap; + if msr >= start && msr < end && flags.contains(ftype) { + allowed = bitmap.get((msr - start) as usize).unwrap_or(false); + break; + } + } + + return allowed; + } else { + return true; + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq)] +#[allow(dead_code)] +pub enum KvmIrqChipMode { + None, + Kernel, + Split, +} + +impl Default for KvmIrqChipMode { + fn default() -> Self { + Self::None + } +} +#[allow(dead_code)] +pub trait KvmInitFunc { + fn hardware_setup(&self) -> Result<(), SystemError>; + fn handle_intel_pt_intr(&self) -> u32; + fn runtime_funcs(&self) -> &'static dyn KvmFunc; +} + +pub trait KvmFunc: Send + Sync + Debug { + /// 返回该硬件支持的名字,例如“Vmx” + fn name(&self) -> &'static str; + + /// 启用硬件支持 + fn hardware_enable(&self) -> Result<(), SystemError>; + + fn vm_init(&self) -> X86KvmArch; + + fn vcpu_precreate(&self, vm: &mut Vm) -> Result<(), SystemError>; + + fn vcpu_create(&self, vcpu: &mut VirtCpu, vm: &Vm); + + fn vcpu_load(&self, vcpu: &mut VirtCpu, cpu: ProcessorId); + + fn load_mmu_pgd(&self, vcpu: &mut VirtCpu, vm: &Vm, root_hpa: u64, root_level: u32); + + fn cache_reg(&self, vcpu: &mut VirtCpuArch, reg: KvmReg); + + fn apicv_pre_state_restore(&self, vcpu: &mut VirtCpu); + + fn set_msr(&self, vcpu: &mut VirtCpu, msr: MsrData) -> Result<(), SystemError>; + + fn set_rflags(&self, vcpu: &mut VirtCpu, rflags: RFlags); + + fn get_rflags(&self, vcpu: &mut VirtCpu) -> RFlags; + + fn set_cr0(&self, vm: &Vm, vcpu: &mut VirtCpu, cr0: Cr0); + + fn is_vaild_cr0(&self, vcpu: &VirtCpu, cr0: Cr0) -> bool; + + fn set_cr4(&self, vcpu: &mut VirtCpu, cr4: Cr4); + + fn post_set_cr3(&self, vcpu: &VirtCpu, cr3: u64); + + fn is_vaild_cr4(&self, vcpu: &VirtCpu, cr4: Cr4) -> bool; + + fn set_efer(&self, vcpu: &mut VirtCpu, efer: EferFlags); + + fn set_segment(&self, vcpu: &mut VirtCpu, var: &mut UapiKvmSegment, seg: VcpuSegment); + + fn get_segment( + &self, + vcpu: &mut VirtCpu, + var: UapiKvmSegment, + seg: VcpuSegment, + ) -> UapiKvmSegment; + + /// 这个函数不会用到VCPU,这里拿到只是为了确保上一层拿到锁 + fn get_idt(&self, _vcpu: &mut VirtCpu, dt: &mut DescriptorTablePointer); + + fn set_idt(&self, _vcpu: &mut VirtCpu, dt: &DescriptorTablePointer); + + fn get_gdt(&self, _vcpu: &mut VirtCpu, dt: &mut DescriptorTablePointer); + + fn set_gdt(&self, _vcpu: &mut VirtCpu, dt: &DescriptorTablePointer); + + fn update_exception_bitmap(&self, vcpu: &mut VirtCpu); + + fn vcpu_reset(&self, vcpu: &mut VirtCpu, vm: &Vm, init_event: bool); + + fn has_emulated_msr(&self, msr: u32) -> bool; + + fn get_msr_feature(&self, msr: &mut VmxMsrEntry) -> bool; + + fn prepare_switch_to_guest(&self, vcpu: &mut VirtCpu); + + fn flush_tlb_all(&self, vcpu: &mut VirtCpu); + + fn vcpu_run(&self, vcpu: &mut VirtCpu) -> ExitFastpathCompletion; + + fn handle_exit_irqoff(&self, vcpu: &mut VirtCpu); + + fn handle_exit( + &self, + vcpu: &mut VirtCpu, + vm: &Vm, + fastpath: ExitFastpathCompletion, + ) -> Result; +} + +/// ## 中断抑制的原因位 +#[derive(Debug)] +pub struct KvmApicvInhibit; + +#[allow(dead_code)] +impl KvmApicvInhibit { + // Intel与AMD共用 + + /// APIC 加速功能被模块参数禁用,或者硬件不支持 + pub const DISABLE: usize = 0; + + /// Hyper-V 客户机正在使用 AutoEOI 功能,导致 APIC 加速被禁用。 + pub const HYPERV: usize = 1; + + /// 因为用户空间尚未启用内核或分裂的中断控制器,导致 APIC 加速被禁用。 + pub const ABSENT: usize = 2; + + /// KVM_GUESTDBG_BLOCKIRQ(一种调试措施,用于阻止该 vCPU 上的所有中断)被启用,以避免 AVIC/APICv 绕过此功能。 + pub const BLOCKIRQ: usize = 3; + + /// 当所有 vCPU 的 APIC ID 和 vCPU 的 1:1 映射被更改且 KVM 未应用其 x2APIC 热插拔修补程序时,APIC 加速被禁用。 + pub const PHYSICAL_ID_ALIASED: usize = 4; + + /// 当 vCPU 的 APIC ID 或 APIC 基址从其复位值更改时,首次禁用 APIC 加速。 + pub const APIC_ID_MODIFIED: usize = 5; + /// 当 vCPU 的 APIC ID 或 APIC 基址从其复位值更改时,首次禁用 APIC 加速。 + pub const APIC_BASE_MODIFIED: usize = 6; + + // 仅仅对AMD适用 + + /// 当 vCPU 运行嵌套客户机时,AVIC 被禁用。因为与 APICv 不同,当 vCPU 运行嵌套时,该 vCPU 的同级无法使用门铃机制通过 AVIC 信号中断。 + pub const NESTED: usize = 7; + + /// 在 SVM 上,等待 IRQ 窗口的实现使用挂起的虚拟中断,而在 KVM 等待 IRQ 窗口时无法注入这些虚拟中断,因此在等待 IRQ 窗口时 AVIC 被禁用。 + pub const IRQWIN: usize = 8; + + /// PIT(i8254)的“重新注入”模式依赖于 EOI 拦截,而 AVIC 不支持边沿触发中断的 EOI 拦截。 + pub const PIT_REINJ: usize = 9; + + /// SEV 不支持 AVIC,因此 AVIC 被禁用。 + pub const SEV: usize = 10; + + /// 当所有带有有效 LDR 的 vCPU 之间的逻辑 ID 和 vCPU 的 1:1 映射被更改时,AVIC 被禁用。 + pub const LOGICAL_ID_ALIASED: usize = 11; +} + +#[derive(Debug)] +pub struct KvmX86MsrFilter { + count: u8, + default_allow: bool, + ranges: Vec, +} + +#[derive(Debug)] +pub struct KernelMsrRange { + pub flags: MsrFilterType, + pub nmsrs: u32, + pub base: u32, + pub bitmap: AllocBitmap, +} + +#[repr(C)] +#[allow(dead_code)] +pub struct PosixMsrFilterRange { + pub flags: u32, + pub nmsrs: u32, + pub base: u32, + pub bitmap: *const u8, +} + +bitflags! { + pub struct MsrFilterType: u8 { + const KVM_MSR_FILTER_READ = 1 << 0; + const KVM_MSR_FILTER_WRITE = 1 << 1; + } + + pub struct NotifyVmExitFlags: u8 { + const KVM_X86_NOTIFY_VMEXIT_ENABLED = 1 << 0; + const KVM_X86_NOTIFY_VMEXIT_USER = 1 << 1; + } +} + +impl Default for NotifyVmExitFlags { + fn default() -> Self { + NotifyVmExitFlags::empty() + } +} + +#[derive(Debug, Clone, Copy)] +pub enum KvmReg { + VcpuRegsRax = 0, + VcpuRegsRcx = 1, + VcpuRegsRdx = 2, + VcpuRegsRbx = 3, + VcpuRegsRsp = 4, + VcpuRegsRbp = 5, + VcpuRegsRsi = 6, + VcpuRegsRdi = 7, + + VcpuRegsR8 = 8, + VcpuRegsR9 = 9, + VcpuRegsR10 = 10, + VcpuRegsR11 = 11, + VcpuRegsR12 = 12, + VcpuRegsR13 = 13, + VcpuRegsR14 = 14, + VcpuRegsR15 = 15, + + VcpuRegsRip = 16, + NrVcpuRegs = 17, + + //VcpuExregPdptr = NrVcpuRegs, + VcpuExregCr0, + VcpuExregCr3, + VcpuExregCr4, + VcpuExregRflags, + VcpuExregSegments, + VcpuExregExitInfo1, //EXITINFO1 provides the linear address of the memory operand. + VcpuExregExitInfo2, //EXITINFO2 provides the contents of the register operand. +} + +bitflags! { + pub struct HFlags: u8 { + const HF_GUEST_MASK = 1 << 0; /* VCPU is in guest-mode */ + const HF_SMM_MASK = 1 << 1; + const HF_SMM_INSIDE_NMI_MASK = 1 << 2; + } +} + +/// ### 虚拟机的通用寄存器 +#[derive(Debug, Default, Clone, Copy)] +#[repr(C)] +pub struct KvmCommonRegs { + rax: u64, + rbx: u64, + rcx: u64, + rdx: u64, + rsi: u64, + rdi: u64, + rsp: u64, + rbp: u64, + r8: u64, + r9: u64, + r10: u64, + r11: u64, + r12: u64, + r13: u64, + r14: u64, + r15: u64, + rip: u64, + rflags: u64, +} + +impl Vm { + pub fn vcpu_precreate(&mut self, id: usize) -> Result<(), SystemError> { + if self.arch.max_vcpu_ids == 0 { + self.arch.max_vcpu_ids = 1024 * 4; + } + + if id >= self.arch.max_vcpu_ids { + return Err(SystemError::EINVAL); + } + + return x86_kvm_ops().vcpu_precreate(self); + } +} +bitflags! { + pub struct EmulType: u32 { + const NO_DECODE = 1 << 0; + const TRAP_UD = 1 << 1; + const SKIP = 1 << 2; + const ALLOW_RETRY_PF = 1 << 3; + const TRAP_UD_FORCED = 1 << 4; + const VMWARE_GP = 1 << 5; + const PF = 1 << 6; + const COMPLETE_USER_EXIT = 1 << 7; + const WRITE_PF_TO_SP = 1 << 8; + } +} +#[allow(dead_code)] +#[derive(Default, Debug)] +///用于跟踪和记录VCPU的各种统计信息。 +pub struct KvmVcpuStat { + //pub generic: KvmVcpuStatGeneric, + pub pf_taken: u64, + pub pf_fixed: u64, + pub pf_emulate: u64, + pub pf_spurious: u64, + pub pf_fast: u64, + pub pf_mmio_spte_created: u64, + pub pf_guest: u64, + pub tlb_flush: u64, + pub invlpg: u64, + pub exits: u64, + pub io_exits: u64, + pub mmio_exits: u64, + pub signal_exits: u64, + pub irq_window_exits: u64, + pub nmi_window_exits: u64, + pub l1d_flush: u64, + pub halt_exits: u64, + pub request_irq_exits: u64, + pub irq_exits: u64, + pub host_state_reload: u64, + pub fpu_reload: u64, + pub insn_emulation: u64, + pub insn_emulation_fail: u64, + pub hypercalls: u64, + pub irq_injections: u64, + pub nmi_injections: u64, + pub req_event: u64, + pub nested_run: u64, + pub directed_yield_attempted: u64, + pub directed_yield_successful: u64, + pub preemption_reported: u64, + pub preemption_other: u64, + pub guest_mode: u64, + pub notify_window_exits: u64, +} +#[inline] +/// 将 GFN 转换为 GPA +pub fn gfn_to_gpa(gfn: u64) -> u64 { + gfn << 12 +} +#[allow(dead_code)] +#[inline] +/// 将 GPA 转换为 GFN +pub fn gpa_to_gfn(gfn: u64) -> u64 { + gfn >> 12 +} diff --git a/kernel/src/arch/x86_64/vm/kvm_host/page.rs b/kernel/src/arch/x86_64/vm/kvm_host/page.rs new file mode 100644 index 00000000..e4bdec89 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/kvm_host/page.rs @@ -0,0 +1 @@ +pub const KVM_MIN_FREE_MMU_PAGES: usize = 5; diff --git a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs new file mode 100644 index 00000000..dd8143c1 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs @@ -0,0 +1,1697 @@ +use core::intrinsics::likely; +use core::{arch::x86_64::_xsetbv, intrinsics::unlikely}; + +use alloc::{boxed::Box, sync::Arc, vec::Vec}; +use bitmap::{traits::BitMapOps, AllocBitmap, BitMapCore}; +use log::warn; +use raw_cpuid::CpuId; +use system_error::SystemError; +use x86::vmx::vmcs::guest; +use x86::{ + bits64::rflags::RFlags, + controlregs::{Cr0, Cr4, Xcr0}, + dtables::DescriptorTablePointer, + msr::{self, wrmsr}, + vmx::vmcs::control::SecondaryControls, +}; +use x86_64::registers::control::EferFlags; + +use crate::arch::vm::asm::VmxAsm; +use crate::arch::vm::vmx::exit::ExitFastpathCompletion; +use crate::virt::vm::kvm_host::mem::KvmMmuMemoryCache; +use crate::virt::vm::kvm_host::vcpu::VcpuMode; +use crate::{ + arch::{ + kvm_arch_ops, + mm::barrier, + vm::{ + asm::{hyperv, kvm_msr, KvmX86Asm, MiscEnable, MsrData, VcpuSegment}, + cpuid::KvmCpuidEntry2, + kvm_host::KvmReg, + mmu::kvm_mmu::LockedKvmMmu, + uapi::{UapiKvmSegmentRegs, KVM_SYNC_X86_VALID_FIELDS}, + vmx::{vmcs::ControlsType, vmx_info}, + x86_kvm_manager, x86_kvm_manager_mut, x86_kvm_ops, + }, + }, + mm::VirtAddr, + smp::{core::smp_get_processor_id, cpu::ProcessorId}, + virt::vm::{ + kvm_host::{ + mem::GfnToHvaCache, + vcpu::{GuestDebug, VirtCpu}, + MutilProcessorState, Vm, + }, + user_api::{UapiKvmRun, UapiKvmSegment}, + }, +}; + +use super::{lapic::KvmLapic, HFlags, KvmCommonRegs, KvmIrqChipMode}; +const MSR_IA32_CR_PAT_DEFAULT: u64 = 0x0007_0406_0007_0406; +#[allow(dead_code)] +#[derive(Debug)] +pub struct X86VcpuArch { + /// 最近一次尝试进入虚拟机的主机cpu + pub last_vmentry_cpu: ProcessorId, + /// 可用寄存器位图 + pub regs_avail: AllocBitmap, + /// 脏寄存器位图 + pub regs_dirty: AllocBitmap, + /// 多处理器状态 + mp_state: MutilProcessorState, + pub apic_base: u64, + /// apic + pub apic: Option, + /// 主机pkru寄存器 + host_pkru: u32, + pkru: u32, + /// hflag + hflags: HFlags, + + pub microcode_version: u64, + + arch_capabilities: u64, + + perf_capabilities: u64, + + ia32_xss: u64, + + pub guest_state_protected: bool, + + pub cpuid_entries: Vec, + + pub exception: KvmQueuedException, + pub exception_vmexit: KvmQueuedException, + pub apf: KvmAsyncPageFault, + + pub emulate_regs_need_sync_from_vcpu: bool, + pub emulate_regs_need_sync_to_vcpu: bool, + + pub smbase: u64, + + pub interrupt: KvmQueuedInterrupt, + + pub tsc_offset_adjustment: u64, + + pub mmu: Option>, + pub root_mmu: Option>, + pub guset_mmu: Option>, + pub walk_mmu: Option>, + pub nested_mmu: Option>, + + pub mmu_pte_list_desc_cache: KvmMmuMemoryCache, + pub mmu_shadow_page_cache: KvmMmuMemoryCache, + pub mmu_shadowed_info_cache: KvmMmuMemoryCache, + pub mmu_page_header_cache: KvmMmuMemoryCache, + + pub max_phyaddr: usize, + + pub pat: u64, + + pub regs: [u64; KvmReg::NrVcpuRegs as usize], + + pub cr0: Cr0, + pub cr0_guest_owned_bits: Cr0, + pub cr2: u64, + pub cr3: u64, + pub cr4: Cr4, + pub cr4_guest_owned_bits: Cr4, + pub cr4_guest_rsvd_bits: Cr4, + pub cr8: u64, + pub efer: EferFlags, + + pub xcr0: Xcr0, + + pub dr6: usize, + pub dr7: usize, + + pub single_step_rip: usize, + + pub msr_misc_features_enables: u64, + pub ia32_misc_enable_msr: MiscEnable, + + pub smi_pending: bool, + pub smi_count: u64, + pub nmi_queued: usize, + /// 待注入的 NMI 数量,不包括硬件 vNMI。 + pub nmi_pending: u32, + pub nmi_injected: bool, + + pub handling_intr_from_guest: KvmIntrType, + + pub xfd_no_write_intercept: bool, + + pub l1tf_flush_l1d: bool, + + pub at_instruction_boundary: bool, + + pub db: [usize; Self::KVM_NR_DB_REGS], + + /* set at EPT violation at this point */ + pub exit_qual: u64, +} + +impl X86VcpuArch { + const KVM_NR_DB_REGS: usize = 4; + + #[inline(never)] + pub fn new() -> Self { + let mut ret: Box = unsafe { Box::new_zeroed().assume_init() }; + ret.last_vmentry_cpu = ProcessorId::INVALID; + ret.regs_avail = AllocBitmap::new(32); + ret.regs_dirty = AllocBitmap::new(32); + ret.mp_state = MutilProcessorState::Runnable; + + ret.apic = None; + //max_phyaddr=?? fztodo + *ret + } + + pub fn clear_dirty(&mut self) { + self.regs_dirty.set_all(false); + } + + pub fn vcpu_apicv_active(&self) -> bool { + self.lapic_in_kernel() && self.lapic().apicv_active + } + + pub fn lapic_in_kernel(&self) -> bool { + if x86_kvm_manager().has_noapic_vcpu { + return self.apic.is_some(); + } + true + } + + pub fn is_bsp(&self) -> bool { + return self.apic_base & msr::IA32_APIC_BASE as u64 != 0; + } + + #[inline] + pub fn lapic(&self) -> &KvmLapic { + self.apic.as_ref().unwrap() + } + + pub fn queue_interrupt(&mut self, vec: u8, soft: bool) { + self.interrupt.injected = true; + self.interrupt.soft = soft; + self.interrupt.nr = vec; + } + + pub fn read_cr0_bits(&mut self, mask: Cr0) -> Cr0 { + let tmask = mask & (Cr0::CR0_TASK_SWITCHED | Cr0::CR0_WRITE_PROTECT); + if tmask.contains(self.cr0_guest_owned_bits) + && !self + .regs_avail + .get(KvmReg::VcpuExregCr0 as usize) + .unwrap_or_default() + { + x86_kvm_ops().cache_reg(self, KvmReg::VcpuExregCr0); + } + + return self.cr0 & mask; + } + + pub fn read_cr4_bits(&mut self, mask: Cr4) -> Cr4 { + let tmask = mask + & (Cr4::CR4_VIRTUAL_INTERRUPTS + | Cr4::CR4_DEBUGGING_EXTENSIONS + | Cr4::CR4_ENABLE_PPMC + | Cr4::CR4_ENABLE_SSE + | Cr4::CR4_UNMASKED_SSE + | Cr4::CR4_ENABLE_GLOBAL_PAGES + | Cr4::CR4_TIME_STAMP_DISABLE + | Cr4::CR4_ENABLE_FSGSBASE); + + if tmask.contains(self.cr4_guest_owned_bits) + && !self + .regs_avail + .get(KvmReg::VcpuExregCr4 as usize) + .unwrap_or_default() + { + x86_kvm_ops().cache_reg(self, KvmReg::VcpuExregCr4) + } + + return self.cr4 & mask; + } + + pub fn get_cr8(&self) -> u64 { + if self.lapic_in_kernel() { + todo!() + } else { + return self.cr8; + } + } + + #[inline] + pub fn is_smm(&self) -> bool { + self.hflags.contains(HFlags::HF_SMM_MASK) + } + + #[inline] + pub fn is_guest_mode(&self) -> bool { + self.hflags.contains(HFlags::HF_GUEST_MASK) + } + + #[inline] + pub fn is_long_mode(&self) -> bool { + self.efer.contains(EferFlags::LONG_MODE_ACTIVE) + } + + #[inline] + #[allow(dead_code)] + pub fn is_pae_paging(&mut self) -> bool { + let flag1 = self.is_long_mode(); + let flag2 = self.is_pae(); + let flag3 = self.is_paging(); + + !flag1 && flag2 && flag3 + } + + #[inline] + pub fn is_pae(&mut self) -> bool { + !self.read_cr4_bits(Cr4::CR4_ENABLE_PAE).is_empty() + } + #[inline] + pub fn is_paging(&mut self) -> bool { + //return likely(kvm_is_cr0_bit_set(vcpu, X86_CR0_PG)); + !self.read_cr0_bits(Cr0::CR0_ENABLE_PAGING).is_empty() + } + + #[inline] + pub fn is_portected_mode(&mut self) -> bool { + !self.read_cr0_bits(Cr0::CR0_PROTECTED_MODE).is_empty() + } + + #[inline] + fn clear_interrupt_queue(&mut self) { + self.interrupt.injected = false; + } + + #[inline] + fn clear_exception_queue(&mut self) { + self.exception.pending = false; + self.exception.injected = false; + self.exception_vmexit.pending = false; + } + + #[allow(dead_code)] + pub fn update_cpuid_runtime(&mut self, entries: &Vec) { + let cpuid = CpuId::new(); + let feat = cpuid.get_feature_info().unwrap(); + let base = KvmCpuidEntry2::find(entries, 1, None); + if let Some(_base) = base { + if feat.has_xsave() {} + } + + todo!() + } + + #[inline] + pub fn test_and_mark_available(&mut self, reg: KvmReg) -> bool { + let old = self.regs_avail.get(reg as usize).unwrap_or_default(); + self.regs_avail.set(reg as usize, true); + return old; + } + + #[inline] + pub fn mark_register_dirty(&mut self, reg: KvmReg) { + self.regs_avail.set(reg as usize, true); + self.regs_dirty.set(reg as usize, true); + } + + #[inline] + pub fn mark_register_available(&mut self, reg: KvmReg) { + self.regs_avail.set(reg as usize, true); + } + + #[inline] + pub fn is_register_dirty(&self, reg: KvmReg) -> bool { + self.regs_dirty.get(reg as usize).unwrap() + } + + #[inline] + pub fn is_register_available(&self, reg: KvmReg) -> bool { + self.regs_avail.get(reg as usize).unwrap() + } + + #[inline] + pub fn write_reg(&mut self, reg: KvmReg, data: u64) { + self.regs[reg as usize] = data; + } + + #[inline] + pub fn write_reg_raw(&mut self, reg: KvmReg, data: u64) { + self.regs[reg as usize] = data; + self.mark_register_dirty(reg); + } + + #[inline] + pub fn read_reg(&self, reg: KvmReg) -> u64 { + return self.regs[reg as usize]; + } + + #[inline] + pub fn read_reg_raw(&mut self, reg: KvmReg) -> u64 { + if self.regs_avail.get(reg as usize) == Some(true) { + kvm_arch_ops().cache_reg(self, reg); + } + + return self.regs[reg as usize]; + } + + #[inline] + fn get_linear_rip(&mut self) -> u64 { + if self.guest_state_protected { + return 0; + } + return self.read_reg_raw(KvmReg::VcpuRegsRip); + } + + pub fn set_msr_common(&mut self, msr_info: &MsrData) { + let msr = msr_info.index; + let data = msr_info.data; + + match msr { + // MSR_AMD64_NB_CFG + 0xc001001f => { + return; + } + // MSR_VM_HSAVE_PA + 0xc0010117 => { + return; + } + // MSR_AMD64_PATCH_LOADER + 0xc0010020 => { + return; + } + // MSR_AMD64_BU_CFG2 + 0xc001102a => { + return; + } + // MSR_AMD64_DC_CFG + 0xc0011022 => { + return; + } + // MSR_AMD64_TW_CFG + 0xc0011023 => { + return; + } + // MSR_F15H_EX_CFG + 0xc001102c => { + return; + } + msr::IA32_BIOS_UPDT_TRIG => { + return; + } + msr::IA32_BIOS_SIGN_ID => { + // MSR_IA32_UCODE_REV + if msr_info.host_initiated { + self.microcode_version = data; + } + return; + } + // MSR_IA32_ARCH_CAPABILITIES + 0x0000010a => { + if !msr_info.host_initiated { + return; + } + + self.arch_capabilities = data; + } + msr::MSR_PERF_CAPABILITIES => { + if !msr_info.host_initiated { + return; + } + + if data & (!x86_kvm_manager().kvm_caps.supported_perf_cap) != 0 { + return; + } + + if self.perf_capabilities == data { + return; + } + + self.perf_capabilities = data; + // todo: kvm_pmu_refresh + return; + } + // MSR_IA32_FLUSH_CMD + 0x0000010b => { + todo!() + } + msr::IA32_EFER => { + todo!() + } + // MSR_K7_HWCR + 0xc0010015 => { + todo!() + } + // MSR_FAM10H_MMIO_CONF_BASE + 0xc0010058 => { + todo!() + } + msr::IA32_PAT => { + todo!() + } + // MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000 | MSR_MTRRdefType + 0x200..=0x26f | 0x2ff => { + todo!() + } + msr::APIC_BASE => { + todo!() + } + // APIC_BASE_MSR ... APIC_BASE_MSR + 0xff + 0x800..=0x8ff => { + todo!() + } + msr::IA32_TSC_DEADLINE => { + todo!() + } + msr::IA32_TSC_ADJUST => { + todo!() + } + msr::IA32_MISC_ENABLE => { + todo!() + } + msr::IA32_SMBASE => { + todo!() + } + msr::TSC => { + todo!() + } + // MSR_IA32_XSS + msr::MSR_C5_PMON_BOX_CTRL => { + if !msr_info.host_initiated { + return; + } + if data & (!x86_kvm_manager().kvm_caps.supported_xss) != 0 { + return; + } + + self.ia32_xss = data; + // TODO:kvm_update_cpuid_runtime + return; + } + msr::MSR_SMI_COUNT => { + todo!() + } + kvm_msr::MSR_KVM_WALL_CLOCK_NEW => { + todo!() + } + kvm_msr::MSR_KVM_WALL_CLOCK => { + todo!() + } + kvm_msr::MSR_KVM_SYSTEM_TIME => { + todo!() + } + kvm_msr::MSR_KVM_ASYNC_PF_EN => { + todo!() + } + kvm_msr::MSR_KVM_ASYNC_PF_INT => { + todo!() + } + kvm_msr::MSR_KVM_ASYNC_PF_ACK => { + todo!() + } + kvm_msr::MSR_KVM_STEAL_TIME => { + todo!() + } + kvm_msr::MSR_KVM_PV_EOI_EN => { + todo!() + } + kvm_msr::MSR_KVM_POLL_CONTROL => { + todo!() + } + msr::MCG_CTL + | msr::MCG_STATUS + | msr::MC0_CTL..=msr::MSR_MC26_MISC + | msr::IA32_MC0_CTL2..=msr::IA32_MC21_CTL2 => { + todo!() + } + // MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3 + // MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3 + // MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3 + // MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1 + 0xc0010004..=0xc0010007 + | 0xc1..=0xc2 + | 0xc0010000..=0xc0010003 + | 0x00000186..=0x00000187 => { + todo!() + } + + // MSR_K7_CLK_CTL + 0xc001001b => { + return; + } + + hyperv::HV_X64_MSR_GUEST_OS_ID..=hyperv::HV_REGISTER_SINT15 + | hyperv::HV_X64_MSR_SYNDBG_CONTROL..=hyperv::HV_X64_MSR_SYNDBG_PENDING_BUFFER + | hyperv::HV_X64_MSR_SYNDBG_OPTIONS + | hyperv::HV_REGISTER_CRASH_P0..=hyperv::HV_REGISTER_CRASH_P4 + | hyperv::HV_REGISTER_CRASH_CTL + | hyperv::HV_REGISTER_STIMER0_CONFIG..=hyperv::HV_REGISTER_STIMER3_COUNT + | hyperv::HV_X64_MSR_REENLIGHTENMENT_CONTROL + | hyperv::HV_X64_MSR_TSC_EMULATION_CONTROL + | hyperv::HV_X64_MSR_TSC_EMULATION_STATUS + | hyperv::HV_X64_MSR_TSC_INVARIANT_CONTROL => { + todo!() + } + + msr::MSR_BBL_CR_CTL3 => { + todo!() + } + + // MSR_AMD64_OSVW_ID_LENGTH + 0xc0010140 => { + todo!() + } + // MSR_AMD64_OSVW_STATUS + 0xc0010141 => { + todo!() + } + + msr::MSR_PLATFORM_INFO => { + todo!() + } + // MSR_MISC_FEATURES_ENABLES + 0x00000140 => { + todo!() + } + // MSR_IA32_XFD + 0x000001c4 => { + todo!() + } + // MSR_IA32_XFD_ERR + 0x000001c5 => { + todo!() + } + _ => { + todo!() + } + } + } + + pub fn kvm_before_interrupt(&mut self, intr: KvmIntrType) { + barrier::mfence(); + self.handling_intr_from_guest = intr; + barrier::mfence(); + } + + pub fn kvm_after_interrupt(&mut self) { + barrier::mfence(); + self.handling_intr_from_guest = KvmIntrType::None; + barrier::mfence(); + } +} + +impl VirtCpu { + pub fn init_arch(&mut self, vm: &mut Vm, id: usize) -> Result<(), SystemError> { + //kvm_arch_vcpu_create + vm.vcpu_precreate(id)?; + + self.arch.last_vmentry_cpu = ProcessorId::INVALID; + self.arch.regs_avail.set_all(true); + self.arch.regs_dirty.set_all(true); + + if vm.arch.irqchip_mode != KvmIrqChipMode::None || vm.arch.bsp_vcpu_id == self.vcpu_id { + self.arch.mp_state = MutilProcessorState::Runnable; + } else { + self.arch.mp_state = MutilProcessorState::Uninitialized; + } + + self.arch.vcpu_arch_mmu_create(); + + if vm.arch.irqchip_mode != KvmIrqChipMode::None { + todo!() + } else { + x86_kvm_manager_mut().has_noapic_vcpu = true; + } + + x86_kvm_ops().vcpu_create(self, vm); + + //lots of todo!!! + + self.arch.pat = MSR_IA32_CR_PAT_DEFAULT; + + self.load(); + self.vcpu_reset(vm, false)?; + self.arch.kvm_init_mmu(); + + Ok(()) + } + + #[inline] + pub fn kvm_run(&self) -> &UapiKvmRun { + self.run.as_ref().unwrap() + } + + #[inline] + pub fn kvm_run_mut(&mut self) -> &mut Box { + self.run.as_mut().unwrap() + } + + pub fn run(&mut self) -> Result { + self.load(); + + if unlikely(self.arch.mp_state == MutilProcessorState::Uninitialized) { + todo!() + } + + if self.kvm_run().kvm_valid_regs & !KVM_SYNC_X86_VALID_FIELDS != 0 + || self.kvm_run().kvm_dirty_regs & !KVM_SYNC_X86_VALID_FIELDS != 0 + { + return Err(SystemError::EINVAL); + } + + if self.kvm_run().kvm_dirty_regs != 0 { + todo!() + } + + if !self.arch.lapic_in_kernel() { + self.kvm_set_cr8(self.kvm_run().cr8); + } + + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#11174 - 11196 + + if self.kvm_run().immediate_exit != 0 { + return Err(SystemError::EINTR); + } + + // vmx_vcpu_pre_run + + self.vcpu_run(&self.kvm().lock())?; + + Ok(0) + } + + fn vcpu_run(&mut self, vm: &Vm) -> Result<(), SystemError> { + self.arch.l1tf_flush_l1d = true; + + loop { + self.arch.at_instruction_boundary = false; + if self.can_running() { + self.enter_guest(vm)?; + } else { + todo!() + }; + } + } + + fn enter_guest(&mut self, vm: &Vm) -> Result<(), SystemError> { + let req_immediate_exit = false; + + warn!("request {:?}", self.request); + if !self.request.is_empty() { + if self.check_request(VirtCpuRequest::KVM_REQ_VM_DEAD) { + return Err(SystemError::EIO); + } + + // TODO: kvm_dirty_ring_check_request + + if self.check_request(VirtCpuRequest::KVM_REQ_MMU_FREE_OBSOLETE_ROOTS) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_MIGRATE_TIMER) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_MASTERCLOCK_UPDATE) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_GLOBAL_CLOCK_UPDATE) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_CLOCK_UPDATE) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_MMU_SYNC) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_LOAD_MMU_PGD) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_TLB_FLUSH) { + self.flush_tlb_all(); + } + + self.service_local_tlb_flush_requests(); + + // TODO: KVM_REQ_HV_TLB_FLUSH) && kvm_hv_vcpu_flush_tlb(vcpu) + + if self.check_request(VirtCpuRequest::KVM_REQ_REPORT_TPR_ACCESS) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_TRIPLE_FAULT) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_STEAL_UPDATE) { + // todo!() + warn!("VirtCpuRequest::KVM_REQ_STEAL_UPDATE TODO!"); + } + + if self.check_request(VirtCpuRequest::KVM_REQ_SMI) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_NMI) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_PMU) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_PMI) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_IOAPIC_EOI_EXIT) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_SCAN_IOAPIC) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_LOAD_EOI_EXITMAP) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_APIC_PAGE_RELOAD) { + // todo!() + warn!("VirtCpuRequest::KVM_REQ_APIC_PAGE_RELOAD TODO!"); + } + + if self.check_request(VirtCpuRequest::KVM_REQ_HV_CRASH) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_HV_RESET) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_HV_EXIT) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_HV_STIMER) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_APICV_UPDATE) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_APF_READY) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_MSR_FILTER_CHANGED) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_UPDATE_CPU_DIRTY_LOGGING) { + todo!() + } + } + + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#10661 + if self.check_request(VirtCpuRequest::KVM_REQ_EVENT) { + // TODO + } + + self.kvm_mmu_reload(vm)?; + + x86_kvm_ops().prepare_switch_to_guest(self); + // warn!( + // "mode {:?} req {:?} mode_cond {} !is_empty {} cond {}", + // self.mode, + // self.request, + // self.mode == VcpuMode::ExitingGuestMode, + // !self.request.is_empty(), + // (self.mode == VcpuMode::ExitingGuestMode) || (!self.request.is_empty()) + // ); + warn!( + "req bit {} empty bit {}", + self.request.bits, + VirtCpuRequest::empty().bits + ); + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#10730 + if self.mode == VcpuMode::ExitingGuestMode || !self.request.is_empty() { + self.mode = VcpuMode::OutsideGuestMode; + return Err(SystemError::EINVAL); + } + + if req_immediate_exit { + self.request(VirtCpuRequest::KVM_REQ_EVENT); + todo!(); + } + + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#10749 - 10766 + + let exit_fastpath; + loop { + exit_fastpath = x86_kvm_ops().vcpu_run(self); + if likely(exit_fastpath != ExitFastpathCompletion::ExitHandled) { + break; + } + + todo!(); + } + + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#10799 - 10814 + + self.arch.last_vmentry_cpu = self.cpu; + + // TODO: last_guest_tsc + + self.mode = VcpuMode::OutsideGuestMode; + + barrier::mfence(); + + // TODO: xfd + + x86_kvm_ops().handle_exit_irqoff(self); + + // todo: xfd + + // TODO: 一些中断或者tsc操作 + + match x86_kvm_ops().handle_exit(self, vm, exit_fastpath) { + Err(err) => return Err(err), + Ok(_) => Ok(()), + } + } + + fn flush_tlb_all(&mut self) { + x86_kvm_ops().flush_tlb_all(self); + self.clear_request(VirtCpuRequest::KVM_REQ_TLB_FLUSH_CURRENT); + } + + fn service_local_tlb_flush_requests(&mut self) { + if self.check_request(VirtCpuRequest::KVM_REQ_TLB_FLUSH_CURRENT) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_TLB_FLUSH_GUEST) { + todo!() + } + } + + pub fn request(&mut self, req: VirtCpuRequest) { + // self.request.set( + // (req.bits() & VirtCpuRequest::KVM_REQUEST_MASK.bits()) as usize, + // true, + // ); + self.request.insert(req); + } + + fn check_request(&mut self, req: VirtCpuRequest) -> bool { + if self.test_request(req) { + self.clear_request(req); + + barrier::mfence(); + return true; + } + + return false; + } + + fn test_request(&self, req: VirtCpuRequest) -> bool { + // self.request + // .get((req.bits & VirtCpuRequest::KVM_REQUEST_MASK.bits) as usize) + // .unwrap_or_default() + self.request.contains(req) + } + + fn clear_request(&mut self, req: VirtCpuRequest) { + // self.request.set( + // (req.bits & VirtCpuRequest::KVM_REQUEST_MASK.bits) as usize, + // false, + // ); + self.request.remove(req); + } + + pub fn can_running(&self) -> bool { + return self.arch.mp_state == MutilProcessorState::Runnable && !self.arch.apf.halted; + } + + #[inline] + fn load(&mut self) { + self.arch_vcpu_load(smp_get_processor_id()) + } + + fn arch_vcpu_load(&mut self, cpu: ProcessorId) { + x86_kvm_ops().vcpu_load(self, cpu); + + self.arch.host_pkru = KvmX86Asm::read_pkru(); + + // 下列两个TODO为处理时钟信息 + if unlikely(self.arch.tsc_offset_adjustment != 0) { + todo!() + } + + if unlikely(self.cpu != cpu) { + // TODO: 设置tsc + self.cpu = cpu; + } + + self.request(VirtCpuRequest::KVM_REQ_STEAL_UPDATE) + } + + pub fn set_msr( + &mut self, + index: u32, + data: u64, + host_initiated: bool, + ) -> Result<(), SystemError> { + match index { + msr::IA32_FS_BASE + | msr::IA32_GS_BASE + | msr::IA32_KERNEL_GSBASE + | msr::IA32_CSTAR + | msr::IA32_LSTAR => { + if VirtAddr::new(data as usize).is_canonical() { + return Ok(()); + } + } + + msr::IA32_SYSENTER_EIP | msr::IA32_SYSENTER_ESP => { + // 需要将Data转为合法地址,但是现在先这样写 + assert!(VirtAddr::new(data as usize).is_canonical()); + } + msr::IA32_TSC_AUX => { + if x86_kvm_manager() + .find_user_return_msr_idx(msr::IA32_TSC_AUX) + .is_none() + { + return Ok(()); + } + + todo!() + } + _ => {} + } + + let msr_data = MsrData { + host_initiated, + index, + data, + }; + + return kvm_arch_ops().set_msr(self, msr_data); + } + + pub fn vcpu_reset(&mut self, vm: &Vm, init_event: bool) -> Result<(), SystemError> { + let old_cr0 = self.arch.read_cr0_bits(Cr0::all()); + + if self.arch.is_guest_mode() { + todo!() + } + + self.lapic_reset(vm, init_event); + + self.arch.hflags = HFlags::empty(); + + self.arch.smi_pending = false; + self.arch.smi_count = 0; + self.arch.nmi_queued = 0; + self.arch.nmi_pending = 0; + self.arch.nmi_injected = false; + + self.arch.clear_exception_queue(); + self.arch.clear_interrupt_queue(); + + for i in &mut self.arch.db { + *i = 0; + } + + // TODO: kvm_update_dr0123(vcpu); + + // DR6_ACTIVE_LOW + self.arch.dr6 = 0xffff0ff0; + // DR7_FIXED_1 + self.arch.dr7 = 0x00000400; + + // TODO: kvm_update_dr7(vcpu); + + self.arch.cr2 = 0; + + self.request(VirtCpuRequest::KVM_REQ_EVENT); + + self.arch.apf.msr_en_val = 0; + self.arch.apf.msr_int_val = 0; + // TODO:st + + // TODO: kvmclock_reset(vcpu); + + // TODO: kvm_clear_async_pf_completion_queue(vcpu); + + for i in &mut self.arch.apf.gfns { + *i = u64::MAX; + } + + self.arch.apf.halted = false; + + // TODO: fpu + + if !init_event { + // TODO:pmu + self.arch.smbase = 0x30000; + + self.arch.msr_misc_features_enables = 0; + self.arch.ia32_misc_enable_msr = MiscEnable::MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL + | MiscEnable::MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; + + // TODO: __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); + // 0xda0: MSR_IA32_XSS + self.set_msr(0xda0, 0, true)?; + } + + for reg in &mut self.arch.regs { + *reg = 0; + } + + self.arch.mark_register_dirty(KvmReg::VcpuRegsRsp); + + let cpuid_0x1 = KvmCpuidEntry2::find(&self.arch.cpuid_entries, 1, None); + let val = if let Some(cpuid) = cpuid_0x1 { + cpuid.eax + } else { + 0x600 + }; + self.arch.write_reg(KvmReg::VcpuRegsRdx, val as u64); + + kvm_arch_ops().vcpu_reset(self, vm, init_event); + + self.set_rflags(RFlags::FLAGS_A1); + self.arch.write_reg_raw(KvmReg::VcpuRegsRip, 0xfff0); + + self.arch.cr3 = 0; + self.arch.mark_register_dirty(KvmReg::VcpuExregCr3); + + let mut new_cr0 = Cr0::CR0_EXTENSION_TYPE; + if init_event { + new_cr0.insert(old_cr0 & (Cr0::CR0_NOT_WRITE_THROUGH | Cr0::CR0_CACHE_DISABLE)); + } else { + new_cr0.insert(Cr0::CR0_NOT_WRITE_THROUGH | Cr0::CR0_CACHE_DISABLE); + } + + kvm_arch_ops().set_cr0(vm, self, new_cr0); + kvm_arch_ops().set_cr4(self, Cr4::empty()); + kvm_arch_ops().set_efer(self, EferFlags::empty()); + kvm_arch_ops().update_exception_bitmap(self); + + if old_cr0.contains(Cr0::CR0_ENABLE_PAGING) { + self.request(VirtCpuRequest::MAKE_KVM_REQ_TLB_FLUSH_GUEST); + self.arch.reset_mmu_context(); + } + + if init_event { + self.request(VirtCpuRequest::MAKE_KVM_REQ_TLB_FLUSH_GUEST); + } + + Ok(()) + } + + fn set_rflags(&mut self, rflags: RFlags) { + self._set_rflags(rflags); + self.request(VirtCpuRequest::KVM_REQ_EVENT); + } + + fn _set_rflags(&mut self, mut rflags: RFlags) { + if self.guest_debug.contains(GuestDebug::SINGLESTEP) + && self.is_linear_rip(self.arch.single_step_rip) + { + rflags.insert(RFlags::FLAGS_TF); + } + + kvm_arch_ops().set_rflags(self, rflags); + } + + fn get_rflags(&mut self) -> RFlags { + let mut rflags = kvm_arch_ops().get_rflags(self); + if self.guest_debug.contains(GuestDebug::SINGLESTEP) { + rflags.insert(RFlags::FLAGS_TF); + } + return rflags; + } + + fn is_linear_rip(&mut self, linear_rip: usize) -> bool { + return self.arch.get_linear_rip() == linear_rip as u64; + } + + pub fn get_regs(&mut self) -> KvmCommonRegs { + self.load(); + return self._get_regs(); + } + + fn _get_regs(&mut self) -> KvmCommonRegs { + KvmCommonRegs { + rax: self.arch.read_reg(KvmReg::VcpuRegsRax), + rbx: self.arch.read_reg(KvmReg::VcpuRegsRbx), + rcx: self.arch.read_reg(KvmReg::VcpuRegsRcx), + rdx: self.arch.read_reg(KvmReg::VcpuRegsRdx), + rsi: self.arch.read_reg(KvmReg::VcpuRegsRsi), + rdi: self.arch.read_reg(KvmReg::VcpuRegsRdi), + rsp: self.arch.read_reg(KvmReg::VcpuRegsRsp), + rbp: self.arch.read_reg(KvmReg::VcpuRegsRbp), + r8: self.arch.read_reg(KvmReg::VcpuRegsR8), + r9: self.arch.read_reg(KvmReg::VcpuRegsR9), + r10: self.arch.read_reg(KvmReg::VcpuRegsR10), + r11: self.arch.read_reg(KvmReg::VcpuRegsR11), + r12: self.arch.read_reg(KvmReg::VcpuRegsR12), + r13: self.arch.read_reg(KvmReg::VcpuRegsR13), + r14: self.arch.read_reg(KvmReg::VcpuRegsR14), + r15: self.arch.read_reg(KvmReg::VcpuRegsR15), + rip: self.arch.read_reg_raw(KvmReg::VcpuRegsRip), + rflags: self.get_rflags().bits(), + } + } + + pub fn get_segment_regs(&mut self) -> UapiKvmSegmentRegs { + self.load(); + return self._get_segment_regs(); + } + + fn _get_segment_regs(&mut self) -> UapiKvmSegmentRegs { + let mut sregs = self._get_segment_regs_common(); + + if self.arch.guest_state_protected { + return sregs; + } + + if self.arch.interrupt.injected && !self.arch.interrupt.soft { + BitMapCore::new().set( + sregs.interrupt_bitmap.len() * core::mem::size_of::(), + &mut sregs.interrupt_bitmap, + self.arch.interrupt.nr as usize, + true, + ); + } + + return sregs; + } + + fn read_cr3(&mut self) -> u64 { + if !self.arch.is_register_available(KvmReg::VcpuExregCr3) { + x86_kvm_ops().cache_reg(&mut self.arch, KvmReg::VcpuExregCr3); + } + return self.arch.cr3; + } + + fn kvm_get_segment(&mut self, segment: &mut UapiKvmSegment, seg: VcpuSegment) { + *segment = x86_kvm_ops().get_segment(self, *segment, seg); + } + + fn _get_segment_regs_common(&mut self) -> UapiKvmSegmentRegs { + let mut sregs = UapiKvmSegmentRegs::default(); + + if !self.arch.guest_state_protected { + let mut dt = DescriptorTablePointer::default(); + + self.kvm_get_segment(&mut sregs.cs, VcpuSegment::CS); + self.kvm_get_segment(&mut sregs.ds, VcpuSegment::DS); + self.kvm_get_segment(&mut sregs.es, VcpuSegment::ES); + self.kvm_get_segment(&mut sregs.fs, VcpuSegment::FS); + self.kvm_get_segment(&mut sregs.gs, VcpuSegment::GS); + self.kvm_get_segment(&mut sregs.ss, VcpuSegment::SS); + + self.kvm_get_segment(&mut sregs.tr, VcpuSegment::TR); + self.kvm_get_segment(&mut sregs.ldt, VcpuSegment::LDTR); + + x86_kvm_ops().get_idt(self, &mut dt); + sregs.idt.limit = dt.limit; + sregs.idt.base = dt.base as usize as u64; + + x86_kvm_ops().get_gdt(self, &mut dt); + sregs.gdt.limit = dt.limit; + sregs.gdt.base = dt.base as usize as u64; + + sregs.cr2 = self.arch.cr2; + sregs.cr3 = self.read_cr3(); + } + + sregs.cr0 = self.arch.read_cr0_bits(Cr0::all()).bits() as u64; + sregs.cr4 = self.arch.read_cr4_bits(Cr4::all()).bits() as u64; + sregs.cr8 = self.arch.get_cr8(); + sregs.efer = self.arch.efer.bits(); + sregs.apic_base = self.arch.apic_base; + + return sregs; + } + + pub fn set_segment_regs(&mut self, sregs: &mut UapiKvmSegmentRegs) -> Result<(), SystemError> { + self.load(); + self._set_segmenet_regs(&self.kvm().lock(), sregs)?; + Ok(()) + } + + fn _set_segmenet_regs( + &mut self, + vm: &Vm, + sregs: &mut UapiKvmSegmentRegs, + ) -> Result<(), SystemError> { + let mut mmu_reset_needed = false; + self._set_segmenet_regs_common(vm, sregs, &mut mmu_reset_needed, true)?; + + if mmu_reset_needed { + todo!() + } + + // KVM_NR_INTERRUPTS + let max_bits = 256; + + let pending_vec = BitMapCore::new().first_index(&sregs.interrupt_bitmap); + if let Some(pending) = pending_vec { + if pending < max_bits { + self.arch.queue_interrupt(pending as u8, false); + + self.request(VirtCpuRequest::KVM_REQ_EVENT); + } + } + + Ok(()) + } + + /// 设置段寄存器 + fn _set_segmenet_regs_common( + &mut self, + vm: &Vm, + sregs: &mut UapiKvmSegmentRegs, + mmu_reset_needed: &mut bool, + update_pdptrs: bool, + ) -> Result<(), SystemError> { + let mut apic_base_msr = MsrData::default(); + + if !self.is_valid_segment_regs(sregs) { + return Err(SystemError::EINVAL); + } + + apic_base_msr.data = sregs.apic_base; + apic_base_msr.host_initiated = true; + + // TODO: kvm_set_apic_base + + if self.arch.guest_state_protected { + return Ok(()); + } + + let mut dt: DescriptorTablePointer = DescriptorTablePointer { + limit: sregs.idt.limit, + base: sregs.idt.base as usize as *const u8, + }; + + x86_kvm_ops().set_idt(self, &dt); + + dt.limit = sregs.gdt.limit; + dt.base = sregs.gdt.base as usize as *const u8; + x86_kvm_ops().set_gdt(self, &dt); + + self.arch.cr2 = sregs.cr2; + *mmu_reset_needed |= self.read_cr3() != sregs.cr3; + + self.arch.cr3 = sregs.cr3; + + self.arch.mark_register_dirty(KvmReg::VcpuExregCr3); + + x86_kvm_ops().post_set_cr3(self, sregs.cr3); + + //debug!("_set_segmenet_regs_common 2:: cr3: {:#x}", self.arch.cr3); + + self.kvm_set_cr8(sregs.cr8); + + let efer = EferFlags::from_bits_truncate(sregs.efer); + *mmu_reset_needed |= self.arch.efer != efer; + x86_kvm_ops().set_efer(self, efer); + + let cr0 = Cr0::from_bits_truncate(sregs.cr0 as usize); + *mmu_reset_needed |= self.arch.cr0 != cr0; + x86_kvm_ops().set_cr0(vm, self, cr0); + self.arch.cr0 = cr0; + + let cr4 = Cr4::from_bits_truncate(sregs.cr4 as usize); + *mmu_reset_needed |= self.arch.read_cr4_bits(Cr4::all()) != cr4; + x86_kvm_ops().set_cr4(self, cr4); + + if update_pdptrs { + //todo!() + } + + x86_kvm_ops().set_segment(self, &mut sregs.cs, VcpuSegment::CS); + x86_kvm_ops().set_segment(self, &mut sregs.ds, VcpuSegment::DS); + x86_kvm_ops().set_segment(self, &mut sregs.es, VcpuSegment::ES); + x86_kvm_ops().set_segment(self, &mut sregs.fs, VcpuSegment::FS); + x86_kvm_ops().set_segment(self, &mut sregs.gs, VcpuSegment::GS); + x86_kvm_ops().set_segment(self, &mut sregs.ss, VcpuSegment::SS); + + x86_kvm_ops().set_segment(self, &mut sregs.tr, VcpuSegment::TR); + x86_kvm_ops().set_segment(self, &mut sregs.ldt, VcpuSegment::LDTR); + + // TODO: update_cr8_intercept + + if self.arch.is_bsp() + && self.arch.read_reg_raw(KvmReg::VcpuRegsRip) == 0xfff0 + && sregs.cs.selector == 0xf000 + && sregs.cs.base == 0xffff0000 + && !self.arch.is_portected_mode() + { + self.arch.mp_state = MutilProcessorState::Runnable; + } + + Ok(()) + } + + pub fn kvm_set_cr8(&mut self, cr8: u64) { + // 先这样写 + self.arch.cr8 = cr8; + } + + fn is_valid_segment_regs(&self, sregs: &UapiKvmSegmentRegs) -> bool { + let efer = EferFlags::from_bits_truncate(sregs.efer); + let cr4 = Cr4::from_bits_truncate(sregs.cr4 as usize); + let cr0 = Cr0::from_bits_truncate(sregs.cr0 as usize); + + if efer.contains(EferFlags::LONG_MODE_ENABLE) && cr0.contains(Cr0::CR0_ENABLE_PAGING) { + if !cr4.contains(Cr4::CR4_ENABLE_PAE) || !efer.contains(EferFlags::LONG_MODE_ACTIVE) { + return false; + } + + // TODO: legal gpa? + } else if efer.contains(EferFlags::LONG_MODE_ACTIVE) || sregs.cs.l != 0 { + return false; + } + let ret = self.kvm_is_vaild_cr0(cr0) && self.kvm_is_vaild_cr4(cr4); + return ret; + } + + fn kvm_is_vaild_cr0(&self, cr0: Cr0) -> bool { + if cr0.contains(Cr0::CR0_NOT_WRITE_THROUGH) && !cr0.contains(Cr0::CR0_CACHE_DISABLE) { + return false; + } + + if cr0.contains(Cr0::CR0_ENABLE_PAGING) && !cr0.contains(Cr0::CR0_PROTECTED_MODE) { + return false; + } + let ret = x86_kvm_ops().is_vaild_cr0(self, cr0); + return ret; + } + + fn __kvm_is_valid_cr4(&self, cr4: Cr4) -> bool { + if cr4.contains(self.arch.cr4_guest_rsvd_bits) { + //debug!("__kvm_is_valid_cr4::here"); + //return false; + } + + return true; + } + + fn kvm_is_vaild_cr4(&self, cr4: Cr4) -> bool { + return self.__kvm_is_valid_cr4(cr4) && x86_kvm_ops().is_vaild_cr4(self, cr4); + } + + pub fn is_unrestricted_guest(&self) -> bool { + let guard = self.vmx().loaded_vmcs(); + return vmx_info().enable_unrestricted_guest + && (!self.arch.is_guest_mode() + || SecondaryControls::from_bits_truncate( + guard.controls_get(ControlsType::SecondaryExec) as u32, + ) + .contains(SecondaryControls::UNRESTRICTED_GUEST)); + } + + pub fn set_regs(&mut self, regs: &KvmCommonRegs) -> Result<(), SystemError> { + self.load(); + self._set_regs(regs); + Ok(()) + } + + fn _set_regs(&mut self, regs: &KvmCommonRegs) { + self.arch.emulate_regs_need_sync_from_vcpu = true; + self.arch.emulate_regs_need_sync_to_vcpu = false; + + self.arch.write_reg(KvmReg::VcpuRegsRax, regs.rax); + self.arch.write_reg(KvmReg::VcpuRegsRbx, regs.rbx); + self.arch.write_reg(KvmReg::VcpuRegsRcx, regs.rcx); + self.arch.write_reg(KvmReg::VcpuRegsRdx, regs.rdx); + self.arch.write_reg(KvmReg::VcpuRegsRsi, regs.rsi); + self.arch.write_reg(KvmReg::VcpuRegsRdi, regs.rdi); + self.arch.write_reg(KvmReg::VcpuRegsRsp, regs.rsp); + self.arch.write_reg(KvmReg::VcpuRegsRbp, regs.rbp); + + self.arch.write_reg(KvmReg::VcpuRegsR8, regs.r8); + self.arch.write_reg(KvmReg::VcpuRegsR9, regs.r9); + self.arch.write_reg(KvmReg::VcpuRegsR10, regs.r10); + self.arch.write_reg(KvmReg::VcpuRegsR11, regs.r11); + self.arch.write_reg(KvmReg::VcpuRegsR12, regs.r12); + self.arch.write_reg(KvmReg::VcpuRegsR13, regs.r13); + self.arch.write_reg(KvmReg::VcpuRegsR14, regs.r14); + self.arch.write_reg(KvmReg::VcpuRegsR15, regs.r15); + + self.arch.write_reg_raw(KvmReg::VcpuRegsRip, regs.rip); + + self.set_rflags(RFlags::from_bits_truncate(regs.rflags) | RFlags::FLAGS_A1); + + self.arch.exception.pending = false; + self.arch.exception_vmexit.pending = false; + + self.request(VirtCpuRequest::KVM_REQ_EVENT); + } + + pub fn load_guest_xsave_state(&mut self) { + if self.arch.guest_state_protected { + return; + } + + if !self.arch.read_cr4_bits(Cr4::CR4_ENABLE_OS_XSAVE).is_empty() { + if self.arch.xcr0 != x86_kvm_manager().host_xcr0 { + unsafe { _xsetbv(0, self.arch.xcr0.bits()) }; + } + + if self.arch.ia32_xss != x86_kvm_manager().host_xss { + // XSS + unsafe { wrmsr(0xda0, self.arch.ia32_xss) }; + } + } + + if CpuId::new().get_extended_feature_info().unwrap().has_pku() + && self.arch.pkru != self.arch.host_pkru + && (self.arch.xcr0.contains(Xcr0::XCR0_PKRU_STATE) + || !self + .arch + .read_cr4_bits(Cr4::CR4_ENABLE_PROTECTION_KEY) + .is_empty()) + { + KvmX86Asm::write_pkru(self.arch.pkru); + } + } + + pub fn load_pdptrs(&mut self) { + //let mmu = self.arch.mmu(); + if !self.arch.is_register_dirty(KvmReg::VcpuExregCr3) { + return; + } + //if self.arch.is_pae_paging(){ + let mmu = self.arch.mmu(); + + VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[0]); + VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[1]); + VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[2]); + VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[3]); + //}else{ + // debug!("load_pdptrs: not pae paging"); + //} + } +} + +bitflags! { + // pub struct VirtCpuRequest: u64 { + // const KVM_REQUEST_MASK = 0xFF; + + // const KVM_REQ_TLB_FLUSH = 0 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; + // const KVM_REQ_VM_DEAD = 1 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; + + // const KVM_REQUEST_NO_WAKEUP = 1 << 8; + // const KVM_REQUEST_WAIT = 1 << 9; + // const KVM_REQUEST_NO_ACTION = 1 << 10; + + // const KVM_REQ_MIGRATE_TIMER = kvm_arch_req(0); + // const KVM_REQ_REPORT_TPR_ACCESS = kvm_arch_req(1); + // const KVM_REQ_TRIPLE_FAULT = kvm_arch_req(2); + // const KVM_REQ_MMU_SYNC = kvm_arch_req(3); + // const KVM_REQ_CLOCK_UPDATE = kvm_arch_req(4); + // const KVM_REQ_LOAD_MMU_PGD = kvm_arch_req(5); + // const KVM_REQ_EVENT = kvm_arch_req(6); + // const KVM_REQ_APF_HALT = kvm_arch_req(7); + // const KVM_REQ_STEAL_UPDATE = kvm_arch_req(8); + // const KVM_REQ_NMI = kvm_arch_req(9); + // const KVM_REQ_PMU = kvm_arch_req(10); + // const KVM_REQ_PMI = kvm_arch_req(11); + // const KVM_REQ_SMI = kvm_arch_req(12); + + // const KVM_REQ_MASTERCLOCK_UPDATE = kvm_arch_req(13); + // const KVM_REQ_MCLOCK_INPROGRESS = kvm_arch_req_flags(14, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_SCAN_IOAPIC = kvm_arch_req_flags(15, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_GLOBAL_CLOCK_UPDATE = kvm_arch_req(16); + // const KVM_REQ_APIC_PAGE_RELOAD = kvm_arch_req_flags(17, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_HV_CRASH = kvm_arch_req(18); + // const KVM_REQ_IOAPIC_EOI_EXIT = kvm_arch_req(19); + // const KVM_REQ_HV_RESET = kvm_arch_req(20); + // const KVM_REQ_HV_EXIT = kvm_arch_req(21); + // const KVM_REQ_HV_STIMER = kvm_arch_req(22); + // const KVM_REQ_LOAD_EOI_EXITMAP = kvm_arch_req(23); + // const KVM_REQ_GET_NESTED_STATE_PAGES = kvm_arch_req(24); + // const KVM_REQ_APICV_UPDATE = kvm_arch_req_flags(25, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_TLB_FLUSH_CURRENT = kvm_arch_req(26); + + // const KVM_REQ_TLB_FLUSH_GUEST = kvm_arch_req_flags(27, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_APF_READY = kvm_arch_req(28); + // const KVM_REQ_MSR_FILTER_CHANGED = kvm_arch_req(29); + // const KVM_REQ_UPDATE_CPU_DIRTY_LOGGING = kvm_arch_req_flags(30, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_MMU_FREE_OBSOLETE_ROOTS = kvm_arch_req_flags(31, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_HV_TLB_FLUSH = kvm_arch_req_flags(32, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // } + + pub struct VirtCpuRequest: u64 { + // const KVM_REQUEST_MASK = 0xFF; + + const KVM_REQ_TLB_FLUSH = Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; + const KVM_REQ_VM_DEAD = 1; + + const KVM_REQUEST_NO_WAKEUP = 1 << 8; + const KVM_REQUEST_WAIT = 1 << 9; + const KVM_REQUEST_NO_ACTION = 1 << 10; + + const KVM_REQ_MIGRATE_TIMER = kvm_arch_req(0); + const KVM_REQ_REPORT_TPR_ACCESS = kvm_arch_req(1); + const KVM_REQ_TRIPLE_FAULT = kvm_arch_req(2); + const KVM_REQ_MMU_SYNC = kvm_arch_req(3); + const KVM_REQ_CLOCK_UPDATE = kvm_arch_req(4); + const KVM_REQ_LOAD_MMU_PGD = kvm_arch_req(5); + const KVM_REQ_EVENT = kvm_arch_req(6); + const KVM_REQ_APF_HALT = kvm_arch_req(7); + const KVM_REQ_STEAL_UPDATE = kvm_arch_req(8); + const KVM_REQ_NMI = kvm_arch_req(9); + const KVM_REQ_PMU = kvm_arch_req(10); + const KVM_REQ_PMI = kvm_arch_req(11); + const KVM_REQ_SMI = kvm_arch_req(12); + + const KVM_REQ_MASTERCLOCK_UPDATE = kvm_arch_req(13); + + const KVM_REQ_MCLOCK_INPROGRESS = kvm_arch_req(14); + const MAKE_KVM_REQ_MCLOCK_INPROGRESS = kvm_arch_req_flags(14, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + const KVM_REQ_SCAN_IOAPIC = kvm_arch_req(15); + const MAKE_KVM_REQ_SCAN_IOAPIC = kvm_arch_req_flags(15, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + + const KVM_REQ_GLOBAL_CLOCK_UPDATE = kvm_arch_req(16); + + const KVM_REQ_APIC_PAGE_RELOAD = kvm_arch_req(17); + const MAKE_KVM_REQ_APIC_PAGE_RELOAD = kvm_arch_req_flags(17, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + const KVM_REQ_HV_CRASH = kvm_arch_req(18); + const KVM_REQ_IOAPIC_EOI_EXIT = kvm_arch_req(19); + const KVM_REQ_HV_RESET = kvm_arch_req(20); + const KVM_REQ_HV_EXIT = kvm_arch_req(21); + const KVM_REQ_HV_STIMER = kvm_arch_req(22); + const KVM_REQ_LOAD_EOI_EXITMAP = kvm_arch_req(23); + const KVM_REQ_GET_NESTED_STATE_PAGES = kvm_arch_req(24); + + const KVM_REQ_APICV_UPDATE = kvm_arch_req(25); + const MAKE_KVM_REQ_APICV_UPDATE = kvm_arch_req_flags(25, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + const KVM_REQ_TLB_FLUSH_CURRENT = kvm_arch_req(26); + + const KVM_REQ_TLB_FLUSH_GUEST = kvm_arch_req(27); + const MAKE_KVM_REQ_TLB_FLUSH_GUEST = kvm_arch_req_flags(27, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + const KVM_REQ_APF_READY = kvm_arch_req(28); + const KVM_REQ_MSR_FILTER_CHANGED = kvm_arch_req(29); + + const KVM_REQ_UPDATE_CPU_DIRTY_LOGGING = kvm_arch_req(30); + const MAKE_KVM_REQ_UPDATE_CPU_DIRTY_LOGGING = kvm_arch_req_flags(30, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + const KVM_REQ_MMU_FREE_OBSOLETE_ROOTS = kvm_arch_req(31); + const MAKE_KVM_REQ_MMU_FREE_OBSOLETE_ROOTS = kvm_arch_req_flags(31, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + const KVM_REQ_HV_TLB_FLUSH = kvm_arch_req(32); + const MAKE_KVM_REQ_HV_TLB_FLUSH = kvm_arch_req_flags(32, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + } +} + +// const KVM_REQUEST_ARCH_BASE: u64 = 8; +const KVM_REQUEST_ARCH_BASE: u64 = 11; + +const fn kvm_arch_req(nr: u64) -> u64 { + return kvm_arch_req_flags(nr, 0); +} + +const fn kvm_arch_req_flags(nr: u64, flags: u64) -> u64 { + 1 << (nr + KVM_REQUEST_ARCH_BASE) | flags +} + +#[derive(Debug, Default)] +pub struct KvmQueuedInterrupt { + pub injected: bool, + pub soft: bool, + pub nr: u8, +} + +#[derive(Debug, Default)] +#[allow(dead_code)] +pub struct KvmQueuedException { + pending: bool, + injected: bool, + has_error_code: bool, + vector: u8, + error_code: u32, + payload: usize, + has_payload: bool, +} + +#[derive(Debug)] +#[allow(dead_code)] +pub struct KvmAsyncPageFault { + /// 是否处于停止状态 + halted: bool, + /// 存储异步页面错误的 GFN(Guest Frame Number) + gfns: [u64; Self::ASYNC_PF_PER_VCPU], + /// 用于 GFN 到 HVA(Host Virtual Address)的缓存 + data: GfnToHvaCache, + /// MSR_KVM_ASYNC_PF_EN 寄存器的值 + msr_en_val: u64, + /// MSR_KVM_ASYNC_PF_INT 寄存器的值 + msr_int_val: u64, + /// 异步 PF 的向量 + vec: u16, + /// 异步 PF 的 ID + id: u32, + /// 是否仅发送给用户空间 + send_user_only: bool, + /// 主机 APF 标志 + host_apf_flags: u32, + /// 是否作为页面错误 VMExit 传递 + delivery_as_pf_vmexit: bool, + /// 是否处于页面就绪挂起状态 + pageready_pending: bool, +} + +impl KvmAsyncPageFault { + pub const ASYNC_PF_PER_VCPU: usize = 64; +} + +#[derive(Debug)] +pub enum KvmIntrType { + None, + Irq, + // Nmi, +} diff --git a/kernel/src/arch/x86_64/vm/mem.rs b/kernel/src/arch/x86_64/vm/mem.rs new file mode 100644 index 00000000..b80be87d --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mem.rs @@ -0,0 +1,24 @@ +use alloc::sync::Arc; +use log::warn; +use system_error::SystemError; + +use crate::virt::vm::kvm_host::{ + mem::{KvmMemoryChangeMode, LockedKvmMemSlot}, + Vm, +}; + +#[allow(dead_code)] +pub struct KvmArchMemorySlot {} + +impl Vm { + pub fn arch_prepare_memory_region( + &self, + _old: Option<&Arc>, + _new: Option<&Arc>, + _change: KvmMemoryChangeMode, + ) -> Result<(), SystemError> { + // todo + warn!("arch_prepare_memory_region TODO"); + Ok(()) + } +} diff --git a/kernel/src/arch/x86_64/vm/mmu/kvm_mmu.rs b/kernel/src/arch/x86_64/vm/mmu/kvm_mmu.rs new file mode 100644 index 00000000..7f5fa770 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mmu/kvm_mmu.rs @@ -0,0 +1,648 @@ +use crate::arch::mm::X86_64MMArch; +use crate::arch::vm::asm::VmxAsm; +use crate::arch::vm::kvm_host::page::KVM_MIN_FREE_MMU_PAGES; +use crate::mm::PhysAddr; +use crate::virt::kvm::host_mem::PAGE_SHIFT; +use crate::{ + arch::{mm::LockedFrameAllocator, MMArch, VirtCpuArch}, + libs::spinlock::{SpinLock, SpinLockGuard}, + mm::{page::PageMapper, MemoryManagementArch, PageTableKind}, + virt::vm::kvm_host::{vcpu::VirtCpu, Vm}, +}; +use alloc::{sync::Arc, vec::Vec}; +use bitfield_struct::bitfield; +use core::intrinsics::likely; +use core::ops::{Add, Sub}; +use log::{debug, error, warn}; +use raw_cpuid::CpuId; +use system_error::SystemError; +use x86::controlregs::{Cr0, Cr4}; +use x86::vmx::vmcs::guest; +use x86_64::registers::control::EferFlags; + +use super::super::{vmx::vmx_info, x86_kvm_ops}; +use super::mmu_internal::KvmPageFault; + +const PT64_ROOT_5LEVEL: usize = 5; +const PT64_ROOT_4LEVEL: usize = 4; +const PT32_ROOT_LEVEL: usize = 2; +const PT32E_ROOT_LEVEL: usize = 3; + +static mut TDP_ENABLED: bool = false; +static mut TDP_MMU_ENABLED: bool = true; +static mut TDP_MMU_ALLOWED: bool = unsafe { TDP_MMU_ENABLED }; + +static mut TDP_ROOT_LEVEL: usize = 0; +static mut MAX_TDP_LEVEL: usize = 0; +static mut SHADOW_ACCESSED_MASK: usize = 0; + +static mut MAX_HUGE_PAGE_LEVEL: PageLevel = PageLevel::None; + +pub const PAGE_SIZE: u64 = 1 << PAGE_SHIFT; + +pub fn is_tdp_mmu_enabled() -> bool { + unsafe { TDP_MMU_ENABLED } +} + +#[allow(dead_code)] +#[repr(u8)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum PageLevel { + None, + Level4K, + Level2M, + Level1G, + Level512G, + LevelNum, +} +// 实现 Add trait +impl Add for PageLevel { + type Output = Self; + + fn add(self, other: usize) -> Self { + let result = self as usize + other; + match result { + 0 => PageLevel::None, + 1 => PageLevel::Level4K, + 2 => PageLevel::Level2M, + 3 => PageLevel::Level1G, + 4 => PageLevel::Level512G, + 5 => PageLevel::LevelNum, + _ => PageLevel::LevelNum, // 超出范围时返回 LevelNum + } + } +} +// 实现 Sub trait +impl Sub for PageLevel { + type Output = Self; + + fn sub(self, other: usize) -> Self { + let result = self as isize - other as isize; + match result { + 0 => PageLevel::None, + 1 => PageLevel::Level4K, + 2 => PageLevel::Level2M, + 3 => PageLevel::Level1G, + 4 => PageLevel::Level512G, + 5 => PageLevel::LevelNum, + _ => PageLevel::None, // 超出范围时返回 None + } + } +} +impl PageLevel { + fn kvm_hpage_gfn_shift(level: u8) -> u32 { + ((level - 1) * 9) as u32 + } + + fn kvm_hpage_shift(level: u8) -> u32 { + PAGE_SHIFT + Self::kvm_hpage_gfn_shift(level) + } + + fn kvm_hpage_size(level: u8) -> u64 { + 1 << Self::kvm_hpage_shift(level) + } + /// 计算每个大页包含的页数 + /// + /// # 参数 + /// - `level`: 页级别 + /// + /// # 返回值 + /// 返回每个大页包含的页数 + pub fn kvm_pages_per_hpage(level: u8) -> u64 { + Self::kvm_hpage_size(level) / PAGE_SIZE + } +} +///计算给定 GFN(Guest Frame Number)在指定级别上的对齐值 +pub fn gfn_round_for_level(gfn: u64, level: u8) -> u64 { + gfn & !(PageLevel::kvm_pages_per_hpage(level) - 1) +} + +#[derive(Debug)] +pub struct LockedKvmMmu { + inner: SpinLock, +} + +impl LockedKvmMmu { + pub fn new(mmu: KvmMmu) -> Arc { + Arc::new(Self { + inner: SpinLock::new(mmu), + }) + } + + pub fn lock(&self) -> SpinLockGuard { + self.inner.lock() + } +} + +pub type KvmMmuPageFaultHandler = + fn(vcpu: &mut VirtCpu, page_fault: &KvmPageFault) -> Result; + +#[derive(Debug, Default)] +#[allow(dead_code)] +pub struct KvmMmu { + pub root: KvmMmuRootInfo, + pub cpu_role: KvmCpuRole, + pub root_role: KvmMmuPageRole, + pub page_fault: Option, + + pkru_mask: u32, + + prev_roots: [KvmMmuRootInfo; Self::KVM_MMU_NUM_PREV_ROOTS], + + pae_root: Vec, + + pub pdptrs: [u64; 4], +} + +impl KvmMmu { + pub fn _save_pdptrs(&mut self) { + self.pdptrs[0] = VmxAsm::vmx_vmread(guest::PDPTE0_FULL); + self.pdptrs[1] = VmxAsm::vmx_vmread(guest::PDPTE1_FULL); + self.pdptrs[2] = VmxAsm::vmx_vmread(guest::PDPTE2_FULL); + self.pdptrs[3] = VmxAsm::vmx_vmread(guest::PDPTE3_FULL); + } + const KVM_MMU_NUM_PREV_ROOTS: usize = 3; + pub const INVALID_PAGE: u64 = u64::MAX; + + #[inline] + pub fn tdp_enabled() -> bool { + unsafe { TDP_ENABLED } + } + + #[inline] + pub fn tdp_root_level() -> usize { + unsafe { TDP_ROOT_LEVEL } + } + + #[inline] + pub fn max_tdp_level() -> usize { + unsafe { MAX_TDP_LEVEL } + } + + #[inline] + pub fn ad_enabled() -> bool { + unsafe { SHADOW_ACCESSED_MASK != 0 } + } + + /// 初始化mmu的配置,因为其是无锁的,所以该函数只能在初始化vmx时调用 + pub fn kvm_configure_mmu( + enable_tdp: bool, + tdp_forced_root_level: usize, + tdp_max_root_level: usize, + tdp_huge_page_level: PageLevel, + ) { + unsafe { + TDP_ENABLED = enable_tdp; + TDP_ROOT_LEVEL = tdp_forced_root_level; + MAX_TDP_LEVEL = tdp_max_root_level; + + TDP_MMU_ENABLED = TDP_MMU_ALLOWED && TDP_ENABLED; + + if TDP_ENABLED { + MAX_HUGE_PAGE_LEVEL = tdp_huge_page_level; + } else if CpuId::new() + .get_extended_processor_and_feature_identifiers() + .unwrap() + .has_1gib_pages() + { + MAX_HUGE_PAGE_LEVEL = PageLevel::Level1G; + } else { + MAX_HUGE_PAGE_LEVEL = PageLevel::Level2M; + } + } + } +} + +#[derive(Debug, Default)] +pub struct KvmMmuRootInfo { + pub pgd: u64, + pub hpa: u64, +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct KvmCpuRole { + base: KvmMmuPageRole, + extend: KvmMmuExtenedRole, +} + +impl PartialEq for KvmCpuRole { + fn eq(&self, other: &Self) -> bool { + self.base.0 == other.base.0 && self.extend.0 == other.extend.0 + } +} + +/// ### 用于跟踪影子页(包括 TDP 页)的属性,以确定页面是否可以在给定的 MMU 上下文中使用。 +#[bitfield(u32)] +pub struct KvmMmuPageRole { + /// 表示页表级别,占用 4 位。对于普通的页表,取值是 2(二级页表)、3(三级页表)、4(四级页表)和 5(五级页表) + #[bits(4)] + pub level: u32, + /// 页表项是否为 4 字节,占用 1 位。在非 PAE 分页模式下,该值为 1 + has_4_byte_gpte: bool, + /// 表示页表项所在的象限,占用 2 位。该字段仅在 has_4_byte_gpte 为 1 时有效。 + #[bits(2)] + quadrant: u32, + /// 页面是否直接映射 + direct: bool, + /// 页面的访问权限 + #[bits(3)] + access: u32, + /// 页面是否无效 + invalid: bool, + /// 页面是否启用 NX(不可执行)位 + efer_nx: bool, + /// CR0 寄存器中的写保护位(WP)是否被置位 + cr0_wp: bool, + /// SMEP(Supervisor Mode Execution Protection)和非写保护位的组合 + smep_andnot_wp: bool, + /// SMAP(Supervisor Mode Access Prevention)和非写保护位的组合 + smap_andnot_wp: bool, + /// 页面是否禁用访问位(Accessed Bit) + ad_disabled: bool, + /// 当前页是否处于客户机模式 + guest_mode: bool, + /// 是否将此页透传给客户机 + passthrough: bool, + /// 未使用位域 + #[bits(5)] + unused: u32, + /// 表示 SMM(System Management Mode)模式 + #[bits(8)] + pub smm: u32, +} + +impl KvmMmuPageRole { + pub fn is_cr0_pg(&self) -> bool { + self.level() > 0 + } + + pub fn is_cr4_pae(&self) -> bool { + !self.has_4_byte_gpte() + } + pub fn get_direct(&self) -> bool { + self.direct() + } +} + +#[bitfield(u32)] +pub struct KvmMmuExtenedRole { + valid: bool, + execonly: bool, + cr4_pse: bool, + cr4_pke: bool, + cr4_smap: bool, + cr4_smep: bool, + cr4_la57: bool, + efer_lma: bool, + #[bits(24)] + unused: u32, +} + +pub struct KvmMmuRoleRegs { + pub cr0: Cr0, + pub cr4: Cr4, + pub efer: EferFlags, +} + +/// page falut的返回值, 用于表示页面错误的处理结果 +/// 应用在handle_mmio_page_fault()、mmu.page_fault()、fast_page_fault()和 +/// kvm_mmu_do_page_fault()等 +#[derive(Debug, Eq, PartialEq, FromPrimitive, Clone)] +#[repr(u32)] +pub enum PFRet { + Continue, // RET_PF_CONTINUE: 到目前为止一切正常,继续处理页面错误。 + Retry, // RET_PF_RETRY: 让 CPU 再次对该地址发生页面错误。 + Emulate, // RET_PF_EMULATE: MMIO 页面错误,直接模拟指令。 + Invalid, // RET_PF_INVALID: SPTE 无效,让实际的页面错误路径更新它。 + Fixed, // RET_PF_FIXED: 故障的条目已经被修复 + Spurious, // RET_PF_SPURIOUS: 故障的条目已经被修复,例如由另一个 vCPU 修复。 + Err = u32::MAX, // 错误 +} +impl From for i32 { + fn from(pf_ret: PFRet) -> Self { + pf_ret as i32 + } +} +impl From for PFRet { + fn from(value: i32) -> Self { + match value { + 0 => PFRet::Continue, + 1 => PFRet::Retry, + 2 => PFRet::Emulate, + 3 => PFRet::Invalid, + 4 => PFRet::Fixed, + 5 => PFRet::Spurious, + _ => PFRet::Err, // 默认返回 Invalid + } + } +} +impl VirtCpuArch { + pub fn kvm_init_mmu(&mut self) { + let regs = self.role_regs(); + let cpu_role = self.calc_cpu_role(®s); + + if self.walk_mmu.is_some() + && self.nested_mmu.is_some() + && Arc::ptr_eq( + self.walk_mmu.as_ref().unwrap(), + self.nested_mmu.as_ref().unwrap(), + ) + { + todo!() + } else if KvmMmu::tdp_enabled() { + self.init_tdp_mmu(cpu_role); + } else { + todo!() + } + } + + fn unload_mmu(&mut self) { + // TODO + } + + pub fn reset_mmu_context(&mut self) { + self.unload_mmu(); + self.kvm_init_mmu(); + } + + fn role_regs(&mut self) -> KvmMmuRoleRegs { + KvmMmuRoleRegs { + cr0: self.read_cr0_bits(Cr0::CR0_ENABLE_PAGING | Cr0::CR0_WRITE_PROTECT), + cr4: self.read_cr4_bits( + Cr4::CR4_ENABLE_PSE + | Cr4::CR4_ENABLE_PAE + | Cr4::CR4_ENABLE_LA57 + | Cr4::CR4_ENABLE_SMEP + | Cr4::CR4_ENABLE_SMAP + | Cr4::CR4_ENABLE_PROTECTION_KEY, + ), + efer: self.efer, + } + } + + fn calc_cpu_role(&self, regs: &KvmMmuRoleRegs) -> KvmCpuRole { + let mut role = KvmCpuRole::default(); + let base = &mut role.base; + let ext = &mut role.extend; + base.set_access(0b111); + base.set_smm(self.is_smm() as u32); + base.set_guest_mode(self.is_guest_mode()); + ext.set_valid(true); + + if !regs.cr0.contains(Cr0::CR0_ENABLE_PAGING) { + base.set_direct(true); + return role; + } + + base.set_efer_nx(regs.efer.contains(EferFlags::NO_EXECUTE_ENABLE)); + base.set_cr0_wp(regs.cr0.contains(Cr0::CR0_WRITE_PROTECT)); + base.set_smep_andnot_wp( + regs.cr4.contains(Cr4::CR4_ENABLE_SMEP) && !regs.cr0.contains(Cr0::CR0_WRITE_PROTECT), + ); + base.set_smap_andnot_wp( + regs.cr4.contains(Cr4::CR4_ENABLE_SMAP) && !regs.cr0.contains(Cr0::CR0_WRITE_PROTECT), + ); + + base.set_has_4_byte_gpte(!regs.cr4.contains(Cr4::CR4_ENABLE_PAE)); + + if regs.efer.contains(EferFlags::LONG_MODE_ACTIVE) { + let level = if regs.cr4.contains(Cr4::CR4_ENABLE_LA57) { + PT64_ROOT_5LEVEL as u32 + } else { + PT64_ROOT_4LEVEL as u32 + }; + base.set_level(level); + } else if regs.cr4.contains(Cr4::CR4_ENABLE_PAE) { + base.set_level(PT32E_ROOT_LEVEL as u32); + } else { + base.set_level(PT32_ROOT_LEVEL as u32); + } + + ext.set_cr4_smep(regs.cr4.contains(Cr4::CR4_ENABLE_SMEP)); + ext.set_cr4_smap(regs.cr4.contains(Cr4::CR4_ENABLE_SMAP)); + ext.set_cr4_pse(regs.cr4.contains(Cr4::CR4_ENABLE_PSE)); + ext.set_cr4_pke( + regs.efer.contains(EferFlags::LONG_MODE_ACTIVE) + && regs.cr4.contains(Cr4::CR4_ENABLE_PROTECTION_KEY), + ); + ext.set_cr4_la57( + regs.efer.contains(EferFlags::LONG_MODE_ACTIVE) + && regs.cr4.contains(Cr4::CR4_ENABLE_LA57), + ); + ext.set_efer_lma(regs.efer.contains(EferFlags::LONG_MODE_ACTIVE)); + + role + } + + /// https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/mmu/mmu.c#6019 + pub fn vcpu_arch_mmu_create(&mut self) { + if vmx_info().tdp_enabled() { + self.guset_mmu = Some(self._mmu_create()); + } + + self.root_mmu = Some(self._mmu_create()); + self.mmu = self.root_mmu.clone(); + self.walk_mmu = self.root_mmu.clone(); + } + + fn _mmu_create(&self) -> Arc { + let mut mmu = KvmMmu::default(); + + mmu.root.hpa = KvmMmu::INVALID_PAGE; + mmu.root.pgd = 0; + + for role in &mut mmu.prev_roots { + role.hpa = KvmMmu::INVALID_PAGE; + role.pgd = KvmMmu::INVALID_PAGE; + } + + if KvmMmu::tdp_enabled() && self.mmu_get_tdp_level() > PT32E_ROOT_LEVEL { + return LockedKvmMmu::new(mmu); + } + + mmu.pae_root + .resize(MMArch::PAGE_SIZE / core::mem::size_of::(), 0); + + return LockedKvmMmu::new(mmu); + } + + fn mmu_get_tdp_level(&self) -> usize { + if KvmMmu::tdp_root_level() != 0 { + return KvmMmu::tdp_root_level(); + } + + if KvmMmu::max_tdp_level() == 5 && self.max_phyaddr <= 48 { + return 4; + } + + return KvmMmu::max_tdp_level(); + } + + pub fn init_tdp_mmu(&mut self, cpu_role: KvmCpuRole) { + let context = self.root_mmu(); + let mut context = context.lock(); + let root_role = self.calc_tdp_mmu_root_page_role(cpu_role); + + if cpu_role == context.cpu_role && root_role.0 == context.root_role.0 { + return; + } + + context.cpu_role = cpu_role; + context.root_role = root_role; + + // todo 设置函数集 + + if !context.cpu_role.base.is_cr0_pg() { + // todo: context->gva_to_gpa = nonpaging_gva_to_gpa; + warn!("context->gva_to_gpa = nonpaging_gva_to_gpa todo!"); + } else if context.cpu_role.base.is_cr4_pae() { + // todo: context->gva_to_gpa = paging64_gva_to_gpa; + warn!("context->gva_to_gpa = paging64_gva_to_gpa todo!"); + } else { + // todo: context->gva_to_gpa = paging32_gva_to_gpa; + warn!("context->gva_to_gpa = paging32_gva_to_gpa todo!"); + } + + // todo: + // reset_guest_paging_metadata(vcpu, context); + // reset_tdp_shadow_zero_bits_mask(context); + } + + #[inline] + pub fn root_mmu(&self) -> &Arc { + self.root_mmu.as_ref().unwrap() + } + + #[inline] + pub fn mmu(&self) -> SpinLockGuard { + self.mmu.as_ref().unwrap().lock() + } + + fn calc_tdp_mmu_root_page_role(&self, cpu_role: KvmCpuRole) -> KvmMmuPageRole { + let mut role = KvmMmuPageRole::default(); + + role.set_access(0b111); + role.set_cr0_wp(true); + role.set_efer_nx(true); + role.set_smm(cpu_role.base.smm()); + role.set_guest_mode(cpu_role.base.guest_mode()); + role.set_ad_disabled(!KvmMmu::ad_enabled()); + role.set_level(self.mmu_get_tdp_level() as u32); + role.set_direct(true); + role.set_has_4_byte_gpte(false); + + role + } +} + +impl VirtCpu { + pub fn kvm_mmu_reload(&mut self, vm: &Vm) -> Result<(), SystemError> { + if likely(self.arch.mmu().root.hpa != KvmMmu::INVALID_PAGE) { + return Ok(()); + } + + return self.kvm_mmu_load(vm); + } + + pub fn kvm_mmu_load(&mut self, vm: &Vm) -> Result<(), SystemError> { + let direct = self.arch.mmu().root_role.direct(); + self.mmu_topup_memory_caches(!direct)?; + self.mmu_alloc_special_roots()?; + + if direct { + self.mmu_alloc_direct_roots(vm)?; + } else { + self.mmu_alloc_shadow_roots(vm)?; + } + + // TODO: kvm_mmu_sync_roots + + self.kvm_mmu_load_pgd(vm); + + Ok(()) + } + + pub fn kvm_mmu_load_pgd(&mut self, vm: &Vm) { + let root_hpa = self.arch.mmu().root.hpa; + debug!("kvm_mmu_load_pgd::root_hpa = {:#x}", root_hpa); + if root_hpa == KvmMmu::INVALID_PAGE { + return; + } + + let level = self.arch.mmu().root_role.level(); + x86_kvm_ops().load_mmu_pgd(self, vm, root_hpa, level); + } + + fn mmu_topup_memory_caches(&mut self, _maybe_indirect: bool) -> Result<(), SystemError> { + // TODO + Ok(()) + } + + fn mmu_alloc_special_roots(&mut self) -> Result<(), SystemError> { + // TODO + Ok(()) + } + + fn mmu_alloc_direct_roots(&mut self, vm: &Vm) -> Result<(), SystemError> { + let shadow_root_level = self.arch.mmu().root_role.level(); + let _r: Result<(), SystemError> = self.make_mmu_pages_available(vm); + let root: PhysAddr; + if KvmMmu::tdp_enabled() { + root = self.kvm_tdp_mmu_get_vcpu_root_hpa().unwrap(); + let mut mmu = self.arch.mmu(); + mmu.root.hpa = root.data() as u64; + } else if shadow_root_level >= PT64_ROOT_4LEVEL as u32 { + todo!() + } else if shadow_root_level == PT32E_ROOT_LEVEL as u32 { + todo!() + } else { + error!("Bad TDP root level = {}", shadow_root_level); + return Err(SystemError::EIO); + } + /* root.pgd is ignored for direct MMUs. */ + self.arch.mmu().root.pgd = 0; + Ok(()) + } + + fn mmu_alloc_shadow_roots(&mut self, _vm: &Vm) -> Result<(), SystemError> { + todo!(); + } + fn make_mmu_pages_available(&mut self, vm: &Vm) -> Result<(), SystemError> { + let avail = Self::kvm_mmu_available_pages(vm); + if likely(avail >= KVM_MIN_FREE_MMU_PAGES) { + return Ok(()); + } + //kvm_mmu_zap_oldest_mmu_pages(vm, KVM_REFILL_PAGES - avail); + if Self::kvm_mmu_available_pages(vm) == 0 { + return Err(SystemError::ENOSPC); + } + Ok(()) + } + fn kvm_mmu_available_pages(vm: &Vm) -> usize { + if vm.arch.n_max_mmu_pages > vm.arch.n_used_mmu_pages { + return vm.arch.n_max_mmu_pages - vm.arch.n_used_mmu_pages; + } + return 0; + } + fn kvm_tdp_mmu_get_vcpu_root_hpa(&self) -> Result { + //todo Check for an existing root before allocating a new one. Note, the + // role check prevents consuming an invalid root. + let root = self.tdp_mmu_alloc_sp().unwrap(); + Ok(PhysAddr::new(root as usize)) + } + fn tdp_mmu_alloc_sp(&self) -> Result { + // 申请并创建新的页表 + let mapper: crate::mm::page::PageMapper = unsafe { + PageMapper::create(PageTableKind::EPT, LockedFrameAllocator) + .ok_or(SystemError::ENOMEM)? + }; + + let ept_root_hpa = mapper.table().phys(); + + self.arch.mmu().root.hpa = ept_root_hpa.data() as u64; + + debug!("ept_root_hpa:{:x}!", ept_root_hpa.data() as u64); + + return Ok(self.arch.mmu().root.hpa); + } +} diff --git a/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs b/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs new file mode 100644 index 00000000..876b4c64 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs @@ -0,0 +1,396 @@ +use crate::mm::page::EntryFlags; +use alloc::sync::Arc; +use core::{intrinsics::unlikely, ops::Index}; +use log::{debug, warn}; +use x86::vmx::vmcs::{guest, host}; + +use system_error::SystemError; + +use crate::{ + arch::{ + vm::{ + asm::VmxAsm, + kvm_host::{EmulType, KVM_PFN_NOSLOT}, + mmu::kvm_mmu::{PFRet, PageLevel}, + mtrr::kvm_mtrr_check_gfn_range_consistency, + vmx::{ept::EptPageMapper, PageFaultErr}, + }, + MMArch, + }, + mm::PhysAddr, + virt::{ + kvm::host_mem::PAGE_SHIFT, + vm::kvm_host::{ + mem::{LockedKvmMemSlot, LockedVmMemSlotSet, UserMemRegionFlag, __gfn_to_pfn_memslot}, + search_memslots, + vcpu::VirtCpu, + Vm, + }, + }, +}; + +use super::kvm_mmu::{gfn_round_for_level, is_tdp_mmu_enabled, KvmMmuPageRole}; + +#[allow(dead_code)] +#[derive(Debug, Default)] +pub struct KvmMmuPage { + pub tdp_mmu_page: bool, // 标记是否为 TDP(Two-Dimensional Paging)页表页 + pub gfn: u64, // 客户机帧号(Guest Frame Number) + + /* + * The following two entries are used to key the shadow page in the + * hash table.暫時沒看出來 + */ + pub role: KvmMmuPageRole, + pub spt: u64, // 指向页表条目(SPTE)的指针 + pub mmu_seq: u64, + pub map_writable: bool, + pub write_fault_to_shadow_pgtable: bool, +} +#[allow(dead_code)] +#[derive(Debug, Default)] +pub struct KvmPageFault { + // vcpu.do_page_fault 的参数 + + // addr是guestOS传进来的gpa + addr: PhysAddr, + error_code: u32, + prefetch: bool, + + // 从 error_code 派生 + exec: bool, + write: bool, + present: bool, + rsvd: bool, + user: bool, + + // 从 mmu 和全局状态派生 + is_tdp: bool, + nx_huge_page_workaround_enabled: bool, + + // 是否可以创建大于 4KB 的映射,或由于 NX 大页被禁止 + huge_page_disallowed: bool, + + // 此故障可以创建的最大页面大小 + max_level: u8, + + // 基于 max_level 和主机映射使用的页面大小可以创建的页面大小 + req_level: u8, + + // 基于 req_level 和 huge_page_disallowed 将创建的页面大小 + goal_level: u8, + + // 移位后的 addr,或如果 addr 是 gva 则是访客页表遍历的结果 + gfn: u64, // gfn_t 通常是一个 64 位地址 + + // 包含 gfn 的 memslot。可能为 None + slot: Option>, + + // kvm_faultin_pfn 的输出 + mmu_seq: u64, + + // kvm_pfn_t 通常是一个 64 位地址,相当于知道了hpa + pfn: u64, + hva: u64, // hva_t 通常是一个 64 位地址 + map_writable: bool, + + // 表示访客正在尝试写入包含用于翻译写入本身的一个或多个 PTE 的 gfn + write_fault_to_shadow_pgtable: bool, +} +#[allow(dead_code)] +impl KvmPageFault { + pub fn pfn(&self) -> u64 { + self.pfn + } + pub fn gfn(&self) -> u64 { + self.gfn + } + pub fn gpa(&self) -> u64 { + self.addr.data() as u64 + } + pub fn hva(&self) -> u64 { + self.hva + } +} + +impl VirtCpu { + #[inline(never)] + pub fn page_fault( + &mut self, + vm: &Vm, + cr2_or_gpa: u64, + mut error_code: u64, + _insn: Option, + _insn_len: usize, + ) -> Result { + let emulation_type = EmulType::PF; + let _direct = self.arch.mmu().root_role.get_direct(); + if error_code & PageFaultErr::PFERR_IMPLICIT_ACCESS.bits() != 0 { + warn!("Implicit access error code detected"); + error_code &= !PageFaultErr::PFERR_IMPLICIT_ACCESS.bits(); + } + + //if self.arch.mmu().root.hpa != KvmMmu::INVALID_PAGE { + // return Ok(PFRet::Retry as u64); + //} + + let mut r = PFRet::Invalid; + if unlikely(error_code & PageFaultErr::PFERR_RSVD.bits() != 0) { + todo!(); + // r = self.handle_mmio_page_fault(cr2_or_gpa, direct)?; + // if r == PFRes::Emulate{ + // return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,insn_len) insn_len); + // } + } + + if r == PFRet::Invalid { + r = self + .do_page_fault( + vm, + cr2_or_gpa, + (error_code & 0xFFFFFFFF) as u32, + false, + emulation_type, + )? + .into(); + if r == PFRet::Invalid { + return Err(SystemError::EIO); + } + } + + if i32::from(r.clone()) < 0 { + return Ok(i32::from(r)); + } + if r != PFRet::Emulate { + return Ok(1); + } + + // 在模拟指令之前,检查错误代码是否由于在翻译客户机页面时的只读(RO)违规。 + // 这可能发生在使用嵌套虚拟化和嵌套分页的情况下。如果是这样,我们只需取消页面保护并恢复客户机。 + let pferr_nested_guest_page = PageFaultErr::PFERR_GUEST_PAGE + | PageFaultErr::PFERR_WRITE + | PageFaultErr::PFERR_PRESENT; + if self.arch.mmu().root_role.get_direct() + && (error_code & pferr_nested_guest_page.bits()) == pferr_nested_guest_page.bits() + { + todo!() + } + + // self.arch.mmu.page_fault 返回 RET_PF_EMULATE,但我们仍然可以乐观地尝试取消页面保护, + // 并让处理器重新执行导致页面故障的指令。不允许重试 MMIO 模拟,因为这不仅毫无意义, + // 而且可能导致进入无限循环,因为处理器会不断在不存在的 MMIO 地址上发生故障。 + // 重试来自嵌套客户机的指令也是毫无意义且危险的,因为我们只显式地影子 L1 的页表, + // 即为 L1 取消保护并不会神奇地修复导致 L2 失败的问题。 + // if !self.mmio_info_in_cache(cr2_or_gpa, direct) && !self.arch.is_guest_mode() { + // emulation_type |= EmulType::ALLOW_RETRY_PF; + // } + + // self.emulate_instruction(cr2_or_gpa, emulation_type, insn, insn_len) + todo!("emulate_instruction") + } + + fn do_page_fault( + &mut self, + vm: &Vm, + cr2_or_gpa: u64, + error_code: u32, + prefetch: bool, + mut emultype: EmulType, + ) -> Result { + //初始化page fault + let mut page_fault = KvmPageFault { + addr: PhysAddr::new(cr2_or_gpa as usize), + error_code, + exec: error_code & PageFaultErr::PFERR_FETCH.bits() as u32 != 0, + write: error_code & PageFaultErr::PFERR_WRITE.bits() as u32 != 0, + present: error_code & PageFaultErr::PFERR_PRESENT.bits() as u32 != 0, + rsvd: error_code & PageFaultErr::PFERR_RSVD.bits() as u32 != 0, + user: error_code & PageFaultErr::PFERR_USER.bits() as u32 != 0, + prefetch, + is_tdp: true, + nx_huge_page_workaround_enabled: false, //todo + max_level: PageLevel::Level1G as u8, + req_level: PageLevel::Level4K as u8, + goal_level: PageLevel::Level4K as u8, + ..Default::default() + }; + //处理直接映射 + if self.arch.mmu().root_role.get_direct() { + page_fault.gfn = (page_fault.addr.data() >> PAGE_SHIFT) as u64; + debug!("page_fault.addr.data() : 0x{:x}", page_fault.addr.data()); + debug!("do_page_fault : gfn = 0x{:x}", page_fault.gfn); + page_fault.slot = self.gfn_to_memslot(page_fault.gfn, vm); //kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);没完成 + } + //异步页面错误(Async #PF),也称为预取错误(prefetch faults), + //从客机(guest)的角度来看并不是错误,并且已经在原始错误发生时被计数。 + if !prefetch { + self.stat.pf_taken += 1; + } + + let r = if page_fault.is_tdp { + self.tdp_page_fault(vm, &mut page_fault).unwrap() + } else { + //目前的处理page_fault的方法只有tdp_page_fault,所以这里是不会执行的 + let handle = self.arch.mmu().page_fault.unwrap(); + handle(self, &page_fault).unwrap() + }; + + if page_fault.write_fault_to_shadow_pgtable { + emultype |= EmulType::WRITE_PF_TO_SP; + } + //类似于上面的情况,预取错误并不是真正的虚假错误,并且异步页面错误路径不会进行仿真。 + //然而,确实要统计由异步页面错误处理程序修复的错误,否则它们将永远不会被统计。 + match PFRet::from(r) { + PFRet::Fixed => self.stat.pf_fixed += 1, + PFRet::Emulate => self.stat.pf_emulate += 1, + PFRet::Spurious => self.stat.pf_spurious += 1, + _ => {} + } + debug!("do_page_fault return r = {}", r); + Ok(r) + } + + pub fn gfn_to_memslot(&self, gfn: u64, vm: &Vm) -> Option> { + let slot_set: Arc = self.kvm_vcpu_memslots(vm); + //...todo + + search_memslots(slot_set, gfn) + } + pub fn kvm_vcpu_memslots(&self, vm: &Vm) -> Arc { + vm.memslots.index(0).clone() + } + fn tdp_page_fault( + &mut self, + vm: &Vm, + page_fault: &mut KvmPageFault, + ) -> Result { + // 如果 shadow_memtype_mask 为真,并且虚拟机有非一致性 DMA + //if shadow_memtype_mask != 0 && self.kvm().lock().arch.noncoherent_dma_count > 0 { + while page_fault.max_level > PageLevel::Level4K as u8 { + let page_num = PageLevel::kvm_pages_per_hpage(page_fault.max_level); + + //低地址对齐 + let base = gfn_round_for_level(page_fault.gfn, page_fault.max_level); + + //检查给定 GFN 范围内的内存类型是否一致,暂未实现 + if kvm_mtrr_check_gfn_range_consistency(self, base, page_num) { + break; + } + + page_fault.max_level -= 1; + } + //} + + if is_tdp_mmu_enabled() { + return self.kvm_tdp_mmu_page_fault(vm, page_fault); + } + + //正常是不会执行到这里的,因为我们的是支持ept的 + self.direct_page_fault(page_fault) + } + fn kvm_tdp_mmu_page_fault( + &self, + vm: &Vm, + page_fault: &mut KvmPageFault, + ) -> Result { + //page_fault_handle_page_track(page_fault) + //fast_page_fault(page_fault); + //mmu_topup_memory_caches(false); + let mut r = self + .kvm_faultin_pfn(vm, page_fault, 1 | 1 << 1 | 1 << 2) + .unwrap(); + if r != PFRet::Continue { + return Ok(r.into()); + } + + //r = PFRet::Retry; + + //if self.is_page_fault_stale(page_fault) {return;} + + //实际的映射 + r = self.tdp_mmu_map(page_fault)?.into(); + + Ok(r.into()) + } + //没有实现huge page相关 + fn tdp_mmu_map(&self, page_fault: &mut KvmPageFault) -> Result { + // let ret = PFRet::Retry;//下面的逻辑和linux不一致,可能在判断返回值会有问题 + let mut mapper = EptPageMapper::lock(); + debug!("{:?}", &page_fault); + //flags :rwx + let page_flags: EntryFlags = unsafe { EntryFlags::from_data(0xb77) }; + mapper.map(PhysAddr::new(page_fault.gpa() as usize), page_flags); + //debug_eptp(); + + debug!("The ept_root_addr is {:?}", EptPageMapper::root_page_addr()); + //todo: 一些参数的更新 + Ok(PFRet::Fixed.into()) + //todo!() + } + + fn direct_page_fault(&self, _page_fault: &KvmPageFault) -> Result { + todo!() + } + + fn kvm_faultin_pfn( + &self, + vm: &Vm, + page_fault: &mut KvmPageFault, + _access: u32, + ) -> Result { + page_fault.mmu_seq = vm.mmu_invalidate_seq; + self.__kvm_faultin_pfn(page_fault) + } + fn __kvm_faultin_pfn(&self, page_fault: &mut KvmPageFault) -> Result { + let slot = &page_fault.slot; + let mut is_async = false; + if slot.is_none() { + return Err(SystemError::KVM_HVA_ERR_BAD); + } + let slot = slot.as_ref().unwrap().read(); + + if slot.get_flags().bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits() != 0 { + return Ok(PFRet::Retry); + } + if !slot.is_visible() { + /* 不要将私有内存槽暴露给 L2。 */ + if self.arch.is_guest_mode() { + drop(slot); + page_fault.slot = None; + page_fault.pfn = KVM_PFN_NOSLOT; + page_fault.map_writable = false; + return Ok(PFRet::Continue); + } + } + + // 尝试将 GFN 转换为 PFN + let guest_cr3 = VmxAsm::vmx_vmread(guest::CR3); + let host_cr3 = VmxAsm::vmx_vmread(host::CR3); + debug!("guest_cr3={:x}, host_cr3={:x}", guest_cr3, host_cr3); + page_fault.pfn = __gfn_to_pfn_memslot( + Some(&slot), + page_fault.gfn, + (false, &mut is_async), + false, + page_fault.write, + &mut page_fault.map_writable, + &mut page_fault.hva, + )?; + + if !is_async { + return Ok(PFRet::Continue); /* *pfn 已经有正确的页面 */ + } + + // if !page_fault.prefetch && self.kvm_can_do_async_pf() { + // self.trace_kvm_try_async_get_page(page_fault.addr, page_fault.gfn); + // if self.kvm_find_async_pf_gfn(page_fault.gfn) { + // self.trace_kvm_async_pf_repeated_fault(page_fault.addr, page_fault.gfn); + // self.kvm_make_request(KVM_REQ_APF_HALT); + // return Ok(PFRet::Retry); + // } else if self.kvm_arch_setup_async_pf(page_fault.addr, page_fault.gfn) { + // return Ok(PFRet::Retry); + // } + // } + Ok(PFRet::Continue) + } +} diff --git a/kernel/src/arch/x86_64/vm/mmu/mod.rs b/kernel/src/arch/x86_64/vm/mmu/mod.rs new file mode 100644 index 00000000..8e486f3a --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mmu/mod.rs @@ -0,0 +1,3 @@ +pub mod kvm_mmu; +pub mod mmu_internal; +pub mod tdp_iter; diff --git a/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs b/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs new file mode 100644 index 00000000..76d305b7 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs @@ -0,0 +1,219 @@ +// use crate::{ +// arch::vm::mmu::mmu::gfn_round_for_level, +// mm::{virt_2_phys, PhysAddr, VirtAddr}, +// time::sleep, +// virt::kvm::host_mem::PAGE_SHIFT, +// }; + +// use super::{ +// mmu::{PageLevel, PAGE_SIZE}, +// mmu_internal::KvmMmuPage, +// }; + +// pub const PT64_ROOT_MAX_LEVEL: usize = 5; //通常只用到4级,但是确实有5级的情况 +// pub const PT_LEVEL_BITS: u8 = 9; // 每个页表级别的位数 +// pub const PT64_ENT_PER_PAGE: u32 = 1 << 9; +// pub const PTE_LEN: usize = 64; + +// //Bits 51:12 are from the EPT PDPTE +// pub const PT64_BASE_ADDR_MASK: u64 = ((1u64 << 52) - 1) & !(PAGE_SIZE - 1); + +// pub fn shadow_pt_index(addr: u64, level: u8) -> u64 { +// (addr >> (PAGE_SHIFT as u8 + (level - 1) * PT_LEVEL_BITS)) & ((1 << PT_LEVEL_BITS) - 1) +// } +// pub fn is_last_spte(pte: u64, level: u8) -> bool { +// level == PageLevel::Level4K as u8 || is_large_pte(pte) +// } +// pub fn is_shadow_present_pte(pte: u64) -> bool { +// pte & 1 << 11 != 0 //在intel手冊中:ept PTE:11 Ignored.不是很懂 +// } +// pub fn is_large_pte(pte: u64) -> bool { +// pte & 1 << 7 != 0 //在intel手冊中:ept PTE:7 Ignored. +// } +// ///Bits 51:12 are from the EPT PDPTE +// pub fn spte_to_pfn(pte: u64) -> u64 { +// (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT +// } + +// #[derive(Default)] +// pub struct TdpIter { +// inner: TdpIterInner, +// } + +// impl TdpIter { +// pub fn start( +// &self, +// root_pt: usize, +// root_level: u8, +// min_level: u8, +// next_last_level_gfn: u64, +// ) -> Self { +// let mut inner = self.inner.clone(); +// inner.start(root_pt, root_level, min_level, next_last_level_gfn); +// TdpIter { inner } +// } +// } +// ///迭代器将遍历分页结构,直到找到此 GFN 的映射。 +// #[derive(Default, Clone)] +// pub struct TdpIterInner { +// next_last_level_gfn: u64, +// /// 线程上次让出时的 next_last_level_gfn。 +// /// 仅当 next_last_level_gfn != yielded_gfn 时让出,有助于确保前进。 +// pub yielded_gfn: u64, + +// ///指向遍历到当前 SPTE 的页表的指针 +// pt_path: [u64; PT64_ROOT_MAX_LEVEL], + +// ///指向当前 SPTE 的指针 是hva吗? +// sptep: PhysAddr, + +// /// 当前 SPTE 映射的最低 GFN hpa>>shift? +// pub gfn: u64, + +// ///给迭代器的根页级别 +// pub root_level: u8, + +// ///迭代器应遍历到的最低级别 +// pub min_level: u8, + +// ///迭代器在分页结构中的当前级别 +// pub level: u8, + +// ///sptep 处值的快照 +// pub old_spte: u64, + +// ///迭代器是否具有有效状态。如果迭代器走出分页结构的末端,则为 false。 +// /// +// pub valid: bool, +// } +// impl TdpIterInner { +// ///初始化ept iter +// #[inline(never)] +// pub fn start( +// &mut self, +// root_pt: usize, +// root_level: u8, +// min_level: u8, +// next_last_level_gfn: u64, +// ) { +// // if root_pt.role.level() == 0 || root_pt.role.level() > PT64_ROOT_MAX_LEVEL as u32 { +// // self.valid = false; +// // return; +// // } + +// if root_level < 1 || root_level > PT64_ROOT_MAX_LEVEL as u8 { +// self.valid = false; +// return; +// } +// self.next_last_level_gfn = next_last_level_gfn; +// self.root_level = root_level as u8; +// self.min_level = min_level as u8; +// self.pt_path[(self.root_level - 1) as usize] = root_pt as u64; +// self.yielded_gfn = self.next_last_level_gfn; +// self.level = self.root_level; + +// self.gfn = gfn_round_for_level(self.next_last_level_gfn, self.level); +// self.tdp_iter_refresh_sptep(); +// self.valid = true; +// } + +// /* +// * 重新计算当前GFN和level和SPTE指针,并重新读取SPTE。 +// */ +// fn tdp_iter_refresh_sptep(&mut self) { +// // self.sptep = PhysAddr::new( +// // (self.pt_path[self.level as usize - 1] +// // + shadow_pt_index(self.gfn << PAGE_SHIFT, self.level)) as usize, +// // ); +// // self.old_spte = read_sptep(self.sptep); +// } + +// pub fn _next(&mut self) { +// if self.try_step_down() { +// return; +// } +// loop { +// if self.try_step_side() { +// return; +// } +// if !self.try_step_up() { +// break; +// } +// } +// self.valid = false; +// } +// ///在分页结构中向目标GFN下降一级。如果迭代器能够下降一级,则返回true,否则返回false。 +// fn try_step_down(&mut self) -> bool { +// if self.level == self.min_level { +// return false; +// } +// //在下降之前重新读取SPTE,以避免遍历到不再从此条目链接的页表中。 +// self.old_spte = read_sptep(self.sptep); + +// match spte_to_child_pt(self.old_spte, self.level) { +// Some(child_pt) => { +// self.level -= 1; +// self.pt_path[self.level as usize - 1] = child_pt.data() as u64; +// self.gfn = gfn_round_for_level(self.gfn, self.level); +// self.tdp_iter_refresh_sptep(); +// true +// } +// None => false, +// } +// } +// fn try_step_up(&mut self) -> bool { +// if self.level == self.root_level { +// return false; +// } +// self.level += 1; +// self.gfn = gfn_round_for_level(self.gfn, self.level); +// self.tdp_iter_refresh_sptep(); +// true +// } +// ///在当前页表的当前级别中,移动到下一个条目。下一个条目可以指向一个page backing guest memory , +// ///或者另一个页表,或者它可能是不存在的。如果迭代器能够移动到页表中的下一个条目,则返回true, +// ///如果迭代器已经在当前页表的末尾,则返回false。 +// fn try_step_side(&mut self) -> bool { +// //检查迭代器是否已经在当前页表的末尾。 +// if shadow_pt_index(self.gfn << PAGE_SHIFT, self.level) == (PT64_ENT_PER_PAGE - 1) as u64 { +// return false; +// } + +// self.gfn += PageLevel::kvm_pages_per_hpage(self.level); +// self.next_last_level_gfn = self.gfn; +// self.sptep.add(PTE_LEN); //指向下一个spte,一个spte占64位 +// self.old_spte = read_sptep(self.sptep); +// true +// } +// } +// impl Iterator for TdpIter { +// type Item = TdpIterInner; // 返回 (gfn, spte) 元组 + +// fn next(&mut self) -> Option { +// let inner = &mut self.inner; +// if !inner.valid { +// return None; +// } +// inner._next(); +// if inner.valid { +// Some(inner.clone()) +// } else { +// None +// } +// } +// } +// ///给定一个 SPTE 及其级别,返回一个指针,该指针包含 SPTE 所引用的子页表的hva。 +// ///如果没有这样的条目,则返回 null。 +// /// +// fn spte_to_child_pt(spte: u64, level: u8) -> Option { +// //没有子页表 +// if !is_shadow_present_pte(spte) || is_last_spte(spte, level) { +// return None; +// } +// Some(VirtAddr::new(virt_2_phys//__va +// ((spte_to_pfn(spte)< u64 { +// unsafe { *(sptep.data() as *const u64) } +// } diff --git a/kernel/src/arch/x86_64/vm/mod.rs b/kernel/src/arch/x86_64/vm/mod.rs new file mode 100644 index 00000000..62d57c10 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mod.rs @@ -0,0 +1,640 @@ +use alloc::vec::Vec; +use log::{error, warn}; +use raw_cpuid::CpuId; +use system_error::SystemError; +use x86::{ + controlregs::{cr4, xcr0, Cr0, Cr4, Xcr0}, + msr::{self, rdmsr, wrmsr}, +}; +use x86_64::registers::control::{Efer, EferFlags}; + +use crate::{ + arch::vm::vmx::{VmxL1dFlushState, L1TF_VMX_MITIGATION}, + libs::once::Once, + mm::percpu::{PerCpu, PerCpuVar}, +}; + +use self::{ + asm::{hyperv::*, kvm_msr::*, ArchCapabilities, VmxMsrEntry}, + kvm_host::{KvmFunc, KvmInitFunc}, +}; + +use super::driver::tsc::TSCManager; + +mod asm; +mod cpuid; +pub(super) mod exit; +pub mod kvm_host; +pub mod mem; +pub mod mmu; +pub mod mtrr; +pub mod uapi; +pub mod vmx; + +static mut KVM_X86_MANAGER: Option = None; + +pub fn x86_kvm_ops() -> &'static dyn KvmFunc { + unsafe { KVM_X86_MANAGER.as_ref().unwrap().funcs() } +} + +pub fn x86_kvm_manager() -> &'static KvmArchManager { + unsafe { KVM_X86_MANAGER.as_ref().unwrap() } +} + +pub fn x86_kvm_manager_mut() -> &'static mut KvmArchManager { + unsafe { KVM_X86_MANAGER.as_mut().unwrap() } +} + +pub fn init_kvm_arch() { + static ONCE: Once = Once::new(); + ONCE.call_once(|| unsafe { + KVM_X86_MANAGER = Some(KvmArchManager::init()); + + let mut user_return_msrs = Vec::new(); + user_return_msrs.resize(PerCpu::MAX_CPU_NUM as usize, KvmUserReturnMsrs::default()); + USER_RETURN_MSRS = Some(PerCpuVar::new(user_return_msrs).unwrap()); + }) +} + +/// fixme:这些成员是否需要加锁呢?? +#[derive(Debug)] +pub struct KvmArchManager { + funcs: Option<&'static dyn KvmFunc>, + host_xcr0: Xcr0, + host_efer: EferFlags, + host_xss: u64, + host_arch_capabilities: u64, + kvm_uret_msrs_list: Vec, + kvm_caps: KvmCapabilities, + max_tsc_khz: u64, + msrs_to_save: Vec, + emulated_msrs: Vec, + msr_based_features: Vec, + + has_noapic_vcpu: bool, + + enable_pmu: bool, + + // 只读 + possible_cr0_guest: Cr0, + possible_cr4_guest: Cr4, + cr4_tlbflush_bits: Cr4, + cr4_pdptr_bits: Cr4, +} + +impl KvmArchManager { + pub fn init() -> Self { + Self { + possible_cr0_guest: Cr0::CR0_TASK_SWITCHED | Cr0::CR0_WRITE_PROTECT, + possible_cr4_guest: Cr4::CR4_VIRTUAL_INTERRUPTS + | Cr4::CR4_DEBUGGING_EXTENSIONS + | Cr4::CR4_ENABLE_PPMC + | Cr4::CR4_ENABLE_SSE + | Cr4::CR4_UNMASKED_SSE + | Cr4::CR4_ENABLE_GLOBAL_PAGES + | Cr4::CR4_TIME_STAMP_DISABLE + | Cr4::CR4_ENABLE_FSGSBASE, + + cr4_tlbflush_bits: Cr4::CR4_ENABLE_GLOBAL_PAGES + | Cr4::CR4_ENABLE_PCID + | Cr4::CR4_ENABLE_PAE + | Cr4::CR4_ENABLE_SMEP, + + cr4_pdptr_bits: Cr4::CR4_ENABLE_GLOBAL_PAGES + | Cr4::CR4_ENABLE_PSE + | Cr4::CR4_ENABLE_PAE + | Cr4::CR4_ENABLE_SMEP, + + host_xcr0: Xcr0::empty(), + + funcs: Default::default(), + host_efer: EferFlags::empty(), + host_xss: Default::default(), + host_arch_capabilities: Default::default(), + kvm_uret_msrs_list: Default::default(), + kvm_caps: Default::default(), + max_tsc_khz: Default::default(), + msrs_to_save: Default::default(), + emulated_msrs: Default::default(), + msr_based_features: Default::default(), + has_noapic_vcpu: Default::default(), + enable_pmu: Default::default(), + } + } + + #[inline] + pub fn set_runtime_func(&mut self, funcs: &'static dyn KvmFunc) { + self.funcs = Some(funcs); + } + + #[inline] + pub fn funcs(&self) -> &'static dyn KvmFunc { + self.funcs.unwrap() + } + + pub fn find_user_return_msr_idx(&self, msr: u32) -> Option { + for (i, val) in self.kvm_uret_msrs_list.iter().enumerate() { + if *val == msr { + return Some(i); + } + } + + None + } + + pub fn mpx_supported(&self) -> bool { + self.kvm_caps.supported_xcr0 & (Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDCSR_STATE) + == (Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDCSR_STATE) + } + + pub const KVM_MAX_VCPUS: usize = 1024; + pub const KVM_MAX_NR_USER_RETURN_MSRS: usize = 7; + + const MSRS_TO_SAVE_BASE: &[u32] = &[ + msr::IA32_SYSENTER_CS, + msr::IA32_SYSENTER_ESP, + msr::IA32_SYSENTER_EIP, + msr::IA32_STAR, + msr::IA32_CSTAR, + msr::IA32_KERNEL_GSBASE, + msr::IA32_FMASK, + msr::IA32_LSTAR, + msr::IA32_TIME_STAMP_COUNTER, + msr::IA32_PAT, + 0xc0010117, // MSR_VM_HSAVE_PA? + msr::IA32_FEATURE_CONTROL, + msr::MSR_C1_PMON_EVNT_SEL0, + msr::IA32_TSC_AUX, + 0x48, // MSR_IA32_SPEC_CTRL + msr::MSR_IA32_TSX_CTRL, + msr::MSR_IA32_RTIT_CTL, + msr::MSR_IA32_RTIT_STATUS, + msr::MSR_IA32_CR3_MATCH, + msr::MSR_IA32_RTIT_OUTPUT_BASE, + msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS, + msr::MSR_IA32_ADDR0_START, + msr::MSR_IA32_ADDR0_END, + msr::MSR_IA32_ADDR1_START, + msr::MSR_IA32_ADDR1_END, + msr::MSR_IA32_ADDR2_START, + msr::MSR_IA32_ADDR2_END, + msr::MSR_IA32_ADDR3_START, + msr::MSR_IA32_ADDR3_END, + 0xe1, // MSR_IA32_UMWAIT_CONTROL + 0x1c4, // MSR_IA32_XFD + 0x1c5, // MSR_IA32_XFD_ERR + ]; + + const EMULATED_MSRS_ALL: &[u32] = &[ + MSR_KVM_SYSTEM_TIME, + MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, + MSR_KVM_WALL_CLOCK_NEW, + HV_X64_MSR_GUEST_OS_ID, + HV_X64_MSR_HYPERCALL, + HV_REGISTER_TIME_REF_COUNT, + HV_REGISTER_REFERENCE_TSC, + HV_X64_MSR_TSC_FREQUENCY, + HV_X64_MSR_APIC_FREQUENCY, + HV_REGISTER_CRASH_P0, + HV_REGISTER_CRASH_P1, + HV_REGISTER_CRASH_P2, + HV_REGISTER_CRASH_P3, + HV_REGISTER_CRASH_P4, + HV_REGISTER_CRASH_CTL, + HV_X64_MSR_RESET, + HV_REGISTER_VP_INDEX, + HV_X64_MSR_VP_RUNTIME, + HV_REGISTER_SCONTROL, + HV_REGISTER_STIMER0_CONFIG, + HV_X64_MSR_VP_ASSIST_PAGE, + HV_X64_MSR_REENLIGHTENMENT_CONTROL, + HV_X64_MSR_TSC_EMULATION_CONTROL, + HV_X64_MSR_TSC_EMULATION_STATUS, + HV_X64_MSR_TSC_INVARIANT_CONTROL, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, + HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, + HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, + MSR_KVM_ASYNC_PF_EN, + MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, + MSR_KVM_ASYNC_PF_INT, + MSR_KVM_ASYNC_PF_ACK, + msr::IA32_TSC_ADJUST, + msr::IA32_TSC_DEADLINE, + msr::IA32_PERF_CAPABILITIES, + 0x10a, // MSR_IA32_ARCH_CAPABILITIES, + msr::IA32_MISC_ENABLE, + msr::IA32_MCG_STATUS, + msr::IA32_MCG_CTL, + 0x4d0, // MSR_IA32_MCG_EXT_CTL, + msr::IA32_SMBASE, + msr::MSR_SMI_COUNT, + msr::MSR_PLATFORM_INFO, + 0x140, // MSR_MISC_FEATURES_ENABLES, + 0xc001011f, // MSR_AMD64_VIRT_SPEC_CTRL, + 0xc0000104, // MSR_AMD64_TSC_RATIO, + msr::MSR_POWER_CTL, + msr::IA32_BIOS_SIGN_ID, // MSR_IA32_UCODE_REV, + /* + * KVM always supports the "true" VMX control MSRs, even if the host + * does not. The VMX MSRs as a whole are considered "emulated" as KVM + * doesn't strictly require them to exist in the host (ignoring that + * KVM would refuse to load in the first place if the core set of MSRs + * aren't supported). + */ + msr::IA32_VMX_BASIC, + msr::IA32_VMX_TRUE_PINBASED_CTLS, + msr::IA32_VMX_TRUE_PROCBASED_CTLS, + msr::IA32_VMX_TRUE_EXIT_CTLS, + msr::IA32_VMX_TRUE_ENTRY_CTLS, + msr::IA32_VMX_MISC, + msr::IA32_VMX_CR0_FIXED0, + msr::IA32_VMX_CR4_FIXED0, + msr::IA32_VMX_VMCS_ENUM, + msr::IA32_VMX_PROCBASED_CTLS2, + msr::IA32_VMX_EPT_VPID_CAP, + msr::IA32_VMX_VMFUNC, + 0xc0010015, // MSR_K7_HWCR, + MSR_KVM_POLL_CONTROL, + ]; + + const MSR_BASED_FEATURES_ALL_EXCEPT_VMX: &[u32] = &[ + 0xc0011029, // MSR_AMD64_DE_CFG + msr::IA32_BIOS_SIGN_ID, // MSR_IA32_UCODE_REV + 0x10a, // MSR_IA32_ARCH_CAPABILITIES, + msr::IA32_PERF_CAPABILITIES, + ]; + + pub fn arch_hardware_enable(&self) -> Result<(), SystemError> { + self.online_user_return_msr(); + + x86_kvm_ops().hardware_enable()?; + + // TODO: 这里需要对TSC进行一系列检测 + + Ok(()) + } + + /// ## 初始化当前cpu的kvm msr寄存器 + fn online_user_return_msr(&self) { + let user_return_msrs = user_return_msrs().get_mut(); + + for (idx, msr) in self.kvm_uret_msrs_list.iter().enumerate() { + let val = unsafe { rdmsr(*msr) }; + user_return_msrs.values[idx].host = val; + user_return_msrs.values[idx].curr = val; + } + } + + /// 厂商相关的init工作 + pub fn vendor_init(&mut self, init_ops: &'static dyn KvmInitFunc) -> Result<(), SystemError> { + let cpuid = CpuId::new(); + let cpu_feature = cpuid.get_feature_info().ok_or(SystemError::ENOSYS)?; + let cpu_extend = cpuid.get_extended_state_info().ok_or(SystemError::ENOSYS)?; + let extend_features = cpuid + .get_extended_feature_info() + .ok_or(SystemError::ENOSYS)?; + + let kvm_x86_ops = &self.funcs; + + // 是否已经设置过 + if kvm_x86_ops.is_some() { + error!( + "[KVM] already loaded vendor module {}", + kvm_x86_ops.unwrap().name() + ); + return Err(SystemError::EEXIST); + } + + // 确保cpu支持fpu浮点数处理器 + if !cpu_feature.has_fpu() || !cpu_feature.has_fxsave_fxstor() { + error!("[KVM] inadequate fpu"); + return Err(SystemError::ENOSYS); + } + + // TODO:实时内核需要判断tsc + // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#9472 + + // 读取主机page attribute table(页属性表) + let host_pat = unsafe { rdmsr(msr::IA32_PAT) }; + // PAT[0]是否为write back类型,即判断低三位是否为0b110(0x06) + if host_pat & 0b111 != 0b110 { + error!("[KVM] host PAT[0] is not WB"); + return Err(SystemError::EIO); + } + + // TODO:mmu vendor init + if cpu_feature.has_xsave() && unsafe { cr4() }.contains(Cr4::CR4_ENABLE_OS_XSAVE) { + self.host_xcr0 = unsafe { xcr0() }; + self.kvm_caps.supported_xcr0 = self.host_xcr0; + } + + // 保存efer + self.host_efer = Efer::read(); + + // 保存xss + if cpu_extend.has_xsaves_xrstors() { + self.host_xss = unsafe { rdmsr(msr::MSR_C5_PMON_BOX_CTRL) }; + } + + // TODO: 初始化性能监视单元(PMU) + // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#9518 + if extend_features.has_sha() { + self.host_arch_capabilities = unsafe { + // MSR_IA32_ARCH_CAPABILITIES + rdmsr(0x10a) + } + } + + init_ops.hardware_setup()?; + + self.set_runtime_func(init_ops.runtime_funcs()); + + self.kvm_timer_init()?; + + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#9544 + + let kvm_caps = &mut self.kvm_caps; + if !cpu_extend.has_xsaves_xrstors() { + kvm_caps.supported_xss = 0; + } + + if kvm_caps.has_tsc_control { + kvm_caps.max_guest_tsc_khz = 0x7fffffff.min( + ((kvm_caps.max_tsc_scaling_ratio as i128 * TSCManager::tsc_khz() as i128) + >> kvm_caps.tsc_scaling_ratio_frac_bits) as u32, + ); + } + + kvm_caps.default_tsc_scaling_ratio = 1 << kvm_caps.tsc_scaling_ratio_frac_bits; + self.kvm_init_msr_lists(); + + warn!("vendor init over"); + Ok(()) + } + + fn kvm_init_msr_lists(&mut self) { + self.msrs_to_save.clear(); + self.emulated_msrs.clear(); + self.msr_based_features.clear(); + + for msr in Self::MSRS_TO_SAVE_BASE { + self.kvm_probe_msr_to_save(*msr); + } + + if self.enable_pmu { + todo!() + } + + for msr in Self::EMULATED_MSRS_ALL { + if !x86_kvm_ops().has_emulated_msr(*msr) { + continue; + } + self.emulated_msrs.push(*msr); + } + + for msr in msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC { + self.kvm_prove_feature_msr(msr) + } + + for msr in Self::MSR_BASED_FEATURES_ALL_EXCEPT_VMX { + self.kvm_prove_feature_msr(*msr); + } + } + + fn kvm_probe_msr_to_save(&mut self, msr: u32) { + let cpuid = CpuId::new(); + let cpu_feat = cpuid.get_feature_info().unwrap(); + let cpu_extend = cpuid.get_extended_feature_info().unwrap(); + + match msr { + msr::MSR_C1_PMON_EVNT_SEL0 => { + if !cpu_extend.has_mpx() { + return; + } + } + + msr::IA32_TSC_AUX => { + if !cpu_feat.has_tsc() { + return; + } + } + // MSR_IA32_UNWAIT_CONTROL + 0xe1 => { + if !cpu_extend.has_waitpkg() { + return; + } + } + msr::MSR_IA32_RTIT_CTL | msr::MSR_IA32_RTIT_STATUS => { + if !cpu_extend.has_processor_trace() { + return; + } + } + msr::MSR_IA32_CR3_MATCH => { + // TODO: 判断intel_pt_validate_hw_cap(PT_CAP_cr3_filtering) + if !cpu_extend.has_processor_trace() { + return; + } + } + msr::MSR_IA32_RTIT_OUTPUT_BASE | msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS => { + // TODO: 判断!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&!intel_pt_validate_hw_cap(PT_CAP_single_range_output) + if !cpu_extend.has_processor_trace() { + return; + } + } + msr::MSR_IA32_ADDR0_START..msr::MSR_IA32_ADDR3_END => { + // TODO: 判断msr_index - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) + if !cpu_extend.has_processor_trace() { + return; + } + } + msr::IA32_PMC0..msr::IA32_PMC7 => { + // TODO: 判断msr是否符合配置 + } + msr::IA32_PERFEVTSEL0..msr::IA32_PERFEVTSEL7 => { + // TODO: 判断msr是否符合配置 + } + msr::MSR_PERF_FIXED_CTR0..msr::MSR_PERF_FIXED_CTR2 => { + // TODO: 判断msr是否符合配置 + } + msr::MSR_IA32_TSX_CTRL => { + // TODO: !(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR) + // 这个寄存器目前不支持,现在先return + // return; + } + _ => {} + } + + self.msrs_to_save.push(msr); + } + + fn kvm_prove_feature_msr(&mut self, index: u32) { + let mut msr = VmxMsrEntry { + index, + reserved: Default::default(), + data: Default::default(), + }; + + if self.get_msr_feature(&mut msr) { + return; + } + + self.msr_based_features.push(index); + } + + fn get_msr_feature(&self, msr: &mut VmxMsrEntry) -> bool { + match msr.index { + 0x10a => { + // MSR_IA32_ARCH_CAPABILITIES, + msr.data = self.get_arch_capabilities(); + } + msr::IA32_PERF_CAPABILITIES => { + msr.data = self.kvm_caps.supported_perf_cap; + } + msr::IA32_BIOS_SIGN_ID => { + // MSR_IA32_UCODE_REV + msr.data = unsafe { rdmsr(msr.index) }; + } + _ => { + return x86_kvm_ops().get_msr_feature(msr); + } + } + + return true; + } + + fn get_arch_capabilities(&self) -> u64 { + let mut data = ArchCapabilities::from_bits_truncate(self.host_arch_capabilities) + & ArchCapabilities::KVM_SUPPORTED_ARCH_CAP; + data.insert(ArchCapabilities::ARCH_CAP_PSCHANGE_MC_NO); + + if *L1TF_VMX_MITIGATION.read() != VmxL1dFlushState::Never { + data.insert(ArchCapabilities::ARCH_CAP_SKIP_VMENTRY_L1DFLUSH); + } + + // fixme:这里是直接赋值,这里应该是需要判断cpu是否存在某些bug + + data.insert( + ArchCapabilities::ARCH_CAP_RDCL_NO + | ArchCapabilities::ARCH_CAP_SSB_NO + | ArchCapabilities::ARCH_CAP_MDS_NO + | ArchCapabilities::ARCH_CAP_GDS_NO, + ); + + return data.bits(); + } + + pub fn add_user_return_msr(&mut self, msr: u32) { + assert!(self.kvm_uret_msrs_list.len() < Self::KVM_MAX_NR_USER_RETURN_MSRS); + self.kvm_uret_msrs_list.push(msr) + } + + fn kvm_timer_init(&mut self) -> Result<(), SystemError> { + let cpuid = CpuId::new(); + let cpu_feature = cpuid.get_feature_info().ok_or(SystemError::ENOSYS)?; + if cpu_feature.has_tsc() { + self.max_tsc_khz = TSCManager::tsc_khz(); + } + + // TODO:此处未完成 + Ok(()) + } + + pub fn kvm_set_user_return_msr(&self, slot: usize, mut value: u64, mask: u64) { + let msrs = user_return_msrs().get_mut(); + + value = (value & mask) | (msrs.values[slot].host & !mask); + if value == msrs.values[slot].curr { + return; + } + + unsafe { wrmsr(self.kvm_uret_msrs_list[slot], value) }; + + msrs.values[slot].curr = value; + + if !msrs.registered { + msrs.registered = true; + } + } +} + +/// ### Kvm的功能特性 +#[derive(Debug)] +pub struct KvmCapabilities { + /// 是否支持控制客户机的 TSC(时间戳计数器)速率 + has_tsc_control: bool, + /// 客户机可以使用的 TSC 的最大速率,以khz为单位 + max_guest_tsc_khz: u32, + /// TSC 缩放比例的小数部分的位数 + tsc_scaling_ratio_frac_bits: u8, + /// TSC 缩放比例的最大允许值 + max_tsc_scaling_ratio: u64, + /// 默认的 TSC 缩放比例,其值为 1ull << tsc_scaling_ratio_frac_bits + default_tsc_scaling_ratio: u64, + /// 是否支持总线锁定的退出 + has_bus_lock_exit: bool, + /// 是否支持 VM 退出通知 + has_notify_vmexit: bool, + /// 支持的 MCE(机器检查异常)功能的位掩码 + supported_mce_cap: McgCap, + /// 支持的 XCR0 寄存器的位掩码 + supported_xcr0: Xcr0, + /// 支持的 XSS(XSAVE Extended State)寄存器的位掩码 + supported_xss: u64, + /// 支持的性能监控功能的位掩码 + supported_perf_cap: u64, +} + +impl Default for KvmCapabilities { + fn default() -> Self { + Self { + has_tsc_control: Default::default(), + max_guest_tsc_khz: Default::default(), + tsc_scaling_ratio_frac_bits: Default::default(), + max_tsc_scaling_ratio: Default::default(), + default_tsc_scaling_ratio: Default::default(), + has_bus_lock_exit: Default::default(), + has_notify_vmexit: Default::default(), + supported_mce_cap: McgCap::MCG_CTL_P | McgCap::MCG_SER_P, + supported_xcr0: Xcr0::empty(), + supported_xss: Default::default(), + supported_perf_cap: Default::default(), + } + } +} + +bitflags! { + pub struct McgCap: u64 { + const MCG_BANKCNT_MASK = 0xff; /* Number of Banks */ + const MCG_CTL_P = 1 << 8; /* MCG_CTL register available */ + const MCG_EXT_P = 1 << 9; /* Extended registers available */ + const MCG_CMCI_P = 1 << 10; /* CMCI supported */ + const MCG_EXT_CNT_MASK = 0xff0000; /* Number of Extended registers */ + const MCG_EXT_CNT_SHIFT = 16; + const MCG_SER_P = 1 << 24; /* MCA recovery/new status bits */ + const MCG_ELOG_P = 1 << 26; /* Extended error log supported */ + const MCG_LMCE_P = 1 << 27; /* Local machine check supported */ + } +} + +static mut USER_RETURN_MSRS: Option> = None; + +fn user_return_msrs() -> &'static PerCpuVar { + unsafe { USER_RETURN_MSRS.as_ref().unwrap() } +} + +#[derive(Debug, Default, Clone)] +struct KvmUserReturnMsrs { + pub registered: bool, + pub values: [KvmUserReturnMsrsValues; KvmArchManager::KVM_MAX_NR_USER_RETURN_MSRS], +} + +#[derive(Debug, Default, Clone)] +struct KvmUserReturnMsrsValues { + pub host: u64, + pub curr: u64, +} diff --git a/kernel/src/arch/x86_64/vm/mtrr.rs b/kernel/src/arch/x86_64/vm/mtrr.rs new file mode 100644 index 00000000..c5a7560d --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mtrr.rs @@ -0,0 +1,37 @@ +use crate::virt::vm::kvm_host::vcpu::VirtCpu; + +use super::kvm_host::gfn_to_gpa; + +pub fn kvm_mtrr_check_gfn_range_consistency(_vcpu: &mut VirtCpu, gfn: u64, page_num: u64) -> bool { + // let mtrr_state = &vcpu.arch.mtrr_state; + // let mut iter = MtrrIter { + // mem_type: -1, + // mtrr_disabled: false, + // partial_map: false, + // }; + let _start = gfn_to_gpa(gfn); + let _end = gfn_to_gpa(gfn + page_num); + + // mtrr_for_each_mem_type(&mut iter, mtrr_state, start, end, |iter| { + // if iter.mem_type == -1 { + // iter.mem_type = iter.mem_type; + // } else if iter.mem_type != iter.mem_type { + // return false; + // } + // }); + + // if iter.mtrr_disabled { + // return true; + // } + + // if !iter.partial_map { + // return true; + // } + + // if iter.mem_type == -1 { + // return true; + // } + + // iter.mem_type == mtrr_default_type(mtrr_state) + true +} diff --git a/kernel/src/arch/x86_64/vm/uapi.rs b/kernel/src/arch/x86_64/vm/uapi.rs new file mode 100644 index 00000000..c7a8ccc2 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/uapi.rs @@ -0,0 +1,102 @@ +#![allow(dead_code)] + +use crate::virt::vm::user_api::UapiKvmSegment; + +pub const DE_VECTOR: usize = 0; +pub const DB_VECTOR: usize = 1; +pub const BP_VECTOR: usize = 3; +pub const OF_VECTOR: usize = 4; +pub const BR_VECTOR: usize = 5; +pub const UD_VECTOR: usize = 6; +pub const NM_VECTOR: usize = 7; +pub const DF_VECTOR: usize = 8; +pub const TS_VECTOR: usize = 10; +pub const NP_VECTOR: usize = 11; +pub const SS_VECTOR: usize = 12; +pub const GP_VECTOR: usize = 13; +pub const PF_VECTOR: usize = 14; +pub const MF_VECTOR: usize = 16; +pub const AC_VECTOR: usize = 17; +pub const MC_VECTOR: usize = 18; +pub const XM_VECTOR: usize = 19; +pub const VE_VECTOR: usize = 20; + +pub const KVM_SYNC_X86_REGS: u64 = 1 << 0; +pub const KVM_SYNC_X86_SREGS: u64 = 1 << 1; +pub const KVM_SYNC_X86_EVENTS: u64 = 1 << 2; + +pub const KVM_SYNC_X86_VALID_FIELDS: u64 = + KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS | KVM_SYNC_X86_EVENTS; + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmSegmentRegs { + pub cs: UapiKvmSegment, + pub ds: UapiKvmSegment, + pub es: UapiKvmSegment, + pub fs: UapiKvmSegment, + pub gs: UapiKvmSegment, + pub ss: UapiKvmSegment, + pub tr: UapiKvmSegment, + pub ldt: UapiKvmSegment, + pub gdt: UapiKvmDtable, + pub idt: UapiKvmDtable, + pub cr0: u64, + pub cr2: u64, + pub cr3: u64, + pub cr4: u64, + pub cr8: u64, + pub efer: u64, + pub apic_base: u64, + pub interrupt_bitmap: [u64; 4usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmDtable { + pub base: u64, + pub limit: u16, + pub padding: [u16; 3usize], +} + +#[allow(dead_code)] +pub mod kvm_exit { + pub const KVM_EXIT_UNKNOWN: u32 = 0; + pub const KVM_EXIT_EXCEPTION: u32 = 1; + pub const KVM_EXIT_IO: u32 = 2; + pub const KVM_EXIT_HYPERCALL: u32 = 3; + pub const KVM_EXIT_DEBUG: u32 = 4; + pub const KVM_EXIT_HLT: u32 = 5; + pub const KVM_EXIT_MMIO: u32 = 6; + pub const KVM_EXIT_IRQ_WINDOW_OPEN: u32 = 7; + pub const KVM_EXIT_SHUTDOWN: u32 = 8; + pub const KVM_EXIT_FAIL_ENTRY: u32 = 9; + pub const KVM_EXIT_INTR: u32 = 10; + pub const KVM_EXIT_SET_TPR: u32 = 11; + pub const KVM_EXIT_TPR_ACCESS: u32 = 12; + pub const KVM_EXIT_S390_SIEIC: u32 = 13; + pub const KVM_EXIT_S390_RESET: u32 = 14; + pub const KVM_EXIT_DCR: u32 = 15; + pub const KVM_EXIT_NMI: u32 = 16; + pub const KVM_EXIT_INTERNAL_ERROR: u32 = 17; + pub const KVM_EXIT_OSI: u32 = 18; + pub const KVM_EXIT_PAPR_HCALL: u32 = 19; + pub const KVM_EXIT_S390_UCONTROL: u32 = 20; + pub const KVM_EXIT_WATCHDOG: u32 = 21; + pub const KVM_EXIT_S390_TSCH: u32 = 22; + pub const KVM_EXIT_EPR: u32 = 23; + pub const KVM_EXIT_SYSTEM_EVENT: u32 = 24; + pub const KVM_EXIT_S390_STSI: u32 = 25; + pub const KVM_EXIT_IOAPIC_EOI: u32 = 26; + pub const KVM_EXIT_HYPERV: u32 = 27; + pub const KVM_EXIT_ARM_NISV: u32 = 28; + pub const KVM_EXIT_X86_RDMSR: u32 = 29; + pub const KVM_EXIT_X86_WRMSR: u32 = 30; + pub const KVM_EXIT_DIRTY_RING_FULL: u32 = 31; + pub const KVM_EXIT_AP_RESET_HOLD: u32 = 32; + pub const KVM_EXIT_X86_BUS_LOCK: u32 = 33; + pub const KVM_EXIT_XEN: u32 = 34; + pub const KVM_EXIT_RISCV_SBI: u32 = 35; + pub const KVM_EXIT_RISCV_CSR: u32 = 36; + pub const KVM_EXIT_NOTIFY: u32 = 37; +} diff --git a/kernel/src/arch/x86_64/vm/vmx/asm.rs b/kernel/src/arch/x86_64/vm/vmx/asm.rs new file mode 100644 index 00000000..92d85750 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/asm.rs @@ -0,0 +1,19 @@ +#![allow(dead_code)] + +pub const VMX_EPT_MT_EPTE_SHIFT: u64 = 3; +pub const VMX_EPTP_PWL_MASK: u64 = 0x38; +pub const VMX_EPTP_PWL_4: u64 = 0x18; +pub const VMX_EPTP_PWL_5: u64 = 0x20; +pub const VMX_EPTP_AD_ENABLE_BIT: u64 = 1 << 6; +pub const VMX_EPTP_MT_MASK: u64 = 0x7; +pub const VMX_EPTP_MT_WB: u64 = 0x6; +pub const VMX_EPTP_MT_UC: u64 = 0x0; +pub const VMX_EPT_READABLE_MASK: u64 = 0x1; +pub const VMX_EPT_WRITABLE_MASK: u64 = 0x2; +pub const VMX_EPT_EXECUTABLE_MASK: u64 = 0x4; +pub const VMX_EPT_IPAT_BIT: u64 = 1 << 6; +pub const VMX_EPT_ACCESS_BIT: u64 = 1 << 8; +pub const VMX_EPT_DIRTY_BIT: u64 = 1 << 9; +pub const VMX_EPT_RWX_MASK: u64 = + VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | VMX_EPT_EXECUTABLE_MASK; +pub const VMX_EPT_MT_MASK: u64 = 7 << VMX_EPT_MT_EPTE_SHIFT; diff --git a/kernel/src/arch/x86_64/vm/vmx/capabilities.rs b/kernel/src/arch/x86_64/vm/vmx/capabilities.rs new file mode 100644 index 00000000..628b0ac5 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/capabilities.rs @@ -0,0 +1,591 @@ +use raw_cpuid::CpuId; +use x86::{ + msr, + vmx::vmcs::control::{ + EntryControls, ExitControls, PinbasedControls, PrimaryControls, SecondaryControls, + }, +}; + +use crate::{ + arch::vm::{ + mmu::kvm_mmu::PageLevel, CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR, + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR, VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR, + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR, + }, + virt::vm::kvm_host::vcpu::VirtCpu, +}; + +use super::{vmcs::feat::VmxFeat, Vmx}; + +#[derive(Debug)] +pub struct VmcsConfig { + pub size: u32, + pub basic_cap: u32, + pub revision_id: u32, + pub pin_based_exec_ctrl: PinbasedControls, + pub cpu_based_exec_ctrl: PrimaryControls, + pub cpu_based_2nd_exec_ctrl: SecondaryControls, + pub cpu_based_3rd_exec_ctrl: u32, + pub vmexit_ctrl: ExitControls, + pub vmentry_ctrl: EntryControls, + pub misc: u64, + pub nested: NestedVmxMsrs, +} + +impl Default for VmcsConfig { + fn default() -> Self { + Self { + size: Default::default(), + basic_cap: Default::default(), + revision_id: Default::default(), + pin_based_exec_ctrl: PinbasedControls::empty(), + cpu_based_exec_ctrl: PrimaryControls::empty(), + cpu_based_2nd_exec_ctrl: SecondaryControls::empty(), + cpu_based_3rd_exec_ctrl: Default::default(), + vmexit_ctrl: ExitControls::empty(), + vmentry_ctrl: EntryControls::empty(), + misc: Default::default(), + nested: Default::default(), + } + } +} + +#[derive(Debug, Default)] +pub struct NestedVmxMsrs { + /// 主处理器基于控制,分为低32位和高32位 + pub procbased_ctls_low: u32, + /// 主处理器基于控制,分为低32位和高32位 + pub procbased_ctls_high: u32, + /// 次要处理器控制,分为低32位和高32位 + pub secondary_ctls_low: u32, + /// 次要处理器控制,分为低32位和高32位 + pub secondary_ctls_high: u32, + /// VMX 的针脚基于控制,分为低32位和高32位 + pub pinbased_ctls_low: u32, + /// VMX 的针脚基于控制,分为低32位和高32位 + pub pinbased_ctls_high: u32, + /// VM退出控制,分为低32位和高32位 + pub exit_ctls_low: u32, + /// VM退出控制,分为低32位和高32位 + pub exit_ctls_high: u32, + /// VM进入控制,分为低32位和高32位 + pub entry_ctls_low: u32, + /// VM进入控制,分为低32位和高32位 + pub entry_ctls_high: u32, + /// VMX 的其他杂项控制,分为低32位和高32位 + pub misc_low: u32, + /// VMX 的其他杂项控制,分为低32位和高32位 + pub misc_high: u32, + /// 扩展页表(EPT)的能力信息 + pub ept_caps: u32, + /// 虚拟处理器标识(VPID)的能力信息 + pub vpid_caps: u32, + /// 基本能力 + pub basic: u64, + /// VMX 控制的CR0寄存器的固定位 + pub cr0_fixed0: u64, + /// VMX 控制的CR0寄存器的固定位 + pub cr0_fixed1: u64, + /// VMX 控制的CR4寄存器的固定位 + pub cr4_fixed0: u64, + /// VMX 控制的CR4寄存器的固定位 + pub cr4_fixed1: u64, + /// VMX 控制的VMCS寄存器的编码 + pub vmcs_enum: u64, + /// VM功能控制 + pub vmfunc_controls: u64, +} + +impl NestedVmxMsrs { + pub fn control_msr(low: u32, high: u32) -> u64 { + (high as u64) << 32 | low as u64 + } + + pub fn get_vmx_msr(&self, msr_index: u32) -> Option { + match msr_index { + msr::IA32_VMX_BASIC => { + return Some(self.basic); + } + msr::IA32_VMX_TRUE_PINBASED_CTLS | msr::IA32_VMX_PINBASED_CTLS => { + let mut data = + NestedVmxMsrs::control_msr(self.pinbased_ctls_low, self.pinbased_ctls_high); + if msr_index == msr::IA32_VMX_PINBASED_CTLS { + data |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + } + return Some(data); + } + msr::IA32_VMX_TRUE_PROCBASED_CTLS | msr::IA32_VMX_PROCBASED_CTLS => { + let mut data = + NestedVmxMsrs::control_msr(self.procbased_ctls_low, self.procbased_ctls_high); + if msr_index == msr::IA32_VMX_PROCBASED_CTLS { + data |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + } + return Some(data); + } + msr::IA32_VMX_TRUE_EXIT_CTLS | msr::IA32_VMX_EXIT_CTLS => { + let mut data = NestedVmxMsrs::control_msr(self.exit_ctls_low, self.exit_ctls_high); + if msr_index == msr::IA32_VMX_EXIT_CTLS { + data |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + } + return Some(data); + } + msr::IA32_VMX_TRUE_ENTRY_CTLS | msr::IA32_VMX_ENTRY_CTLS => { + let mut data = + NestedVmxMsrs::control_msr(self.entry_ctls_low, self.entry_ctls_high); + if msr_index == msr::IA32_VMX_ENTRY_CTLS { + data |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; + } + return Some(data); + } + msr::IA32_VMX_MISC => { + return Some(NestedVmxMsrs::control_msr(self.misc_low, self.misc_high)); + } + msr::IA32_VMX_CR0_FIXED0 => { + return Some(self.cr0_fixed0); + } + msr::IA32_VMX_CR0_FIXED1 => { + return Some(self.cr0_fixed1); + } + msr::IA32_VMX_CR4_FIXED0 => { + return Some(self.cr4_fixed0); + } + msr::IA32_VMX_CR4_FIXED1 => { + return Some(self.cr4_fixed1); + } + msr::IA32_VMX_VMCS_ENUM => { + return Some(self.vmcs_enum); + } + msr::IA32_VMX_PROCBASED_CTLS2 => { + return Some(NestedVmxMsrs::control_msr( + self.secondary_ctls_low, + self.secondary_ctls_high, + )); + } + msr::IA32_VMX_EPT_VPID_CAP => { + return Some(self.ept_caps as u64 | ((self.vpid_caps as u64) << 32)); + } + msr::IA32_VMX_VMFUNC => { + return Some(self.vmfunc_controls); + } + _ => { + return None; + } + } + } +} + +#[derive(Debug, Default)] +pub struct VmxCapability { + pub ept: EptFlag, + pub vpid: VpidFlag, +} + +#[derive(Debug, PartialEq)] +pub enum ProcessorTraceMode { + System, + HostGuest, +} + +bitflags! { + #[derive(Default)] + pub struct VpidFlag: u32 { + /// 表示处理器支持 INVVPID 指令 + const INVVPID = 1 << 0; /* (32 - 32) */ + /// 表示 VPID 支持以单独地址方式进行范围 + const EXTENT_INDIVIDUAL_ADDR = 1 << 8; /* (40 - 32) */ + /// 表示 VPID 支持以单个上下文方式进行范围 + const EXTENT_SINGLE_CONTEXT = 1 << 9; /* (41 - 32) */ + /// 表示 VPID 支持以全局上下文方式进行范围 + const EXTENT_GLOBAL_CONTEXT = 1 << 10; /* (42 - 32) */ + /// 表示 VPID 支持以单个非全局方式进行范围 + const EXTENT_SINGLE_NON_GLOBAL = 1 << 11; /* (43 - 32) */ + } + + #[derive(Default)] + pub struct EptFlag: u32 { + /// EPT 条目是否允许执行 + const EPT_EXECUTE_ONLY = 1; + /// 处理器是否支持 4 级页表 + const EPT_PAGE_WALK_4 = 1 << 6; + /// 处理器是否支持 5 级页表 + const EPT_PAGE_WALK_5 = 1 << 7; + /// EPT 表的内存类型是否为不可缓存(uncached) + const EPTP_UC = 1 << 8; + /// EPT 表的内存类型是否为写回(write-back) + const EPTP_WB = 1 << 14; + /// 处理器是否支持 2MB 大页 + const EPT_2MB_PAGE = 1 << 16; + /// 处理器是否支持 1GB 大页 + const EPT_1GB_PAGE = 1 << 17; + /// 处理器是否支持 INV-EPT 指令,用于刷新 EPT TLB + const EPT_INVEPT = 1 << 20; + /// EPT 表是否支持访问位(Access-Dirty) + const EPT_AD = 1 << 21; + /// 处理器是否支持上下文扩展 + const EPT_EXTENT_CONTEXT = 1 << 25; + /// 处理器是否支持全局扩展 + const EPT_EXTENT_GLOBAL = 1 << 26; + } +} + +impl VmxCapability { + pub fn set_val_from_msr_val(&mut self, val: u64) { + self.ept = EptFlag::from_bits_truncate(val as u32); + self.vpid = VpidFlag::from_bits_truncate((val >> 32) as u32); + } +} + +impl Vmx { + /// 检查处理器是否支持VMX基本控制结构的输入输出功能 + #[inline] + #[allow(dead_code)] + pub fn has_basic_inout(&self) -> bool { + return ((self.vmcs_config.basic_cap as u64) << 32) & VmxFeat::VMX_BASIC_INOUT != 0; + } + + /// 检查处理器是否支持虚拟的非屏蔽中断(NMI) + #[inline] + pub fn has_virtual_nmis(&self) -> bool { + return self + .vmcs_config + .pin_based_exec_ctrl + .contains(PinbasedControls::VIRTUAL_NMIS) + && self + .vmcs_config + .cpu_based_exec_ctrl + .contains(PrimaryControls::NMI_WINDOW_EXITING); + } + + /// 检查处理器是否支持VMX的抢占计时器功能 + #[inline] + pub fn has_preemption_timer(&self) -> bool { + return self + .vmcs_config + .pin_based_exec_ctrl + .contains(PinbasedControls::VMX_PREEMPTION_TIMER); + } + + /// 检查处理器是否支持VMX的posted interrupt功能 + #[inline] + pub fn has_posted_intr(&self) -> bool { + return self + .vmcs_config + .pin_based_exec_ctrl + .contains(PinbasedControls::POSTED_INTERRUPTS); + } + + /// 是否支持加载IA32_EFER寄存器 + #[inline] + pub fn has_load_ia32_efer(&self) -> bool { + return self + .vmcs_config + .vmentry_ctrl + .contains(EntryControls::LOAD_IA32_EFER); + } + + /// 是否支持加载IA32_PERF_GLOBAL_CTRL寄存器 + #[inline] + pub fn has_load_perf_global_ctrl(&self) -> bool { + return self + .vmcs_config + .vmentry_ctrl + .contains(EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL); + } + + /// 是否支持加载边界检查配置寄存器(MPX) + #[inline] + pub fn has_mpx(&self) -> bool { + return self + .vmcs_config + .vmentry_ctrl + .contains(EntryControls::LOAD_IA32_BNDCFGS); + } + + /// 是否支持虚拟处理器的任务优先级(TPR)影子 + #[inline] + pub fn has_tpr_shadow(&self) -> bool { + return self + .vmcs_config + .cpu_based_exec_ctrl + .contains(PrimaryControls::USE_TPR_SHADOW); + } + + /// 检查处理器是否支持 VMX中的 VPID(Virtual Processor ID)功能 + /// + /// VPID 允许虚拟机监视器为每个虚拟处理器分配唯一的标识符,从而使得在不同的虚拟机之间进行快速的上下文切换和恢复成为可能。 + /// + /// 通过使用 VPID,VMM 可以更快速地识别和恢复之前保存的虚拟处理器的状态,从而提高了虚拟化性能和效率。 + #[inline] + pub fn has_vpid(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENABLE_VPID); + } + + /// 是否支持invvpid + /// + /// INVVPID 指令用于通知处理器无效化指定虚拟处理器标识符(VPID)相关的 TLB(Translation Lookaside Buffer)条目 + #[inline] + pub fn has_invvpid(&self) -> bool { + return self.vmx_cap.vpid.contains(VpidFlag::INVVPID); + } + + /// VPID 是否支持以单独地址方式进行范围 + #[allow(dead_code)] + #[inline] + pub fn has_invvpid_individual_addr(&self) -> bool { + return self.vmx_cap.vpid.contains(VpidFlag::EXTENT_INDIVIDUAL_ADDR); + } + + /// VPID 是否支持以单个上下文方式进行范围 + #[inline] + pub fn has_invvpid_single(&self) -> bool { + return self.vmx_cap.vpid.contains(VpidFlag::EXTENT_SINGLE_CONTEXT); + } + + /// VPID 是否支持以全局上下文方式进行范围 + #[inline] + pub fn has_invvpid_global(&self) -> bool { + return self.vmx_cap.vpid.contains(VpidFlag::EXTENT_GLOBAL_CONTEXT); + } + + /// 是否启用EPT(Extended Page Tables) + /// + /// EPT:EPT 是一种硬件虚拟化技术,允许虚拟机管理程序(例如 Hypervisor) 控制客户操作系统中虚拟地址和物理地址之间的映射。 + /// + /// 通过启用 EPT,处理器可以将虚拟地址直接映射到物理地址,从而提高虚拟机的性能和安全性。 + #[inline] + pub fn has_ept(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENABLE_EPT); + } + + /// 是否支持4级页表 + #[inline] + pub fn has_ept_4levels(&self) -> bool { + return self.vmx_cap.ept.contains(EptFlag::EPT_PAGE_WALK_4); + } + + /// 是否支持5级页表 + #[inline] + pub fn has_ept_5levels(&self) -> bool { + return self.vmx_cap.ept.contains(EptFlag::EPT_PAGE_WALK_5); + } + + pub fn get_max_ept_level(&self) -> usize { + if self.has_ept_5levels() { + return 5; + } + return 4; + } + + pub fn ept_cap_to_lpage_level(&self) -> PageLevel { + if self.vmx_cap.ept.contains(EptFlag::EPT_1GB_PAGE) { + return PageLevel::Level1G; + } + if self.vmx_cap.ept.contains(EptFlag::EPT_2MB_PAGE) { + return PageLevel::Level2M; + } + + return PageLevel::Level4K; + } + + /// 判断mt(Memory type)是否为write back + #[inline] + pub fn has_ept_mt_wb(&self) -> bool { + return self.vmx_cap.ept.contains(EptFlag::EPTP_WB); + } + + #[inline] + pub fn has_vmx_invept_context(&self) -> bool { + self.vmx_cap.ept.contains(EptFlag::EPT_EXTENT_CONTEXT) + } + + /// EPT是否支持全局拓展 + #[inline] + pub fn has_invept_global(&self) -> bool { + return self.vmx_cap.ept.contains(EptFlag::EPT_EXTENT_GLOBAL); + } + + /// EPT是否支持访问位 + #[inline] + pub fn has_ept_ad_bits(&self) -> bool { + return self.vmx_cap.ept.contains(EptFlag::EPT_AD); + } + + /// 是否支持 VMX 中的无限制客户(unrestricted guest)功能 + /// + /// 无限制客户功能允许客户操作系统在未受到主机操作系统干预的情况下运行 + #[inline] + pub fn has_unrestricted_guest(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::UNRESTRICTED_GUEST); + } + + /// 是否支持 VMX 中的 FlexPriority 功能 + /// + /// FlexPriority 是一种功能,可以在 TPR shadow 和虚拟化 APIC 访问同时可用时启用。 + /// + /// TPR shadow 允许虚拟机管理程序(VMM)跟踪虚拟机中处理器的 TPR 值,并在需要时拦截和修改。 + /// + /// 虚拟化 APIC 访问允许 VMM 控制虚拟机中的 APIC 寄存器访问。 + #[inline] + pub fn has_flexproirity(&self) -> bool { + return self.has_tpr_shadow() && self.has_virtualize_apic_accesses(); + } + + /// 是否支持 VMX 中的虚拟化 APIC 访问功能。 + /// + /// 当启用此功能时,虚拟机管理程序(VMM)可以控制虚拟机中的 APIC 寄存器访问。 + #[inline] + pub fn has_virtualize_apic_accesses(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::VIRTUALIZE_APIC); + } + + /// 是否支持 VMX 中的 ENCLS 指令导致的 VM 退出功能 + #[inline] + pub fn has_encls_vmexit(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENCLS_EXITING); + } + + /// 是否支持 VMX 中的 PLE (Pause Loop Exiting) 功能。 + #[inline] + pub fn has_ple(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::PAUSE_LOOP_EXITING); + } + + /// 是否支持 VMX 中的 APICv 功能 + #[inline] + pub fn has_apicv(&self) -> bool { + return self.has_apic_register_virt() + && self.has_posted_intr() + && self.has_virtual_intr_delivery(); + } + + /// 是否支持虚拟化的 APIC 寄存器功能 + #[inline] + pub fn has_apic_register_virt(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::VIRTUALIZE_APIC_REGISTER); + } + + /// 是否支持虚拟化的中断传递功能 + #[inline] + pub fn has_virtual_intr_delivery(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY); + } + + /// 是否支持虚拟化的中断注入(Inter-Processor Interrupt Virtualization,IPIV) + #[inline] + pub fn has_ipiv(&self) -> bool { + return false; + } + + /// 是否支持虚拟化的 TSC 缩放功能 + #[inline] + pub fn has_tsc_scaling(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::USE_TSC_SCALING); + } + + /// 是否支持虚拟化的页修改日志(Page Modification Logging) + #[inline] + pub fn has_pml(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENABLE_PML); + } + + /// 检查 CPU 是否支持使用 MSR 位图来控制 VMX + #[inline] + pub fn has_msr_bitmap(&self) -> bool { + return self + .vmcs_config + .cpu_based_exec_ctrl + .contains(PrimaryControls::USE_MSR_BITMAPS); + } + + #[inline] + pub fn has_sceondary_exec_ctrls(&self) -> bool { + self.vmcs_config + .cpu_based_exec_ctrl + .contains(PrimaryControls::SECONDARY_CONTROLS) + } + + #[inline] + pub fn has_rdtscp(&self) -> bool { + self.vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENABLE_RDTSCP) + } + + #[inline] + pub fn has_vmfunc(&self) -> bool { + self.vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENABLE_VM_FUNCTIONS) + } + + #[inline] + pub fn has_xsaves(&self) -> bool { + self.vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENABLE_XSAVES_XRSTORS) + } + + #[inline] + pub fn vmx_umip_emulated(&self) -> bool { + let feat = CpuId::new().get_extended_feature_info().unwrap().has_umip(); + + return !feat + && (self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::DTABLE_EXITING)); + } + + #[inline] + pub fn has_tertiary_exec_ctrls(&self) -> bool { + false + } + + #[inline] + pub fn has_bus_lock_detection(&self) -> bool { + false + } + + #[inline] + pub fn has_notify_vmexit(&self) -> bool { + false + } + + /// 是否需要拦截页面故障 + #[inline] + pub fn vmx_need_pf_intercept(&self, _vcpu: &VirtCpu) -> bool { + // if (!enable_ept) + // return true; + false + } +} diff --git a/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs b/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs new file mode 100644 index 00000000..bf3cda4e --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs @@ -0,0 +1,466 @@ +use crate::arch::mm::LockedFrameAllocator; +use crate::arch::vm::asm::VmxAsm; +use crate::arch::vm::mmu::kvm_mmu::PageLevel; +use crate::arch::vm::mmu::mmu_internal::KvmPageFault; +use crate::arch::MMArch; +use crate::libs::spinlock::SpinLockGuard; +use crate::mm::allocator::page_frame::FrameAllocator; +use crate::mm::page::{ + page_manager_lock_irqsave, EntryFlags, PageEntry, PageFlags, PageFlush, PageManager, PageType, +}; +use crate::mm::{MemoryManagementArch, PhysAddr, VirtAddr}; +use crate::smp::core::smp_get_processor_id; +use crate::smp::cpu::AtomicProcessorId; +use crate::smp::cpu::ProcessorId; +use core::ops::Add; +use core::sync::atomic::{compiler_fence, AtomicUsize, Ordering}; +use log::{debug, error, warn}; +use system_error::SystemError; +use x86::msr; +use x86::vmx::vmcs::control; + +// pub const VMX_EPT_MT_EPTE_SHIFT:u64 = 3; +pub const VMX_EPT_RWX_MASK: u64 = 0x7; + +// Exit Qualifications for EPT Violations +pub const EPT_VIOLATION_ACC_READ_BIT: u64 = 0; +pub const EPT_VIOLATION_ACC_WRITE_BIT: u64 = 1; +pub const EPT_VIOLATION_ACC_INSTR_BIT: u64 = 2; +pub const EPT_VIOLATION_RWX_SHIFT: u64 = 3; +pub const EPT_VIOLATION_GVA_IS_VALID_BIT: u64 = 7; +pub const EPT_VIOLATION_GVA_TRANSLATED_BIT: u64 = 8; + +bitflags! { + pub struct EptViolationExitQual :u64{ + const ACC_READ = 1 << EPT_VIOLATION_ACC_READ_BIT; + const ACC_WRITE = 1 << EPT_VIOLATION_ACC_WRITE_BIT; + const ACC_INSTR = 1 << EPT_VIOLATION_ACC_INSTR_BIT; + const RWX_MASK = VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT; + const GVA_IS_VALID = 1 << EPT_VIOLATION_GVA_IS_VALID_BIT; + const GVA_TRANSLATED = 1 << EPT_VIOLATION_GVA_TRANSLATED_BIT; + } +} + +// /// 全局EPT物理页信息管理器 +// pub static mut EPT_PAGE_MANAGER: Option> = None; + +// /// 初始化EPT_PAGE_MANAGER +// pub fn ept_page_manager_init() { +// kinfo!("page_manager_init"); +// let page_manager = SpinLock::new(EptPageManager::new()); + +// compiler_fence(Ordering::SeqCst); +// unsafe { EPT_PAGE_MANAGER = Some(page_manager) }; +// compiler_fence(Ordering::SeqCst); + +// kinfo!("page_manager_init done"); +// } + +// pub fn ept_page_manager_lock_irqsave() -> SpinLockGuard<'static, EptPageManager> { +// unsafe { EPT_PAGE_MANAGER.as_ref().unwrap().lock_irqsave() } +// } +// EPT 页表数据结构 +#[derive(Debug)] +pub struct EptPageTable { + /// 当前页表表示的虚拟地址空间的起始地址,内核访问EPT页表也是在虚拟地址空间中的 + base: VirtAddr, + /// 当前页表所在的物理地址 + phys: PhysAddr, + /// 当前页表的层级 + /// PageLevel::4K = 1 + level: PageLevel, +} +impl EptPageTable { + pub fn phys(&self) -> PhysAddr { + self.phys + } + + /// 设置当前页表的第i个页表项 + pub unsafe fn set_entry(&self, i: usize, entry: PageEntry) -> Option<()> { + let entry_virt = self.entry_virt(i)?; + MMArch::write::>(entry_virt, entry); + let page_entry = MMArch::read::>(entry_virt); + debug!("Set EPT entry: {:?} , index : {:?}", page_entry, i); + return Some(()); + } + /// 判断当前页表的第i个页表项是否已经填写了值 + /// + /// ## 参数 + /// - Some(true) 如果已经填写了值 + /// - Some(false) 如果未填写值 + /// - None 如果i超出了页表项的范围 + pub fn entry_mapped(&self, i: usize) -> Option { + let etv = unsafe { self.entry_virt(i) }?; + if unsafe { MMArch::read::(etv) } != 0 { + return Some(true); + } else { + return Some(false); + } + } + + /// 获取当前页表的层级 + #[inline(always)] + pub fn level(&self) -> PageLevel { + self.level + } + + /// 获取第i个entry的虚拟内存空间 + #[allow(dead_code)] + pub fn entry_base(&self, i: usize) -> Option { + if i < MMArch::PAGE_ENTRY_NUM { + let shift = (self.level as usize - 1) * MMArch::PAGE_ENTRY_SHIFT + MMArch::PAGE_SHIFT; + return Some(self.base.add(i << shift)); + } else { + return None; + } + } + /// 获取当前页表自身所在的虚拟地址 + #[inline(always)] + pub unsafe fn virt(&self) -> VirtAddr { + return MMArch::phys_2_virt(self.phys).unwrap(); + } + /// 获取当前页表的第i个页表项所在的虚拟地址(注意与entry_base进行区分) + pub unsafe fn entry_virt(&self, i: usize) -> Option { + if i < MMArch::PAGE_ENTRY_NUM { + return Some(self.virt().add(i * MMArch::PAGE_ENTRY_SIZE)); + } else { + return None; + } + } + /// 获取当前页表的第i个页表项 + pub unsafe fn entry(&self, i: usize) -> Option> { + let entry_virt = self.entry_virt(i)?; + return Some(PageEntry::from_usize(MMArch::read::(entry_virt))); + } + + pub fn new(base: VirtAddr, phys: PhysAddr, level: PageLevel) -> Self { + Self { base, phys, level } + } + /// 根据虚拟地址,获取对应的页表项在页表中的下标 + /// + /// ## 参数 + /// + /// - hva: 虚拟地址 + /// + /// ## 返回值 + /// + /// 页表项在页表中的下标。如果addr不在当前页表所表示的虚拟地址空间中,则返回None + pub unsafe fn index_of(&self, gpa: PhysAddr) -> Option { + let addr = VirtAddr::new(gpa.data() & MMArch::PAGE_ADDRESS_MASK); + let shift = (self.level - 1) as usize * MMArch::PAGE_ENTRY_SHIFT + MMArch::PAGE_SHIFT; + + //let mask = (MMArch::PAGE_ENTRY_NUM << shift) - 1; + // if addr < self.base || addr >= self.base.add(mask) { + // return None; + // } else { + return Some((addr.data() >> shift) & MMArch::PAGE_ENTRY_MASK); + //} + } + + pub fn next_level_table(&self, index: usize) -> Option { + if self.level == PageLevel::Level4K { + return None; + } + // 返回下一级页表 + let phys = unsafe { self.entry(index)?.address() }; + + let base; + if let Ok(phys) = phys { + base = unsafe { MMArch::phys_2_virt(PhysAddr::new(phys.data())).unwrap() }; + } else { + base = unsafe { MMArch::phys_2_virt(PhysAddr::new(phys.unwrap_err().data())).unwrap() }; + } + + let level = self.level - 1; + if let Err(_phys) = phys { + debug!("EptPageTable::next_level_table: phys {:?}", phys); + // Not Present的情况下,返回None + // 这里之所以绕了一圈,是因为在虚拟机启动阶段的page_fault的addr是not_present的,但是也要进行映射 + // 可能有点问题,但是先这么写 + if _phys.data() & 0x7 == 0x000 { + return None; + } + return Some(EptPageTable::new(base, PhysAddr::new(_phys.data()), level)); + } + return Some(EptPageTable::new( + base, + PhysAddr::new(phys.unwrap().data()), + level, + )); + } +} + +// // EPT物理页管理器 +// pub struct EptPageManager { +// phys2page: HashMap, +// } + +// impl EptPageManager { +// pub fn new() -> Self { +// Self { +// phys2page: HashMap::new(), +// } +// } + +// } + +/// Check if MTRR is supported +#[allow(dead_code)] +pub fn check_ept_features() -> Result<(), SystemError> { + const MTRR_ENABLE_BIT: u64 = 1 << 11; + let ia32_mtrr_def_type = unsafe { msr::rdmsr(msr::IA32_MTRR_DEF_TYPE) }; + if (ia32_mtrr_def_type & MTRR_ENABLE_BIT) == 0 { + return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP); + } + Ok(()) +} + +/// 标志当前没有处理器持有内核映射器的锁 +/// 之所以需要这个标志,是因为AtomicUsize::new(0)会把0当作一个处理器的id +const EPT_MAPPER_NO_PROCESSOR: ProcessorId = ProcessorId::INVALID; +/// 当前持有内核映射器锁的处理器 +static EPT_MAPPER_LOCK_OWNER: AtomicProcessorId = AtomicProcessorId::new(EPT_MAPPER_NO_PROCESSOR); +/// 内核映射器的锁计数器 +static EPT_MAPPER_LOCK_COUNT: AtomicUsize = AtomicUsize::new(0); + +pub struct EptPageMapper { + /// EPT页表映射器 + //mapper: PageMapper,//PageTableKind::EPT, LockedFrameAllocator + /// 标记当前映射器是否为只读 + readonly: bool, + // EPT页表根地址 + root_page_addr: PhysAddr, + /// 页分配器 + frame_allocator: LockedFrameAllocator, +} + +impl EptPageMapper { + /// 返回最上层的ept页表 + pub fn table(&self) -> EptPageTable { + EptPageTable::new( + unsafe { MMArch::phys_2_virt(self.root_page_addr).unwrap() }, + self.root_page_addr, + PageLevel::Level512G, + ) + } + pub fn root_page_addr() -> PhysAddr { + //PML4的物理地址 + let eptp = VmxAsm::vmx_vmread(control::EPTP_FULL); + let addr = eptp & 0xFFFF_FFFF_FFFF_F000; //去除低12位 + PhysAddr::new(addr as usize) + } + + fn lock_cpu(cpuid: ProcessorId) -> Self { + loop { + match EPT_MAPPER_LOCK_OWNER.compare_exchange_weak( + EPT_MAPPER_NO_PROCESSOR, + cpuid, + Ordering::Acquire, + Ordering::Relaxed, + ) { + Ok(_) => break, + // 当前处理器已经持有了锁 + Err(id) if id == cpuid => break, + // either CAS failed, or some other hardware thread holds the lock + Err(_) => core::hint::spin_loop(), + } + } + + let prev_count = EPT_MAPPER_LOCK_COUNT.fetch_add(1, Ordering::Relaxed); + compiler_fence(Ordering::Acquire); + + // 本地核心已经持有过锁,因此标记当前加锁获得的映射器为只读 + let readonly = prev_count > 0; + let root_page_addr = Self::root_page_addr(); + return Self { + readonly, + root_page_addr, + frame_allocator: LockedFrameAllocator, + }; + } + + /// 锁定内核映射器, 并返回一个内核映射器对象 + /// 目前只有这一个办法可以获得EptPageMapper对象 + #[inline(always)] + pub fn lock() -> Self { + //fixme:得到的是cpuid还是vcpuid? + let cpuid = smp_get_processor_id(); + return Self::lock_cpu(cpuid); + } + + /// 检查有无gpa->hpa的映射 + #[no_mangle] + pub fn is_mapped(&self, page_fault: &mut KvmPageFault) -> bool { + let gpa = page_fault.gpa() as usize; + let mut page_table = self.table(); + let mut next_page_table; + loop { + let index: usize = unsafe { + if let Some(i) = page_table.index_of(PhysAddr::new(gpa)) { + debug!("ept page table index: {:?}", i); + i + } else { + error!("ept page table index_of failed"); + return false; + } + }; + debug!("EPT table: index = {:?}, value = {:?}", index, page_table); + if let Some(table) = page_table.next_level_table(index) { + if table.level() == PageLevel::Level4K { + debug!("EPT table 4K: {:?}", table); + return true; + } + debug!("table.level(): {:?}", table.level()); + next_page_table = table; + } else { + return false; + } + page_table = next_page_table; + } + } + + /// 从当前EptPageMapper的页分配器中分配一个物理页(hpa),并将其映射到指定的gpa + pub fn map(&mut self, gpa: PhysAddr, flags: EntryFlags) -> Option> { + let gpa = PhysAddr::new(gpa.data() & (!MMArch::PAGE_NEGATIVE_MASK) & !0xFFF); + self.map_gpa(gpa, flags) + } + + ///映射一个hpa到指定的gpa + pub fn map_gpa( + &mut self, + gpa: PhysAddr, + flags: EntryFlags, + ) -> Option> { + // 验证虚拟地址和物理地址是否对齐 + if !(gpa.check_aligned(MMArch::PAGE_SIZE)) { + error!("Try to map unaligned page: gpa={:?}", gpa); + } + + // TODO: 验证flags是否合法 + + let mut table = self.table(); + debug!("ept page table: {:?}", table); + loop { + let i = unsafe { table.index_of(gpa).unwrap() }; + assert!(i < MMArch::PAGE_ENTRY_NUM); + if table.level() == PageLevel::Level4K { + //检查这个4K页面是否映射过 + if table.entry_mapped(i).unwrap() { + unsafe { + let entry_virt = table.entry_virt(i)?; + let _set_entry = MMArch::read::>(entry_virt); + warn!( + "index :: {:?} , Page gpa :: {:?} already mapped,content is: {:x}", + i, + gpa, + _set_entry.data() + ); + return None; + }; + } + + //分配一个entry的物理页 + compiler_fence(Ordering::SeqCst); + // let hpa: PhysAddr = unsafe { self.frame_allocator.allocate_one() }?; + // debug!("Allocate hpa: {:?}", hpa); + // 修改全局页管理器 + let mut page_manager_guard: SpinLockGuard<'static, PageManager> = + page_manager_lock_irqsave(); + let page = page_manager_guard + .create_one_page( + PageType::Normal, + PageFlags::empty(), + &mut self.frame_allocator, + ) + .ok()?; + let hpa = page.phys_address(); + drop(page_manager_guard); + // 清空这个页帧 + unsafe { + MMArch::write_bytes(MMArch::phys_2_virt(hpa).unwrap(), 0, MMArch::PAGE_SIZE) + }; + let entry = PageEntry::new(hpa, flags); + unsafe { table.set_entry(i, entry) }; + compiler_fence(Ordering::SeqCst); + + //打印页表项以进行验证 + unsafe { + let entry_virt = table.entry_virt(i)?; + let _set_entry = MMArch::read::>(entry_virt); + } + + return Some(PageFlush::new(unsafe { table.entry_virt(i)? })); + } else { + let next_table = table.next_level_table(i); + if let Some(next_table) = next_table { + table = next_table; + debug!("already next table: {:?}", table); + } else { + // 分配下一级页表 + let frame = unsafe { self.frame_allocator.allocate_one() }?; + + // 清空这个页帧 + unsafe { + MMArch::write_bytes( + MMArch::phys_2_virt(frame).unwrap(), + 0, + MMArch::PAGE_SIZE, + ) + }; + + // fixme::设置页表项的flags,可能有点问题 + let flags: EntryFlags = unsafe { EntryFlags::from_data(0x7) }; + + // 把新分配的页表映射到当前页表 + unsafe { table.set_entry(i, PageEntry::new(frame, flags)) }; + + // 获取新分配的页表 + table = table.next_level_table(i)?; + } + } + } + } +} +#[allow(dead_code)] +//调试EPT页表用,可以打印出EPT页表的值 +pub fn debug_eptp() { + let pml4_hpa: PhysAddr = EptPageMapper::lock().table().phys(); + debug!("Prepare to read EPTP address"); + let pml4_hva = unsafe { MMArch::phys_2_virt(PhysAddr::new(pml4_hpa.data())).unwrap() }; + debug!("PML4_hpa: 0x{:x}", pml4_hpa.data()); + debug!("PML4_hva: 0x{:x}", pml4_hva.data()); //Level512G + unsafe { + let entry = MMArch::read::(pml4_hva); + debug!("Value at EPTP address: 0x{:x}", entry); //Level2M + // 遍历并打印所有已分配的页面 + traverse_ept_table(pml4_hva, 4); + } +} +unsafe fn traverse_ept_table(table_addr: VirtAddr, level: u8) { + if level == (u8::MAX) { + return; + } + + let entries = MMArch::read_array::(table_addr, 511); + for (i, entry) in entries.iter().enumerate() { + //打印已分配的entry和4K页表的所有entry + if *entry & 0x7 != 0 || level == 0 { + let next_level_addr = if level != 0 { + MMArch::phys_2_virt(PhysAddr::new((*entry & 0xFFFFFFFFF000) as usize)) + } else { + //暂未分配地址 + if *entry == 0 { + continue; + } + MMArch::phys_2_virt(PhysAddr::new((*entry & 0xFFFFFFFFF000) as usize)) + }; + let entry_value = MMArch::read::(next_level_addr.unwrap()); + debug!( + "Level {} - index {}: HPA: 0x{:016x}, read_to: 0x{:016x}", + level, i, *entry, /*& 0xFFFFFFFFF000*/ entry_value, + ); + // 递归遍历下一级页表 + traverse_ept_table(next_level_addr.unwrap(), level - 1); + } + } +} diff --git a/kernel/src/arch/x86_64/vm/vmx/exit.rs b/kernel/src/arch/x86_64/vm/vmx/exit.rs new file mode 100644 index 00000000..2193d486 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/exit.rs @@ -0,0 +1,426 @@ +use bitfield_struct::bitfield; +use system_error::SystemError; +use x86::vmx::vmcs::{guest, ro}; + +use crate::{ + arch::vm::asm::{IntrInfo, VmxAsm}, + virt::vm::kvm_host::{vcpu::VirtCpu, Vm}, +}; + +use super::{ept::EptViolationExitQual, vmx_info, PageFaultErr}; +extern crate num_traits; + +#[bitfield(u32)] +pub struct VmxExitReason { + pub basic: u16, + pub reserved16: bool, + pub reserved17: bool, + pub reserved18: bool, + pub reserved19: bool, + pub reserved20: bool, + pub reserved21: bool, + pub reserved22: bool, + pub reserved23: bool, + pub reserved24: bool, + pub reserved25: bool, + pub bus_lock_detected: bool, + pub enclave_mode: bool, + pub smi_pending_mtf: bool, + pub smi_from_vmx_root: bool, + pub reserved30: bool, + pub failed_vmentry: bool, +} + +//#define VMX_EXIT_REASONS +#[derive(FromPrimitive, PartialEq, Clone, Copy)] +#[allow(non_camel_case_types)] +pub enum VmxExitReasonBasic { + EXCEPTION_OR_NMI = 0, + EXTERNAL_INTERRUPT = 1, + TRIPLE_FAULT = 2, + INIT_SIGNAL = 3, + SIPI = 4, + IO_SMI = 5, + OTHER_SMI = 6, + INTERRUPT_WINDOW = 7, + NMI_WINDOW = 8, + TASK_SWITCH = 9, + CPUID = 10, + GETSEC = 11, + HLT = 12, + INVD = 13, + INVLPG = 14, + RDPMC = 15, + RDTSC = 16, + RSM = 17, + VMCALL = 18, + VMCLEAR = 19, + VMLAUNCH = 20, + VMPTRLD = 21, + VMPTRST = 22, + VMREAD = 23, + VMRESUME = 24, + VMWRITE = 25, + VMXOFF = 26, + VMXON = 27, + CR_ACCESS = 28, + DR_ACCESS = 29, + IO_INSTRUCTION = 30, + RDMSR = 31, + WRMSR = 32, + VM_ENTRY_FAILURE_INVALID_GUEST_STATE = 33, + VM_ENTRY_FAILURE_MSR_LOADING = 34, + MWAIT = 36, + MONITOR_TRAP_FLAG = 37, + MONITOR = 39, + PAUSE = 40, + VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT = 41, + TPR_BELOW_THRESHOLD = 43, + APIC_ACCESS = 44, + VIRTUALIZED_EOI = 45, // "EOI_INDUCED" + ACCESS_GDTR_OR_IDTR = 46, + ACCESS_LDTR_OR_TR = 47, + EPT_VIOLATION = 48, + EPT_MISCONFIG = 49, + INVEPT = 50, + RDTSCP = 51, + VMX_PREEMPTION_TIMER_EXPIRED = 52, + INVVPID = 53, + WBINVD = 54, + XSETBV = 55, + APIC_WRITE = 56, + RDRAND = 57, + INVPCID = 58, + VMFUNC = 59, + ENCLS = 60, + RDSEED = 61, + PML_FULL = 62, + XSAVES = 63, + XRSTORS = 64, + + UMWAIT = 67, + TPAUSE = 68, + BUS_LOCK = 74, + NOTIFY = 75, + + UNKNOWN, +} + +impl From for VmxExitReasonBasic { + fn from(num: u16) -> Self { + match num { + 0 => VmxExitReasonBasic::EXCEPTION_OR_NMI, + 1 => VmxExitReasonBasic::EXTERNAL_INTERRUPT, + 2 => VmxExitReasonBasic::TRIPLE_FAULT, + 3 => VmxExitReasonBasic::INIT_SIGNAL, + 4 => VmxExitReasonBasic::SIPI, + 5 => VmxExitReasonBasic::IO_SMI, + 6 => VmxExitReasonBasic::OTHER_SMI, + 7 => VmxExitReasonBasic::INTERRUPT_WINDOW, + 8 => VmxExitReasonBasic::NMI_WINDOW, + 9 => VmxExitReasonBasic::TASK_SWITCH, + 10 => VmxExitReasonBasic::CPUID, + 11 => VmxExitReasonBasic::GETSEC, + 12 => VmxExitReasonBasic::HLT, + 13 => VmxExitReasonBasic::INVD, + 14 => VmxExitReasonBasic::INVLPG, + 15 => VmxExitReasonBasic::RDPMC, + 16 => VmxExitReasonBasic::RDTSC, + 17 => VmxExitReasonBasic::RSM, + 18 => VmxExitReasonBasic::VMCALL, + 19 => VmxExitReasonBasic::VMCLEAR, + 20 => VmxExitReasonBasic::VMLAUNCH, + 21 => VmxExitReasonBasic::VMPTRLD, + 22 => VmxExitReasonBasic::VMPTRST, + 23 => VmxExitReasonBasic::VMREAD, + 24 => VmxExitReasonBasic::VMRESUME, + 25 => VmxExitReasonBasic::VMWRITE, + 26 => VmxExitReasonBasic::VMXOFF, + 27 => VmxExitReasonBasic::VMXON, + 28 => VmxExitReasonBasic::CR_ACCESS, + 29 => VmxExitReasonBasic::DR_ACCESS, + 30 => VmxExitReasonBasic::IO_INSTRUCTION, + 31 => VmxExitReasonBasic::RDMSR, + 32 => VmxExitReasonBasic::WRMSR, + 33 => VmxExitReasonBasic::VM_ENTRY_FAILURE_INVALID_GUEST_STATE, + 34 => VmxExitReasonBasic::VM_ENTRY_FAILURE_MSR_LOADING, + 36 => VmxExitReasonBasic::MWAIT, + 37 => VmxExitReasonBasic::MONITOR_TRAP_FLAG, + 39 => VmxExitReasonBasic::MONITOR, + 40 => VmxExitReasonBasic::PAUSE, + 41 => VmxExitReasonBasic::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT, + 43 => VmxExitReasonBasic::TPR_BELOW_THRESHOLD, + 44 => VmxExitReasonBasic::APIC_ACCESS, + 45 => VmxExitReasonBasic::VIRTUALIZED_EOI, + 46 => VmxExitReasonBasic::ACCESS_GDTR_OR_IDTR, + 47 => VmxExitReasonBasic::ACCESS_LDTR_OR_TR, + 48 => VmxExitReasonBasic::EPT_VIOLATION, + 49 => VmxExitReasonBasic::EPT_MISCONFIG, + 50 => VmxExitReasonBasic::INVEPT, + 51 => VmxExitReasonBasic::RDTSCP, + 52 => VmxExitReasonBasic::VMX_PREEMPTION_TIMER_EXPIRED, + 53 => VmxExitReasonBasic::INVVPID, + 54 => VmxExitReasonBasic::WBINVD, + 55 => VmxExitReasonBasic::XSETBV, + 56 => VmxExitReasonBasic::APIC_WRITE, + 57 => VmxExitReasonBasic::RDRAND, + 58 => VmxExitReasonBasic::INVPCID, + 59 => VmxExitReasonBasic::VMFUNC, + 60 => VmxExitReasonBasic::ENCLS, + 61 => VmxExitReasonBasic::RDSEED, + 62 => VmxExitReasonBasic::PML_FULL, + 63 => VmxExitReasonBasic::XSAVES, + 64 => VmxExitReasonBasic::XRSTORS, + + 67 => VmxExitReasonBasic::UMWAIT, + 68 => VmxExitReasonBasic::TPAUSE, + 74 => VmxExitReasonBasic::BUS_LOCK, + 75 => VmxExitReasonBasic::NOTIFY, + _ => VmxExitReasonBasic::UNKNOWN, + } + } +} + +#[derive(Debug, PartialEq)] +#[allow(dead_code)] +pub enum ExitFastpathCompletion { + None, + ReenterGuest, + ExitHandled, +} +pub struct VmxExitHandlers {} +// //name 代表暂时不懂含义的(name linux=name DragonOS) +// ExceptionNmi = VmxExitReasonBasic::EXCEPTION_OR_NMI as isize, +// ExternalInterrupt = VmxExitReasonBasic::EXTERNAL_INTERRUPT as isize, +// TripleFault = VmxExitReasonBasic::TRIPLE_FAULT as isize, +// NmiWindow = VmxExitReasonBasic::NMI_WINDOW as isize, +// IoInstruction = VmxExitReasonBasic::IO_INSTRUCTION as isize, +// CrAccess = VmxExitReasonBasic::CR_ACCESS as isize, +// DrAccess = VmxExitReasonBasic::DR_ACCESS as isize, +// Cpuid = VmxExitReasonBasic::CPUID as isize, +// MsrRead = VmxExitReasonBasic::RDMSR as isize, +// MsrWrite = VmxExitReasonBasic::WRMSR as isize, +// InterruptWindow = VmxExitReasonBasic::INTERRUPT_WINDOW as isize, +// Hlt = VmxExitReasonBasic::HLT as isize, +// Invd = VmxExitReasonBasic::INVD as isize, +// Invlpg = VmxExitReasonBasic::INVLPG as isize, +// Rdpmc = VmxExitReasonBasic::RDPMC as isize, +// Vmcall = VmxExitReasonBasic::VMCALL as isize, +// Vmclear = VmxExitReasonBasic::VMCLEAR as isize, +// Vmlaunch = VmxExitReasonBasic::VMLAUNCH as isize, +// Vmptrld = VmxExitReasonBasic::VMPTRLD as isize, +// Vmptrst = VmxExitReasonBasic::VMPTRST as isize, +// Vmread = VmxExitReasonBasic::VMREAD as isize, +// Vmresume = VmxExitReasonBasic::VMRESUME as isize, +// Vmwrite = VmxExitReasonBasic::VMWRITE as isize, +// Vmoff = VmxExitReasonBasic::VMXOFF as isize, +// Vmon = VmxExitReasonBasic::VMXON as isize, +// TprBelowThreshold = VmxExitReasonBasic::TPR_BELOW_THRESHOLD as isize, +// ApicAccess = VmxExitReasonBasic::APIC_ACCESS as isize, +// ApicWrite = VmxExitReasonBasic::APIC_WRITE as isize, +// EoiInduced = VmxExitReasonBasic::VIRTUALIZED_EOI as isize, //name +// Wbinvd = VmxExitReasonBasic::WBINVD as isize, +// Xsetbv = VmxExitReasonBasic::XSETBV as isize, +// TaskSwitch = VmxExitReasonBasic::TASK_SWITCH as isize, +// MceDuringVmentry = VmxExitReasonBasic::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT as isize, //name +// GdtrIdtr = VmxExitReasonBasic::ACCESS_GDTR_OR_IDTR as isize, +// LdtrTr = VmxExitReasonBasic::ACCESS_LDTR_OR_TR as isize, +// EptViolation = VmxExitReasonBasic::EPT_VIOLATION as isize, +// EptMisconfig = VmxExitReasonBasic::EPT_MISCONFIG as isize, +// PauseInstruction = VmxExitReasonBasic::PAUSE as isize, +// MwaitInstruction = VmxExitReasonBasic::MWAIT as isize, +// MonitorTrapFlag = VmxExitReasonBasic::MONITOR_TRAP_FLAG as isize, +// MonitorInstruction = VmxExitReasonBasic::MONITOR as isize, +// Invept = VmxExitReasonBasic::INVEPT as isize, +// Invvpid = VmxExitReasonBasic::INVVPID as isize, +// Rdrand = VmxExitReasonBasic::RDRAND as isize, +// Rdseed = VmxExitReasonBasic::RDSEED as isize, +// PmlFull = VmxExitReasonBasic::PML_FULL as isize, +// Invpcid = VmxExitReasonBasic::INVPCID as isize, +// Vmfunc = VmxExitReasonBasic::VMFUNC as isize, +// PreemptionTimer = VmxExitReasonBasic::VMX_PREEMPTION_TIMER_EXPIRED as isize, +// Encls = VmxExitReasonBasic::ENCLS as isize, +// BusLock = VmxExitReasonBasic::BUS_LOCK as isize, +// Notify = VmxExitReasonBasic::NOTIFY as isize, +// Unknown, + +impl VmxExitHandlers { + #[inline(never)] + pub fn try_handle_exit( + vcpu: &mut VirtCpu, + vm: &Vm, + basic: VmxExitReasonBasic, + ) -> Option> { + // let exit_reason = vmx_vmread(VmcsFields::VMEXIT_EXIT_REASON as u32).unwrap() as u32; + // let exit_basic_reason = exit_reason & 0x0000_ffff; + // let guest_rip = vmx_vmread(VmcsFields::GUEST_RIP as u32).unwrap(); + // let _guest_rflags = vmx_vmread(VmcsFields::GUEST_RFLAGS as u32).unwrap(); + match basic { + VmxExitReasonBasic::IO_INSTRUCTION => { + return Some(Self::handle_io(vcpu)); + } + VmxExitReasonBasic::EPT_VIOLATION => { + let r = Some(Self::handle_ept_violation(vcpu, vm)); + debug(); + r + } + VmxExitReasonBasic::EXTERNAL_INTERRUPT => { + return Some(Self::handle_external_interrupt(vcpu)); + } + VmxExitReasonBasic::EXCEPTION_OR_NMI => { + todo!() + } + _ => None, + } + } + + fn handle_io(_vcpu: &mut VirtCpu) -> Result { + todo!(); + } + + fn handle_external_interrupt(vcpu: &mut VirtCpu) -> Result { + vcpu.stat.irq_exits += 1; + Ok(1) + } + + fn handle_ept_violation(vcpu: &mut VirtCpu, vm: &Vm) -> Result { + let exit_qualification = vcpu.get_exit_qual(); //0x184 + // EPT 违规发生在从 NMI 执行 iret 时, + // 在下一次 VM 进入之前必须设置 "blocked by NMI" 位。 + // 有一些错误可能会导致该位未被设置: + // AAK134, BY25。 + let vmx = vcpu.vmx(); + if vmx.idt_vectoring_info.bits() & IntrInfo::INTR_INFO_VALID_MASK.bits() != 0 + && vmx_info().enable_vnmi + && exit_qualification & IntrInfo::INTR_INFO_UNBLOCK_NMI.bits() as u64 != 0 + { + VmxAsm::vmx_vmwrite(guest::INTERRUPTIBILITY_STATE, 0x8); //GUEST_INTR_STATE_NMI + } + let gpa = VmxAsm::vmx_vmread(ro::GUEST_PHYSICAL_ADDR_FULL); + //let exit_qualification = VmxAsm::vmx_vmread(ro::EXIT_QUALIFICATION); + // trace_kvm_page_fault(vcpu, gpa, exit_qualification);// + + // 根据故障类型确定错误代码 + let mut error_code = if exit_qualification & (EptViolationExitQual::ACC_READ.bits()) != 0 { + //debug!("error_code::ACC_READ"); + PageFaultErr::PFERR_USER.bits() + } else { + 0 + }; + error_code |= if exit_qualification & (EptViolationExitQual::ACC_WRITE.bits()) != 0 { + //debug!("error_code::ACC_WRITE"); + PageFaultErr::PFERR_WRITE.bits() + } else { + 0 + }; + error_code |= if exit_qualification & (EptViolationExitQual::ACC_INSTR.bits()) != 0 { + //actice + //debug!("error_code::ACC_INSTR"); + PageFaultErr::PFERR_FETCH.bits() + } else { + 0 + }; + error_code |= if exit_qualification & (EptViolationExitQual::RWX_MASK.bits()) != 0 { + //debug!("error_code::RWX_MASK"); + PageFaultErr::PFERR_PRESENT.bits() + } else { + 0 + }; + if exit_qualification & (EptViolationExitQual::GVA_IS_VALID.bits()) != 0 { + //调试用 + //debug!("GVA is valid"); + } else { + //debug!("GVA is invalid"); + } + error_code |= if exit_qualification & (EptViolationExitQual::GVA_TRANSLATED.bits()) != 0 { + //debug!("error_code:GVA GVA_TRANSLATED"); + PageFaultErr::PFERR_GUEST_FINAL.bits() //active + } else { + PageFaultErr::PFERR_GUEST_PAGE.bits() + }; + //fixme:: 此时error_code为0x100000011 + + vcpu.arch.exit_qual = exit_qualification; + + // 检查 GPA 是否超出物理内存限制,因为这是一个客户机页面错误。 + // 我们必须在这里模拟指令,因为如果非法地址是分页结构的地址, + // 则会设置 EPT_VIOLATION_ACC_WRITE 位。 + // 或者,如果支持,我们还可以使用 EPT 违规的高级 VM 退出信息来重建页面错误代码。 + // if allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa) { + // return kvm_emulate_instruction(vcpu, 0); + // } + //debug!("EPT violation: error_code={:#x}", error_code); + vcpu.page_fault(vm, gpa, error_code, None, 0) + } +} +fn debug() { + // // 3 + // let info = VmxAsm::vmx_vmread(VmcsFields::VMEXIT_INSTR_LEN as u32); + // debug!("vmexit handler: VMEXIT_INSTR_LEN: 0x{:x}!", info); + + // //0 + // let info = VmxAsm::vmx_vmread(VmcsFields::VMEXIT_INSTR_INFO as u32); + // debug!("vmexit handler: VMEXIT_INSTR_INFO: 0x{:x}!", info); + + // //0x64042 + // /*0x64042: + + // 将其转换为二进制:0x64042 的二进制表示是 110010000001000010。 + // 每个位代表一个异常向量(例如,除以零,调试,不可屏蔽中断,断点等)。 + + // 从 vmx_update_exception_bitmap 函数中,我们看到设置的特定异常: + + // PF_VECTOR:页面错误 + // UD_VECTOR:未定义操作码 + // MC_VECTOR:机器检查 + // DB_VECTOR:调试 + // AC_VECTOR:对齐检查 + + // 值 0x64042 设置了与这些异常相对应的位,这意味着当这些异常在来宾中发生时将导致 VM 退出。 */ + // let info = VmxAsm::vmx_vmread(control::EXCEPTION_BITMAP); + // debug!("vmexit handler: EXCEPTION_BITMAP: 0x{:x}!", info); + + // //9 + // let info = VmxAsm::vmx_vmread(control::PAGE_FAULT_ERR_CODE_MASK); + // debug!("vmexit handler: PAGE_FAULT_ERR_CODE_MASK: 0x{:x}!", info); + + // //1 + // let info = VmxAsm::vmx_vmread(control::PAGE_FAULT_ERR_CODE_MATCH); + // debug!("vmexit handler: PAGE_FAULT_ERR_CODE_MATCH: 0x{:x}!", info); + + // //0 + // let info = VmxAsm::vmx_vmread(control::EPTP_LIST_ADDR_FULL); + // debug!("vmexit handler: EPTP_LIST_ADDR_FULL: 0x{:x}!", info); + + // let info = VmxAsm::vmx_vmread(ro::VM_INSTRUCTION_ERROR); + // debug!("vmexit handler: VM_INSTRUCTION_ERROR: 0x{:x}!", info); + + // let info = VmxAsm::vmx_vmread(ro::EXIT_REASON); + // debug!("vmexit handler: EXIT_REASON:0x{:x}!", info);//EPT VIOLATION + + // let info = VmxAsm::vmx_vmread(ro::VMEXIT_INTERRUPTION_INFO); + // debug!("vmexit handler: VMEXIT_INTERRUPTION_INFO: 0x{:x}!", info); + + // let info = VmxAsm::vmx_vmread(ro::VMEXIT_INTERRUPTION_ERR_CODE); + // debug!("vmexit handler: VMEXIT_INTERRUPTION_ERR_CODE: 0x{:x}!", info); + + // let info = VmxAsm::vmx_vmread(ro::IDT_VECTORING_INFO); + // debug!("vmexit handler: IDT_VECTORING_INFO: 0x{:x}!", info); + + // let info = VmxAsm::vmx_vmread(ro::IDT_VECTORING_ERR_CODE); + // debug!("vmexit handler: IDT_VECTORING_ERR_CODE: 0x{:x}!", info); + + // let info = VmxAsm::vmx_vmread(ro::VMEXIT_INSTRUCTION_LEN); + // debug!("vmexit handler: VMEXIT_INSTRUCTION_LEN: 0x{:x}!", info); + + // let info = VmxAsm::vmx_vmread(ro::VMEXIT_INSTRUCTION_INFO); + // debug!("vmexit handler: VMEXIT_INSTRUCTION_INFO: 0x{:x}!", info); + + //panic + // let info = VmxAsm::vmx_vmread(control::EPTP_INDEX); + // debug!("vmexit handler: EPTP_INDEX: 0x{:x}!", info); + + //panic + // let info = VmxAsm::vmx_vmread(control::VIRT_EXCEPTION_INFO_ADDR_FULL); + // debug!("vmexit handler: VIRT_EXCEPTION_INFO_ADDR_FULL: 0x{:x}!", info); +} diff --git a/kernel/src/arch/x86_64/vm/vmx/mod.rs b/kernel/src/arch/x86_64/vm/vmx/mod.rs new file mode 100644 index 00000000..e6e361c7 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/mod.rs @@ -0,0 +1,3775 @@ +use core::intrinsics::likely; +use core::intrinsics::unlikely; +use core::sync::atomic::{AtomicBool, Ordering}; +use exit::VmxExitHandlers; +use log::debug; +use log::error; +use log::warn; +use x86_64::registers::control::Cr3Flags; +use x86_64::structures::paging::PhysFrame; + +use crate::arch::process::table::USER_DS; +use crate::arch::vm::mmu::kvm_mmu::KvmMmu; +use crate::arch::vm::uapi::kvm_exit; +use crate::arch::vm::uapi::{ + AC_VECTOR, BP_VECTOR, DB_VECTOR, GP_VECTOR, MC_VECTOR, NM_VECTOR, PF_VECTOR, UD_VECTOR, +}; +use crate::arch::vm::vmx::vmcs::VmcsIntrHelper; +use crate::libs::spinlock::SpinLockGuard; +use crate::mm::VirtAddr; +use crate::process::ProcessManager; +use crate::virt::vm::kvm_host::vcpu::GuestDebug; +use crate::{ + arch::{ + vm::{ + asm::KvmX86Asm, + kvm_host::{vcpu::VirtCpuRequest, X86KvmArch}, + vmx::vmcs::vmx_area, + }, + CurrentIrqArch, MMArch, VirtCpuArch, + }, + exception::InterruptArch, + libs::spinlock::SpinLock, + mm::{ + percpu::{PerCpu, PerCpuVar}, + MemoryManagementArch, + }, + smp::{core::smp_get_processor_id, cpu::ProcessorId}, + virt::vm::{kvm_dev::kvm_init, kvm_host::vcpu::VirtCpu, user_api::UapiKvmSegment}, +}; +use alloc::{alloc::Global, boxed::Box, collections::LinkedList, sync::Arc, vec::Vec}; +use asm::VMX_EPTP_AD_ENABLE_BIT; +use asm::VMX_EPTP_MT_WB; +use asm::VMX_EPTP_PWL_4; +use asm::VMX_EPTP_PWL_5; +use bitfield_struct::bitfield; +use bitmap::{traits::BitMapOps, AllocBitmap}; +use raw_cpuid::CpuId; +use system_error::SystemError; +use x86::controlregs::{cr2, cr2_write}; +use x86::dtables::ldtr; +use x86::msr::wrmsr; +use x86::segmentation::load_ds; +use x86::segmentation::load_es; +use x86::segmentation::{ds, es, fs, gs}; +use x86::vmx::vmcs::ro; +use x86::{ + bits64::rflags::RFlags, + controlregs::{cr0, cr4, Cr0, Cr4, Xcr0}, + msr::{self, rdmsr}, + segmentation::{self}, + vmx::vmcs::{ + control::{ + self, EntryControls, ExitControls, PinbasedControls, PrimaryControls, SecondaryControls, + }, + guest, host, + }, +}; +use x86_64::registers::control::Cr3; +use x86_64::{instructions::tables::sidt, registers::control::EferFlags}; + +use crate::{ + arch::{ + vm::{vmx::vmcs::feat::VmxFeat, x86_kvm_manager_mut, McgCap}, + KvmArch, + }, + libs::rwlock::RwLock, + virt::vm::kvm_host::Vm, +}; + +use self::exit::ExitFastpathCompletion; +use self::exit::VmxExitReason; +use self::exit::VmxExitReasonBasic; +use self::vmcs::LoadedVmcs; +use self::{ + capabilities::{ProcessorTraceMode, VmcsConfig, VmxCapability}, + vmcs::{ + current_loaded_vmcs_list_mut, current_vmcs, current_vmcs_mut, ControlsType, + LockedLoadedVmcs, VMControlStructure, VmxMsrBitmapAccess, VmxMsrBitmapAction, + PERCPU_LOADED_VMCS_LIST, PERCPU_VMCS, VMXAREA, + }, +}; + +use super::asm::IntrInfo; +use super::asm::SegmentCacheField; +use super::kvm_host::vcpu::KvmIntrType; +use super::kvm_host::RMODE_TSS_SIZE; +use super::x86_kvm_ops; +use super::{ + asm::{VcpuSegment, VmxAsm, VmxMsrEntry}, + init_kvm_arch, + kvm_host::{KvmFunc, KvmInitFunc, KvmIrqChipMode, KvmReg, MsrFilterType, NotifyVmExitFlags}, + x86_kvm_manager, KvmArchManager, +}; + +pub mod asm; +pub mod capabilities; +pub mod ept; +pub mod exit; +pub mod vmcs; + +extern "C" { + fn vmx_vmexit(); +} + +pub struct VmxKvmInitFunc; + +impl VmxKvmInitFunc { + pub fn setup_per_cpu(&self) { + let mut vmcs_areas = Vec::new(); + vmcs_areas.resize(PerCpu::MAX_CPU_NUM as usize, VMControlStructure::new()); + unsafe { VMXAREA = PerCpuVar::new(vmcs_areas) }; + + let mut percpu_current_vmcs = Vec::new(); + percpu_current_vmcs.resize(PerCpu::MAX_CPU_NUM as usize, None); + unsafe { PERCPU_VMCS = PerCpuVar::new(percpu_current_vmcs) } + + let mut percpu_loaded_vmcs_lists = Vec::new(); + percpu_loaded_vmcs_lists.resize(PerCpu::MAX_CPU_NUM as usize, LinkedList::new()); + unsafe { PERCPU_LOADED_VMCS_LIST = PerCpuVar::new(percpu_loaded_vmcs_lists) } + } +} + +impl KvmInitFunc for VmxKvmInitFunc { + #[allow(clippy::borrow_interior_mutable_const)] + #[inline(never)] + fn hardware_setup(&self) -> Result<(), SystemError> { + let idt = sidt(); + let cpuid = CpuId::new(); + let cpu_extend_feature = cpuid + .get_extended_processor_and_feature_identifiers() + .ok_or(SystemError::ENOSYS)?; + + let mut vmx_init: Box = unsafe { + Box::try_new_zeroed_in(Global) + .map_err(|_| SystemError::ENOMEM)? + .assume_init() + }; + + vmx_init.init(); + + vmx_init.host_idt_base = idt.base.as_u64(); + Vmx::set_up_user_return_msrs(); + + Vmx::setup_vmcs_config(&mut vmx_init.vmcs_config, &mut vmx_init.vmx_cap)?; + + let manager = x86_kvm_manager_mut(); + let kvm_cap = &mut manager.kvm_caps; + + if vmx_init.has_mpx() { + kvm_cap.supported_xcr0 &= !(Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDCSR_STATE); + } + + // 判断是否启用vpid + if !vmx_init.has_vpid() + || !vmx_init.has_invvpid() + || !vmx_init.has_invvpid_single() + || !vmx_init.has_invvpid_global() + { + vmx_init.enable_vpid = false; + } + + if !vmx_init.has_ept() + || !vmx_init.has_ept_4levels() + || !vmx_init.has_ept_mt_wb() + || !vmx_init.has_invept_global() + { + vmx_init.enable_ept = false; + } + + // 是否启用了 EPT 并且检查 CPU 是否支持 Execute Disable(NX)功能 + // Execute Disable 是一种 CPU 功能,可以防止代码在数据内存区域上执行 + if !vmx_init.enable_ept && !cpu_extend_feature.has_execute_disable() { + error!("[KVM] NX (Execute Disable) not supported"); + return Err(SystemError::ENOSYS); + } + + if !vmx_init.has_ept_ad_bits() || !vmx_init.enable_ept { + vmx_init.enable_ept_ad = false; + } + + if !vmx_init.has_unrestricted_guest() || !vmx_init.enable_ept { + vmx_init.enable_unrestricted_guest = false; + } + + if !vmx_init.has_flexproirity() { + vmx_init.enable_flexpriority = false; + } + + if !vmx_init.has_virtual_nmis() { + vmx_init.enable_vnmi = false; + } + + if !vmx_init.has_encls_vmexit() { + vmx_init.enable_sgx = false; + } + + if !vmx_init.enable_flexpriority { + VmxKvmFunc::CONFIG.write().have_set_apic_access_page_addr = false; + } + + if !vmx_init.has_tpr_shadow() { + VmxKvmFunc::CONFIG.write().have_update_cr8_intercept = false; + } + + // TODO:https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#8501 - 8513 + + if !vmx_init.has_ple() { + vmx_init.ple_gap = 0; + vmx_init.ple_window = 0; + vmx_init.ple_window_grow = 0; + vmx_init.ple_window_max = 0; + vmx_init.ple_window_shrink = 0; + } + + if !vmx_init.has_apicv() { + vmx_init.enable_apicv = false; + } + + if !vmx_init.enable_apicv { + // TODO: 设置sync_pir_to_irr + } + + if !vmx_init.enable_apicv || !vmx_init.has_ipiv() { + vmx_init.enable_ipiv = false; + } + + if vmx_init.has_tsc_scaling() { + kvm_cap.has_tsc_control = true; + } + + kvm_cap.max_tsc_scaling_ratio = 0xffffffffffffffff; + kvm_cap.tsc_scaling_ratio_frac_bits = 48; + kvm_cap.has_bus_lock_exit = vmx_init.has_bus_lock_detection(); + kvm_cap.has_notify_vmexit = vmx_init.has_notify_vmexit(); + + // vmx_init.vpid_bitmap.lock().set_all(false); + + if vmx_init.enable_ept { + // TODO: mmu_set_ept_masks + warn!("mmu_set_ept_masks TODO!"); + } + + warn!("vmx_setup_me_spte_mask TODO!"); + + KvmMmu::kvm_configure_mmu( + vmx_init.enable_ept, + 0, + vmx_init.get_max_ept_level(), + vmx_init.ept_cap_to_lpage_level(), + ); + + if !vmx_init.enable_ept || !vmx_init.enable_ept_ad || !vmx_init.has_pml() { + vmx_init.enable_pml = false; + } + + if !vmx_init.enable_pml { + // TODO: Set cpu dirty log size + } + + if !vmx_init.has_preemption_timer() { + vmx_init.enable_preemption_timer = false; + } + + if vmx_init.enable_preemption_timer { + // TODO + } + + if !vmx_init.enable_preemption_timer { + // TODO + } + + kvm_cap + .supported_mce_cap + .insert(McgCap::MCG_LMCE_P | McgCap::MCG_CMCI_P); + + // TODO: pt_mode + + // TODO: setup_default_sgx_lepubkeyhash + + // TODO: nested + + // TODO: vmx_set_cpu_caps + init_vmx(vmx_init); + self.setup_per_cpu(); + + warn!("hardware setup finish"); + Ok(()) + } + + fn handle_intel_pt_intr(&self) -> u32 { + todo!() + } + + fn runtime_funcs(&self) -> &'static dyn super::kvm_host::KvmFunc { + &VmxKvmFunc + } +} + +#[derive(Debug)] +pub struct VmxKvmFunc; + +pub struct VmxKvmFuncConfig { + pub have_set_apic_access_page_addr: bool, + pub have_update_cr8_intercept: bool, +} + +impl VmxKvmFunc { + #[allow(clippy::declare_interior_mutable_const)] + pub const CONFIG: RwLock = RwLock::new(VmxKvmFuncConfig { + have_set_apic_access_page_addr: true, + have_update_cr8_intercept: true, + }); + + pub fn vcpu_load_vmcs( + vcpu: &mut VirtCpu, + cpu: ProcessorId, + _buddy: Option>, + ) { + let vmx = vcpu.vmx(); + let already_loaded = vmx.loaded_vmcs.lock().cpu == cpu; + + if !already_loaded { + Self::loaded_vmcs_clear(&vmx.loaded_vmcs); + let _irq_guard = unsafe { CurrentIrqArch::save_and_disable_irq() }; + + current_loaded_vmcs_list_mut().push_back(vmx.loaded_vmcs.clone()); + } + + if let Some(prev) = current_vmcs() { + let vmcs = vmx.loaded_vmcs.lock().vmcs.clone(); + if !Arc::ptr_eq(&vmcs, prev) { + VmxAsm::vmcs_load(vmcs.phys_addr()); + *current_vmcs_mut() = Some(vmcs); + + // TODO:buddy barrier? + } + } else { + let vmcs = vmx.loaded_vmcs.lock().vmcs.clone(); + VmxAsm::vmcs_load(vmcs.phys_addr()); + *current_vmcs_mut() = Some(vmcs); + + // TODO:buddy barrier? + } + + if !already_loaded { + let mut pseudo_descriptpr: x86::dtables::DescriptorTablePointer = + Default::default(); + unsafe { + x86::dtables::sgdt(&mut pseudo_descriptpr); + }; + + vmx.loaded_vmcs.lock().cpu = cpu; + let id = vmx.loaded_vmcs.lock().vmcs.lock().revision_id(); + debug!( + "revision_id {id} req {:?}", + VirtCpuRequest::KVM_REQ_TLB_FLUSH + ); + vcpu.request(VirtCpuRequest::KVM_REQ_TLB_FLUSH); + + VmxAsm::vmx_vmwrite( + host::TR_BASE, + KvmX86Asm::get_segment_base( + pseudo_descriptpr.base, + pseudo_descriptpr.limit, + unsafe { x86::task::tr().bits() }, + ), + ); + + VmxAsm::vmx_vmwrite(host::GDTR_BASE, pseudo_descriptpr.base as usize as u64); + + VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_ESP, unsafe { + rdmsr(msr::IA32_SYSENTER_ESP) + }); + } + } + + pub fn loaded_vmcs_clear(loaded_vmcs: &Arc) { + let mut guard = loaded_vmcs.lock(); + if guard.cpu == ProcessorId::INVALID { + return; + } + + if guard.cpu == smp_get_processor_id() { + if let Some(vmcs) = current_vmcs() { + if Arc::ptr_eq(vmcs, &guard.vmcs) { + *current_vmcs_mut() = None; + } + } + + VmxAsm::vmclear(guard.vmcs.phys_addr()); + + if let Some(shadow) = &guard.shadow_vmcs { + if guard.launched { + VmxAsm::vmclear(shadow.phys_addr()); + } + } + + let _ = current_loaded_vmcs_list_mut().extract_if(|x| Arc::ptr_eq(x, loaded_vmcs)); + + guard.cpu = ProcessorId::INVALID; + guard.launched = false; + } else { + // 交由对应cpu处理 + todo!() + } + } + + pub fn seg_setup(&self, seg: VcpuSegment) { + let seg_field = &KVM_VMX_SEGMENT_FIELDS[seg as usize]; + + VmxAsm::vmx_vmwrite(seg_field.selector, 0); + VmxAsm::vmx_vmwrite(seg_field.base, 0); + VmxAsm::vmx_vmwrite(seg_field.limit, 0xffff); + + let mut ar = 0x93; + if seg == VcpuSegment::CS { + ar |= 0x08; + } + VmxAsm::vmx_vmwrite(seg_field.ar_bytes, ar); + } +} + +impl KvmFunc for VmxKvmFunc { + fn name(&self) -> &'static str { + "VMX" + } + + fn hardware_enable(&self) -> Result<(), SystemError> { + let vmcs = vmx_area().get().as_ref(); + + debug!("vmcs idx {}", vmcs.abort); + + let phys_addr = + unsafe { MMArch::virt_2_phys(VirtAddr::new(vmcs as *const _ as usize)).unwrap() }; + + // TODO: intel_pt_handle_vmx(1); + + VmxAsm::kvm_cpu_vmxon(phys_addr)?; + + Ok(()) + } + + fn vm_init(&self) -> X86KvmArch { + let vmx_init = vmx_info(); + + let mut arch = X86KvmArch::default(); + if vmx_init.ple_gap == 0 { + arch.pause_in_guest = true; + } + + return arch; + } + + fn vcpu_create(&self, vcpu: &mut VirtCpu, vm: &Vm) { + VmxVCpuPriv::init(vcpu, vm); + } + + fn vcpu_load(&self, vcpu: &mut VirtCpu, cpu: crate::smp::cpu::ProcessorId) { + Self::vcpu_load_vmcs(vcpu, cpu, None); + // TODO: vmx_vcpu_pi_load + } + + fn cache_reg(&self, vcpu: &mut VirtCpuArch, reg: KvmReg) { + vcpu.mark_register_available(reg); + + match reg { + KvmReg::VcpuRegsRsp => { + vcpu.regs[reg as usize] = VmxAsm::vmx_vmread(guest::RSP); + } + KvmReg::VcpuRegsRip => { + vcpu.regs[reg as usize] = VmxAsm::vmx_vmread(guest::RIP); + } + // VCPU_EXREG_PDPTR + KvmReg::NrVcpuRegs => { + if vmx_info().enable_ept { + todo!() + } + } + KvmReg::VcpuExregCr0 => { + let guest_owned = vcpu.cr0_guest_owned_bits; + + vcpu.cr0.remove(guest_owned); + vcpu.cr0.insert( + Cr0::from_bits_truncate(VmxAsm::vmx_vmread(guest::CR0) as usize) & guest_owned, + ); + } + KvmReg::VcpuExregCr3 => { + //当拦截CR3加载时(例如用于影子分页),KVM(Kernel-based Virtual Machine)的CR3会被加载到硬件中,而不是客户机的CR3。 + //暂时先直接读寄存器 + vcpu.cr3 = VmxAsm::vmx_vmread(guest::CR3); + //todo!() + } + KvmReg::VcpuExregCr4 => { + let guest_owned = vcpu.cr4_guest_owned_bits; + + vcpu.cr4.remove(guest_owned); + vcpu.cr4.insert( + Cr4::from_bits_truncate(VmxAsm::vmx_vmread(guest::CR4) as usize) & guest_owned, + ); + } + _ => { + todo!() + } + } + } + + fn apicv_pre_state_restore(&self, _vcpu: &mut VirtCpu) { + // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#6924 + // TODO: pi + // todo!() + } + + fn set_msr(&self, vcpu: &mut VirtCpu, msr: super::asm::MsrData) -> Result<(), SystemError> { + let vmx = vcpu.vmx_mut(); + let msr_index = msr.index; + let data = msr.data; + + match msr_index { + msr::IA32_EFER => { + todo!("IA32_EFER") + } + + msr::IA32_FS_BASE => { + todo!("IA32_FS_BASE") + } + + msr::IA32_GS_BASE => { + todo!("IA32_GS_BASE") + } + + msr::IA32_KERNEL_GSBASE => { + todo!("IA32_KERNEL_GSBASE") + } + + 0x000001c4 => { + todo!("MSR_IA32_XFD") + } + + msr::IA32_SYSENTER_CS => { + todo!("IA32_SYSENTER_CS") + } + + msr::IA32_SYSENTER_EIP => { + todo!("IA32_SYSENTER_EIP") + } + + msr::IA32_SYSENTER_ESP => { + todo!("IA32_SYSENTER_ESP") + } + + msr::IA32_DEBUGCTL => { + todo!("IA32_DEBUGCTL") + } + + msr::MSR_C1_PMON_EVNT_SEL0 => { + todo!("MSR_IA32_BNDCFGS") + } + + 0xe1 => { + todo!("MSR_IA32_UMWAIT_CONTROL ") + } + + 0x48 => { + todo!("MSR_IA32_SPEC_CTRL") + } + + msr::MSR_IA32_TSX_CTRL => { + todo!("MSR_IA32_TSX_CTRL") + } + + msr::IA32_PAT => { + todo!("IA32_PAT") + } + + 0x4d0 => { + todo!("MSR_IA32_MCG_EXT_CTL") + } + + msr::IA32_FEATURE_CONTROL => { + todo!("IA32_FEATURE_CONTROL") + } + + 0x8c..=0x8f => { + todo!("MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3 {msr_index}") + } + + msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC => { + todo!("msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC") + } + + msr::MSR_IA32_RTIT_CTL => { + todo!("MSR_IA32_RTIT_CTL") + } + + msr::MSR_IA32_RTIT_STATUS => { + todo!("MSR_IA32_RTIT_STATUS") + } + + msr::MSR_IA32_RTIT_OUTPUT_BASE => { + todo!("MSR_IA32_RTIT_OUTPUT_BASE") + } + + 0x572 => { + todo!("MSR_IA32_RTIT_CR3_MATCH") + } + + msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS => { + todo!("MSR_IA32_RTIT_OUTPUT_MASK_PTRS") + } + + msr::MSR_IA32_ADDR0_START..=msr::MSR_IA32_ADDR3_END => { + todo!("msr::MSR_IA32_ADDR0_START..=msr::MSR_IA32_ADDR3_END") + } + + msr::MSR_PERF_CAPABILITIES => { + todo!("MSR_PERF_CAPABILITIES") + } + + _ => { + let uret_msr = vmx.find_uret_msr(msr_index); + + if let Some((idx, _msr)) = uret_msr { + vmx.set_guest_uret_msr(idx, data)?; + vmx.set_uret_msr(msr_index, data); + } else { + vcpu.arch.set_msr_common(&msr); + }; + } + } + + if msr_index == 0x10a { + // MSR_IA32_ARCH_CAPABILITIES + todo!() + } + + Ok(()) + } + + fn vcpu_reset(&self, vcpu: &mut VirtCpu, vm: &Vm, init_event: bool) { + if !init_event { + vmx_info_mut().vmx_reset_vcpu(vcpu, vm) + } + vcpu.kvm_set_cr8(0); + + let vmx = vcpu.vmx_mut(); + vmx.rmode.vm86_active = false; + vmx.spec_ctrl = 0; + vmx.msr_ia32_umwait_control = 0; + vmx.hv_deadline_tsc = u64::MAX; + + vmx.segment_cache_clear(); + + vcpu.arch.mark_register_available(KvmReg::VcpuExregSegments); + + self.seg_setup(VcpuSegment::CS); + VmxAsm::vmx_vmwrite(guest::CS_SELECTOR, 0xf000); + VmxAsm::vmx_vmwrite(guest::CS_BASE, 0xffff0000); + + self.seg_setup(VcpuSegment::DS); + self.seg_setup(VcpuSegment::ES); + self.seg_setup(VcpuSegment::FS); + self.seg_setup(VcpuSegment::GS); + self.seg_setup(VcpuSegment::SS); + + VmxAsm::vmx_vmwrite(guest::TR_SELECTOR, 0); + VmxAsm::vmx_vmwrite(guest::TR_BASE, 0); + VmxAsm::vmx_vmwrite(guest::TR_LIMIT, 0xffff); + VmxAsm::vmx_vmwrite(guest::TR_ACCESS_RIGHTS, 0x008b); + + VmxAsm::vmx_vmwrite(guest::LDTR_SELECTOR, 0); + VmxAsm::vmx_vmwrite(guest::LDTR_BASE, 0); + VmxAsm::vmx_vmwrite(guest::LDTR_LIMIT, 0xffff); + VmxAsm::vmx_vmwrite(guest::LDTR_ACCESS_RIGHTS, 0x00082); + + VmxAsm::vmx_vmwrite(guest::GDTR_BASE, 0); + VmxAsm::vmx_vmwrite(guest::GDTR_LIMIT, 0xffff); + + VmxAsm::vmx_vmwrite(guest::IDTR_BASE, 0); + VmxAsm::vmx_vmwrite(guest::IDTR_LIMIT, 0xffff); + + VmxAsm::vmx_vmwrite(guest::ACTIVITY_STATE, 0); + VmxAsm::vmx_vmwrite(guest::INTERRUPTIBILITY_STATE, 0); + VmxAsm::vmx_vmwrite(guest::PENDING_DBG_EXCEPTIONS, 0); + + if x86_kvm_manager().mpx_supported() { + VmxAsm::vmx_vmwrite(guest::IA32_BNDCFGS_FULL, 0); + } + + VmxAsm::vmx_vmwrite(control::VMENTRY_INTERRUPTION_INFO_FIELD, 0); + + vcpu.request(VirtCpuRequest::MAKE_KVM_REQ_APIC_PAGE_RELOAD); + + vmx_info().vpid_sync_context(vcpu.vmx().vpid); + + warn!("TODO: vmx_update_fb_clear_dis"); + } + + fn set_rflags(&self, vcpu: &mut VirtCpu, mut rflags: x86::bits64::rflags::RFlags) { + if vcpu.is_unrestricted_guest() { + vcpu.arch.mark_register_available(KvmReg::VcpuExregRflags); + vcpu.vmx_mut().rflags = rflags; + VmxAsm::vmx_vmwrite(guest::RFLAGS, rflags.bits()); + return; + } + + let old_rflags = self.get_rflags(vcpu); + + let vmx = vcpu.vmx_mut(); + + vmx.rflags = rflags; + if vmx.rmode.vm86_active { + vmx.rmode.save_rflags = rflags; + rflags.insert(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM); + } + + VmxAsm::vmx_vmwrite(guest::RFLAGS, rflags.bits()); + + if (old_rflags ^ vmx.rflags).contains(RFlags::FLAGS_VM) { + let emulation_required = vmx_info().emulation_required(vcpu); + vcpu.vmx_mut().emulation_required = emulation_required; + } + } + + fn set_cr0(&self, vm: &Vm, vcpu: &mut VirtCpu, cr0: x86::controlregs::Cr0) { + let old_cr0_pg = vcpu.arch.read_cr0_bits(Cr0::CR0_ENABLE_PAGING); + let mut hw_cr0 = cr0 & (!(Cr0::CR0_NOT_WRITE_THROUGH | Cr0::CR0_CACHE_DISABLE)); + + if vmx_info().enable_unrestricted_guest { + hw_cr0.insert(Cr0::CR0_NUMERIC_ERROR); + } else { + hw_cr0 + .insert(Cr0::CR0_NUMERIC_ERROR | Cr0::CR0_ENABLE_PAGING | Cr0::CR0_PROTECTED_MODE); + + if !vmx_info().enable_ept { + hw_cr0.insert(Cr0::CR0_WRITE_PROTECT); + } + + if vcpu.vmx().rmode.vm86_active && cr0.contains(Cr0::CR0_PROTECTED_MODE) { + vmx_info().enter_pmode(vcpu); + } + + if !vcpu.vmx().rmode.vm86_active && !cr0.contains(Cr0::CR0_PROTECTED_MODE) { + vmx_info().enter_rmode(vcpu, vm); + } + } + + VmxAsm::vmx_vmwrite(control::CR0_READ_SHADOW, cr0.bits() as u64); + VmxAsm::vmx_vmwrite(guest::CR0, hw_cr0.bits() as u64); + + vcpu.arch.cr0 = cr0; + + vcpu.arch.mark_register_available(KvmReg::VcpuExregCr0); + + if vcpu.arch.efer.contains(EferFlags::LONG_MODE_ENABLE) { + if old_cr0_pg.is_empty() && cr0.contains(Cr0::CR0_ENABLE_PAGING) { + todo!("enter lmode todo"); + } else if !old_cr0_pg.is_empty() && !cr0.contains(Cr0::CR0_ENABLE_PAGING) { + todo!("exit lmode todo"); + } + } + + if vmx_info().enable_ept && !vmx_info().enable_unrestricted_guest { + todo!() + } + + vcpu.vmx_mut().emulation_required = vmx_info().emulation_required(vcpu); + } + + fn set_cr4(&self, vcpu: &mut VirtCpu, cr4_flags: x86::controlregs::Cr4) { + let old_cr4 = vcpu.arch.read_cr4_bits(Cr4::all()); + + let mut hw_cr4 = (unsafe { cr4() } & Cr4::CR4_ENABLE_MACHINE_CHECK) + | (cr4_flags & (!Cr4::CR4_ENABLE_MACHINE_CHECK)); + + if vmx_info().enable_unrestricted_guest { + hw_cr4.insert(Cr4::CR4_ENABLE_VMX); + } else if vcpu.vmx().rmode.vm86_active { + hw_cr4.insert(Cr4::CR4_ENABLE_PAE | Cr4::CR4_ENABLE_VMX | Cr4::CR4_ENABLE_VME); + } else { + hw_cr4.insert(Cr4::CR4_ENABLE_PAE | Cr4::CR4_ENABLE_VMX); + } + + if vmx_info().vmx_umip_emulated() { + if cr4_flags.contains(Cr4::CR4_ENABLE_UMIP) { + vcpu.vmx().loaded_vmcs().controls_set( + ControlsType::SecondaryExec, + SecondaryControls::DTABLE_EXITING.bits() as u64, + ); + hw_cr4.remove(Cr4::CR4_ENABLE_UMIP); + } else if !vcpu.arch.is_guest_mode() { + vcpu.vmx().loaded_vmcs().controls_clearbit( + ControlsType::SecondaryExec, + SecondaryControls::DTABLE_EXITING.bits() as u64, + ); + } + } + + vcpu.arch.cr4 = cr4_flags; + vcpu.arch.mark_register_available(KvmReg::VcpuExregCr4); + + if !vmx_info().enable_unrestricted_guest { + if vmx_info().enable_ept { + if vcpu.arch.read_cr0_bits(Cr0::CR0_ENABLE_PAGING).is_empty() { + hw_cr4.remove(Cr4::CR4_ENABLE_PAE); + hw_cr4.insert(Cr4::CR4_ENABLE_PSE); + } else if !cr4_flags.contains(Cr4::CR4_ENABLE_PAE) { + hw_cr4.remove(Cr4::CR4_ENABLE_PAE); + } + } + + if vcpu.arch.read_cr0_bits(Cr0::CR0_ENABLE_PAGING).is_empty() { + hw_cr4.remove( + Cr4::CR4_ENABLE_SMEP | Cr4::CR4_ENABLE_SMAP | Cr4::CR4_ENABLE_PROTECTION_KEY, + ); + } + } + + VmxAsm::vmx_vmwrite(control::CR4_READ_SHADOW, cr4_flags.bits() as u64); + VmxAsm::vmx_vmwrite(guest::CR4, hw_cr4.bits() as u64); + + if (cr4_flags ^ old_cr4).contains(Cr4::CR4_ENABLE_OS_XSAVE | Cr4::CR4_ENABLE_PROTECTION_KEY) + { + // TODO: update_cpuid_runtime + } + } + + fn set_efer(&self, vcpu: &mut VirtCpu, efer: x86_64::registers::control::EferFlags) { + if vcpu.vmx().find_uret_msr(msr::IA32_EFER).is_none() { + return; + } + + vcpu.arch.efer = efer; + if efer.contains(EferFlags::LONG_MODE_ACTIVE) { + vcpu.vmx().loaded_vmcs().controls_setbit( + ControlsType::VmEntry, + EntryControls::IA32E_MODE_GUEST.bits().into(), + ); + } else { + vcpu.vmx().loaded_vmcs().controls_clearbit( + ControlsType::VmEntry, + EntryControls::IA32E_MODE_GUEST.bits().into(), + ); + } + + vmx_info().setup_uret_msrs(vcpu); + } + + fn update_exception_bitmap(&self, vcpu: &mut VirtCpu) { + let mut eb = (1u32 << PF_VECTOR) + | (1 << UD_VECTOR) + | (1 << MC_VECTOR) + | (1 << DB_VECTOR) + | (1 << AC_VECTOR); + + if vmx_info().enable_vmware_backdoor { + eb |= 1 << GP_VECTOR; + } + + if vcpu.guest_debug & (GuestDebug::ENABLE | GuestDebug::USE_SW_BP) + == (GuestDebug::ENABLE | GuestDebug::USE_SW_BP) + { + eb |= 1 << BP_VECTOR; + } + + if vcpu.vmx().rmode.vm86_active { + eb = !0; + } + + if !vmx_info().vmx_need_pf_intercept(vcpu) { + eb &= !(1 << PF_VECTOR); + } + + if vcpu.arch.is_guest_mode() { + todo!() + } else { + let mut mask = PageFaultErr::empty(); + let mut match_code = PageFaultErr::empty(); + if vmx_info().enable_ept && (eb & (1 << PF_VECTOR) != 0) { + mask = PageFaultErr::PFERR_PRESENT | PageFaultErr::PFERR_RSVD; + match_code = PageFaultErr::PFERR_PRESENT; + } + + VmxAsm::vmx_vmwrite(control::PAGE_FAULT_ERR_CODE_MASK, mask.bits); + VmxAsm::vmx_vmwrite(control::PAGE_FAULT_ERR_CODE_MATCH, match_code.bits); + } + + if vcpu.arch.xfd_no_write_intercept { + eb |= 1 << NM_VECTOR; + } + + VmxAsm::vmx_vmwrite(control::EXCEPTION_BITMAP, eb as u64); + } + + fn has_emulated_msr(&self, msr: u32) -> bool { + match msr { + msr::IA32_SMBASE => { + return vmx_info().enable_unrestricted_guest + || vmx_info().emulate_invalid_guest_state; + } + + msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC => { + return vmx_info().nested; + } + + 0xc001011f | 0xc0000104 => { + // MSR_AMD64_VIRT_SPEC_CTRL | MSR_AMD64_TSC_RATIO + return false; + } + + _ => { + return true; + } + } + } + + fn get_msr_feature(&self, msr: &mut super::asm::VmxMsrEntry) -> bool { + match msr.index { + msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC => { + if !vmx_info().nested { + return false; + } + + match vmx_info().vmcs_config.nested.get_vmx_msr(msr.index) { + Some(data) => { + msr.data = data; + return true; + } + None => { + return false; + } + } + } + _ => { + return false; + } + } + } + + fn get_rflags(&self, vcpu: &mut VirtCpu) -> x86::bits64::rflags::RFlags { + if !vcpu.arch.is_register_available(KvmReg::VcpuExregRflags) { + vcpu.arch.mark_register_available(KvmReg::VcpuExregRflags); + let mut rflags = RFlags::from_bits_truncate(VmxAsm::vmx_vmread(guest::RFLAGS)); + if vcpu.vmx_mut().rmode.vm86_active { + rflags.remove(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM); + let save_rflags = vcpu.vmx_mut().rmode.save_rflags; + rflags.insert(save_rflags & !(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM)); + } + + vcpu.vmx_mut().rflags = rflags; + } + + return vcpu.vmx_mut().rflags; + } + + fn vcpu_precreate(&self, vm: &mut Vm) -> Result<(), SystemError> { + if vm.arch.irqchip_mode != KvmIrqChipMode::None || !vmx_info().enable_ipiv { + return Ok(()); + } + + let kvm_vmx = vm.kvm_vmx_mut(); + + if kvm_vmx.pid_table.is_some() { + return Ok(()); + } + + kvm_vmx.pid_table = Some(unsafe { Box::new_zeroed().assume_init() }); + Ok(()) + } + + fn set_segment(&self, vcpu: &mut VirtCpu, var: &mut UapiKvmSegment, seg: VcpuSegment) { + vcpu.vmx_mut().emulation_required = vmx_info().emulation_required(vcpu); + *var = vmx_info()._vmx_set_segment(vcpu, *var, seg); + } + + fn get_segment( + &self, + vcpu: &mut VirtCpu, + var: UapiKvmSegment, + seg: VcpuSegment, + ) -> UapiKvmSegment { + return vmx_info().vmx_get_segment(vcpu, var, seg); + } + + fn get_idt(&self, _vcpu: &mut VirtCpu, dt: &mut x86::dtables::DescriptorTablePointer) { + dt.limit = VmxAsm::vmx_vmread(guest::IDTR_LIMIT) as u16; + dt.base = VmxAsm::vmx_vmread(guest::IDTR_BASE) as usize as *const _; + } + + fn set_idt(&self, _vcpu: &mut VirtCpu, dt: &x86::dtables::DescriptorTablePointer) { + VmxAsm::vmx_vmwrite(guest::IDTR_LIMIT, dt.limit as u64); + VmxAsm::vmx_vmwrite(guest::IDTR_BASE, dt.base as usize as u64); + } + + fn get_gdt(&self, _vcpu: &mut VirtCpu, dt: &mut x86::dtables::DescriptorTablePointer) { + dt.limit = VmxAsm::vmx_vmread(guest::GDTR_LIMIT) as u16; + dt.base = VmxAsm::vmx_vmread(guest::GDTR_BASE) as usize as *const _; + } + + fn set_gdt(&self, _vcpu: &mut VirtCpu, dt: &x86::dtables::DescriptorTablePointer) { + VmxAsm::vmx_vmwrite(guest::GDTR_LIMIT, dt.limit as u64); + VmxAsm::vmx_vmwrite(guest::GDTR_BASE, dt.base as usize as u64); + } + + fn is_vaild_cr0(&self, vcpu: &VirtCpu, _cr0: Cr0) -> bool { + if vcpu.arch.is_guest_mode() { + todo!() + } + + // TODO: 判断vmx->nested->vmxon + + true + } + + fn is_vaild_cr4(&self, vcpu: &VirtCpu, cr4: Cr4) -> bool { + if cr4.contains(Cr4::CR4_ENABLE_VMX) && vcpu.arch.is_smm() { + return false; + } + + // TODO: 判断vmx->nested->vmxon + + return true; + } + + fn post_set_cr3(&self, _vcpu: &VirtCpu, _cr3: u64) { + // Do Nothing + } + + fn vcpu_run(&self, vcpu: &mut VirtCpu) -> ExitFastpathCompletion { + if unlikely(vmx_info().enable_vnmi && vcpu.vmx().loaded_vmcs().soft_vnmi_blocked) { + todo!() + } + + if unlikely(vcpu.vmx().emulation_required) { + todo!() + } + + if vcpu.vmx().ple_window_dirty { + vcpu.vmx_mut().ple_window_dirty = false; + VmxAsm::vmx_vmwrite(control::PLE_WINDOW, vcpu.vmx().ple_window as u64); + } + + if vcpu.arch.is_register_dirty(KvmReg::VcpuRegsRsp) { + VmxAsm::vmx_vmwrite(guest::RSP, vcpu.arch.regs[KvmReg::VcpuRegsRsp as usize]); + } + if vcpu.arch.is_register_dirty(KvmReg::VcpuRegsRip) { + VmxAsm::vmx_vmwrite(guest::RIP, vcpu.arch.regs[KvmReg::VcpuRegsRip as usize]); + } + + vcpu.arch.clear_dirty(); + + let cr3: (PhysFrame, Cr3Flags) = Cr3::read(); + if unlikely(cr3 != vcpu.vmx().loaded_vmcs().host_state.cr3) { + let cr3_combined: u64 = + (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); + VmxAsm::vmx_vmwrite(host::CR3, cr3_combined); + vcpu.vmx().loaded_vmcs().host_state.cr3 = cr3; + } + + let cr4 = unsafe { cr4() }; + if unlikely(cr4 != vcpu.vmx().loaded_vmcs().host_state.cr4) { + VmxAsm::vmx_vmwrite(host::CR4, cr4.bits() as u64); + vcpu.vmx().loaded_vmcs().host_state.cr4 = cr4; + } + + // TODO: set_debugreg + + if vcpu.guest_debug.contains(GuestDebug::SINGLESTEP) { + todo!() + } + + vcpu.load_guest_xsave_state(); + + // TODO: pt_guest_enter + + // TODO: atomic_switch_perf_msrs + + if vmx_info().enable_preemption_timer { + // todo!() + warn!("vmx_update_hv_timer TODO"); + } + + Vmx::vmx_vcpu_enter_exit(vcpu, vcpu.vmx().vmx_vcpu_run_flags()); + + unsafe { + load_ds(USER_DS); + load_es(USER_DS); + }; + + // TODO: pt_guest_exit + + // TODO: kvm_load_host_xsave_state + + if vcpu.arch.is_guest_mode() { + todo!() + } + + if unlikely(vcpu.vmx().fail != 0) { + return ExitFastpathCompletion::None; + } + + if unlikely( + vcpu.vmx().exit_reason.basic() + == VmxExitReasonBasic::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT as u16, + ) { + todo!() + } + + if unlikely(vcpu.vmx().exit_reason.failed_vmentry()) { + return ExitFastpathCompletion::None; + } + + vcpu.vmx().loaded_vmcs().launched = true; + + // TODO: 处理中断 + + if vcpu.arch.is_guest_mode() { + return ExitFastpathCompletion::None; + } + + return Vmx::vmx_exit_handlers_fastpath(vcpu); + } + + fn prepare_switch_to_guest(&self, vcpu: &mut VirtCpu) { + // let cpu = smp_get_processor_id(); + let vmx = vcpu.vmx_mut(); + vmx.req_immediate_exit = false; + + if !vmx.guest_uret_msrs_loaded { + vmx.guest_uret_msrs_loaded = true; + + for (idx, msr) in vmx.guest_uret_msrs.iter().enumerate() { + if msr.load_into_hardware { + x86_kvm_manager().kvm_set_user_return_msr(idx, msr.data, msr.mask); + } + } + } + + // TODO: nested + + if vmx.guest_state_loaded { + return; + } + + // fixme: 这里读的是当前cpu的gsbase,正确安全做法应该为将gsbase设置为percpu变量 + let gs_base = unsafe { rdmsr(msr::IA32_KERNEL_GSBASE) }; + + let current = ProcessManager::current_pcb(); + let mut pcb_arch = current.arch_info_irqsave(); + + let fs_sel = fs().bits(); + let gs_sel = gs().bits(); + + unsafe { + pcb_arch.save_fsbase(); + pcb_arch.save_gsbase(); + } + + let fs_base = pcb_arch.fsbase(); + vmx.msr_host_kernel_gs_base = pcb_arch.gsbase() as u64; + + unsafe { wrmsr(msr::IA32_KERNEL_GSBASE, vmx.msr_guest_kernel_gs_base) }; + + let mut loaded_vmcs = vmx.loaded_vmcs(); + let host_state = &mut loaded_vmcs.host_state; + host_state.ldt_sel = unsafe { ldtr() }.bits(); + + host_state.ds_sel = ds().bits(); + host_state.es_sel = es().bits(); + + host_state.set_host_fsgs(fs_sel, gs_sel, fs_base, gs_base as usize); + drop(loaded_vmcs); + + vmx.guest_state_loaded = true; + } + + fn flush_tlb_all(&self, vcpu: &mut VirtCpu) { + if vmx_info().enable_ept { + VmxAsm::ept_sync_global(); + } else if vmx_info().has_invvpid_global() { + VmxAsm::sync_vcpu_global(); + } else { + VmxAsm::sync_vcpu_single(vcpu.vmx().vpid); + // TODO: 嵌套:VmxAsm::sync_vcpu_single(vcpu.vmx().nested.vpid02); + } + } + + fn handle_exit_irqoff(&self, vcpu: &mut VirtCpu) { + if vcpu.vmx().emulation_required { + return; + } + + let basic = VmxExitReasonBasic::from(vcpu.vmx().exit_reason.basic()); + + if basic == VmxExitReasonBasic::EXTERNAL_INTERRUPT { + Vmx::handle_external_interrupt_irqoff(vcpu); + } else if basic == VmxExitReasonBasic::EXCEPTION_OR_NMI { + //todo!() + } + } + + fn handle_exit( + //vmx_handle_exit + &self, + vcpu: &mut VirtCpu, + vm: &Vm, + fastpath: ExitFastpathCompletion, + ) -> Result { + let r = vmx_info().vmx_handle_exit(vcpu, vm, fastpath); + + if vcpu.vmx().exit_reason.bus_lock_detected() { + todo!() + } + + r + } + + fn load_mmu_pgd(&self, vcpu: &mut VirtCpu, _vm: &Vm, root_hpa: u64, root_level: u32) { + let guest_cr3; + let eptp; + + if vmx_info().enable_ept { + eptp = vmx_info().construct_eptp(vcpu, root_hpa, root_level); + + VmxAsm::vmx_vmwrite(control::EPTP_FULL, eptp); + + if !vmx_info().enable_unrestricted_guest + && !vcpu.arch.cr0.contains(Cr0::CR0_ENABLE_PAGING) + { + todo!() + } else if vcpu.arch.is_register_dirty(KvmReg::VcpuExregCr3) { + guest_cr3 = vcpu.arch.cr3; + debug!("load_mmu_pgd: guest_cr3 = {:#x}", guest_cr3); + } else { + return; + } + } else { + todo!(); + } + vcpu.load_pdptrs(); + VmxAsm::vmx_vmwrite(guest::CR3, guest_cr3); + } +} + +static mut VMX: Option = None; + +#[inline] +pub fn vmx_info() -> &'static Vmx { + unsafe { VMX.as_ref().unwrap() } +} + +#[inline] +pub fn vmx_info_mut() -> &'static mut Vmx { + unsafe { VMX.as_mut().unwrap() } +} + +#[inline(never)] +pub fn init_vmx(vmx: Box) { + static INIT_ONCE: AtomicBool = AtomicBool::new(false); + if INIT_ONCE + .compare_exchange(false, true, Ordering::SeqCst, Ordering::SeqCst) + .is_ok() + { + unsafe { VMX = Some(*vmx) }; + } else { + panic!("init_vmx can only be called once"); + } +} + +#[derive(Debug)] +pub struct Vmx { + pub host_idt_base: u64, + pub vmcs_config: VmcsConfig, + pub vmx_cap: VmxCapability, + pub vpid_bitmap: SpinLock, + pub enable_vpid: bool, + pub enable_ept: bool, + pub enable_ept_ad: bool, + pub enable_unrestricted_guest: bool, + pub emulate_invalid_guest_state: bool, + pub enable_flexpriority: bool, + pub enable_vnmi: bool, + pub enable_sgx: bool, + pub enable_apicv: bool, + pub enable_ipiv: bool, + pub enable_pml: bool, + pub enable_preemption_timer: bool, + + pub enable_vmware_backdoor: bool, + + pub nested: bool, + + pub ple_gap: u32, + pub ple_window: u32, + pub ple_window_grow: u32, + pub ple_window_max: u32, + pub ple_window_shrink: u32, + + pub pt_mode: ProcessorTraceMode, +} + +impl Vmx { + fn init(&mut self) { + let mut bitmap = AllocBitmap::new(1 << 16); + + // 0为vpid的非法值 + bitmap.set(0, true); + + self.host_idt_base = Default::default(); + self.vmcs_config = Default::default(); + self.vmx_cap = Default::default(); + self.vpid_bitmap = SpinLock::new(bitmap); + self.enable_vpid = true; + self.enable_ept = true; + self.enable_ept_ad = true; + self.enable_unrestricted_guest = true; + self.enable_flexpriority = true; + self.enable_vnmi = true; + self.enable_sgx = true; + self.ple_gap = 128; + self.ple_window = 4096; + self.ple_window_grow = 2; + self.ple_window_max = u32::MAX; + self.ple_window_shrink = 0; + self.enable_apicv = true; + self.enable_ipiv = true; + self.enable_pml = true; + self.enable_preemption_timer = true; + self.pt_mode = ProcessorTraceMode::System; + self.emulate_invalid_guest_state = true; + + // 目前先不管嵌套虚拟化,后续再实现 + self.nested = false; + self.enable_vmware_backdoor = false; + } + + /* + * Internal error codes that are used to indicate that MSR emulation encountered + * an error that should result in #GP in the guest, unless userspace + * handles it. + */ + #[allow(dead_code)] + pub const KVM_MSR_RET_INVALID: u32 = 2; /* in-kernel MSR emulation #GP condition */ + #[allow(dead_code)] + pub const KVM_MSR_RET_FILTERED: u32 = 3; /* #GP due to userspace MSR filter */ + + pub const MAX_POSSIBLE_PASSTHROUGH_MSRS: usize = 16; + + pub const VMX_POSSIBLE_PASSTHROUGH_MSRS: [u32; Self::MAX_POSSIBLE_PASSTHROUGH_MSRS] = [ + 0x48, // MSR_IA32_SPEC_CTRL + 0x49, // MSR_IA32_PRED_CMD + 0x10b, // MSR_IA32_FLUSH_CMD + msr::IA32_TIME_STAMP_COUNTER, + msr::IA32_FS_BASE, + msr::IA32_GS_BASE, + msr::IA32_KERNEL_GSBASE, + 0x1c4, // MSR_IA32_XFD + 0x1c5, // MSR_IA32_XFD_ERR + msr::IA32_SYSENTER_CS, + msr::IA32_SYSENTER_ESP, + msr::IA32_SYSENTER_EIP, + msr::MSR_CORE_C1_RESIDENCY, + msr::MSR_CORE_C3_RESIDENCY, + msr::MSR_CORE_C6_RESIDENCY, + msr::MSR_CORE_C7_RESIDENCY, + ]; + + /// ### 查看CPU是否支持虚拟化 + #[allow(dead_code)] + pub fn check_vmx_support() -> bool { + let cpuid = CpuId::new(); + // Check to see if CPU is Intel (“GenuineIntel”). + if let Some(vi) = cpuid.get_vendor_info() { + if vi.as_str() != "GenuineIntel" { + return false; + } + } + // Check processor supports for Virtual Machine Extension (VMX) technology + // CPUID.1:ECX.VMX[bit 5] = 1 (Intel Manual: 24.6 Discovering Support for VMX) + if let Some(fi) = cpuid.get_feature_info() { + if !fi.has_vmx() { + return false; + } + } + return true; + } + + #[inline(never)] + pub fn set_up_user_return_msrs() { + const VMX_URET_MSRS_LIST: &[u32] = &[ + msr::IA32_FMASK, + msr::IA32_LSTAR, + msr::IA32_CSTAR, + msr::IA32_EFER, + msr::IA32_TSC_AUX, + msr::IA32_STAR, + // 这个寄存器会出错<,先注释掉 + // MSR_IA32_TSX_CTRL, + ]; + + let manager = x86_kvm_manager_mut(); + for msr in VMX_URET_MSRS_LIST { + manager.add_user_return_msr(*msr); + } + } + + /// 初始化设置vmcs的config + #[inline(never)] + pub fn setup_vmcs_config( + vmcs_config: &mut VmcsConfig, + vmx_cap: &mut VmxCapability, + ) -> Result<(), SystemError> { + const VMCS_ENTRY_EXIT_PAIRS: &[VmcsEntryExitPair] = &[ + VmcsEntryExitPair::new( + EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL, + ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL, + ), + VmcsEntryExitPair::new(EntryControls::LOAD_IA32_PAT, ExitControls::LOAD_IA32_PAT), + VmcsEntryExitPair::new(EntryControls::LOAD_IA32_EFER, ExitControls::LOAD_IA32_EFER), + VmcsEntryExitPair::new( + EntryControls::LOAD_IA32_BNDCFGS, + ExitControls::CLEAR_IA32_BNDCFGS, + ), + VmcsEntryExitPair::new( + EntryControls::LOAD_IA32_RTIT_CTL, + ExitControls::CLEAR_IA32_RTIT_CTL, + ), + ]; + + let mut cpu_based_exec_control = VmxFeat::adjust_primary_controls()?; + + let mut cpu_based_2nd_exec_control = + if cpu_based_exec_control.contains(PrimaryControls::SECONDARY_CONTROLS) { + VmxFeat::adjust_secondary_controls()? + } else { + SecondaryControls::empty() + }; + + if cpu_based_2nd_exec_control.contains(SecondaryControls::VIRTUALIZE_APIC) { + cpu_based_exec_control.remove(PrimaryControls::USE_TPR_SHADOW) + } + + if !cpu_based_exec_control.contains(PrimaryControls::USE_TPR_SHADOW) { + cpu_based_2nd_exec_control.remove( + SecondaryControls::VIRTUALIZE_APIC_REGISTER + | SecondaryControls::VIRTUALIZE_X2APIC + | SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY, + ) + } + + let cap = unsafe { rdmsr(msr::IA32_VMX_EPT_VPID_CAP) }; + vmx_cap.set_val_from_msr_val(cap); + + // 不支持ept但是读取到了值 + if !cpu_based_2nd_exec_control.contains(SecondaryControls::ENABLE_EPT) + && !vmx_cap.ept.is_empty() + { + warn!("EPT CAP should not exist if not support. 1-setting enable EPT VM-execution control"); + return Err(SystemError::EIO); + } + + if !cpu_based_2nd_exec_control.contains(SecondaryControls::ENABLE_VPID) + && !vmx_cap.vpid.is_empty() + { + warn!("VPID CAP should not exist if not support. 1-setting enable VPID VM-execution control"); + return Err(SystemError::EIO); + } + + let cpuid = CpuId::new(); + let cpu_extend_feat = cpuid + .get_extended_feature_info() + .ok_or(SystemError::ENOSYS)?; + if !cpu_extend_feat.has_sgx() { + cpu_based_2nd_exec_control.remove(SecondaryControls::ENCLS_EXITING); + } + + let cpu_based_3rd_exec_control = 0; + // if cpu_based_exec_control.contains(SecondaryControls::TERTIARY_CONTROLS) { + // // Self::adjust_vmx_controls64(VmxFeature::IPI_VIRT, IA32_CTLS3) + // todo!() + // } else { + // 0 + // }; + + let vmxexit_control = VmxFeat::adjust_exit_controls()?; + + let pin_based_exec_control = VmxFeat::adjust_pin_based_controls()?; + + // TODO: broken timer? + // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#2676 + + let vmentry_control = VmxFeat::adjust_entry_controls()?; + + for pair in VMCS_ENTRY_EXIT_PAIRS { + let n_ctrl = pair.entry; + let x_ctrl = pair.exit; + + // if !(vmentry_control.bits() & n_ctrl.bits) == !(vmxexit_control.bits() & x_ctrl.bits) { + // continue; + // } + if (vmentry_control.contains(n_ctrl)) == (vmxexit_control.contains(x_ctrl)) { + continue; + } + + warn!( + "Inconsistent VM-Entry/VM-Exit pair, entry = {:?}, exit = {:?}", + vmentry_control & n_ctrl, + vmxexit_control & x_ctrl, + ); + + return Err(SystemError::EIO); + } + + let basic = unsafe { rdmsr(msr::IA32_VMX_BASIC) }; + let vmx_msr_high = (basic >> 32) as u32; + let vmx_msr_low = basic as u32; + + // 64位cpu,VMX_BASIC[48] == 0 + if vmx_msr_high & (1 << 16) != 0 { + return Err(SystemError::EIO); + } + + // 判断是否为写回(WB) + if (vmx_msr_high >> 18) & 15 != 6 { + return Err(SystemError::EIO); + } + + let misc_msr = unsafe { rdmsr(msr::IA32_VMX_MISC) }; + + vmcs_config.size = vmx_msr_high & 0x1fff; + vmcs_config.basic_cap = vmx_msr_high & !0x1fff; + vmcs_config.revision_id = vmx_msr_low; + vmcs_config.pin_based_exec_ctrl = pin_based_exec_control; + vmcs_config.cpu_based_exec_ctrl = cpu_based_exec_control; + vmcs_config.cpu_based_2nd_exec_ctrl = cpu_based_2nd_exec_control; + vmcs_config.cpu_based_3rd_exec_ctrl = cpu_based_3rd_exec_control; + vmcs_config.vmentry_ctrl = vmentry_control; + vmcs_config.vmexit_ctrl = vmxexit_control; + vmcs_config.misc = misc_msr; + + Ok(()) + } + + fn adjust_vmx_controls(ctl_min: u32, ctl_opt: u32, msr: u32) -> Result { + let mut ctl = ctl_min | ctl_opt; + let val = unsafe { rdmsr(msr) }; + let low = val as u32; + let high = (val >> 32) as u32; + + ctl &= high; + ctl |= low; + + if ctl_min & !ctl != 0 { + return Err(SystemError::EIO); + } + + return Ok(ctl); + } + #[allow(dead_code)] + fn adjust_vmx_controls64(ctl_opt: u32, msr: u32) -> u32 { + let allow = unsafe { rdmsr(msr) } as u32; + ctl_opt & allow + } + + pub fn alloc_vpid(&self) -> Option { + if !self.enable_vpid { + return None; + } + + let mut bitmap_guard = self.vpid_bitmap.lock(); + + let idx = bitmap_guard.first_false_index(); + if let Some(idx) = idx { + bitmap_guard.set(idx, true); + } + + return idx; + } + #[allow(dead_code)] + pub fn free_vpid(&self, vpid: Option) { + if !self.enable_vpid || vpid.is_none() { + return; + } + + self.vpid_bitmap.lock().set(vpid.unwrap(), false); + } + + pub fn is_valid_passthrough_msr(msr: u32) -> bool { + match msr { + 0x800..0x8ff => { + // x2Apic msr寄存器 + return true; + } + msr::MSR_IA32_RTIT_STATUS + | msr::MSR_IA32_RTIT_OUTPUT_BASE + | msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS + | msr::MSR_IA32_CR3_MATCH + | msr::MSR_LBR_SELECT + | msr::MSR_LASTBRANCH_TOS => { + return true; + } + msr::MSR_IA32_ADDR0_START..msr::MSR_IA32_ADDR3_END => { + return true; + } + 0xdc0..0xddf => { + // MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31 + return true; + } + 0x680..0x69f => { + // MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31 + return true; + } + 0x6c0..0x6df => { + // MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31 + return true; + } + 0x40..0x48 => { + // MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8 + return true; + } + 0x60..0x68 => { + // MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8 + return true; + } + _ => { + return Self::possible_passthrough_msr_slot(msr).is_some(); + } + } + } + + pub fn vpid_sync_context(&self, vpid: u16) { + if self.has_invvpid_single() { + VmxAsm::sync_vcpu_single(vpid); + } else if vpid != 0 { + VmxAsm::sync_vcpu_global(); + } + } + + pub fn possible_passthrough_msr_slot(msr: u32) -> Option { + for (idx, val) in Self::VMX_POSSIBLE_PASSTHROUGH_MSRS.iter().enumerate() { + if *val == msr { + return Some(idx); + } + } + + return None; + } + + pub fn tdp_enabled(&self) -> bool { + self.enable_ept + } + + fn setup_l1d_flush(&self) { + // TODO:先这样写 + *L1TF_VMX_MITIGATION.write() = VmxL1dFlushState::NotRequired; + } + + pub fn construct_eptp(&self, vcpu: &mut VirtCpu, root_hpa: u64, root_level: u32) -> u64 { + let mut eptp = VMX_EPTP_MT_WB; + + eptp |= if root_level == 5 { + VMX_EPTP_PWL_5 + } else { + VMX_EPTP_PWL_4 + }; + + if self.enable_ept_ad && !vcpu.arch.is_guest_mode() { + eptp |= VMX_EPTP_AD_ENABLE_BIT; + } + + eptp |= root_hpa; + + return eptp; + } + + fn vmx_reset_vcpu(&mut self, vcpu: &mut VirtCpu, vm: &Vm) { + self.init_vmcs(vcpu, vm); + + if self.nested { + todo!() + } + + // TODO: vcpu_setup_sgx_lepubkeyhash + + // TODO: nested + + vcpu.arch.microcode_version = 0x100000000; + + let vmx = vcpu.vmx_mut(); + vmx.msr_ia32_feature_control_valid_bits = 1 << 0; + + vmx.post_intr_desc.control.set_nv(0xf2); + vmx.post_intr_desc.control.set_sn(true); + } + + fn init_vmcs(&mut self, vcpu: &mut VirtCpu, vm: &Vm) { + let kvm_vmx = vm.kvm_vmx(); + if vmx_info().nested { + todo!() + } + + if vmx_info().has_msr_bitmap() { + debug!( + "msr_bitmap addr 0x{:x}", + vcpu.vmx().vmcs01.lock().msr_bitmap.phys_addr() as u64 + ); + VmxAsm::vmx_vmwrite( + control::MSR_BITMAPS_ADDR_FULL, + vcpu.vmx().vmcs01.lock().msr_bitmap.phys_addr() as u64, + ) + } + + VmxAsm::vmx_vmwrite(guest::LINK_PTR_FULL, u64::MAX); + + let mut loaded_vmcs = vcpu.vmx().loaded_vmcs.lock(); + + loaded_vmcs.controls_set( + ControlsType::Pin, + self.get_pin_based_exec_controls(vcpu).bits() as u64, + ); + + loaded_vmcs.controls_set( + ControlsType::Exec, + self.get_exec_controls(vcpu, &vm.arch).bits() as u64, + ); + + if self.has_sceondary_exec_ctrls() { + loaded_vmcs.controls_set( + ControlsType::SecondaryExec, + self.get_secondary_exec_controls(vcpu, vm).bits() as u64, + ) + } + + if self.has_tertiary_exec_ctrls() { + todo!() + } + + drop(loaded_vmcs); + + if self.enable_apicv && vcpu.arch.lapic_in_kernel() { + VmxAsm::vmx_vmwrite(control::EOI_EXIT0_FULL, 0); + VmxAsm::vmx_vmwrite(control::EOI_EXIT1_FULL, 0); + VmxAsm::vmx_vmwrite(control::EOI_EXIT2_FULL, 0); + VmxAsm::vmx_vmwrite(control::EOI_EXIT3_FULL, 0); + + VmxAsm::vmx_vmwrite(guest::INTERRUPT_STATUS, 0); + + VmxAsm::vmx_vmwrite(control::POSTED_INTERRUPT_NOTIFICATION_VECTOR, 0xf2); + VmxAsm::vmx_vmwrite(control::POSTED_INTERRUPT_DESC_ADDR_FULL, unsafe { + MMArch::virt_2_phys(VirtAddr::new( + &vcpu.vmx().post_intr_desc as *const _ as usize, + )) + .unwrap() + .data() as u64 + }) + } + + if self.enable_apicv && vcpu.arch.lapic_in_kernel() { + // PID_POINTER_TABLE + VmxAsm::vmx_vmwrite(0x2042, unsafe { + MMArch::virt_2_phys(VirtAddr::new(kvm_vmx.pid_table().as_ptr() as usize)) + .unwrap() + .data() as u64 + }); + // LAST_PID_POINTER_INDEX + VmxAsm::vmx_vmwrite(0x08, vm.arch.max_vcpu_ids as u64 - 1); + } + + if !vm.arch.pause_in_guest { + VmxAsm::vmx_vmwrite(control::PLE_GAP, self.ple_gap as u64); + vcpu.vmx_mut().ple_window = self.ple_window; + vcpu.vmx_mut().ple_window_dirty = true; + } + + if vm + .arch + .notify_vmexit_flags + .contains(NotifyVmExitFlags::KVM_X86_NOTIFY_VMEXIT_ENABLED) + { + // NOTIFY_WINDOW + VmxAsm::vmx_vmwrite(0x4024, vm.arch.notify_window as u64); + } + + VmxAsm::vmx_vmwrite(control::PAGE_FAULT_ERR_CODE_MASK, 0); + VmxAsm::vmx_vmwrite(control::PAGE_FAULT_ERR_CODE_MATCH, 0); + VmxAsm::vmx_vmwrite(control::CR3_TARGET_COUNT, 0); + + VmxAsm::vmx_vmwrite(host::FS_SELECTOR, 0); + VmxAsm::vmx_vmwrite(host::GS_SELECTOR, 0); + self.set_constant_host_state(vcpu); + + VmxAsm::vmx_vmwrite(host::FS_BASE, 0); + VmxAsm::vmx_vmwrite(host::GS_BASE, 0); + + if self.has_vmfunc() { + VmxAsm::vmx_vmwrite(control::VM_FUNCTION_CONTROLS_FULL, 0); + } + + VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_STORE_COUNT, 0); + VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_LOAD_COUNT, 0); + VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_LOAD_ADDR_FULL, unsafe { + MMArch::virt_2_phys(VirtAddr::new( + vcpu.vmx().msr_autoload.host.val.as_ptr() as *const _ as usize, + )) + .unwrap() + .data() as u64 + }); + VmxAsm::vmx_vmwrite(control::VMENTRY_MSR_LOAD_COUNT, 0); + VmxAsm::vmx_vmwrite(control::VMENTRY_MSR_LOAD_ADDR_FULL, unsafe { + MMArch::virt_2_phys(VirtAddr::new( + vcpu.vmx().msr_autoload.guest.val.as_ptr() as usize + )) + .unwrap() + .data() as u64 + }); + + if self + .vmcs_config + .vmentry_ctrl + .contains(EntryControls::LOAD_IA32_PAT) + { + VmxAsm::vmx_vmwrite(guest::IA32_PAT_FULL, vcpu.arch.pat) //todo + } + + let mut loaded_vmcs = vcpu.vmx().loaded_vmcs.lock(); + loaded_vmcs.controls_set( + ControlsType::VmExit, + self.get_vmexit_controls().bits() as u64, + ); + + loaded_vmcs.controls_set( + ControlsType::VmEntry, + self.get_vmentry_controls().bits() as u64, + ); + + drop(loaded_vmcs); + + vcpu.arch.cr0_guest_owned_bits = self.l1_guest_owned_cr0_bits(); + VmxAsm::vmx_vmwrite( + control::CR0_GUEST_HOST_MASK, + (!vcpu.arch.cr0_guest_owned_bits).bits() as u64, + ); + + self.set_cr4_guest_host_mask(&mut vcpu.arch); + + if vcpu.vmx().vpid != 0 { + VmxAsm::vmx_vmwrite(control::VPID, vcpu.vmx().vpid as u64); + } + + if self.has_xsaves() { + VmxAsm::vmx_vmwrite(control::XSS_EXITING_BITMAP_FULL, 0); + } + + if self.enable_pml { + VmxAsm::vmx_vmwrite(control::PML_ADDR_FULL, unsafe { + MMArch::virt_2_phys(VirtAddr::new(vcpu.vmx().pml_pg.as_ref().as_ptr() as usize)) + .unwrap() + .data() as u64 + }); + + VmxAsm::vmx_vmwrite(guest::PML_INDEX, VmxVCpuPriv::PML_ENTITY_NUM as u64 - 1); + } + + // TODO: vmx_write_encls_bitmap + + if self.pt_mode == ProcessorTraceMode::HostGuest { + todo!() + } + + VmxAsm::vmx_vmwrite(guest::IA32_SYSENTER_CS, 0); + VmxAsm::vmx_vmwrite(guest::IA32_SYSENTER_ESP, 0); + VmxAsm::vmx_vmwrite(guest::IA32_SYSENTER_EIP, 0); + VmxAsm::vmx_vmwrite(guest::IA32_DEBUGCTL_FULL, 0); + + if self.has_tpr_shadow() { + VmxAsm::vmx_vmwrite(control::VIRT_APIC_ADDR_FULL, 0); + if vcpu.arch.lapic_in_kernel() { + VmxAsm::vmx_vmwrite(control::VIRT_APIC_ADDR_FULL, unsafe { + MMArch::virt_2_phys(VirtAddr::new(vcpu.arch.lapic().regs.as_ptr() as usize)) + .unwrap() + .data() as u64 + }); + } + + VmxAsm::vmx_vmwrite(control::TPR_THRESHOLD, 0); + } + + self.setup_uret_msrs(vcpu); + } + + /// 打印VMCS信息用于debug + pub fn dump_vmcs(&self, vcpu: &VirtCpu) { + let vmentry_ctl = unsafe { + EntryControls::from_bits_unchecked(self.vmread(control::VMENTRY_CONTROLS) as u32) + }; + + let vmexit_ctl = unsafe { + ExitControls::from_bits_unchecked(self.vmread(control::VMEXIT_CONTROLS) as u32) + }; + + let cpu_based_exec_ctl = PrimaryControls::from_bits_truncate( + self.vmread(control::PRIMARY_PROCBASED_EXEC_CONTROLS) as u32, + ); + + let pin_based_exec_ctl = PinbasedControls::from_bits_truncate( + self.vmread(control::PINBASED_EXEC_CONTROLS) as u32, + ); + + // let cr4 = Cr4::from_bits_truncate(self.vmread(guest::CR4) as usize); + + let secondary_exec_control = if self.has_sceondary_exec_ctrls() { + unsafe { + SecondaryControls::from_bits_unchecked( + self.vmread(control::SECONDARY_PROCBASED_EXEC_CONTROLS) as u32, + ) + } + } else { + SecondaryControls::empty() + }; + + if self.has_tertiary_exec_ctrls() { + todo!() + } + + error!( + "VMCS addr: 0x{:x}, last attempted VM-entry on CPU {:?}", + vcpu.vmx().loaded_vmcs().vmcs.lock().as_ref() as *const _ as usize, + vcpu.arch.last_vmentry_cpu + ); + + error!("--- GUEST STATE ---"); + error!( + "CR0: actual = 0x{:x}, shadow = 0x{:x}, gh_mask = 0x{:x}", + self.vmread(guest::CR0), + self.vmread(control::CR0_READ_SHADOW), + self.vmread(control::CR0_GUEST_HOST_MASK) + ); + error!( + "CR4: actual = 0x{:x}, shadow = 0x{:x}, gh_mask = 0x{:x}", + self.vmread(guest::CR4), + self.vmread(control::CR4_READ_SHADOW), + self.vmread(control::CR4_GUEST_HOST_MASK) + ); + error!("CR3: actual = 0x{:x}", self.vmread(guest::CR3)); + + if self.has_ept() { + error!( + "PDPTR0 = 0x{:x}, PDPTR1 = 0x{:x}", + self.vmread(guest::PDPTE0_FULL), + self.vmread(guest::PDPTE1_FULL) + ); + error!( + "PDPTR2 = 0x{:x}, PDPTR3 = 0x{:x}", + self.vmread(guest::PDPTE2_FULL), + self.vmread(guest::PDPTE3_FULL) + ); + } + error!( + "RSP = 0x{:x}, RIP = 0x{:x}", + self.vmread(guest::RSP), + self.vmread(guest::RIP) + ); + error!( + "RFLAGS = 0x{:x}, DR7 = 0x{:x}", + self.vmread(guest::RFLAGS), + self.vmread(guest::DR7) + ); + error!( + "Sysenter RSP = 0x{:x}, CS:RIP = 0x{:x}:0x{:x}", + self.vmread(guest::IA32_SYSENTER_ESP), + self.vmread(guest::IA32_SYSENTER_CS), + self.vmread(guest::IA32_SYSENTER_EIP), + ); + + self.dump_sel("CS: ", guest::CS_SELECTOR); + self.dump_sel("DS: ", guest::DS_SELECTOR); + self.dump_sel("SS: ", guest::SS_SELECTOR); + self.dump_sel("ES: ", guest::ES_SELECTOR); + self.dump_sel("FS: ", guest::FS_SELECTOR); + self.dump_sel("GS: ", guest::GS_SELECTOR); + + self.dump_dtsel("GDTR: ", guest::GDTR_LIMIT); + self.dump_sel("LDTR: ", guest::LDTR_SELECTOR); + self.dump_dtsel("IDTR: ", guest::IDTR_LIMIT); + self.dump_sel("TR: ", guest::TR_SELECTOR); + + let efer_slot = vcpu + .vmx() + .msr_autoload + .guest + .find_loadstore_msr_slot(msr::IA32_EFER); + + if vmentry_ctl.contains(EntryControls::LOAD_IA32_EFER) { + error!("EFER = 0x{:x}", self.vmread(guest::IA32_EFER_FULL)); + } else if let Some(slot) = efer_slot { + error!( + "EFER = 0x{:x} (autoload)", + vcpu.vmx().msr_autoload.guest.val[slot].data + ); + } else if vmentry_ctl.contains(EntryControls::IA32E_MODE_GUEST) { + error!( + "EFER = 0x{:x} (effective)", + vcpu.arch.efer | (EferFlags::LONG_MODE_ACTIVE | EferFlags::LONG_MODE_ENABLE) + ); + } else { + error!( + "EFER = 0x{:x} (effective)", + vcpu.arch.efer & !(EferFlags::LONG_MODE_ACTIVE | EferFlags::LONG_MODE_ENABLE) + ); + } + + if vmentry_ctl.contains(EntryControls::LOAD_IA32_PAT) { + error!("PAT = 0x{:x}", self.vmread(guest::IA32_PAT_FULL)); + } + + error!( + "DebugCtl = 0x{:x}, DebugExceptions = 0x{:x}", + self.vmread(guest::IA32_DEBUGCTL_FULL), + self.vmread(guest::PENDING_DBG_EXCEPTIONS) + ); + + if self.has_load_perf_global_ctrl() + && vmentry_ctl.contains(EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL) + { + error!( + "PerfGlobCtl = 0x{:x}", + self.vmread(guest::IA32_PERF_GLOBAL_CTRL_FULL) + ); + } + + if vmentry_ctl.contains(EntryControls::LOAD_IA32_BNDCFGS) { + error!("BndCfgS = 0x{:x}", self.vmread(guest::IA32_BNDCFGS_FULL)); + } + + error!( + "Interruptibility = 0x{:x}, ActivityState = 0x{:x}", + self.vmread(guest::INTERRUPT_STATUS), + self.vmread(guest::ACTIVITY_STATE) + ); + + if secondary_exec_control.contains(SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY) { + error!( + "InterruptStatus = 0x{:x}", + self.vmread(guest::INTERRUPT_STATUS) + ); + } + + if self.vmread(control::VMENTRY_MSR_LOAD_COUNT) > 0 { + self.dump_msrs("guest autoload", &vcpu.vmx().msr_autoload.guest); + } + if self.vmread(control::VMEXIT_MSR_LOAD_COUNT) > 0 { + self.dump_msrs("guest autostore", &vcpu.vmx().msr_autostore); + } + + error!("\n--- HOST STATE ---"); + error!( + "RIP = 0x{:x}, RSP = 0x{:x}", + self.vmread(host::RIP), + self.vmread(host::RSP) + ); + error!( + "CS = 0x{:x}, SS = 0x{:x}, DS = 0x{:x}, ES = 0x{:x}, FS = 0x{:x}, GS = 0x{:x}, TR = 0x{:x}", + self.vmread(host::CS_SELECTOR), + self.vmread(host::SS_SELECTOR), + self.vmread(host::DS_SELECTOR), + self.vmread(host::ES_SELECTOR), + self.vmread(host::FS_SELECTOR), + self.vmread(host::GS_SELECTOR), + self.vmread(host::TR_SELECTOR) + ); + error!( + "FSBase = 0x{:x}, GSBase = 0x{:x}, TRBase = 0x{:x}", + self.vmread(host::FS_BASE), + self.vmread(host::GS_BASE), + self.vmread(host::TR_BASE), + ); + error!( + "GDTBase = 0x{:x}, IDTBase = 0x{:x}", + self.vmread(host::GDTR_BASE), + self.vmread(host::IDTR_BASE), + ); + error!( + "CR0 = 0x{:x}, CR3 = 0x{:x}, CR4 = 0x{:x}", + self.vmread(host::CR0), + self.vmread(host::CR3), + self.vmread(host::CR4), + ); + error!( + "Sysenter RSP = 0x{:x}, CS:RIP=0x{:x}:0x{:x}", + self.vmread(host::IA32_SYSENTER_ESP), + self.vmread(host::IA32_SYSENTER_CS), + self.vmread(host::IA32_SYSENTER_EIP), + ); + + if vmexit_ctl.contains(ExitControls::LOAD_IA32_EFER) { + error!("EFER = 0x{:x}", self.vmread(host::IA32_EFER_FULL)); + } + + if vmexit_ctl.contains(ExitControls::LOAD_IA32_PAT) { + error!("PAT = 0x{:x}", self.vmread(host::IA32_PAT_FULL)); + } + + if self.has_load_perf_global_ctrl() + && vmexit_ctl.contains(ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL) + { + error!( + "PerfGlobCtl = 0x{:x}", + self.vmread(host::IA32_PERF_GLOBAL_CTRL_FULL) + ); + } + + if self.vmread(control::VMEXIT_MSR_LOAD_COUNT) > 0 { + self.dump_msrs("host autoload", &vcpu.vmx().msr_autoload.host); + } + + error!("\n--- CONTROL STATE ---"); + error!( + "\nCPUBased = {:?},\nSecondaryExec = 0x{:x},\nTertiaryExec = 0(Unused)", + cpu_based_exec_ctl, secondary_exec_control, + ); + error!( + "\nPinBased = {:?},\nEntryControls = {:?},\nExitControls = {:?}", + pin_based_exec_ctl, vmentry_ctl, vmexit_ctl, + ); + error!( + "ExceptionBitmap = 0x{:x}, PFECmask = 0x{:x}, PFECmatch = 0x{:x}", + self.vmread(control::EXCEPTION_BITMAP), + self.vmread(control::PAGE_FAULT_ERR_CODE_MASK), + self.vmread(control::PAGE_FAULT_ERR_CODE_MATCH), + ); + error!( + "VMEntry: intr_info = 0x{:x}, errcode = 0x{:x}, ilen = 0x{:x}", + self.vmread(control::VMENTRY_INTERRUPTION_INFO_FIELD), + self.vmread(control::VMENTRY_EXCEPTION_ERR_CODE), + self.vmread(control::VMENTRY_INSTRUCTION_LEN), + ); + error!( + "VMExit: intr_info = 0x{:x}, errcode = 0x{:x}, ilen = 0x{:x}", + self.vmread(ro::VMEXIT_INSTRUCTION_INFO), + self.vmread(ro::VMEXIT_INTERRUPTION_ERR_CODE), + self.vmread(ro::VMEXIT_INSTRUCTION_LEN), + ); + error!( + " reason = 0x{:x}, qualification = 0x{:x}", + self.vmread(ro::EXIT_REASON), + self.vmread(ro::EXIT_QUALIFICATION), + ); + error!( + "IDTVectoring: info = 0x{:x}, errcode = 0x{:x}", + self.vmread(ro::IDT_VECTORING_INFO), + self.vmread(ro::IDT_VECTORING_ERR_CODE), + ); + error!("TSC Offset = 0x{:x}", self.vmread(control::TSC_OFFSET_FULL)); + + if secondary_exec_control.contains(SecondaryControls::USE_TSC_SCALING) { + error!( + "TSC Multiplier = 0x{:x}", + self.vmread(control::TSC_MULTIPLIER_FULL) + ); + } + + if cpu_based_exec_ctl.contains(PrimaryControls::USE_TPR_SHADOW) { + if secondary_exec_control.contains(SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY) { + let status = self.vmread(guest::INTERRUPT_STATUS); + error!("SVI|RVI = 0x{:x}|0x{:x}", status >> 8, status & 0xff); + } + + error!( + "TPR Threshold = 0x{:x}", + self.vmread(control::TPR_THRESHOLD) + ); + if secondary_exec_control.contains(SecondaryControls::VIRTUALIZE_APIC) { + error!( + "APIC-access addr = 0x{:x}", + self.vmread(control::APIC_ACCESS_ADDR_FULL) + ); + } + error!( + "virt-APIC addr = 0x{:x}", + self.vmread(control::VIRT_APIC_ADDR_FULL) + ); + } + + if pin_based_exec_ctl.contains(PinbasedControls::POSTED_INTERRUPTS) { + error!( + "PostedIntrVec = 0x{:x}", + self.vmread(control::POSTED_INTERRUPT_NOTIFICATION_VECTOR) + ); + } + + if secondary_exec_control.contains(SecondaryControls::ENABLE_EPT) { + error!("EPT pointer = 0x{:x}", self.vmread(control::EPTP_FULL)); + } + if secondary_exec_control.contains(SecondaryControls::PAUSE_LOOP_EXITING) { + error!( + "PLE Gap = 0x{:x}, Window = 0x{:x}", + self.vmread(control::PLE_GAP), + self.vmread(control::PLE_WINDOW) + ); + } + if secondary_exec_control.contains(SecondaryControls::ENABLE_VPID) { + error!("Virtual processor ID = 0x{:x}", self.vmread(control::VPID)); + } + } + + pub fn dump_sel(&self, name: &'static str, sel: u32) { + error!( + "{name} sel = 0x{:x}, attr = 0x{:x}, limit = 0x{:x}, base = 0x{:x}", + self.vmread(sel), + self.vmread(sel + guest::ES_ACCESS_RIGHTS - guest::ES_SELECTOR), + self.vmread(sel + guest::ES_LIMIT - guest::ES_SELECTOR), + self.vmread(sel + guest::ES_BASE - guest::ES_SELECTOR), + ); + } + + pub fn dump_dtsel(&self, name: &'static str, limit: u32) { + error!( + "{name} limit = 0x{:x}, base = 0x{:x}", + self.vmread(limit), + self.vmread(limit + guest::GDTR_BASE - guest::GDTR_LIMIT) + ); + } + + pub fn dump_msrs(&self, name: &'static str, msr: &VmxMsrs) { + error!("MSR {name}:"); + for (idx, msr) in msr.val.iter().enumerate() { + error!("{idx}: msr = 0x{:x}, value = 0x{:x}", msr.index, msr.data); + } + } + + #[inline] + pub fn vmread(&self, field: u32) -> u64 { + VmxAsm::vmx_vmread(field) + } + + fn setup_uret_msrs(&self, vcpu: &mut VirtCpu) { + // 是否加载syscall相关msr + let load_syscall_msrs = + vcpu.arch.is_long_mode() && vcpu.arch.efer.contains(EferFlags::SYSTEM_CALL_EXTENSIONS); + + self.setup_uret_msr(vcpu, msr::IA32_STAR, load_syscall_msrs); + self.setup_uret_msr(vcpu, msr::IA32_LSTAR, load_syscall_msrs); + self.setup_uret_msr(vcpu, msr::IA32_FMASK, load_syscall_msrs); + + let load_efer = self.update_transition_efer(vcpu); + self.setup_uret_msr(vcpu, msr::IA32_EFER, load_efer); + + // TODO: MSR_TSC_AUX + + self.setup_uret_msr( + vcpu, + msr::MSR_IA32_TSX_CTRL, + CpuId::default() + .get_extended_feature_info() + .unwrap() + .has_rtm(), + ); + + vcpu.vmx_mut().guest_uret_msrs_loaded = false; + } + + fn setup_uret_msr(&self, vcpu: &mut VirtCpu, msr: u32, load_into_hardware: bool) { + let uret_msr = vcpu.vmx_mut().find_uret_msr_mut(msr); + + if let Some((_idx, msr)) = uret_msr { + msr.load_into_hardware = load_into_hardware; + } + } + + fn update_transition_efer(&self, vcpu: &mut VirtCpu) -> bool { + let mut guest_efer = vcpu.arch.efer; + let mut ignore_efer = EferFlags::empty(); + if !self.enable_ept { + guest_efer.insert(EferFlags::NO_EXECUTE_ENABLE); + } + + ignore_efer.insert(EferFlags::SYSTEM_CALL_EXTENSIONS); + + ignore_efer.insert(EferFlags::LONG_MODE_ACTIVE | EferFlags::LONG_MODE_ENABLE); + + if guest_efer.contains(EferFlags::LONG_MODE_ACTIVE) { + ignore_efer.remove(EferFlags::SYSTEM_CALL_EXTENSIONS); + } + + if self.has_load_ia32_efer() + || (self.enable_ept + && (vcpu.arch.efer ^ x86_kvm_manager().host_efer) + .contains(EferFlags::NO_EXECUTE_ENABLE)) + { + if !guest_efer.contains(EferFlags::LONG_MODE_ACTIVE) { + guest_efer.remove(EferFlags::LONG_MODE_ENABLE); + } + + if guest_efer != x86_kvm_manager().host_efer { + vcpu.vmx_mut().add_atomic_switch_msr( + msr::IA32_EFER, + guest_efer.bits(), + x86_kvm_manager().host_efer.bits(), + false, + ); + } else { + vcpu.vmx_mut().clear_atomic_switch_msr(msr::IA32_EFER); + } + + return false; + } + + let idx = x86_kvm_manager().find_user_return_msr_idx(msr::IA32_EFER); + if let Some(i) = idx { + vcpu.vmx_mut().clear_atomic_switch_msr(msr::IA32_EFER); + + guest_efer.remove(ignore_efer); + guest_efer.insert(x86_kvm_manager().host_efer & ignore_efer); + + vcpu.vmx_mut().guest_uret_msrs[i].data = guest_efer.bits(); + vcpu.vmx_mut().guest_uret_msrs[i].mask = (!ignore_efer).bits(); + return true; + } else { + return false; + } + } + + fn set_cr4_guest_host_mask(&self, arch: &mut VirtCpuArch) { + arch.cr4_guest_owned_bits = + x86_kvm_manager().possible_cr4_guest & (!arch.cr4_guest_rsvd_bits); + + if !self.enable_ept { + arch.cr4_guest_owned_bits + .remove(x86_kvm_manager().cr4_tlbflush_bits); + arch.cr4_guest_owned_bits + .remove(x86_kvm_manager().cr4_pdptr_bits); + } + + if arch.is_guest_mode() { + // 嵌套todo + todo!() + } + + VmxAsm::vmx_vmwrite( + control::CR4_GUEST_HOST_MASK, + (!arch.cr4_guest_owned_bits).bits() as u64, + ); + } + + fn l1_guest_owned_cr0_bits(&self) -> Cr0 { + let mut cr0 = x86_kvm_manager().possible_cr0_guest; + + if !self.enable_ept { + cr0.remove(Cr0::CR0_WRITE_PROTECT) + } + + return cr0; + } + + /// 设置在guest生命周期中host不变的部分 + fn set_constant_host_state(&self, vcpu: &mut VirtCpu) { + let loaded_vmcs_host_state = &mut vcpu.vmx().loaded_vmcs.lock().host_state; + + VmxAsm::vmx_vmwrite(host::CR0, unsafe { cr0() }.bits() as u64); + + let cr3: (PhysFrame, Cr3Flags) = Cr3::read(); + let cr3_combined: u64 = + (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); + VmxAsm::vmx_vmwrite(host::CR3, cr3_combined); + loaded_vmcs_host_state.cr3 = cr3; + + let cr4 = unsafe { cr4() }; + VmxAsm::vmx_vmwrite(host::CR4, cr4.bits() as u64); + loaded_vmcs_host_state.cr4 = cr4; + + VmxAsm::vmx_vmwrite( + host::CS_SELECTOR, + (segmentation::cs().bits() & (!0x07)).into(), + ); + + VmxAsm::vmx_vmwrite(host::DS_SELECTOR, 0); + VmxAsm::vmx_vmwrite(host::ES_SELECTOR, 0); + + VmxAsm::vmx_vmwrite( + host::SS_SELECTOR, + (segmentation::ds().bits() & (!0x07)).into(), + ); + VmxAsm::vmx_vmwrite( + host::TR_SELECTOR, + (unsafe { x86::task::tr().bits() } & (!0x07)).into(), + ); + + VmxAsm::vmx_vmwrite(host::IDTR_BASE, self.host_idt_base); + VmxAsm::vmx_vmwrite(host::RIP, vmx_vmexit as usize as u64); + + let val = unsafe { rdmsr(msr::IA32_SYSENTER_CS) }; + + // low32 + VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_CS, (val << 32) >> 32); + + // VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_ESP, 0); + + let tmp = unsafe { rdmsr(msr::IA32_SYSENTER_EIP) }; + VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_EIP, (tmp << 32) >> 32); + + if self + .vmcs_config + .vmexit_ctrl + .contains(ExitControls::LOAD_IA32_PAT) + { + VmxAsm::vmx_vmwrite(host::IA32_PAT_FULL, unsafe { rdmsr(msr::IA32_PAT) }); + } + + if self.has_load_ia32_efer() { + VmxAsm::vmx_vmwrite( + host::IA32_EFER_FULL, + x86_kvm_manager().host_efer.bits() as u64, + ); + } + } + + fn get_pin_based_exec_controls(&self, vcpu: &VirtCpu) -> PinbasedControls { + let mut ctrls = self.vmcs_config.pin_based_exec_ctrl; + + if !vcpu.arch.vcpu_apicv_active() { + ctrls.remove(PinbasedControls::POSTED_INTERRUPTS); + } + + if !self.enable_vnmi { + ctrls.remove(PinbasedControls::VIRTUAL_NMIS); + } + + if !self.enable_preemption_timer { + ctrls.remove(PinbasedControls::VMX_PREEMPTION_TIMER); + } + + return ctrls; + } + + fn get_exec_controls(&self, vcpu: &VirtCpu, vmarch: &KvmArch) -> PrimaryControls { + let mut ctrls = self.vmcs_config.cpu_based_exec_ctrl; + + ctrls.remove( + PrimaryControls::RDTSC_EXITING + | PrimaryControls::USE_IO_BITMAPS + | PrimaryControls::MONITOR_TRAP_FLAG + | PrimaryControls::PAUSE_EXITING, + ); + + ctrls.remove( + PrimaryControls::NMI_WINDOW_EXITING | PrimaryControls::INTERRUPT_WINDOW_EXITING, + ); + + ctrls.remove(PrimaryControls::MOV_DR_EXITING); + + if vcpu.arch.lapic_in_kernel() && self.has_tpr_shadow() { + ctrls.remove(PrimaryControls::USE_TPR_SHADOW); + } + + if ctrls.contains(PrimaryControls::USE_TPR_SHADOW) { + ctrls.remove(PrimaryControls::CR8_LOAD_EXITING | PrimaryControls::CR8_STORE_EXITING); + } else { + ctrls.insert(PrimaryControls::CR8_LOAD_EXITING | PrimaryControls::CR8_STORE_EXITING); + } + + if self.enable_ept { + ctrls.remove( + PrimaryControls::CR3_LOAD_EXITING + | PrimaryControls::CR3_STORE_EXITING + | PrimaryControls::INVLPG_EXITING, + ); + } + + if vmarch.mwait_in_guest { + ctrls.remove(PrimaryControls::MWAIT_EXITING | PrimaryControls::MONITOR_EXITING); + } + + if vmarch.hlt_in_guest { + ctrls.remove(PrimaryControls::HLT_EXITING); + } + + return ctrls; + } + + fn get_secondary_exec_controls(&mut self, vcpu: &VirtCpu, vm: &Vm) -> SecondaryControls { + let mut ctrls = self.vmcs_config.cpu_based_2nd_exec_ctrl; + + if self.pt_mode == ProcessorTraceMode::System { + ctrls.remove( + SecondaryControls::INTEL_PT_GUEST_PHYSICAL | SecondaryControls::CONCEAL_VMX_FROM_PT, + ); + } + + if !(self.enable_flexpriority && vcpu.arch.lapic_in_kernel()) { + ctrls.remove(SecondaryControls::VIRTUALIZE_APIC) + } + + if vcpu.vmx().vpid == 0 { + ctrls.remove(SecondaryControls::ENABLE_VPID); + } + + if !self.enable_ept { + ctrls.remove(SecondaryControls::ENABLE_EPT); + self.enable_unrestricted_guest = false; + } + + if !self.enable_unrestricted_guest { + ctrls.remove(SecondaryControls::UNRESTRICTED_GUEST); + } + + if vm.arch.pause_in_guest { + ctrls.remove(SecondaryControls::PAUSE_LOOP_EXITING); + } + if !vcpu.arch.vcpu_apicv_active() { + ctrls.remove( + SecondaryControls::VIRTUALIZE_APIC_REGISTER + | SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY, + ); + } + + ctrls.remove(SecondaryControls::VIRTUALIZE_X2APIC); + + ctrls.remove(SecondaryControls::ENABLE_VM_FUNCTIONS); + + ctrls.remove(SecondaryControls::DTABLE_EXITING); + + ctrls.remove(SecondaryControls::VMCS_SHADOWING); + + if !self.enable_pml || vm.nr_memslots_dirty_logging == 0 { + ctrls.remove(SecondaryControls::ENABLE_PML); + } + + // TODO: vmx_adjust_sec_exec_feature + + if self.has_rdtscp() { + warn!("adjust RDTSCP todo!"); + // todo!() + } + + return ctrls; + } + + fn get_vmexit_controls(&self) -> ExitControls { + let mut ctrls = self.vmcs_config.vmexit_ctrl; + + ctrls.remove( + ExitControls::SAVE_IA32_PAT + | ExitControls::SAVE_IA32_EFER + | ExitControls::SAVE_VMX_PREEMPTION_TIMER, + ); + + if self.pt_mode == ProcessorTraceMode::System { + ctrls.remove(ExitControls::CONCEAL_VMX_FROM_PT | ExitControls::CLEAR_IA32_RTIT_CTL); + } + + // todo: cpu_has_perf_global_ctrl_bug + + ctrls.remove(ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL | ExitControls::LOAD_IA32_EFER); + + ctrls + } + + fn get_vmentry_controls(&self) -> EntryControls { + let mut ctrls = self.vmcs_config.vmentry_ctrl; + + if self.pt_mode == ProcessorTraceMode::System { + ctrls.remove(EntryControls::CONCEAL_VMX_FROM_PT | EntryControls::LOAD_IA32_RTIT_CTL); + } + + ctrls.remove( + EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL + | EntryControls::LOAD_IA32_EFER + | EntryControls::IA32E_MODE_GUEST, + ); + + // todo: cpu_has_perf_global_ctrl_bug + + ctrls + } + + pub fn emulation_required(&self, vcpu: &mut VirtCpu) -> bool { + return self.emulate_invalid_guest_state && !self.guest_state_valid(vcpu); + } + + pub fn guest_state_valid(&self, vcpu: &mut VirtCpu) -> bool { + return vcpu.is_unrestricted_guest() || self.__guest_state_valid(vcpu); + } + + pub fn __guest_state_valid(&self, vcpu: &mut VirtCpu) -> bool { + if vcpu.arch.is_portected_mode() + || x86_kvm_ops().get_rflags(vcpu).contains(RFlags::FLAGS_VM) + { + if !self.rmode_segment_valid(vcpu, VcpuSegment::CS) { + return false; + } + if !self.rmode_segment_valid(vcpu, VcpuSegment::SS) { + return false; + } + if !self.rmode_segment_valid(vcpu, VcpuSegment::DS) { + return false; + } + if !self.rmode_segment_valid(vcpu, VcpuSegment::ES) { + return false; + } + if !self.rmode_segment_valid(vcpu, VcpuSegment::FS) { + return false; + } + if !self.rmode_segment_valid(vcpu, VcpuSegment::GS) { + return false; + } + } else { + todo!("protected mode guest state checks todo"); + } + + return true; + } + + pub fn vmx_get_segment( + &self, + vcpu: &mut VirtCpu, + mut var: UapiKvmSegment, + seg: VcpuSegment, + ) -> UapiKvmSegment { + if vcpu.vmx().rmode.vm86_active && seg != VcpuSegment::LDTR { + var = vcpu.vmx().rmode.segs[seg as usize]; + if seg == VcpuSegment::TR || var.selector == Vmx::vmx_read_guest_seg_selector(vcpu, seg) + { + return var; + } + + var.base = Vmx::vmx_read_guest_seg_base(vcpu, seg); + var.selector = Vmx::vmx_read_guest_seg_selector(vcpu, seg); + return var; + } + + var.base = Vmx::vmx_read_guest_seg_base(vcpu, seg); + var.limit = Vmx::vmx_read_guest_seg_limit(vcpu, seg); + var.selector = Vmx::vmx_read_guest_seg_selector(vcpu, seg); + + let ar = Vmx::vmx_read_guest_seg_ar(vcpu, seg); + + var.unusable = ((ar >> 16) & 1) as u8; + var.type_ = (ar & 15) as u8; + var.s = ((ar >> 4) & 1) as u8; + var.dpl = ((ar >> 5) & 3) as u8; + + var.present = !var.unusable; + var.avl = ((ar >> 12) & 1) as u8; + var.l = ((ar >> 13) & 1) as u8; + var.db = ((ar >> 14) & 1) as u8; + var.g = ((ar >> 15) & 1) as u8; + + return var; + } + + pub fn _vmx_set_segment( + &self, + vcpu: &mut VirtCpu, + mut var: UapiKvmSegment, + seg: VcpuSegment, + ) -> UapiKvmSegment { + let sf = &KVM_VMX_SEGMENT_FIELDS[seg as usize]; + + vcpu.vmx_mut().segment_cache_clear(); + + if vcpu.vmx().rmode.vm86_active && seg != VcpuSegment::LDTR { + vcpu.vmx_mut().rmode.segs[seg as usize] = var; + if seg == VcpuSegment::TR { + VmxAsm::vmx_vmwrite(sf.selector, var.selector as u64); + } else if var.s != 0 { + Vmx::fix_rmode_seg(seg, &vcpu.vmx().rmode.segs[seg as usize]); + } + return var; + } + + VmxAsm::vmx_vmwrite(sf.base, var.base); + VmxAsm::vmx_vmwrite(sf.limit, var.limit as u64); + VmxAsm::vmx_vmwrite(sf.selector, var.selector as u64); + + if vcpu.is_unrestricted_guest() && seg != VcpuSegment::LDTR { + var.type_ |= 0x1; + } + + VmxAsm::vmx_vmwrite(sf.ar_bytes, var.vmx_segment_access_rights() as u64); + return var; + } + + pub fn rmode_segment_valid(&self, vcpu: &mut VirtCpu, seg: VcpuSegment) -> bool { + let mut var = UapiKvmSegment::default(); + var = self.vmx_get_segment(vcpu, var, seg); + + var.dpl = 0x3; + + if seg == VcpuSegment::CS { + var.type_ = 0x3; + } + + let ar = var.vmx_segment_access_rights(); + + if var.base != ((var.selector as u64) << 4) { + return false; + } + + if var.limit != 0xffff { + return false; + } + + if ar != 0xf3 { + return false; + } + + true + } + + pub fn fix_rmode_seg(seg: VcpuSegment, save: &UapiKvmSegment) { + let sf = &KVM_VMX_SEGMENT_FIELDS[seg as usize]; + + let mut var = *save; + var.dpl = 0x3; + if seg == VcpuSegment::CS { + var.type_ = 0x3; + } + + if !vmx_info().emulate_invalid_guest_state { + var.selector = (var.base >> 4) as u16; + var.base &= 0xffff0; + var.limit = 0xffff; + var.g = 0; + var.db = 0; + var.present = 1; + var.s = 1; + var.l = 0; + var.unusable = 0; + var.type_ = 0x3; + var.avl = 0; + if save.base & 0xf != 0 { + warn!("segment base is not paragraph aligned when entering protected mode (seg={seg:?})"); + } + } + + VmxAsm::vmx_vmwrite(sf.selector, var.selector as u64); + VmxAsm::vmx_vmwrite(sf.base, var.base); + VmxAsm::vmx_vmwrite(sf.limit, var.limit as u64); + VmxAsm::vmx_vmwrite(sf.ar_bytes, var.vmx_segment_access_rights() as u64); + } + + pub fn fix_pmode_seg( + &self, + vcpu: &mut VirtCpu, + seg: VcpuSegment, + mut save: UapiKvmSegment, + ) -> UapiKvmSegment { + if self.emulate_invalid_guest_state { + if seg == VcpuSegment::CS || seg == VcpuSegment::SS { + save.selector &= !0x3; + } + + save.dpl = (save.selector & 0x3) as u8; + save.s = 1; + } + + self._vmx_set_segment(vcpu, save, seg); + + return save; + } + + pub fn enter_pmode(&self, vcpu: &mut VirtCpu) { + self.get_segment_with_rmode(vcpu, VcpuSegment::ES); + self.get_segment_with_rmode(vcpu, VcpuSegment::DS); + self.get_segment_with_rmode(vcpu, VcpuSegment::FS); + self.get_segment_with_rmode(vcpu, VcpuSegment::GS); + self.get_segment_with_rmode(vcpu, VcpuSegment::SS); + self.get_segment_with_rmode(vcpu, VcpuSegment::CS); + + vcpu.vmx_mut().rmode.vm86_active = false; + + self.set_segment_with_rmode(vcpu, VcpuSegment::TR); + + let mut flags = RFlags::from_bits_truncate(VmxAsm::vmx_vmread(guest::RFLAGS)); + + flags.remove(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM); + + flags.insert(vcpu.vmx().rmode.save_rflags & (RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM)); + + VmxAsm::vmx_vmwrite(guest::RFLAGS, flags.bits()); + + let cr4 = (Cr4::from_bits_truncate(VmxAsm::vmx_vmread(guest::CR4) as usize) + & (!Cr4::CR4_ENABLE_VME)) + | (Cr4::from_bits_truncate(VmxAsm::vmx_vmread(control::CR4_READ_SHADOW) as usize) + & Cr4::CR4_ENABLE_VME); + VmxAsm::vmx_vmwrite(guest::CR4, cr4.bits() as u64); + + VmxKvmFunc.update_exception_bitmap(vcpu); + + self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::CS); + self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::SS); + self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::ES); + self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::DS); + self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::FS); + self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::GS); + } + + fn fix_pmode_seg_with_rmode(&self, vcpu: &mut VirtCpu, seg: VcpuSegment) { + let segment = vcpu.vmx().rmode.segs[seg as usize]; + vcpu.vmx_mut().rmode.segs[seg as usize] = self.fix_pmode_seg(vcpu, seg, segment); + } + + fn get_segment_with_rmode(&self, vcpu: &mut VirtCpu, seg: VcpuSegment) { + let segment = vcpu.vmx().rmode.segs[seg as usize]; + vcpu.vmx_mut().rmode.segs[seg as usize] = self.vmx_get_segment(vcpu, segment, seg); + } + + fn set_segment_with_rmode(&self, vcpu: &mut VirtCpu, seg: VcpuSegment) { + let segment = vcpu.vmx().rmode.segs[seg as usize]; + vcpu.vmx_mut().rmode.segs[seg as usize] = self._vmx_set_segment(vcpu, segment, seg); + } + + pub fn enter_rmode(&self, vcpu: &mut VirtCpu, vm: &Vm) { + let kvm_vmx = vm.kvm_vmx(); + + self.get_segment_with_rmode(vcpu, VcpuSegment::TR); + self.get_segment_with_rmode(vcpu, VcpuSegment::ES); + self.get_segment_with_rmode(vcpu, VcpuSegment::DS); + self.get_segment_with_rmode(vcpu, VcpuSegment::FS); + self.get_segment_with_rmode(vcpu, VcpuSegment::GS); + self.get_segment_with_rmode(vcpu, VcpuSegment::SS); + self.get_segment_with_rmode(vcpu, VcpuSegment::CS); + + vcpu.vmx_mut().rmode.vm86_active = true; + + vcpu.vmx_mut().segment_cache_clear(); + + VmxAsm::vmx_vmwrite(guest::TR_BASE, kvm_vmx.tss_addr as u64); + VmxAsm::vmx_vmwrite(guest::TR_LIMIT, RMODE_TSS_SIZE as u64 - 1); + VmxAsm::vmx_vmwrite(guest::TR_ACCESS_RIGHTS, 0x008b); + + let mut flags = RFlags::from_bits_truncate(VmxAsm::vmx_vmread(guest::RFLAGS)); + vcpu.vmx_mut().rmode.save_rflags = flags; + + flags.insert(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM); + + VmxAsm::vmx_vmwrite(guest::RFLAGS, flags.bits()); + VmxAsm::vmx_vmwrite( + guest::CR4, + VmxAsm::vmx_vmread(guest::CR4) | Cr4::CR4_ENABLE_VME.bits() as u64, + ); + + VmxKvmFunc.update_exception_bitmap(vcpu); + + self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::SS); + self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::CS); + self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::ES); + self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::DS); + self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::GS); + self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::FS); + } + + fn fix_rmode_seg_with_rmode(&self, vcpu: &VirtCpu, seg: VcpuSegment) { + Vmx::fix_rmode_seg(seg, &vcpu.vmx().rmode.segs[seg as usize]); + } + + pub fn vmx_read_guest_seg_ar(vcpu: &mut VirtCpu, seg: VcpuSegment) -> u32 { + if !Vmx::vmx_segment_cache_test_set(vcpu, seg, SegmentCacheField::AR) { + vcpu.vmx_mut().segment_cache.seg[seg as usize].ar = + VmxAsm::vmx_vmread(KVM_VMX_SEGMENT_FIELDS[seg as usize].ar_bytes) as u32; + } + + return vcpu.vmx().segment_cache.seg[seg as usize].ar; + } + + pub fn vmx_read_guest_seg_selector(vcpu: &mut VirtCpu, seg: VcpuSegment) -> u16 { + if !Vmx::vmx_segment_cache_test_set(vcpu, seg, SegmentCacheField::SEL) { + vcpu.vmx_mut().segment_cache.seg[seg as usize].selector = + VmxAsm::vmx_vmread(KVM_VMX_SEGMENT_FIELDS[seg as usize].selector) as u16; + } + + return vcpu.vmx().segment_cache.seg[seg as usize].selector; + } + + pub fn vmx_read_guest_seg_base(vcpu: &mut VirtCpu, seg: VcpuSegment) -> u64 { + if !Vmx::vmx_segment_cache_test_set(vcpu, seg, SegmentCacheField::BASE) { + vcpu.vmx_mut().segment_cache.seg[seg as usize].base = + VmxAsm::vmx_vmread(KVM_VMX_SEGMENT_FIELDS[seg as usize].base); + } + + return vcpu.vmx().segment_cache.seg[seg as usize].base; + } + + pub fn vmx_read_guest_seg_limit(vcpu: &mut VirtCpu, seg: VcpuSegment) -> u32 { + if !Vmx::vmx_segment_cache_test_set(vcpu, seg, SegmentCacheField::LIMIT) { + vcpu.vmx_mut().segment_cache.seg[seg as usize].limit = + VmxAsm::vmx_vmread(KVM_VMX_SEGMENT_FIELDS[seg as usize].limit) as u32; + } + + return vcpu.vmx().segment_cache.seg[seg as usize].limit; + } + + fn vmx_segment_cache_test_set( + vcpu: &mut VirtCpu, + seg: VcpuSegment, + field: SegmentCacheField, + ) -> bool { + let mask = 1u32 << (seg as usize * SegmentCacheField::NR as usize + field as usize); + + if !vcpu.arch.is_register_available(KvmReg::VcpuExregSegments) { + vcpu.arch.mark_register_available(KvmReg::VcpuExregSegments); + vcpu.vmx_mut().segment_cache_clear(); + } + + let ret = vcpu.vmx().segment_cache.bitmask & mask; + + vcpu.vmx_mut().segment_cache.bitmask |= mask; + + return ret != 0; + } + + pub fn vmx_vcpu_enter_exit(vcpu: &mut VirtCpu, flags: VmxRunFlag) { + // TODO: vmx_l1d_should_flush and mmio_stale_data_clear + + // TODO: vmx_disable_fb_clear + + if vcpu.arch.cr2 != unsafe { cr2() } as u64 { + unsafe { cr2_write(vcpu.arch.cr2) }; + } + + let fail = + unsafe { __vmx_vcpu_run(vcpu.vmx(), vcpu.arch.regs.as_ptr(), flags.bits as u32) }; + + vcpu.vmx_mut().fail = fail as u8; + + vcpu.arch.cr2 = unsafe { cr2() } as u64; + vcpu.arch.regs_avail.set_all(true); + + // 这些寄存器需要更新缓存 + for reg_idx in Vmx::VMX_REGS_LAZY_LOAD_SET { + vcpu.arch.regs_avail.set(*reg_idx, false); + } + + vcpu.vmx_mut().idt_vectoring_info = IntrInfo::empty(); + + // TODO: enable_fb_clear + + if unlikely(vcpu.vmx().fail != 0) { + vcpu.vmx_mut().exit_reason = VmxExitReason::from(0xdead); + return; + } + + vcpu.vmx_mut().exit_reason = + VmxExitReason::from(VmxAsm::vmx_vmread(ro::EXIT_REASON) as u32); + + if likely(!vcpu.vmx().exit_reason.failed_vmentry()) { + vcpu.vmx_mut().idt_vectoring_info = + IntrInfo::from_bits_truncate(VmxAsm::vmx_vmread(ro::IDT_VECTORING_INFO) as u32); + } + + if VmxExitReasonBasic::from(vcpu.vmx().exit_reason.basic()) + == VmxExitReasonBasic::EXCEPTION_OR_NMI + && VmcsIntrHelper::is_nmi(&Vmx::vmx_get_intr_info(vcpu)) + { + todo!() + } + } + + fn vmx_get_intr_info(vcpu: &mut VirtCpu) -> IntrInfo { + if !vcpu + .arch + .test_and_mark_available(KvmReg::VcpuExregExitInfo2) + { + vcpu.vmx_mut().exit_intr_info = IntrInfo::from_bits_truncate(VmxAsm::vmx_vmread( + ro::VMEXIT_INTERRUPTION_INFO, + ) as u32); + } + + return vcpu.vmx_mut().exit_intr_info; + } + + pub fn vmx_exit_handlers_fastpath(vcpu: &mut VirtCpu) -> ExitFastpathCompletion { + match VmxExitReasonBasic::from(vcpu.vmx().exit_reason.basic()) { + VmxExitReasonBasic::WRMSR => { + todo!() + } + VmxExitReasonBasic::VMX_PREEMPTION_TIMER_EXPIRED => { + todo!() + } + _ => ExitFastpathCompletion::None, + } + } + + pub fn vmx_handle_exit( + &self, + vcpu: &mut VirtCpu, + vm: &Vm, + exit_fastpath: ExitFastpathCompletion, + ) -> Result { + let exit_reason = vcpu.vmx().exit_reason; + // self.dump_vmcs(vcpu); + { + let reason = self.vmread(ro::EXIT_REASON); + debug!("vm_exit reason 0x{:x}\n", reason); + } + let unexpected_vmexit = |vcpu: &mut VirtCpu| -> Result { + error!("vmx: unexpected exit reason {:?}\n", exit_reason); + + self.dump_vmcs(vcpu); + + let cpu = vcpu.arch.last_vmentry_cpu.into() as u64; + let run = vcpu.kvm_run_mut(); + run.exit_reason = kvm_exit::KVM_EXIT_INTERNAL_ERROR; + + unsafe { + run.__bindgen_anon_1.internal.ndata = 2; + run.__bindgen_anon_1.internal.data[0] = Into::::into(exit_reason) as u64; + run.__bindgen_anon_1.internal.data[1] = cpu; + } + + return Ok(0); + }; + + let vectoring_info = vcpu.vmx().idt_vectoring_info; + + if self.enable_pml && !vcpu.arch.is_guest_mode() { + todo!() + } + + if vcpu.arch.is_guest_mode() { + if exit_reason.basic() == VmxExitReasonBasic::PML_FULL as u16 { + return unexpected_vmexit(vcpu); + } + + todo!() + } + + if vcpu.vmx().emulation_required { + todo!() + } + + if exit_reason.failed_vmentry() { + self.dump_vmcs(vcpu); + todo!() + } + + if unlikely(vcpu.vmx().fail != 0) { + self.dump_vmcs(vcpu); + todo!() + } + + let basic = VmxExitReasonBasic::from(exit_reason.basic()); + if vectoring_info.contains(IntrInfo::INTR_INFO_VALID_MASK) + && basic != VmxExitReasonBasic::EXCEPTION_OR_NMI + && basic != VmxExitReasonBasic::EPT_VIOLATION + && basic != VmxExitReasonBasic::PML_FULL + && basic != VmxExitReasonBasic::APIC_ACCESS + && basic != VmxExitReasonBasic::TASK_SWITCH + && basic != VmxExitReasonBasic::NOTIFY + { + todo!() + } + + if unlikely(!self.enable_pml && vcpu.vmx().loaded_vmcs().soft_vnmi_blocked) { + todo!() + } + + if exit_fastpath != ExitFastpathCompletion::None { + return Err(SystemError::EINVAL); + } + + match VmxExitHandlers::try_handle_exit( + vcpu, + vm, + VmxExitReasonBasic::from(exit_reason.basic()), + ) { + Some(Ok(r)) => { + debug!("vmx: handled exit return {:?}\n", r); + return Ok(r); + } + Some(Err(_)) | None => unexpected_vmexit(vcpu), + } + } + + #[allow(unreachable_code)] + pub fn handle_external_interrupt_irqoff(vcpu: &mut VirtCpu) { + let intr_info = Vmx::vmx_get_intr_info(vcpu); + let _vector = intr_info & IntrInfo::INTR_INFO_VECTOR_MASK; + // let desc = vmx_info().host_idt_base + vector.bits() as u64; + if !VmcsIntrHelper::is_external_intr(&intr_info) { + error!("unexpected VM-Exit interrupt info: {:?}", intr_info); + return; + } + + vcpu.arch.kvm_before_interrupt(KvmIntrType::Irq); + // TODO + warn!("handle_external_interrupt_irqoff TODO"); + vcpu.arch.kvm_after_interrupt(); + + vcpu.arch.at_instruction_boundary = true; + } + + /// 需要在缓存中更新的寄存器集。此处未列出的其他寄存器在 VM 退出后立即同步到缓存。 + pub const VMX_REGS_LAZY_LOAD_SET: &'static [usize] = &[ + KvmReg::VcpuRegsRip as usize, + KvmReg::VcpuRegsRsp as usize, + KvmReg::VcpuExregRflags as usize, + KvmReg::NrVcpuRegs as usize, + KvmReg::VcpuExregSegments as usize, + KvmReg::VcpuExregCr0 as usize, + KvmReg::VcpuExregCr3 as usize, + KvmReg::VcpuExregCr4 as usize, + KvmReg::VcpuExregExitInfo1 as usize, + KvmReg::VcpuExregExitInfo2 as usize, + ]; +} + +extern "C" { + /// #[allow(improper_ctypes)]因为只需要在内部调用而无需与C交互 + #[allow(improper_ctypes)] + fn __vmx_vcpu_run(vmx: &VmxVCpuPriv, regs: *const u64, flags: u32) -> i32; +} + +struct VmcsEntryExitPair { + entry: EntryControls, + exit: ExitControls, +} + +impl VmcsEntryExitPair { + pub const fn new(entry: EntryControls, exit: ExitControls) -> Self { + Self { entry, exit } + } +} + +#[derive(Debug, Default)] +#[repr(C, align(64))] +pub struct PostedIntrDesc { + pir: [u32; 8], + control: PostedIntrDescControl, + // 保留位 + rsvd: [u32; 6], +} + +#[bitfield(u64)] +pub struct PostedIntrDescControl { + #[bits(1)] + on: bool, + #[bits(1)] + sn: bool, + #[bits(14)] + rsvd_1: u16, + nv: u8, + rsvd_2: u8, + ndst: u32, +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct VmxUretMsr { + load_into_hardware: bool, + data: u64, + mask: u64, +} + +#[derive(Debug, Default)] +pub struct VmxMsrs { + nr: usize, + val: [VmxMsrEntry; Self::MAX_NR_LOADSTORE_MSRS], +} + +impl VmxMsrs { + pub const MAX_NR_LOADSTORE_MSRS: usize = 8; + + pub fn find_loadstore_msr_slot(&self, msr: u32) -> Option { + return (0..self.nr).find(|&i| self.val[i].index == msr); + } +} + +#[derive(Debug, Default)] +pub struct VmxMsrAutoLoad { + guest: VmxMsrs, + host: VmxMsrs, +} + +#[derive(Debug)] +pub struct VmxRMode { + pub vm86_active: bool, + pub save_rflags: RFlags, + pub segs: [UapiKvmSegment; 8], +} + +impl Default for VmxRMode { + fn default() -> Self { + Self { + vm86_active: false, + save_rflags: RFlags::empty(), + segs: [UapiKvmSegment::default(); 8], + } + } +} + +#[derive(Debug, Clone, Copy, Default)] +pub struct VmxSaveSegment { + selector: u16, + base: u64, + limit: u32, + ar: u32, +} + +#[derive(Debug, Default)] +pub struct VmxSegmentCache { + pub bitmask: u32, + pub seg: [VmxSaveSegment; 8], +} + +#[derive(Debug)] +#[allow(dead_code)] +pub struct VmxVCpuPriv { + vpid: u16, + + fail: u8, + + exit_reason: VmxExitReason, + + exit_intr_info: IntrInfo, + + idt_vectoring_info: IntrInfo, + + vmcs01: Arc, + loaded_vmcs: Arc, + guest_uret_msrs: [VmxUretMsr; KvmArchManager::KVM_MAX_NR_USER_RETURN_MSRS], + guest_uret_msrs_loaded: bool, + + post_intr_desc: PostedIntrDesc, + + shadow_msr_intercept_read: AllocBitmap, + shadow_msr_intercept_write: AllocBitmap, + + msr_ia32_feature_control: u64, + msr_ia32_feature_control_valid_bits: u64, + + msr_host_kernel_gs_base: u64, + msr_guest_kernel_gs_base: u64, + + emulation_required: bool, + + rflags: RFlags, + + ple_window: u32, + ple_window_dirty: bool, + + msr_autoload: VmxMsrAutoLoad, + msr_autostore: VmxMsrs, + + pml_pg: Box<[u8; MMArch::PAGE_SIZE]>, + + rmode: VmxRMode, + + spec_ctrl: u64, + msr_ia32_umwait_control: u32, + hv_deadline_tsc: u64, + + segment_cache: VmxSegmentCache, + + req_immediate_exit: bool, + guest_state_loaded: bool, + + exit_qualification: u64, //暂时不知道用处fztodo +} + +#[derive(Debug, Default)] +#[allow(dead_code)] +pub struct KvmVmx { + tss_addr: usize, + ept_identity_pagetable_done: bool, + ept_identity_map_addr: u64, + pid_table: Option>, +} + +impl KvmVmx { + pub fn pid_table(&self) -> &[u64; MMArch::PAGE_SIZE] { + self.pid_table.as_ref().unwrap().as_ref() + } +} + +impl VmxVCpuPriv { + pub const PML_ENTITY_NUM: usize = 512; + + pub fn loaded_vmcs(&self) -> SpinLockGuard { + self.loaded_vmcs.lock() + } + + /// 参考:https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#7452 + pub fn init(vcpu: &mut VirtCpu, vm: &Vm) { + let vmcs = LockedLoadedVmcs::new(); + + // TODO: 改堆分配 + let mut vmx = Self { + vpid: 0, + fail: 0, + vmcs01: vmcs.clone(), + loaded_vmcs: vmcs, + guest_uret_msrs: [VmxUretMsr::default(); KvmArchManager::KVM_MAX_NR_USER_RETURN_MSRS], + shadow_msr_intercept_read: AllocBitmap::new(16), + shadow_msr_intercept_write: AllocBitmap::new(16), + post_intr_desc: PostedIntrDesc::default(), + ple_window: 0, + ple_window_dirty: false, + msr_autoload: VmxMsrAutoLoad::default(), + pml_pg: unsafe { Box::new_zeroed().assume_init() }, + guest_uret_msrs_loaded: false, + msr_ia32_feature_control: 0, + msr_ia32_feature_control_valid_bits: 0, + rmode: VmxRMode::default(), + spec_ctrl: 0, + msr_ia32_umwait_control: 0, + hv_deadline_tsc: u64::MAX, + segment_cache: VmxSegmentCache::default(), + emulation_required: false, + rflags: RFlags::empty(), + req_immediate_exit: false, + guest_state_loaded: false, + msr_host_kernel_gs_base: 0, + msr_guest_kernel_gs_base: 0, + idt_vectoring_info: IntrInfo::empty(), + exit_reason: VmxExitReason::new(), + exit_intr_info: IntrInfo::empty(), + msr_autostore: VmxMsrs::default(), + exit_qualification: 0, //fztodo + }; + + vmx.vpid = vmx_info().alloc_vpid().unwrap_or_default() as u16; + + for i in 0..x86_kvm_manager().kvm_uret_msrs_list.len() { + vmx.guest_uret_msrs[i].mask = u64::MAX; + } + + if CpuId::new().get_extended_feature_info().unwrap().has_rtm() { + let tsx_ctrl = vmx.find_uret_msr_mut(msr::MSR_IA32_TSX_CTRL); + if let Some((_idx, tsx_ctrl)) = tsx_ctrl { + // Disable TSX enumeration + tsx_ctrl.mask = !(1 << 1); + } + } + + vmx.shadow_msr_intercept_read.set_all(true); + vmx.shadow_msr_intercept_write.set_all(true); + + let arch = &vm.arch; + + vmx.disable_intercept_for_msr(arch, msr::IA32_TIME_STAMP_COUNTER, MsrType::READ); + vmx.disable_intercept_for_msr(arch, msr::IA32_FS_BASE, MsrType::RW); + vmx.disable_intercept_for_msr(arch, msr::IA32_GS_BASE, MsrType::RW); + vmx.disable_intercept_for_msr(arch, msr::IA32_KERNEL_GSBASE, MsrType::RW); + + vmx.disable_intercept_for_msr(arch, msr::IA32_SYSENTER_CS, MsrType::RW); + vmx.disable_intercept_for_msr(arch, msr::IA32_SYSENTER_ESP, MsrType::RW); + vmx.disable_intercept_for_msr(arch, msr::IA32_SYSENTER_EIP, MsrType::RW); + + if arch.pause_in_guest { + vmx.disable_intercept_for_msr(arch, msr::MSR_CORE_C1_RESIDENCY, MsrType::READ); + vmx.disable_intercept_for_msr(arch, msr::MSR_CORE_C3_RESIDENCY, MsrType::READ); + vmx.disable_intercept_for_msr(arch, msr::MSR_CORE_C6_RESIDENCY, MsrType::READ); + vmx.disable_intercept_for_msr(arch, msr::MSR_CORE_C7_RESIDENCY, MsrType::READ); + } + + if vmx_info().enable_flexpriority && vcpu.arch.lapic_in_kernel() { + todo!() + } + + if vmx_info().enable_ept && !vmx_info().enable_unrestricted_guest { + todo!() + } + + if vcpu.arch.lapic_in_kernel() && vmx_info().enable_ipiv { + todo!() + } + + // 初始化vmx私有信息 + vcpu.private = Some(vmx); + } + + pub fn find_uret_msr(&self, msr: u32) -> Option<(usize, &VmxUretMsr)> { + let idx = x86_kvm_manager().find_user_return_msr_idx(msr); + if let Some(index) = idx { + return Some((index, &self.guest_uret_msrs[index])); + } else { + return None; + } + } + + fn set_uret_msr(&mut self, msr: u32, data: u64) { + if let Some((_idx, msr)) = self.find_uret_msr_mut(msr) { + msr.data = data; + } + } + + pub fn find_uret_msr_mut(&mut self, msr: u32) -> Option<(usize, &mut VmxUretMsr)> { + let idx = x86_kvm_manager().find_user_return_msr_idx(msr); + if let Some(index) = idx { + return Some((index, &mut self.guest_uret_msrs[index])); + } else { + return None; + } + } + + fn set_guest_uret_msr(&mut self, slot: usize, data: u64) -> Result<(), SystemError> { + let msr = &mut self.guest_uret_msrs[slot]; + if msr.load_into_hardware { + x86_kvm_manager().kvm_set_user_return_msr(slot, data, msr.mask); + } + + msr.data = data; + + Ok(()) + } + + /// ## 禁用对特定的 MSR 的拦截 + fn disable_intercept_for_msr(&mut self, arch: &KvmArch, msr: u32, mut msr_type: MsrType) { + if !vmx_info().has_msr_bitmap() { + return; + } + + let msr_bitmap = &mut self.vmcs01.lock().msr_bitmap; + + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#3974 + // 嵌套vmx处理 + + if Vmx::is_valid_passthrough_msr(msr) { + if let Some(idx) = Vmx::possible_passthrough_msr_slot(msr) { + if msr_type.contains(MsrType::READ) { + self.shadow_msr_intercept_read.set(idx, false); + } + if msr_type.contains(MsrType::WRITE) { + self.shadow_msr_intercept_write.set(idx, false); + } + } + } + + if msr_type.contains(MsrType::READ) + && !arch.msr_allowed(msr, MsrFilterType::KVM_MSR_FILTER_READ) + { + msr_bitmap.ctl(msr, VmxMsrBitmapAction::Set, VmxMsrBitmapAccess::Read); + msr_type.remove(MsrType::READ); + } + + if msr_type.contains(MsrType::WRITE) + && !arch.msr_allowed(msr, MsrFilterType::KVM_MSR_FILTER_WRITE) + { + msr_bitmap.ctl(msr, VmxMsrBitmapAction::Set, VmxMsrBitmapAccess::Write); + msr_type.remove(MsrType::WRITE); + } + + if msr_type.contains(MsrType::READ) { + msr_bitmap.ctl(msr, VmxMsrBitmapAction::Clear, VmxMsrBitmapAccess::Read); + } + + if msr_type.contains(MsrType::WRITE) { + msr_bitmap.ctl(msr, VmxMsrBitmapAction::Clear, VmxMsrBitmapAccess::Write); + } + } + + #[inline] + pub fn segment_cache_clear(&mut self) { + self.segment_cache.bitmask = 0; + } + + pub fn clear_atomic_switch_msr(&mut self, msr: u32) { + match msr { + msr::IA32_EFER => { + if vmx_info().has_load_ia32_efer() { + self.clear_stomic_switch_msr_special( + EntryControls::LOAD_IA32_EFER.bits().into(), + ExitControls::LOAD_IA32_EFER.bits().into(), + ); + return; + } + } + + msr::MSR_PERF_GLOBAL_CTRL => { + if vmx_info().has_load_perf_global_ctrl() { + self.clear_stomic_switch_msr_special( + EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL.bits().into(), + ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL.bits().into(), + ); + return; + } + } + _ => {} + } + + let m = &mut self.msr_autoload; + let i = m.guest.find_loadstore_msr_slot(msr); + + if let Some(i) = i { + m.guest.nr -= 1; + m.guest.val[i] = m.guest.val[m.guest.nr]; + VmxAsm::vmx_vmwrite(control::VMENTRY_MSR_LOAD_COUNT, m.guest.nr as u64); + } + + let i = m.host.find_loadstore_msr_slot(msr); + if let Some(i) = i { + m.host.nr -= 1; + m.host.val[i] = m.host.val[m.host.nr]; + VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_LOAD_COUNT, m.host.nr as u64); + } + } + + fn clear_stomic_switch_msr_special(&self, entry: u64, exit: u64) { + let mut guard = self.loaded_vmcs.lock(); + guard.controls_clearbit(ControlsType::VmEntry, entry); + guard.controls_clearbit(ControlsType::VmExit, exit); + } + + pub fn add_atomic_switch_msr( + &mut self, + msr: u32, + guest_val: u64, + host_val: u64, + entry_only: bool, + ) { + match msr { + msr::IA32_EFER => { + if vmx_info().has_load_ia32_efer() { + self.add_atomic_switch_msr_special( + EntryControls::LOAD_IA32_EFER.bits() as u64, + ExitControls::LOAD_IA32_EFER.bits() as u64, + guest::IA32_EFER_FULL, + host::IA32_EFER_FULL, + guest_val, + host_val, + ); + return; + } + } + msr::MSR_PERF_GLOBAL_CTRL => { + if vmx_info().has_load_perf_global_ctrl() { + self.add_atomic_switch_msr_special( + EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL.bits().into(), + ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL.bits().into(), + guest::IA32_PERF_GLOBAL_CTRL_FULL, + host::IA32_PERF_GLOBAL_CTRL_FULL, + guest_val, + host_val, + ); + return; + } + } + msr::MSR_PEBS_ENABLE => { + unsafe { wrmsr(msr::MSR_PEBS_ENABLE, 0) }; + } + + _ => {} + } + + let m = &mut self.msr_autoload; + let i = m.guest.find_loadstore_msr_slot(msr); + let j = if !entry_only { + m.host.find_loadstore_msr_slot(msr) + } else { + Some(0) + }; + + if (i.is_none() && m.guest.nr == VmxMsrs::MAX_NR_LOADSTORE_MSRS) + || (j.is_none() && m.host.nr == VmxMsrs::MAX_NR_LOADSTORE_MSRS) + { + warn!("Not enough msr switch entries. Can't add msr 0x{:x}", msr); + return; + } + + let i = if let Some(i) = i { + i + } else { + m.guest.nr += 1; + VmxAsm::vmx_vmwrite(control::VMENTRY_MSR_LOAD_COUNT, m.guest.nr as u64); + m.guest.nr + }; + + m.guest.val[i].index = msr; + m.guest.val[i].data = guest_val; + + if entry_only { + return; + } + + let j = if let Some(j) = j { + j + } else { + m.host.nr += 1; + VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_LOAD_COUNT, m.host.nr as u64); + m.host.nr + }; + + m.host.val[j].index = msr; + m.host.val[j].data = host_val; + } + + fn add_atomic_switch_msr_special( + &self, + entry: u64, + exit: u64, + guest_val_vmcs: u32, + host_val_vmcs: u32, + guest_val: u64, + host_val: u64, + ) { + VmxAsm::vmx_vmwrite(guest_val_vmcs, guest_val); + if host_val_vmcs != host::IA32_EFER_FULL { + VmxAsm::vmx_vmwrite(host_val_vmcs, host_val); + } + + let mut guard = self.loaded_vmcs.lock(); + guard.controls_setbit(ControlsType::VmEntry, entry); + guard.controls_setbit(ControlsType::VmExit, exit); + } + + pub fn vmx_vcpu_run_flags(&self) -> VmxRunFlag { + let mut flags = VmxRunFlag::empty(); + + if self.loaded_vmcs().launched { + flags.insert(VmxRunFlag::VMRESUME); + } + + // MSR_IA32_SPEC_CTRL + if !self.loaded_vmcs().msr_write_intercepted(0x48) { + flags.insert(VmxRunFlag::SAVE_SPEC_CTRL); + } + + flags + } + pub fn get_exit_qual(&self) -> u64 { + self.exit_qualification + } + pub fn vmread_exit_qual(&mut self) { + self.exit_qualification = VmxAsm::vmx_vmread(ro::EXIT_QUALIFICATION); + } +} + +bitflags! { + pub struct MsrType: u8 { + const READ = 1; + const WRITE = 2; + const RW = 3; + } + + //https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/include/asm/kvm_host.h#249 + pub struct PageFaultErr: u64 { + const PFERR_PRESENT = 1 << 0; + const PFERR_WRITE = 1 << 1; + const PFERR_USER = 1 << 2; + const PFERR_RSVD = 1 << 3; + const PFERR_FETCH = 1 << 4; + const PFERR_PK = 1 << 5; + const PFERR_SGX = 1 << 15; + const PFERR_GUEST_FINAL = 1 << 32; + const PFERR_GUEST_PAGE = 1 << 33; + const PFERR_IMPLICIT_ACCESS = 1 << 48; + } + + pub struct VmxRunFlag: u8 { + const VMRESUME = 1 << 0; + const SAVE_SPEC_CTRL = 1 << 1; + } +} + +#[derive(Debug, PartialEq)] +#[allow(dead_code)] +pub enum VmxL1dFlushState { + Auto, + Never, + Cond, + Always, + EptDisabled, + NotRequired, +} + +#[derive(Debug, PartialEq)] +pub struct VmxSegmentField { + selector: u32, + base: u32, + limit: u32, + ar_bytes: u32, +} +//fix +pub const KVM_VMX_SEGMENT_FIELDS: &[VmxSegmentField] = &[ + // ES + VmxSegmentField { + selector: guest::ES_SELECTOR, + base: guest::ES_BASE, + limit: guest::ES_LIMIT, + ar_bytes: guest::ES_ACCESS_RIGHTS, + }, + // CS + VmxSegmentField { + selector: guest::CS_SELECTOR, + base: guest::CS_BASE, + limit: guest::CS_LIMIT, + ar_bytes: guest::CS_ACCESS_RIGHTS, + }, + // SS + VmxSegmentField { + selector: guest::SS_SELECTOR, + base: guest::SS_BASE, + limit: guest::SS_LIMIT, + ar_bytes: guest::SS_ACCESS_RIGHTS, + }, + // DS + VmxSegmentField { + selector: guest::DS_SELECTOR, + base: guest::DS_BASE, + limit: guest::DS_LIMIT, + ar_bytes: guest::DS_ACCESS_RIGHTS, + }, + // FS + VmxSegmentField { + selector: guest::FS_SELECTOR, + base: guest::FS_BASE, + limit: guest::FS_LIMIT, + ar_bytes: guest::FS_ACCESS_RIGHTS, + }, + // GS + VmxSegmentField { + selector: guest::GS_SELECTOR, + base: guest::GS_BASE, + limit: guest::GS_LIMIT, + ar_bytes: guest::GS_ACCESS_RIGHTS, + }, + // TR + VmxSegmentField { + selector: guest::TR_SELECTOR, + base: guest::TR_BASE, + limit: guest::TR_LIMIT, + ar_bytes: guest::TR_ACCESS_RIGHTS, + }, + // LDTR + VmxSegmentField { + selector: guest::LDTR_SELECTOR, + base: guest::LDTR_BASE, + limit: guest::LDTR_LIMIT, + ar_bytes: guest::LDTR_ACCESS_RIGHTS, + }, +]; + +pub static L1TF_VMX_MITIGATION: RwLock = RwLock::new(VmxL1dFlushState::Auto); + +pub fn vmx_init() -> Result<(), SystemError> { + let cpuid = CpuId::new(); + let cpu_feat = cpuid.get_feature_info().ok_or(SystemError::ENOSYS)?; + if !cpu_feat.has_vmx() { + return Err(SystemError::ENOSYS); + } + + init_kvm_arch(); + + x86_kvm_manager_mut().vendor_init(&VmxKvmInitFunc)?; + + vmx_info().setup_l1d_flush(); + + kvm_init()?; + Ok(()) +} + +#[no_mangle] +unsafe extern "C" fn vmx_update_host_rsp(vcpu_vmx: &VmxVCpuPriv, host_rsp: usize) { + warn!("vmx_update_host_rsp"); + let mut guard = vcpu_vmx.loaded_vmcs.lock(); + if unlikely(host_rsp != guard.host_state.rsp) { + guard.host_state.rsp = host_rsp; + VmxAsm::vmx_vmwrite(host::RSP, host_rsp as u64); + } +} + +#[no_mangle] +unsafe extern "C" fn vmx_spec_ctrl_restore_host(_vcpu_vmx: &VmxVCpuPriv, _flags: u32) { + // TODO + warn!("vmx_spec_ctrl_restore_host todo!"); +} diff --git a/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs b/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs new file mode 100644 index 00000000..77aa91a8 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs @@ -0,0 +1,160 @@ +use system_error::SystemError; +use x86::{ + msr::{ + IA32_VMX_ENTRY_CTLS, IA32_VMX_EXIT_CTLS, IA32_VMX_PINBASED_CTLS, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_PROCBASED_CTLS2, + }, + vmx::vmcs::control::{ + EntryControls, ExitControls, PinbasedControls, PrimaryControls, SecondaryControls, + }, +}; + +use crate::arch::vm::vmx::Vmx; + +pub struct VmxFeat; +#[allow(dead_code)] +impl VmxFeat { + pub const KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL: u32 = PrimaryControls::HLT_EXITING.bits() + | PrimaryControls::CR3_LOAD_EXITING.bits() + | PrimaryControls::CR3_STORE_EXITING.bits() + | PrimaryControls::UNCOND_IO_EXITING.bits() + | PrimaryControls::MOV_DR_EXITING.bits() + | PrimaryControls::USE_TSC_OFFSETTING.bits() + | PrimaryControls::MWAIT_EXITING.bits() + | PrimaryControls::MONITOR_EXITING.bits() + | PrimaryControls::INVLPG_EXITING.bits() + | PrimaryControls::RDPMC_EXITING.bits() + | PrimaryControls::INTERRUPT_WINDOW_EXITING.bits() + | PrimaryControls::CR8_LOAD_EXITING.bits() + | PrimaryControls::CR8_STORE_EXITING.bits(); + + pub const KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL: u32 = PrimaryControls::RDTSC_EXITING + .bits() + | PrimaryControls::USE_TPR_SHADOW.bits() + | PrimaryControls::USE_IO_BITMAPS.bits() + | PrimaryControls::MONITOR_TRAP_FLAG.bits() + | PrimaryControls::USE_MSR_BITMAPS.bits() + | PrimaryControls::NMI_WINDOW_EXITING.bits() + | PrimaryControls::PAUSE_EXITING.bits() + | PrimaryControls::SECONDARY_CONTROLS.bits(); + + pub const KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL: u32 = 0; + + pub const KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL: u32 = SecondaryControls::VIRTUALIZE_APIC + .bits() + | SecondaryControls::VIRTUALIZE_X2APIC.bits() + | SecondaryControls::WBINVD_EXITING.bits() + | SecondaryControls::ENABLE_VPID.bits() + | SecondaryControls::ENABLE_EPT.bits() + | SecondaryControls::UNRESTRICTED_GUEST.bits() + | SecondaryControls::PAUSE_LOOP_EXITING.bits() + | SecondaryControls::DTABLE_EXITING.bits() + | SecondaryControls::ENABLE_RDTSCP.bits() + | SecondaryControls::ENABLE_INVPCID.bits() + | SecondaryControls::VIRTUALIZE_APIC_REGISTER.bits() + | SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY.bits() + | SecondaryControls::VMCS_SHADOWING.bits() + | SecondaryControls::ENABLE_XSAVES_XRSTORS.bits() + | SecondaryControls::RDSEED_EXITING.bits() + | SecondaryControls::RDRAND_EXITING.bits() + | SecondaryControls::ENABLE_PML.bits() + | SecondaryControls::USE_TSC_SCALING.bits() + | SecondaryControls::ENABLE_USER_WAIT_PAUSE.bits() + | SecondaryControls::INTEL_PT_GUEST_PHYSICAL.bits() + | SecondaryControls::CONCEAL_VMX_FROM_PT.bits() + | SecondaryControls::ENABLE_VM_FUNCTIONS.bits() + | SecondaryControls::ENCLS_EXITING.bits(); + // | SecondaryControls::BUS_LOCK_DETECTION.bits() + // | SecondaryControls::NOTIFY_VM_EXITING.bits() + + pub const KVM_REQUIRED_VMX_VM_EXIT_CONTROLS: u32 = ExitControls::SAVE_DEBUG_CONTROLS.bits() + | ExitControls::ACK_INTERRUPT_ON_EXIT.bits() + | ExitControls::HOST_ADDRESS_SPACE_SIZE.bits(); + + pub const KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS: u32 = ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL + .bits() + | ExitControls::SAVE_IA32_PAT.bits() + | ExitControls::LOAD_IA32_PAT.bits() + | ExitControls::SAVE_IA32_EFER.bits() + | ExitControls::SAVE_VMX_PREEMPTION_TIMER.bits() + | ExitControls::LOAD_IA32_EFER.bits() + | ExitControls::CLEAR_IA32_BNDCFGS.bits() + | ExitControls::CONCEAL_VMX_FROM_PT.bits() + | ExitControls::CLEAR_IA32_RTIT_CTL.bits(); + + pub const KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 = + PinbasedControls::EXTERNAL_INTERRUPT_EXITING.bits() | PinbasedControls::NMI_EXITING.bits(); + + pub const KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 = + PinbasedControls::VIRTUAL_NMIS.bits() | PinbasedControls::POSTED_INTERRUPTS.bits(); + + pub const KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS: u32 = + EntryControls::LOAD_DEBUG_CONTROLS.bits() | EntryControls::IA32E_MODE_GUEST.bits(); + + pub const KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS: u32 = EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL + .bits() + | EntryControls::LOAD_IA32_PAT.bits() + | EntryControls::LOAD_IA32_EFER.bits() + | EntryControls::LOAD_IA32_BNDCFGS.bits() + | EntryControls::CONCEAL_VMX_FROM_PT.bits() + | EntryControls::LOAD_IA32_RTIT_CTL.bits(); + + /* VMX_BASIC bits and bitmasks */ + pub const VMX_BASIC_VMCS_SIZE_SHIFT: u64 = 32; + pub const VMX_BASIC_TRUE_CTLS: u64 = 1 << 55; + pub const VMX_BASIC_64: u64 = 0x0001000000000000; + pub const VMX_BASIC_MEM_TYPE_SHIFT: u64 = 50; + pub const VMX_BASIC_MEM_TYPE_MASK: u64 = 0x003c000000000000; + pub const VMX_BASIC_MEM_TYPE_WB: u64 = 6; + pub const VMX_BASIC_INOUT: u64 = 0x0040000000000000; + + pub fn adjust_primary_controls() -> Result { + Ok(unsafe { + PrimaryControls::from_bits_unchecked(Vmx::adjust_vmx_controls( + Self::KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, + Self::KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, + IA32_VMX_PROCBASED_CTLS, + )?) + }) + } + + pub fn adjust_secondary_controls() -> Result { + Ok(unsafe { + SecondaryControls::from_bits_unchecked(Vmx::adjust_vmx_controls( + Self::KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, + Self::KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, + IA32_VMX_PROCBASED_CTLS2, + )?) + }) + } + + pub fn adjust_exit_controls() -> Result { + Ok(unsafe { + ExitControls::from_bits_unchecked(Vmx::adjust_vmx_controls( + Self::KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, + Self::KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, + IA32_VMX_EXIT_CTLS, + )?) + }) + } + + pub fn adjust_entry_controls() -> Result { + Ok(unsafe { + EntryControls::from_bits_unchecked(Vmx::adjust_vmx_controls( + Self::KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, + Self::KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, + IA32_VMX_ENTRY_CTLS, + )?) + }) + } + + pub fn adjust_pin_based_controls() -> Result { + Ok(unsafe { + PinbasedControls::from_bits_unchecked(Vmx::adjust_vmx_controls( + Self::KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, + Self::KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, + IA32_VMX_PINBASED_CTLS, + )?) + }) + } +} diff --git a/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs new file mode 100644 index 00000000..de53a2f2 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs @@ -0,0 +1,451 @@ +use core::intrinsics::unlikely; + +use alloc::{boxed::Box, collections::LinkedList, sync::Arc}; +use bitmap::{traits::BitMapOps, AllocBitmap}; +use x86::{ + controlregs::Cr4, + vmx::vmcs::{ + control::{self, PrimaryControls}, + host, + }, +}; +use x86_64::{registers::control::Cr3Flags, structures::paging::PhysFrame}; + +use crate::{ + arch::{ + vm::asm::{IntrInfo, IntrType, VmxAsm}, + MMArch, + }, + libs::spinlock::{SpinLock, SpinLockGuard}, + mm::{percpu::PerCpuVar, MemoryManagementArch, PhysAddr, VirtAddr}, + smp::cpu::ProcessorId, +}; + +use super::vmx_info; + +pub mod feat; + +pub static mut PERCPU_VMCS: Option>>> = None; +pub static mut PERCPU_LOADED_VMCS_LIST: Option>>> = None; +pub static mut VMXAREA: Option>> = None; + +pub fn current_vmcs() -> &'static Option> { + unsafe { PERCPU_VMCS.as_ref().unwrap().get() } +} + +pub fn current_vmcs_mut() -> &'static mut Option> { + unsafe { PERCPU_VMCS.as_ref().unwrap().get_mut() } +} + +pub fn current_loaded_vmcs_list_mut() -> &'static mut LinkedList> { + unsafe { PERCPU_LOADED_VMCS_LIST.as_ref().unwrap().get_mut() } +} + +#[allow(dead_code)] +pub fn current_loaded_vmcs_list() -> &'static LinkedList> { + unsafe { PERCPU_LOADED_VMCS_LIST.as_ref().unwrap().get() } +} + +pub fn vmx_area() -> &'static PerCpuVar> { + unsafe { VMXAREA.as_ref().unwrap() } +} + +#[repr(C, align(4096))] +#[derive(Debug, Clone)] +pub struct VMControlStructure { + pub header: u32, + pub abort: u32, + pub data: [u8; MMArch::PAGE_SIZE - core::mem::size_of::() - core::mem::size_of::()], +} + +impl VMControlStructure { + pub fn new() -> Box { + let mut vmcs: Box = unsafe { + Box::try_new_zeroed() + .expect("alloc vmcs failed") + .assume_init() + }; + + vmcs.set_revision_id(vmx_info().vmcs_config.revision_id); + vmcs + } + + pub fn revision_id(&self) -> u32 { + self.header & 0x7FFF_FFFF + } + + #[allow(dead_code)] + pub fn is_shadow_vmcs(&self) -> bool { + self.header & 0x8000_0000 == 1 + } + + pub fn set_shadow_vmcs(&mut self, shadow: bool) { + self.header |= (shadow as u32) << 31; + } + + pub fn set_revision_id(&mut self, id: u32) { + self.header = self.header & 0x8000_0000 | (id & 0x7FFF_FFFF); + } +} + +#[derive(Debug)] +pub struct LockedVMControlStructure { + /// 记录内部的vmcs的物理地址 + phys_addr: PhysAddr, + inner: SpinLock>, +} + +impl LockedVMControlStructure { + #[inline(never)] + pub fn new(shadow: bool) -> Arc { + let mut vmcs = VMControlStructure::new(); + + let phys_addr = unsafe { + MMArch::virt_2_phys(VirtAddr::new(vmcs.as_ref() as *const _ as usize)).unwrap() + }; + + vmcs.set_shadow_vmcs(shadow); + + Arc::new(Self { + phys_addr, + inner: SpinLock::new(vmcs), + }) + } + + pub fn lock(&self) -> SpinLockGuard> { + self.inner.lock() + } + + pub fn phys_addr(&self) -> PhysAddr { + self.phys_addr + } +} + +#[derive(Debug)] +pub struct VmcsHostState { + pub cr3: (PhysFrame, Cr3Flags), + pub cr4: Cr4, + pub gs_base: usize, + pub fs_base: usize, + pub rsp: usize, + pub fs_sel: u16, + pub gs_sel: u16, + pub ldt_sel: u16, + pub ds_sel: u16, + pub es_sel: u16, +} + +impl VmcsHostState { + pub fn set_host_fsgs(&mut self, fs_sel: u16, gs_sel: u16, fs_base: usize, gs_base: usize) { + if unlikely(self.fs_sel != fs_sel) { + if (fs_sel & 7) == 0 { + VmxAsm::vmx_vmwrite(host::FS_SELECTOR, fs_sel as u64); + } else { + VmxAsm::vmx_vmwrite(host::FS_SELECTOR, 0); + } + + self.fs_sel = fs_sel; + } + + if unlikely(self.gs_sel != gs_sel) { + if (gs_sel & 7) == 0 { + VmxAsm::vmx_vmwrite(host::GS_SELECTOR, gs_sel as u64); + } else { + VmxAsm::vmx_vmwrite(host::GS_SELECTOR, 0); + } + + self.gs_sel = gs_sel; + } + + if unlikely(fs_base != self.fs_base) { + VmxAsm::vmx_vmwrite(host::FS_BASE, fs_base as u64); + self.fs_base = fs_base; + } + + if unlikely(self.gs_base != gs_base) { + VmxAsm::vmx_vmwrite(host::GS_BASE, gs_base as u64); + self.gs_base = gs_base; + } + } +} + +impl Default for VmcsHostState { + fn default() -> Self { + Self { + cr3: ( + PhysFrame::containing_address(x86_64::PhysAddr::new(0)), + Cr3Flags::empty(), + ), + cr4: Cr4::empty(), + gs_base: 0, + fs_base: 0, + rsp: 0, + fs_sel: 0, + gs_sel: 0, + ldt_sel: 0, + ds_sel: 0, + es_sel: 0, + } + } +} + +#[derive(Debug, Default)] +pub struct VmcsControlsShadow { + vm_entry: u32, + vm_exit: u32, + pin: u32, + exec: u32, + secondary_exec: u32, + tertiary_exec: u64, +} + +#[derive(Debug)] +#[allow(dead_code)] +pub struct LoadedVmcs { + pub vmcs: Arc, + pub shadow_vmcs: Option>, + pub cpu: ProcessorId, + /// 是否已经执行了 VMLAUNCH 指令 + pub launched: bool, + /// NMI 是否已知未被屏蔽 + nmi_known_unmasked: bool, + /// Hypervisor 定时器是否被软禁用 + hv_timer_soft_disabled: bool, + /// 支持 vnmi-less CPU 的字段,指示 VNMI 是否被软阻止 + pub soft_vnmi_blocked: bool, + /// 记录 VM 进入时间 + entry_time: u64, + /// 记录 VNMI 被阻止的时间 + vnmi_blocked_time: u64, + /// msr位图 + pub msr_bitmap: VmxMsrBitmap, + /// 保存 VMCS 主机状态的结构体 + pub host_state: VmcsHostState, + /// 保存 VMCS 控制字段的shadow状态的结构体。 + controls_shadow: VmcsControlsShadow, +} + +impl LoadedVmcs { + pub fn controls_set(&mut self, ctl_type: ControlsType, value: u64) { + match ctl_type { + ControlsType::VmEntry => { + if self.controls_shadow.vm_entry != value as u32 { + VmxAsm::vmx_vmwrite(control::VMENTRY_CONTROLS, value); + self.controls_shadow.vm_entry = value as u32; + } + } + ControlsType::VmExit => { + if self.controls_shadow.vm_exit != value as u32 { + VmxAsm::vmx_vmwrite(control::VMEXIT_CONTROLS, value); + self.controls_shadow.vm_exit = value as u32; + } + } + ControlsType::Pin => { + if self.controls_shadow.pin != value as u32 { + VmxAsm::vmx_vmwrite(control::PINBASED_EXEC_CONTROLS, value); + self.controls_shadow.pin = value as u32; + } + } + ControlsType::Exec => { + if self.controls_shadow.exec != value as u32 { + VmxAsm::vmx_vmwrite(control::PRIMARY_PROCBASED_EXEC_CONTROLS, value); + self.controls_shadow.exec = value as u32; + } + } + ControlsType::SecondaryExec => { + if self.controls_shadow.secondary_exec != value as u32 { + VmxAsm::vmx_vmwrite(control::SECONDARY_PROCBASED_EXEC_CONTROLS, value); + self.controls_shadow.secondary_exec = value as u32; + } + } + ControlsType::TertiaryExec => { + if self.controls_shadow.tertiary_exec != value { + VmxAsm::vmx_vmwrite(0x2034, value); + self.controls_shadow.tertiary_exec = value; + } + } + } + } + + pub fn controls_get(&self, ctl_type: ControlsType) -> u64 { + match ctl_type { + ControlsType::VmEntry => self.controls_shadow.vm_entry as u64, + ControlsType::VmExit => self.controls_shadow.vm_exit as u64, + ControlsType::Pin => self.controls_shadow.pin as u64, + ControlsType::Exec => self.controls_shadow.exec as u64, + ControlsType::SecondaryExec => self.controls_shadow.secondary_exec as u64, + ControlsType::TertiaryExec => self.controls_shadow.tertiary_exec, + } + } + + pub fn controls_setbit(&mut self, ctl_type: ControlsType, value: u64) { + let val = self.controls_get(ctl_type) | value; + self.controls_set(ctl_type, val) + } + + pub fn controls_clearbit(&mut self, ctl_type: ControlsType, value: u64) { + let val = self.controls_get(ctl_type) & (!value); + self.controls_set(ctl_type, val) + } + + pub fn msr_write_intercepted(&mut self, msr: u32) -> bool { + if unsafe { + PrimaryControls::from_bits_unchecked(self.controls_get(ControlsType::Exec) as u32) + .contains(PrimaryControls::USE_MSR_BITMAPS) + } { + return true; + } + + return self + .msr_bitmap + .ctl(msr, VmxMsrBitmapAction::Test, VmxMsrBitmapAccess::Write); + } +} + +#[derive(Debug)] +pub struct LockedLoadedVmcs { + inner: SpinLock, +} + +#[derive(Debug, Clone, Copy)] +#[allow(dead_code)] +pub enum ControlsType { + VmEntry, + VmExit, + Pin, + Exec, + SecondaryExec, + TertiaryExec, +} + +impl LockedLoadedVmcs { + pub fn new() -> Arc { + let bitmap = if vmx_info().has_msr_bitmap() { + let bitmap = VmxMsrBitmap::new(true, MMArch::PAGE_SIZE * u8::BITS as usize); + bitmap + } else { + VmxMsrBitmap::new(true, 0) + }; + let vmcs = LockedVMControlStructure::new(false); + + VmxAsm::vmclear(vmcs.phys_addr); + + Arc::new(Self { + inner: SpinLock::new(LoadedVmcs { + vmcs, + shadow_vmcs: None, + cpu: ProcessorId::INVALID, + launched: false, + hv_timer_soft_disabled: false, + msr_bitmap: bitmap, + host_state: VmcsHostState::default(), + controls_shadow: VmcsControlsShadow::default(), + nmi_known_unmasked: false, + soft_vnmi_blocked: false, + entry_time: 0, + vnmi_blocked_time: 0, + }), + }) + } + + pub fn lock(&self) -> SpinLockGuard { + self.inner.lock() + } +} + +#[derive(Debug)] +pub struct VmxMsrBitmap { + data: AllocBitmap, + phys_addr: usize, +} + +pub enum VmxMsrBitmapAction { + Test, + Set, + Clear, +} + +pub enum VmxMsrBitmapAccess { + Write, + Read, +} + +impl VmxMsrBitmapAccess { + pub const fn base(&self) -> usize { + match self { + VmxMsrBitmapAccess::Write => 0x800 * core::mem::size_of::(), + VmxMsrBitmapAccess::Read => 0, + } + } +} + +impl VmxMsrBitmap { + pub fn new(init_val: bool, size: usize) -> Self { + let mut data = AllocBitmap::new(size); + data.set_all(init_val); + + let addr = data.data() as *const [usize] as *const usize as usize; + Self { + data, + phys_addr: unsafe { MMArch::virt_2_phys(VirtAddr::new(addr)).unwrap().data() }, + } + } + + pub fn phys_addr(&self) -> usize { + self.phys_addr + } + + pub fn ctl( + &mut self, + msr: u32, + action: VmxMsrBitmapAction, + access: VmxMsrBitmapAccess, + ) -> bool { + if msr <= 0x1fff { + return self.bit_op(msr as usize, access.base(), action); + } else if (0xc0000000..=0xc0001fff).contains(&msr) { + // 这里是有问题的,需要后续检查 + // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.h#450 + return self.bit_op(msr as usize & 0x1fff, access.base() + 0x400, action); + } else { + return true; + } + } + + fn bit_op(&mut self, msr: usize, base: usize, action: VmxMsrBitmapAction) -> bool { + match action { + VmxMsrBitmapAction::Test => { + let ret = self.data.get(msr + base); + ret.unwrap_or(false) + } + VmxMsrBitmapAction::Set => { + self.data.set(msr + base, true); + true + } + VmxMsrBitmapAction::Clear => { + self.data.set(msr + base, false); + true + } + } + } +} + +/// 中断相关辅助函数载体 +pub struct VmcsIntrHelper; + +impl VmcsIntrHelper { + pub fn is_nmi(intr_info: &IntrInfo) -> bool { + return Self::is_intr_type(intr_info, IntrType::INTR_TYPE_NMI_INTR); + } + + pub fn is_intr_type(intr_info: &IntrInfo, intr_type: IntrType) -> bool { + return (*intr_info + & (IntrInfo::INTR_INFO_VALID_MASK | IntrInfo::INTR_INFO_INTR_TYPE_MASK)) + .bits() + == IntrInfo::INTR_INFO_VALID_MASK.bits() | intr_type.bits(); + } + + pub fn is_external_intr(intr_info: &IntrInfo) -> bool { + return Self::is_intr_type(intr_info, IntrType::INTR_TYPE_EXT_INTR); + } +} diff --git a/kernel/src/arch/x86_64/vm/vmx/vmenter.S b/kernel/src/arch/x86_64/vm/vmx/vmenter.S new file mode 100644 index 00000000..10f3fca2 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/vmenter.S @@ -0,0 +1,179 @@ +#include "common/asm.h" + +#define __VCPU_REGS_RAX 0 +#define __VCPU_REGS_RCX 1 +#define __VCPU_REGS_RDX 2 +#define __VCPU_REGS_RBX 3 +#define __VCPU_REGS_RSP 4 +#define __VCPU_REGS_RBP 5 +#define __VCPU_REGS_RSI 6 +#define __VCPU_REGS_RDI 7 + +#define __VCPU_REGS_R8 8 +#define __VCPU_REGS_R9 9 +#define __VCPU_REGS_R10 10 +#define __VCPU_REGS_R11 11 +#define __VCPU_REGS_R12 12 +#define __VCPU_REGS_R13 13 +#define __VCPU_REGS_R14 14 +#define __VCPU_REGS_R15 15 + +#define VCPU_RAX __VCPU_REGS_RAX * 8 +#define VCPU_RCX __VCPU_REGS_RCX * 8 +#define VCPU_RDX __VCPU_REGS_RDX * 8 +#define VCPU_RBX __VCPU_REGS_RBX * 8 +#define VCPU_RBP __VCPU_REGS_RBP * 8 +#define VCPU_RSI __VCPU_REGS_RSI * 8 +#define VCPU_RDI __VCPU_REGS_RDI * 8 + +#define VCPU_R8 __VCPU_REGS_R8 * 8 +#define VCPU_R9 __VCPU_REGS_R9 * 8 +#define VCPU_R10 __VCPU_REGS_R10 * 8 +#define VCPU_R11 __VCPU_REGS_R11 * 8 +#define VCPU_R12 __VCPU_REGS_R12 * 8 +#define VCPU_R13 __VCPU_REGS_R13 * 8 +#define VCPU_R14 __VCPU_REGS_R14 * 8 +#define VCPU_R15 __VCPU_REGS_R15 * 8 + +#define VMX_RUN_VMRESUME_SHIFT 0 +#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1 + +#define VMX_RUN_VMRESUME 1 << VMX_RUN_VMRESUME_SHIFT +#define VMX_RUN_SAVE_SPEC_CTRL 1 << VMX_RUN_SAVE_SPEC_CTRL_SHIFT + +// 将VCPU运行在guest模式 +ENTRY(__vmx_vcpu_run) + pushq %rbp + movq %rsp, %rbp + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + + push %rbx + + // 参数一 + push %rdi + // 参数三 + push %rdx + // 参数二 + push %rsi + + mov %edx, %ebx + + lea (%rsp), %rsi + + call vmx_update_host_rsp + + // TODO: spec_ctrl + +.Lspec_ctrl_done: + mov %rsp, %rax + + bt $VMX_RUN_VMRESUME_SHIFT, %ebx + + mov VCPU_RCX(%rax), %rcx + mov VCPU_RDX(%rax), %rdx + mov VCPU_RBX(%rax), %rbx + mov VCPU_RBP(%rax), %rbp + mov VCPU_RSI(%rax), %rsi + mov VCPU_RDI(%rax), %rdi + + mov VCPU_R8(%rax), %R8 + mov VCPU_R9(%rax), %r9 + mov VCPU_R10(%rax), %r10 + mov VCPU_R11(%rax), %r11 + mov VCPU_R12(%rax), %r12 + mov VCPU_R13(%rax), %r13 + mov VCPU_R14(%rax), %r14 + mov VCPU_R15(%rax), %r15 + + mov VCPU_RAX(%rax), %rax + + // TODO: clear cpu buffer + + jnc .Lvmlaunch + +.Lvmresume: + vmresume + jmp .Lvmfail + +.Lvmlaunch: + call vmx_vmlaunch + jmp .Lvmfail + +// 从guest模式退出 +ENTRY(vmx_vmexit) + // TODO: unwind hint restore + // 临时保存guest RAX + push %rax + + // 拿到regs头指针,存入rax + mov 8(%rsp), %rax + + // 保存所有guest寄存器 + pop VCPU_RAX(%rax) + mov %rcx, VCPU_RCX(%rax) + mov %rdx, VCPU_RDX(%rax) + mov %rbx, VCPU_RBX(%rax) + mov %rbp, VCPU_RBP(%rax) + mov %rsi, VCPU_RSI(%rax) + mov %rdi, VCPU_RDI(%rax) + + mov %r8, VCPU_R8(%rax) + mov %r9, VCPU_R9(%rax) + mov %r10, VCPU_R10(%rax) + mov %r11, VCPU_R11(%rax) + mov %r12, VCPU_R12(%rax) + mov %r13, VCPU_R13(%rax) + mov %r14, VCPU_R14(%rax) + mov %r15, VCPU_R15(%rax) + + xor %ebx, %ebx + +.Lclear_regs: + pop %rax + + xor %eax, %eax + xor %ecx, %ecx + xor %edx, %edx + xor %ebp, %ebp + xor %esi, %esi + xor %edi, %edi + + xor %r8d, %r8d + xor %r9d, %r9d + xor %r10d, %r10d + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d + + // todo: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmenter.S#270 + + pop %rsi + pop %rdi + + call vmx_spec_ctrl_restore_host + + mov %rbx, %rax + + pop %rbx + + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + pop %rbp + ret + +.Lvmfail: + // 失败,设置返回值为1 + mov $1, %rbx + jmp .Lclear_regs + + + diff --git a/kernel/src/driver/tty/tty_port.rs b/kernel/src/driver/tty/tty_port.rs index d0488132..bab9ea7f 100644 --- a/kernel/src/driver/tty/tty_port.rs +++ b/kernel/src/driver/tty/tty_port.rs @@ -84,7 +84,8 @@ pub trait TtyPort: Sync + Send + Debug { let ld = tty.ldisc(); let ret = ld.receive_buf2(tty.clone(), buf, None, count); - if ret.is_err() && ret.clone().unwrap_err() == SystemError::ENOSYS { + + if let Err(SystemError::ENOSYS) = ret { return ld.receive_buf(tty, buf, None, count); } diff --git a/kernel/src/filesystem/mod.rs b/kernel/src/filesystem/mod.rs index 59bda2a1..8bd94989 100644 --- a/kernel/src/filesystem/mod.rs +++ b/kernel/src/filesystem/mod.rs @@ -6,6 +6,7 @@ pub mod kernfs; pub mod mbr; pub mod overlayfs; pub mod page_cache; +pub mod poll; pub mod procfs; pub mod ramfs; pub mod sysfs; diff --git a/kernel/src/filesystem/poll.rs b/kernel/src/filesystem/poll.rs new file mode 100644 index 00000000..e5a96d9a --- /dev/null +++ b/kernel/src/filesystem/poll.rs @@ -0,0 +1,209 @@ +use core::ffi::c_int; + +use crate::{ + ipc::signal::{RestartBlock, RestartBlockData, RestartFn}, + mm::VirtAddr, + net::event_poll::{EPollCtlOption, EPollEvent, EPollEventType, EventPoll}, + process::ProcessManager, + syscall::{user_access::UserBufferWriter, Syscall}, + time::{Duration, Instant}, +}; + +use super::vfs::file::{File, FileMode}; +use alloc::sync::Arc; +use system_error::SystemError; + +#[repr(C)] +#[derive(Debug)] +pub struct PollFd { + pub fd: c_int, + pub events: u16, + pub revents: u16, +} + +struct PollAdapter<'a> { + ep_file: Arc, + poll_fds: &'a mut [PollFd], +} + +impl<'a> PollAdapter<'a> { + pub fn new(ep_file: Arc, poll_fds: &'a mut [PollFd]) -> Self { + Self { ep_file, poll_fds } + } + + fn add_pollfds(&self) -> Result<(), SystemError> { + for pollfd in self.poll_fds.iter() { + let mut epoll_event = EPollEvent::default(); + let poll_flags = PollFlags::from_bits_truncate(pollfd.events); + let ep_events: EPollEventType = poll_flags.into(); + epoll_event.set_events(ep_events.bits()); + + EventPoll::epoll_ctl_with_epfile( + self.ep_file.clone(), + EPollCtlOption::Add, + pollfd.fd, + epoll_event, + false, + ) + .map(|_| ())?; + } + + Ok(()) + } + + fn poll_all_fds(&mut self, timeout: Option) -> Result { + let mut epoll_events = vec![EPollEvent::default(); self.poll_fds.len()]; + let len = epoll_events.len() as i32; + let remain_timeout = timeout + .and_then(|t| t.duration_since(Instant::now())) + .map(|t| t.into()); + let events = EventPoll::epoll_wait_with_file( + self.ep_file.clone(), + &mut epoll_events, + len, + remain_timeout, + )?; + + for (i, event) in epoll_events.iter().enumerate() { + self.poll_fds[i].revents = (event.events() & 0xffff) as u16; + } + + Ok(events) + } +} + +impl Syscall { + /// https://code.dragonos.org.cn/xref/linux-6.6.21/fs/select.c#1068 + pub fn poll(pollfd_ptr: usize, nfds: u32, timeout_ms: i32) -> Result { + let pollfd_ptr = VirtAddr::new(pollfd_ptr); + let len = nfds as usize * core::mem::size_of::(); + + let mut timeout: Option = None; + if timeout_ms >= 0 { + timeout = poll_select_set_timeout(timeout_ms); + } + let mut poll_fds_writer = UserBufferWriter::new(pollfd_ptr.as_ptr::(), len, true)?; + let mut r = do_sys_poll(poll_fds_writer.buffer(0)?, timeout); + if let Err(SystemError::ERESTARTNOHAND) = r { + let restart_block_data = RestartBlockData::new_poll(pollfd_ptr, nfds, timeout); + let restart_block = RestartBlock::new(&RestartFnPoll, restart_block_data); + r = ProcessManager::current_pcb().set_restart_fn(Some(restart_block)); + } + + return r; + } +} + +/// 计算超时的时刻 +fn poll_select_set_timeout(timeout_ms: i32) -> Option { + if timeout_ms == 0 { + return None; + } + + Some(Instant::now() + Duration::from_millis(timeout_ms as u64)) +} + +fn do_sys_poll(poll_fds: &mut [PollFd], timeout: Option) -> Result { + let ep_file = EventPoll::create_epoll_file(FileMode::empty())?; + + let ep_file = Arc::new(ep_file); + + let mut adapter = PollAdapter::new(ep_file, poll_fds); + adapter.add_pollfds()?; + let nevents = adapter.poll_all_fds(timeout)?; + + Ok(nevents) +} + +bitflags! { + pub struct PollFlags: u16 { + const POLLIN = 0x0001; + const POLLPRI = 0x0002; + const POLLOUT = 0x0004; + const POLLERR = 0x0008; + const POLLHUP = 0x0010; + const POLLNVAL = 0x0020; + const POLLRDNORM = 0x0040; + const POLLRDBAND = 0x0080; + const POLLWRNORM = 0x0100; + const POLLWRBAND = 0x0200; + const POLLMSG = 0x0400; + const POLLREMOVE = 0x1000; + const POLLRDHUP = 0x2000; + const POLLFREE = 0x4000; + const POLL_BUSY_LOOP = 0x8000; + } +} + +impl From for EPollEventType { + fn from(val: PollFlags) -> Self { + let mut epoll_flags = EPollEventType::empty(); + + if val.contains(PollFlags::POLLIN) { + epoll_flags |= EPollEventType::EPOLLIN; + } + if val.contains(PollFlags::POLLPRI) { + epoll_flags |= EPollEventType::EPOLLPRI; + } + if val.contains(PollFlags::POLLOUT) { + epoll_flags |= EPollEventType::EPOLLOUT; + } + if val.contains(PollFlags::POLLERR) { + epoll_flags |= EPollEventType::EPOLLERR; + } + if val.contains(PollFlags::POLLHUP) { + epoll_flags |= EPollEventType::EPOLLHUP; + } + if val.contains(PollFlags::POLLNVAL) { + epoll_flags |= EPollEventType::EPOLLNVAL; + } + if val.contains(PollFlags::POLLRDNORM) { + epoll_flags |= EPollEventType::EPOLLRDNORM; + } + if val.contains(PollFlags::POLLRDBAND) { + epoll_flags |= EPollEventType::EPOLLRDBAND; + } + if val.contains(PollFlags::POLLWRNORM) { + epoll_flags |= EPollEventType::EPOLLWRNORM; + } + if val.contains(PollFlags::POLLWRBAND) { + epoll_flags |= EPollEventType::EPOLLWRBAND; + } + if val.contains(PollFlags::POLLMSG) { + epoll_flags |= EPollEventType::EPOLLMSG; + } + if val.contains(PollFlags::POLLRDHUP) { + epoll_flags |= EPollEventType::EPOLLRDHUP; + } + if val.contains(PollFlags::POLLFREE) { + epoll_flags |= EPollEventType::POLLFREE; + } + + epoll_flags + } +} + +/// sys_poll的restart fn +#[derive(Debug)] +struct RestartFnPoll; + +impl RestartFn for RestartFnPoll { + // 参考 https://code.dragonos.org.cn/xref/linux-6.6.21/fs/select.c#1047 + fn call(&self, data: &mut RestartBlockData) -> Result { + if let RestartBlockData::Poll(d) = data { + let len = d.nfds as usize * core::mem::size_of::(); + + let mut poll_fds_writer = + UserBufferWriter::new(d.pollfd_ptr.as_ptr::(), len, true)?; + let mut r = do_sys_poll(poll_fds_writer.buffer(0)?, d.timeout_instant); + if let Err(SystemError::ERESTARTNOHAND) = r { + let restart_block = RestartBlock::new(&RestartFnPoll, data.clone()); + r = ProcessManager::current_pcb().set_restart_fn(Some(restart_block)); + } + + return r; + } else { + panic!("RestartFnPoll called with wrong data type: {:?}", data); + } + } +} diff --git a/kernel/src/filesystem/vfs/file.rs b/kernel/src/filesystem/vfs/file.rs index 0833ab00..0f3a301a 100644 --- a/kernel/src/filesystem/vfs/file.rs +++ b/kernel/src/filesystem/vfs/file.rs @@ -506,7 +506,7 @@ impl File { } FileType::Pipe => { let inode = self.inode.downcast_ref::().unwrap(); - return inode.inner().lock().add_epoll(epitem); + return inode.add_epoll(epitem); } _ => { let r = self.inode.kernel_ioctl(epitem, &self.private_data.lock()); @@ -530,13 +530,14 @@ impl File { .remove(epoll), FileType::Pipe => { let inode = self.inode.downcast_ref::().unwrap(); - inode.inner().lock().remove_epoll(epoll) + inode.remove_epoll(epoll) } _ => { let inode = self.inode.downcast_ref::(); if let Some(inode) = inode { return inode.remove_epoll(epoll); } + let inode = self .inode .downcast_ref::() diff --git a/kernel/src/init/init.rs b/kernel/src/init/init.rs index fa77cba7..0be5bce2 100644 --- a/kernel/src/init/init.rs +++ b/kernel/src/init/init.rs @@ -92,8 +92,11 @@ fn do_start_kernel() { Futex::init(); crate::bpf::init_bpf_system(); crate::debug::jump_label::static_keys_init(); + + // #[cfg(all(target_arch = "x86_64", feature = "kvm"))] + // crate::virt::kvm::kvm_init(); #[cfg(all(target_arch = "x86_64", feature = "kvm"))] - crate::virt::kvm::kvm_init(); + crate::arch::vm::vmx::vmx_init().unwrap(); } /// 在内存管理初始化之前,执行的初始化 diff --git a/kernel/src/init/initial_kthread.rs b/kernel/src/init/initial_kthread.rs index 25fb2191..a79c59da 100644 --- a/kernel/src/init/initial_kthread.rs +++ b/kernel/src/init/initial_kthread.rs @@ -22,7 +22,12 @@ use crate::{ use super::{cmdline::kenrel_cmdline_param_manager, initcall::do_initcalls}; -const INIT_PROC_TRYLIST: [&str; 3] = ["/bin/dragonreach", "/bin/init", "/bin/sh"]; +const INIT_PROC_TRYLIST: [(&str, Option<&str>); 4] = [ + ("/bin/dragonreach", None), + ("/bin/init", None), + ("/bin/sh", None), + ("/bin/busybox", Some("init")), +]; pub fn initial_kernel_thread() -> i32 { kernel_init().unwrap_or_else(|err| { @@ -88,6 +93,7 @@ fn switch_to_user() -> ! { try_to_run_init_process( path.as_c_str().to_str().unwrap(), &mut proc_init_info, + &None, &mut trap_frame, ) .unwrap_or_else(|e| { @@ -98,8 +104,9 @@ fn switch_to_user() -> ! { }); } else { let mut ok = false; - for path in INIT_PROC_TRYLIST.iter() { - if try_to_run_init_process(path, &mut proc_init_info, &mut trap_frame).is_ok() { + for (path, ext_args) in INIT_PROC_TRYLIST.iter() { + if try_to_run_init_process(path, &mut proc_init_info, ext_args, &mut trap_frame).is_ok() + { ok = true; break; } @@ -118,10 +125,22 @@ fn switch_to_user() -> ! { fn try_to_run_init_process( path: &str, proc_init_info: &mut ProcInitInfo, + ext_args: &Option<&str>, trap_frame: &mut TrapFrame, ) -> Result<(), SystemError> { + let mut args_to_insert = alloc::vec::Vec::new(); + args_to_insert.push(CString::new(path).unwrap()); + + if let Some(ext_args) = ext_args { + // Split ext_args by whitespace and trim each part + for arg in ext_args.split_whitespace() { + args_to_insert.push(CString::new(arg.trim()).unwrap()); + } + } proc_init_info.proc_name = CString::new(path).unwrap(); - proc_init_info.args.insert(0, CString::new(path).unwrap()); + let elements_to_remove = args_to_insert.len(); + let old_args = core::mem::replace(&mut proc_init_info.args, args_to_insert); + proc_init_info.args.extend(old_args); if let Err(e) = run_init_process(proc_init_info, trap_frame) { if e != SystemError::ENOENT { error!( @@ -130,12 +149,12 @@ fn try_to_run_init_process( ); } - proc_init_info.args.remove(0); + proc_init_info.args.drain(0..elements_to_remove); return Err(e); } + Ok(()) } - fn run_init_process( proc_init_info: &ProcInitInfo, trap_frame: &mut TrapFrame, diff --git a/kernel/src/ipc/pipe.rs b/kernel/src/ipc/pipe.rs index 32d57494..b4a5bb79 100644 --- a/kernel/src/ipc/pipe.rs +++ b/kernel/src/ipc/pipe.rs @@ -49,6 +49,7 @@ pub struct LockedPipeInode { inner: SpinLock, read_wait_queue: WaitQueue, write_wait_queue: WaitQueue, + epitems: SpinLock>>, } /// @brief 管道文件i节点(无锁) @@ -65,7 +66,6 @@ pub struct InnerPipeInode { reader: u32, writer: u32, had_reader: bool, - epitems: SpinLock>>, } impl InnerPipeInode { @@ -81,7 +81,7 @@ impl InnerPipeInode { if mode.contains(FileMode::O_RDONLY) { if self.valid_cnt != 0 { // 有数据可读 - events.insert(EPollEventType::EPOLLIN & EPollEventType::EPOLLRDNORM); + events.insert(EPollEventType::EPOLLIN | EPollEventType::EPOLLRDNORM); } // 没有写者 @@ -93,7 +93,7 @@ impl InnerPipeInode { if mode.contains(FileMode::O_WRONLY) { // 管道内数据未满 if self.valid_cnt as usize != PIPE_BUFF_SIZE { - events.insert(EPollEventType::EPOLLIN & EPollEventType::EPOLLWRNORM); + events.insert(EPollEventType::EPOLLOUT | EPollEventType::EPOLLWRNORM); } // 没有读者 @@ -105,29 +105,9 @@ impl InnerPipeInode { Ok(events.bits() as usize) } - pub fn add_epoll(&mut self, epitem: Arc) -> Result<(), SystemError> { - self.epitems.lock().push_back(epitem); - Ok(()) - } - fn buf_full(&self) -> bool { return self.valid_cnt as usize == PIPE_BUFF_SIZE; } - - pub fn remove_epoll(&self, epoll: &Weak>) -> Result<(), SystemError> { - let is_remove = !self - .epitems - .lock_irqsave() - .extract_if(|x| x.epoll().ptr_eq(epoll)) - .collect::>() - .is_empty(); - - if is_remove { - return Ok(()); - } - - Err(SystemError::ENOENT) - } } impl LockedPipeInode { @@ -158,12 +138,12 @@ impl LockedPipeInode { }, reader: 0, writer: 0, - epitems: SpinLock::new(LinkedList::new()), }; let result = Arc::new(Self { inner: SpinLock::new(inner), read_wait_queue: WaitQueue::default(), write_wait_queue: WaitQueue::default(), + epitems: SpinLock::new(LinkedList::new()), }); let mut guard = result.inner.lock(); guard.self_ref = Arc::downgrade(&result); @@ -185,6 +165,26 @@ impl LockedPipeInode { let inode = self.inner.lock(); return !inode.buf_full() || inode.reader == 0; } + + pub fn add_epoll(&self, epitem: Arc) -> Result<(), SystemError> { + self.epitems.lock().push_back(epitem); + Ok(()) + } + + pub fn remove_epoll(&self, epoll: &Weak>) -> Result<(), SystemError> { + let is_remove = !self + .epitems + .lock_irqsave() + .extract_if(|x| x.epoll().ptr_eq(epoll)) + .collect::>() + .is_empty(); + + if is_remove { + return Ok(()); + } + + Err(SystemError::ENOENT) + } } impl IndexNode for LockedPipeInode { @@ -210,12 +210,12 @@ impl IndexNode for LockedPipeInode { } // log::debug!("pipe mode: {:?}", mode); // 加锁 - let mut inode = self.inner.lock(); + let mut inner_guard = self.inner.lock(); // 如果管道里面没有数据,则唤醒写端, - while inode.valid_cnt == 0 { + while inner_guard.valid_cnt == 0 { // 如果当前管道写者数为0,则返回EOF - if inode.writer == 0 { + if inner_guard.writer == 0 { return Ok(0); } @@ -224,12 +224,12 @@ impl IndexNode for LockedPipeInode { // 如果为非阻塞管道,直接返回错误 if mode.contains(FileMode::O_NONBLOCK) { - drop(inode); + drop(inner_guard); return Err(SystemError::EAGAIN_OR_EWOULDBLOCK); } // 否则在读等待队列中睡眠,并释放锁 - drop(inode); + drop(inner_guard); let r = wq_wait_event_interruptible!(self.read_wait_queue, self.readable(), {}); if r.is_err() { ProcessManager::current_pcb() @@ -238,35 +238,37 @@ impl IndexNode for LockedPipeInode { return Err(SystemError::ERESTARTSYS); } - inode = self.inner.lock(); + inner_guard = self.inner.lock(); } - let mut num = inode.valid_cnt as usize; + let mut num = inner_guard.valid_cnt as usize; //决定要输出的字节 - let start = inode.read_pos as usize; + let start = inner_guard.read_pos as usize; //如果读端希望读取的字节数大于有效字节数,则输出有效字节 - let mut end = (inode.valid_cnt as usize + inode.read_pos as usize) % PIPE_BUFF_SIZE; + let mut end = + (inner_guard.valid_cnt as usize + inner_guard.read_pos as usize) % PIPE_BUFF_SIZE; //如果读端希望读取的字节数少于有效字节数,则输出希望读取的字节 - if len < inode.valid_cnt as usize { - end = (len + inode.read_pos as usize) % PIPE_BUFF_SIZE; + if len < inner_guard.valid_cnt as usize { + end = (len + inner_guard.read_pos as usize) % PIPE_BUFF_SIZE; num = len; } // 从管道拷贝数据到用户的缓冲区 if end < start { - buf[0..(PIPE_BUFF_SIZE - start)].copy_from_slice(&inode.data[start..PIPE_BUFF_SIZE]); - buf[(PIPE_BUFF_SIZE - start)..num].copy_from_slice(&inode.data[0..end]); + buf[0..(PIPE_BUFF_SIZE - start)] + .copy_from_slice(&inner_guard.data[start..PIPE_BUFF_SIZE]); + buf[(PIPE_BUFF_SIZE - start)..num].copy_from_slice(&inner_guard.data[0..end]); } else { - buf[0..num].copy_from_slice(&inode.data[start..end]); + buf[0..num].copy_from_slice(&inner_guard.data[start..end]); } //更新读位置以及valid_cnt - inode.read_pos = (inode.read_pos + num as i32) % PIPE_BUFF_SIZE as i32; - inode.valid_cnt -= num as i32; + inner_guard.read_pos = (inner_guard.read_pos + num as i32) % PIPE_BUFF_SIZE as i32; + inner_guard.valid_cnt -= num as i32; // 读完以后如果未读完,则唤醒下一个读者 - if inode.valid_cnt > 0 { + if inner_guard.valid_cnt > 0 { self.read_wait_queue .wakeup(Some(ProcessState::Blocked(true))); } @@ -274,10 +276,10 @@ impl IndexNode for LockedPipeInode { //读完后解锁并唤醒等待在写等待队列中的进程 self.write_wait_queue .wakeup(Some(ProcessState::Blocked(true))); - - let pollflag = EPollEventType::from_bits_truncate(inode.poll(&data)? as u32); + let pollflag = EPollEventType::from_bits_truncate(inner_guard.poll(&data)? as u32); + drop(inner_guard); // 唤醒epoll中等待的进程 - EventPoll::wakeup_epoll(&inode.epitems, Some(pollflag))?; + EventPoll::wakeup_epoll(&self.epitems, Some(pollflag))?; //返回读取的字节数 return Ok(num); @@ -380,11 +382,10 @@ impl IndexNode for LockedPipeInode { return Err(SystemError::EINVAL); } // 加锁 + let mut inner_guard = self.inner.lock(); - let mut inode = self.inner.lock(); - - if inode.reader == 0 { - if !inode.had_reader { + if inner_guard.reader == 0 { + if !inner_guard.had_reader { // 如果从未有读端,直接返回 ENXIO,无论是否阻塞模式 return Err(SystemError::ENXIO); } else { @@ -417,43 +418,44 @@ impl IndexNode for LockedPipeInode { // 如果管道空间不够 - while len + inode.valid_cnt as usize > PIPE_BUFF_SIZE { + while len + inner_guard.valid_cnt as usize > PIPE_BUFF_SIZE { // 唤醒读端 self.read_wait_queue .wakeup(Some(ProcessState::Blocked(true))); // 如果为非阻塞管道,直接返回错误 if mode.contains(FileMode::O_NONBLOCK) { - drop(inode); + drop(inner_guard); return Err(SystemError::ENOMEM); } // 解锁并睡眠 - drop(inode); + drop(inner_guard); let r = wq_wait_event_interruptible!(self.write_wait_queue, self.writeable(), {}); if r.is_err() { return Err(SystemError::ERESTARTSYS); } - inode = self.inner.lock(); + inner_guard = self.inner.lock(); } // 决定要输入的字节 - let start = inode.write_pos as usize; - let end = (inode.write_pos as usize + len) % PIPE_BUFF_SIZE; + let start = inner_guard.write_pos as usize; + let end = (inner_guard.write_pos as usize + len) % PIPE_BUFF_SIZE; // 从用户的缓冲区拷贝数据到管道 if end < start { - inode.data[start..PIPE_BUFF_SIZE].copy_from_slice(&buf[0..(PIPE_BUFF_SIZE - start)]); - inode.data[0..end].copy_from_slice(&buf[(PIPE_BUFF_SIZE - start)..len]); + inner_guard.data[start..PIPE_BUFF_SIZE] + .copy_from_slice(&buf[0..(PIPE_BUFF_SIZE - start)]); + inner_guard.data[0..end].copy_from_slice(&buf[(PIPE_BUFF_SIZE - start)..len]); } else { - inode.data[start..end].copy_from_slice(&buf[0..len]); + inner_guard.data[start..end].copy_from_slice(&buf[0..len]); } // 更新写位置以及valid_cnt - inode.write_pos = (inode.write_pos + len as i32) % PIPE_BUFF_SIZE as i32; - inode.valid_cnt += len as i32; + inner_guard.write_pos = (inner_guard.write_pos + len as i32) % PIPE_BUFF_SIZE as i32; + inner_guard.valid_cnt += len as i32; // 写完后还有位置,则唤醒下一个写者 - if (inode.valid_cnt as usize) < PIPE_BUFF_SIZE { + if (inner_guard.valid_cnt as usize) < PIPE_BUFF_SIZE { self.write_wait_queue .wakeup(Some(ProcessState::Blocked(true))); } @@ -462,9 +464,11 @@ impl IndexNode for LockedPipeInode { self.read_wait_queue .wakeup(Some(ProcessState::Blocked(true))); - let pollflag = EPollEventType::from_bits_truncate(inode.poll(&data)? as u32); + let pollflag = EPollEventType::from_bits_truncate(inner_guard.poll(&data)? as u32); + + drop(inner_guard); // 唤醒epoll中等待的进程 - EventPoll::wakeup_epoll(&inode.epitems, Some(pollflag))?; + EventPoll::wakeup_epoll(&self.epitems, Some(pollflag))?; // 返回写入的字节数 return Ok(len); diff --git a/kernel/src/ipc/signal.rs b/kernel/src/ipc/signal.rs index 8c85182d..1f5ee768 100644 --- a/kernel/src/ipc/signal.rs +++ b/kernel/src/ipc/signal.rs @@ -1,4 +1,4 @@ -use core::sync::atomic::compiler_fence; +use core::{fmt::Debug, sync::atomic::compiler_fence}; use alloc::sync::Arc; use log::warn; @@ -8,9 +8,11 @@ use crate::{ arch::ipc::signal::{SigCode, SigFlags, SigSet, Signal}, ipc::signal_types::SigactionType, libs::spinlock::SpinLockGuard, + mm::VirtAddr, process::{ pid::PidType, Pid, ProcessControlBlock, ProcessFlags, ProcessManager, ProcessSignalInfo, }, + time::Instant, }; use super::signal_types::{ @@ -69,6 +71,11 @@ impl Signal { warn!("Kill operation not support: pid={:?}", pid); return Err(SystemError::ENOSYS); } + + // 暂时不支持发送信号给进程组 + if pid.data() == 0 { + return Err(SystemError::ENOSYS); + } compiler_fence(core::sync::atomic::Ordering::SeqCst); // 检查sig是否符合要求,如果不符合要求,则退出。 if !self.is_valid() { @@ -78,7 +85,7 @@ impl Signal { let pcb = ProcessManager::find(pid); if pcb.is_none() { - warn!("No such process."); + warn!("No such process: pid={:?}", pid); return retval; } @@ -610,6 +617,24 @@ pub fn set_current_blocked(new_set: &mut SigSet) { __set_current_blocked(new_set); } +/// 参考 https://code.dragonos.org.cn/xref/linux-6.6.21/kernel/signal.c?fi=set_user_sigmask#set_user_sigmask +/// 功能与set_current_blocked相同,多一步保存当前的sig_blocked到saved_sigmask +/// 由于这之中设置了saved_sigmask,因此从系统调用返回之前需要恢复saved_sigmask +pub fn set_user_sigmask(new_set: &mut SigSet) { + let pcb = ProcessManager::current_pcb(); + let mut guard = pcb.sig_info_mut(); + let oset = *guard.sig_blocked(); + + let flags = pcb.flags(); + flags.set(ProcessFlags::RESTORE_SIG_MASK, true); + + let saved_sigmask = guard.saved_sigmask_mut(); + *saved_sigmask = oset; + drop(guard); + + set_current_blocked(new_set); +} + /// 设置当前进程的屏蔽信号 (sig_block) /// /// ## 参数 @@ -641,3 +666,45 @@ pub fn set_sigprocmask(how: SigHow, set: SigSet) -> Result __set_current_blocked(&res_set); Ok(oset) } + +#[derive(Debug)] +pub struct RestartBlock { + pub data: RestartBlockData, + pub restart_fn: &'static dyn RestartFn, +} + +impl RestartBlock { + pub fn new(restart_fn: &'static dyn RestartFn, data: RestartBlockData) -> Self { + Self { data, restart_fn } + } +} + +pub trait RestartFn: Debug + Sync + Send + 'static { + fn call(&self, data: &mut RestartBlockData) -> Result; +} + +#[derive(Debug, Clone)] +pub enum RestartBlockData { + Poll(PollRestartBlockData), + // todo: nanosleep + Nanosleep(), + // todo: futex_wait + FutexWait(), +} + +impl RestartBlockData { + pub fn new_poll(pollfd_ptr: VirtAddr, nfds: u32, timeout_instant: Option) -> Self { + Self::Poll(PollRestartBlockData { + pollfd_ptr, + nfds, + timeout_instant, + }) + } +} + +#[derive(Debug, Clone)] +pub struct PollRestartBlockData { + pub pollfd_ptr: VirtAddr, + pub nfds: u32, + pub timeout_instant: Option, +} diff --git a/kernel/src/ipc/syscall.rs b/kernel/src/ipc/syscall.rs index c0d7305c..90d4ddc4 100644 --- a/kernel/src/ipc/syscall.rs +++ b/kernel/src/ipc/syscall.rs @@ -548,8 +548,18 @@ impl Syscall { } pub fn restart_syscall() -> Result { - // todo: https://code.dragonos.org.cn/xref/linux-6.1.9/kernel/signal.c#2998 - unimplemented!("restart_syscall with restart block"); - // Err(SystemError::ENOSYS) + let restart_block = ProcessManager::current_pcb().restart_block().take(); + if let Some(mut restart_block) = restart_block { + return restart_block.restart_fn.call(&mut restart_block.data); + } else { + // 不应该走到这里,因此kill掉当前进程及同组的进程 + let pid = Pid::new(0); + let sig = Signal::SIGKILL; + let mut info = SigInfo::new(sig, 0, SigCode::Kernel, SigType::Kill(pid)); + + sig.send_signal_info(Some(&mut info), pid) + .expect("Failed to kill "); + return Ok(0); + } } } diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index c6aa5b49..48237caf 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -1,5 +1,6 @@ #![no_main] // <1> #![feature(alloc_error_handler)] +#![feature(new_zeroed_alloc)] #![feature(allocator_api)] #![feature(arbitrary_self_types)] #![feature(concat_idents)] diff --git a/kernel/src/libs/rbtree.rs b/kernel/src/libs/rbtree.rs index 6aedb791..7d7299ac 100644 --- a/kernel/src/libs/rbtree.rs +++ b/kernel/src/libs/rbtree.rs @@ -829,6 +829,15 @@ impl IntoIterator for RBTree { } } +impl Default for RBTree { + fn default() -> Self { + RBTree { + root: NodePtr::null(), + len: 0, + } + } +} + impl RBTree { /// Creates an empty `RBTree`. pub fn new() -> RBTree { diff --git a/kernel/src/misc/mod.rs b/kernel/src/misc/mod.rs index 541e1db7..62c7a1a4 100644 --- a/kernel/src/misc/mod.rs +++ b/kernel/src/misc/mod.rs @@ -1,2 +1,4 @@ pub mod events; pub mod ksysfs; +pub mod reboot; +pub mod syscall; diff --git a/kernel/src/misc/reboot.rs b/kernel/src/misc/reboot.rs new file mode 100644 index 00000000..a65b1db0 --- /dev/null +++ b/kernel/src/misc/reboot.rs @@ -0,0 +1,154 @@ +use core::hint::spin_loop; + +use system_error::SystemError; + +use crate::{arch::cpu::cpu_reset, libs::mutex::Mutex, syscall::user_access::check_and_clone_cstr}; + +static SYSTEM_TRANSITION_MUTEX: Mutex<()> = Mutex::new(()); + +const LINUX_REBOOT_MAGIC1: u32 = 0xfee1dead; +const LINUX_REBOOT_MAGIC2: u32 = 672274793; +const LINUX_REBOOT_MAGIC2A: u32 = 85072278; +const LINUX_REBOOT_MAGIC2B: u32 = 369367448; +const LINUX_REBOOT_MAGIC2C: u32 = 537993216; + +#[derive(Debug)] +pub enum RebootCommand { + /// 重启系统,使用默认命令和模式 + Restart, + /// 停止操作系统,并将系统控制权交给ROM监视器(如果有) + Halt, + /// Ctrl-Alt-Del序列导致执行RESTART命令 + CadOn, + /// Ctrl-Alt-Del序列向init任务发送SIGINT信号 + CadOff, + /// 停止操作系统,如果可能的话从系统中移除所有电源 + PowerOff, + /// 使用给定的命令字符串重启系统 + Restart2, + /// 使用软件挂起(如果编译在内)挂起系统 + SoftwareSuspend, + /// 使用预先加载的Linux内核重启系统 + Kexec, +} + +impl TryFrom for RebootCommand { + type Error = SystemError; + + fn try_from(value: u32) -> Result { + match value { + 0x01234567 => Ok(RebootCommand::Restart), + 0xCDEF0123 => Ok(RebootCommand::Halt), + 0x89ABCDEF => Ok(RebootCommand::CadOn), + 0x00000000 => Ok(RebootCommand::CadOff), + 0x4321FEDC => Ok(RebootCommand::PowerOff), + 0xA1B2C3D4 => Ok(RebootCommand::Restart2), + 0xD000FCE2 => Ok(RebootCommand::SoftwareSuspend), + 0x45584543 => Ok(RebootCommand::Kexec), + _ => Err(SystemError::EINVAL), + } + } +} + +impl From for u32 { + fn from(val: RebootCommand) -> Self { + match val { + RebootCommand::Restart => 0x01234567, + RebootCommand::Halt => 0xCDEF0123, + RebootCommand::CadOn => 0x89ABCDEF, + RebootCommand::CadOff => 0x00000000, + RebootCommand::PowerOff => 0x4321FEDC, + RebootCommand::Restart2 => 0xA1B2C3D4, + RebootCommand::SoftwareSuspend => 0xD000FCE2, + RebootCommand::Kexec => 0x45584543, + } + } +} + +/// 系统调用reboot的实现 +/// +/// 参考:https://code.dragonos.org.cn/xref/linux-6.1.9/kernel/reboot.c#700 +pub(super) fn do_sys_reboot( + magic1: u32, + magic2: u32, + cmd: u32, + arg: usize, +) -> Result<(), SystemError> { + if magic1 != LINUX_REBOOT_MAGIC1 + || (magic2 != LINUX_REBOOT_MAGIC2 + && magic2 != LINUX_REBOOT_MAGIC2A + && magic2 != LINUX_REBOOT_MAGIC2B + && magic2 != LINUX_REBOOT_MAGIC2C) + { + return Err(SystemError::EINVAL); + } + let command = RebootCommand::try_from(cmd)?; + let _guard = SYSTEM_TRANSITION_MUTEX.lock(); + log::debug!( + "do_sys_reboot: magic1={}, magic2={}, cmd={:?}", + magic1, + magic2, + command + ); + match command { + RebootCommand::Restart => kernel_restart(None), + RebootCommand::Halt => kernel_halt(), + RebootCommand::CadOn => { + // todo: 支持Ctrl-Alt-Del序列 + return Ok(()); + } + RebootCommand::CadOff => { + // todo: 支持Ctrl-Alt-Del序列 + return Ok(()); + } + RebootCommand::PowerOff => kernel_power_off(), + RebootCommand::Restart2 => { + let s = check_and_clone_cstr(arg as *const u8, Some(256))?; + let cmd_str = s.to_str().map_err(|_| SystemError::EINVAL)?; + kernel_restart(Some(cmd_str)); + } + RebootCommand::SoftwareSuspend => { + log::warn!("do_sys_reboot: SoftwareSuspend not implemented"); + return Err(SystemError::ENOSYS); + } + RebootCommand::Kexec => { + log::warn!("do_sys_reboot: Kexec not implemented"); + return Err(SystemError::ENOSYS); + } + } +} + +/// kernel_restart - 重启系统 +/// +/// ## 参数 +/// - cmd: 指向包含重启命令的缓冲区的指针,或者 None +/// +/// 关闭所有东西并执行一个干净的重启。 +/// 在中断上下文中调用这是不安全的。 +/// +/// todo: 参考 https://code.dragonos.org.cn/xref/linux-6.1.9/kernel/reboot.c#265 +pub fn kernel_restart(cmd: Option<&str>) -> ! { + if let Some(cmd) = cmd { + log::warn!("Restarting system with command: '{}'", cmd); + } else { + log::warn!("Restarting system..."); + } + unsafe { cpu_reset() } +} + +/// todo: 参考 https://code.dragonos.org.cn/xref/linux-6.1.9/kernel/reboot.c#678 +pub fn kernel_power_off() -> ! { + log::warn!("Power down"); + log::warn!("Currently, the system cannot be powered off, so we halt here."); + loop { + spin_loop(); + } +} + +/// todo: 参考 https://code.dragonos.org.cn/xref/linux-6.1.9/kernel/reboot.c#293 +pub fn kernel_halt() -> ! { + log::warn!("System halted."); + loop { + spin_loop(); + } +} diff --git a/kernel/src/misc/syscall.rs b/kernel/src/misc/syscall.rs new file mode 100644 index 00000000..578bcb05 --- /dev/null +++ b/kernel/src/misc/syscall.rs @@ -0,0 +1,11 @@ +use system_error::SystemError; + +use crate::syscall::Syscall; + +use super::reboot::do_sys_reboot; + +impl Syscall { + pub fn reboot(magic1: u32, magic2: u32, cmd: u32, arg: usize) -> Result { + do_sys_reboot(magic1, magic2, cmd, arg).map(|_| 0) + } +} diff --git a/kernel/src/mm/mod.rs b/kernel/src/mm/mod.rs index e95e9019..f5fac396 100644 --- a/kernel/src/mm/mod.rs +++ b/kernel/src/mm/mod.rs @@ -155,7 +155,7 @@ pub enum PageTableKind { } /// 物理内存地址 -#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)] +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Default)] #[repr(transparent)] pub struct PhysAddr(usize); @@ -277,7 +277,7 @@ impl core::ops::SubAssign for PhysAddr { } /// 虚拟内存地址 -#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)] +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Default)] #[repr(transparent)] pub struct VirtAddr(usize); diff --git a/kernel/src/mm/page.rs b/kernel/src/mm/page.rs index 3ec68547..0b84aaba 100644 --- a/kernel/src/mm/page.rs +++ b/kernel/src/mm/page.rs @@ -874,6 +874,7 @@ impl PageTable { } /// 页表项 +#[repr(C, align(8))] #[derive(Copy, Clone)] pub struct PageEntry { data: usize, diff --git a/kernel/src/namespaces/syscall.rs b/kernel/src/namespaces/syscall.rs index 9dfe5c12..c04d2a3d 100644 --- a/kernel/src/namespaces/syscall.rs +++ b/kernel/src/namespaces/syscall.rs @@ -37,7 +37,7 @@ impl Syscall { Ok(check) } - #[allow(unused)] + #[allow(dead_code)] pub fn sys_setns(_fd: i32, flags: u64) -> Result { let check = check_unshare_flags(flags)?; diff --git a/kernel/src/net/event_poll/mod.rs b/kernel/src/net/event_poll/mod.rs index 2c80cc72..2536d11b 100644 --- a/kernel/src/net/event_poll/mod.rs +++ b/kernel/src/net/event_poll/mod.rs @@ -51,27 +51,6 @@ pub struct EventPoll { self_ref: Option>>, } -impl EventPoll { - pub const EP_MAX_EVENTS: u32 = u32::MAX / (core::mem::size_of::() as u32); - /// 用于获取inode中的epitem队列 - pub const ADD_EPOLLITEM: u32 = 0x7965; - pub fn new() -> Self { - Self { - epoll_wq: WaitQueue::default(), - ep_items: RBTree::new(), - ready_list: LinkedList::new(), - shutdown: AtomicBool::new(false), - self_ref: None, - } - } -} - -impl Default for EventPoll { - fn default() -> Self { - Self::new() - } -} - /// EpollItem表示的是Epoll所真正管理的对象 /// 每当用户向Epoll添加描述符时都会注册一个新的EpollItem,EpollItem携带了一些被监听的描述符的必要信息 #[derive(Debug)] @@ -199,25 +178,7 @@ impl IndexNode for EPollInode { // 释放资源 let mut epoll = self.epoll.0.lock_irqsave(); - // 唤醒epoll上面等待的所有进程 - epoll.shutdown.store(true, Ordering::SeqCst); - epoll.ep_wake_all(); - - let fds = epoll.ep_items.keys().cloned().collect::>(); - - // 清理红黑树里面的epitems - for fd in fds { - let file = ProcessManager::current_pcb() - .fd_table() - .read() - .get_file_by_fd(fd); - - if file.is_some() { - file.unwrap().remove_epoll(&Arc::downgrade(&self.epoll.0))?; - } - - epoll.ep_items.remove(&fd); - } + epoll.close()?; Ok(()) } @@ -232,21 +193,72 @@ impl IndexNode for EPollInode { } impl EventPoll { - /// ## 创建epoll对象 + pub const EP_MAX_EVENTS: u32 = u32::MAX / (core::mem::size_of::() as u32); + /// 用于获取inode中的epitem队列 + pub const ADD_EPOLLITEM: u32 = 0x7965; + fn new() -> Self { + Self { + epoll_wq: WaitQueue::default(), + ep_items: RBTree::new(), + ready_list: LinkedList::new(), + shutdown: AtomicBool::new(false), + self_ref: None, + } + } + + /// 关闭epoll时,执行的逻辑 + fn close(&mut self) -> Result<(), SystemError> { + // 唤醒epoll上面等待的所有进程 + self.shutdown.store(true, Ordering::SeqCst); + self.ep_wake_all(); + + let fds: Vec = self.ep_items.keys().cloned().collect::>(); + // 清理红黑树里面的epitems + for fd in fds { + let file = ProcessManager::current_pcb() + .fd_table() + .read() + .get_file_by_fd(fd); + + if let Some(file) = file { + if let Some(self_ref) = self.self_ref.as_ref() { + file.remove_epoll(self_ref)?; + } + } + + self.ep_items.remove(&fd); + } + + Ok(()) + } + + /// ## 创建epoll对象, 并将其加入到当前进程的fd_table中 /// /// ### 参数 /// - flags: 创建的epoll文件的FileMode /// /// ### 返回值 /// - 成功则返回Ok(fd),否则返回Err - pub fn do_create_epoll(flags: FileMode) -> Result { + pub fn create_epoll(flags: FileMode) -> Result { + let ep_file = Self::create_epoll_file(flags)?; + + let current_pcb = ProcessManager::current_pcb(); + let fd_table = current_pcb.fd_table(); + let mut fd_table_guard = fd_table.write(); + + let fd = fd_table_guard.alloc_fd(ep_file, None)?; + + Ok(fd as usize) + } + + /// ## 创建epoll文件 + pub fn create_epoll_file(flags: FileMode) -> Result { if !flags.difference(FileMode::O_CLOEXEC).is_empty() { return Err(SystemError::EINVAL); } // 创建epoll - let epoll = LockedEventPoll(Arc::new(SpinLock::new(EventPoll::new()))); - epoll.0.lock_irqsave().self_ref = Some(Arc::downgrade(&epoll.0)); + let epoll = Self::do_create_epoll(); // 创建epoll的inode对象 let epoll_inode = EPollInode::new(epoll.clone()); @@ -258,14 +270,13 @@ impl EventPoll { // 设置ep_file的FilePrivateData ep_file.private_data = SpinLock::new(FilePrivateData::EPoll(EPollPrivateData { epoll })); + Ok(ep_file) + } - let current_pcb = ProcessManager::current_pcb(); - let fd_table = current_pcb.fd_table(); - let mut fd_table_guard = fd_table.write(); - - let fd = fd_table_guard.alloc_fd(ep_file, None)?; - - Ok(fd as usize) + fn do_create_epoll() -> LockedEventPoll { + let epoll = LockedEventPoll(Arc::new(SpinLock::new(EventPoll::new()))); + epoll.0.lock().self_ref = Some(Arc::downgrade(&epoll.0)); + epoll } /// ## epoll_ctl的具体实现 @@ -273,30 +284,20 @@ impl EventPoll { /// 根据不同的op对epoll文件进行增删改 /// /// ### 参数 - /// - epfd: 操作的epoll文件描述符 + /// - ep_file: epoll文件 /// - op: 对应的操作 - /// - fd: 操作对应的文件描述符 + /// - dstfd: 操作对应的文件描述符 + /// - dst_file: 操作对应的文件(与dstfd对应) /// - epds: 从用户态传入的event,若op为EpollCtlAdd,则对应注册的监听事件,若op为EPollCtlMod,则对应更新的事件,删除操作不涉及此字段 /// - nonblock: 定义这次操作是否为非阻塞(有可能其他地方占有EPoll的锁) - pub fn do_epoll_ctl( - epfd: i32, + fn do_epoll_ctl( + ep_file: Arc, op: EPollCtlOption, - fd: i32, - epds: &mut EPollEvent, + dstfd: i32, + dst_file: Arc, + mut epds: EPollEvent, nonblock: bool, ) -> Result { - let current_pcb = ProcessManager::current_pcb(); - let fd_table = current_pcb.fd_table(); - let fd_table_guard = fd_table.read(); - - // 获取epoll和对应fd指向的文件 - let ep_file = fd_table_guard - .get_file_by_fd(epfd) - .ok_or(SystemError::EBADF)?; - let dst_file = fd_table_guard - .get_file_by_fd(fd) - .ok_or(SystemError::EBADF)?; - // 检查是否允许 EPOLLWAKEUP if op != EPollCtlOption::Del { epds.events &= !EPollEventType::EPOLLWAKEUP.bits(); @@ -351,7 +352,7 @@ impl EventPoll { } } - let ep_item = epoll_guard.ep_items.get(&fd); + let ep_item = epoll_guard.ep_items.get(&dstfd); match op { EPollCtlOption::Add => { // 如果已经存在,则返回错误 @@ -361,8 +362,8 @@ impl EventPoll { // 设置epoll let epitem = Arc::new(EPollItem::new( Arc::downgrade(&epoll_data.epoll.0), - *epds, - fd, + epds, + dstfd, Arc::downgrade(&dst_file), )); Self::ep_insert(&mut epoll_guard, dst_file, epitem)?; @@ -373,7 +374,7 @@ impl EventPoll { return Err(SystemError::ENOENT); } // 删除 - Self::ep_remove(&mut epoll_guard, fd, Some(dst_file))?; + Self::ep_remove(&mut epoll_guard, dstfd, Some(dst_file))?; } EPollCtlOption::Mod => { // 不存在则返回错误 @@ -385,7 +386,7 @@ impl EventPoll { epds.events |= EPollEventType::EPOLLERR.bits() | EPollEventType::EPOLLHUP.bits(); - Self::ep_modify(&mut epoll_guard, ep_item, epds)?; + Self::ep_modify(&mut epoll_guard, ep_item, &epds)?; } } } @@ -394,8 +395,50 @@ impl EventPoll { Ok(0) } - /// ## epoll_wait的具体实现 - pub fn do_epoll_wait( + pub fn epoll_ctl_with_epfd( + epfd: i32, + op: EPollCtlOption, + dstfd: i32, + epds: EPollEvent, + nonblock: bool, + ) -> Result { + let current_pcb = ProcessManager::current_pcb(); + let fd_table = current_pcb.fd_table(); + let fd_table_guard = fd_table.read(); + + // 获取epoll和对应fd指向的文件 + let ep_file = fd_table_guard + .get_file_by_fd(epfd) + .ok_or(SystemError::EBADF)?; + let dst_file = fd_table_guard + .get_file_by_fd(dstfd) + .ok_or(SystemError::EBADF)?; + + drop(fd_table_guard); + + Self::do_epoll_ctl(ep_file, op, dstfd, dst_file, epds, nonblock) + } + + pub fn epoll_ctl_with_epfile( + ep_file: Arc, + op: EPollCtlOption, + dstfd: i32, + epds: EPollEvent, + nonblock: bool, + ) -> Result { + let current_pcb = ProcessManager::current_pcb(); + let fd_table = current_pcb.fd_table(); + let fd_table_guard = fd_table.read(); + let dst_file = fd_table_guard + .get_file_by_fd(dstfd) + .ok_or(SystemError::EBADF)?; + + drop(fd_table_guard); + + Self::do_epoll_ctl(ep_file, op, dstfd, dst_file, epds, nonblock) + } + + pub fn epoll_wait( epfd: i32, epoll_event: &mut [EPollEvent], max_events: i32, @@ -411,6 +454,16 @@ impl EventPoll { .ok_or(SystemError::EBADF)?; drop(fd_table_guard); + Self::epoll_wait_with_file(ep_file, epoll_event, max_events, timespec) + } + /// ## epoll_wait的具体实现 + pub fn epoll_wait_with_file( + ep_file: Arc, + epoll_event: &mut [EPollEvent], + max_events: i32, + timespec: Option, + ) -> Result { + let current_pcb = ProcessManager::current_pcb(); // 确保是epoll file if !Self::is_epoll_file(&ep_file) { @@ -432,6 +485,9 @@ impl EventPoll { // 非阻塞情况 timeout = true; } + } else if timespec.is_none() { + // 非阻塞情况 + timeout = true; } // 判断epoll上有没有就绪事件 let mut available = epoll_guard.ep_events_available(); @@ -502,6 +558,7 @@ impl EventPoll { })?; drop(guard); schedule(SchedMode::SM_NONE); + // 被唤醒后,检查是否有事件可读 available = epoll.0.lock_irqsave().ep_events_available(); if let Some(timer) = timer { @@ -530,6 +587,9 @@ impl EventPoll { user_event: &mut [EPollEvent], max_events: i32, ) -> Result { + if user_event.len() < max_events as usize { + return Err(SystemError::EINVAL); + } let mut ep_guard = epoll.0.lock_irqsave(); let mut res: usize = 0; @@ -651,11 +711,9 @@ impl EventPoll { dst_file.remove_epoll(epoll.self_ref.as_ref().unwrap())?; } - let epitem = epoll.ep_items.remove(&fd).unwrap(); - - let _ = epoll - .ready_list - .extract_if(|item| Arc::ptr_eq(item, &epitem)); + if let Some(epitem) = epoll.ep_items.remove(&fd) { + epoll.ready_list.retain(|item| !Arc::ptr_eq(item, &epitem)); + } Ok(()) } @@ -740,7 +798,6 @@ impl EventPoll { let binding = epitem.clone(); let event_guard = binding.event().read(); let ep_events = EPollEventType::from_bits_truncate(event_guard.events()); - // 检查事件合理性以及是否有感兴趣的事件 if !(ep_events .difference(EPollEventType::EP_PRIVATE_BITS) diff --git a/kernel/src/net/event_poll/syscall.rs b/kernel/src/net/event_poll/syscall.rs index 6fd0dc94..a984bbd2 100644 --- a/kernel/src/net/event_poll/syscall.rs +++ b/kernel/src/net/event_poll/syscall.rs @@ -3,7 +3,7 @@ use system_error::SystemError; use crate::{ arch::ipc::signal::SigSet, filesystem::vfs::file::FileMode, - ipc::signal::set_current_blocked, + ipc::signal::{restore_saved_sigmask, set_user_sigmask}, mm::VirtAddr, syscall::{ user_access::{UserBufferReader, UserBufferWriter}, @@ -20,13 +20,13 @@ impl Syscall { return Err(SystemError::EINVAL); } - return EventPoll::do_create_epoll(FileMode::empty()); + return EventPoll::create_epoll(FileMode::empty()); } pub fn epoll_create1(flag: usize) -> Result { let flags = FileMode::from_bits_truncate(flag as u32); - let ret = EventPoll::do_create_epoll(flags); + let ret = EventPoll::create_epoll(flags); ret } @@ -60,7 +60,7 @@ impl Syscall { )?; let epoll_events = epds_writer.buffer::(0)?; - return EventPoll::do_epoll_wait(epfd, epoll_events, max_events, timespec); + return EventPoll::epoll_wait(epfd, epoll_events, max_events, timespec); } pub fn epoll_ctl(epfd: i32, op: usize, fd: i32, event: VirtAddr) -> Result { @@ -84,7 +84,7 @@ impl Syscall { epds_reader.copy_one_from_user(&mut epds, 0)?; } - return EventPoll::do_epoll_ctl(epfd, op, fd, &mut epds, false); + return EventPoll::epoll_ctl_with_epfd(epfd, op, fd, epds, false); } /// ## 在epoll_wait时屏蔽某些信号 @@ -96,13 +96,12 @@ impl Syscall { sigmask: &mut SigSet, ) -> Result { // 设置屏蔽的信号 - set_current_blocked(sigmask); + set_user_sigmask(sigmask); let wait_ret = Self::epoll_wait(epfd, epoll_event, max_events, timespec); if wait_ret.is_err() && *wait_ret.as_ref().unwrap_err() != SystemError::EINTR { - // TODO: 恢复信号? - // link:https://code.dragonos.org.cn/xref/linux-6.1.9/fs/eventpoll.c#2294 + restore_saved_sigmask(); } wait_ret } diff --git a/kernel/src/process/exit.rs b/kernel/src/process/exit.rs index 66d5a80d..9cc1620d 100644 --- a/kernel/src/process/exit.rs +++ b/kernel/src/process/exit.rs @@ -63,7 +63,6 @@ pub fn kernel_wait4( // 判断pid类型 let pidtype: PidType; - if pid == -1 { pidtype = PidType::MAX; } else if pid < 0 { @@ -176,6 +175,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { break 'outer; } } + drop(rd_childen); nanosleep(Duration::from_millis(100).into())?; } } else { diff --git a/kernel/src/process/fork.rs b/kernel/src/process/fork.rs index 41265d13..ca4e6015 100644 --- a/kernel/src/process/fork.rs +++ b/kernel/src/process/fork.rs @@ -75,8 +75,6 @@ bitflags! { const CLONE_NEWNET = 0x40000000; /// 在新的 I/O 上下文中运行它 const CLONE_IO = 0x80000000; - /// 克隆时,与父进程共享信号结构体 - const CLONE_SIGNAL = 0x00010000 | 0x00000800; /// 克隆时,将原本被设置为SIG_IGNORE的信号,设置回SIG_DEFAULT const CLONE_CLEAR_SIGHAND = 0x100000000; } @@ -347,8 +345,11 @@ impl ProcessManager { ) -> Result<(), SystemError> { let clone_flags = clone_args.flags; // 不允许与不同namespace的进程共享根目录 - if (clone_flags == (CloneFlags::CLONE_NEWNS | CloneFlags::CLONE_FS)) - || clone_flags == (CloneFlags::CLONE_NEWUSER | CloneFlags::CLONE_FS) + + if (clone_flags & (CloneFlags::CLONE_NEWNS | CloneFlags::CLONE_FS) + == (CloneFlags::CLONE_NEWNS | CloneFlags::CLONE_FS)) + || (clone_flags & (CloneFlags::CLONE_NEWNS | CloneFlags::CLONE_FS)) + == (CloneFlags::CLONE_NEWUSER | CloneFlags::CLONE_FS) { return Err(SystemError::EINVAL); } @@ -373,7 +374,7 @@ impl ProcessManager { // 如果新进程使用不同的 pid 或 namespace, // 则不允许它与分叉任务共享线程组。 if clone_flags.contains(CloneFlags::CLONE_THREAD) - && clone_flags.contains(CloneFlags::CLONE_NEWUSER | CloneFlags::CLONE_NEWPID) + && !((clone_flags & (CloneFlags::CLONE_NEWUSER | CloneFlags::CLONE_NEWPID)).is_empty()) { return Err(SystemError::EINVAL); // TODO: 判断新进程与当前进程namespace是否相同,不同则返回错误 @@ -381,12 +382,12 @@ impl ProcessManager { // 如果新进程将处于不同的time namespace, // 则不能让它共享vm或线程组。 - if clone_flags.contains(CloneFlags::CLONE_THREAD | CloneFlags::CLONE_VM) { + if !((clone_flags & (CloneFlags::CLONE_THREAD | CloneFlags::CLONE_VM)).is_empty()) { // TODO: 判断time namespace,不同则返回错误 } if clone_flags.contains(CloneFlags::CLONE_PIDFD) - && clone_flags.contains(CloneFlags::CLONE_DETACHED | CloneFlags::CLONE_THREAD) + && !((clone_flags & (CloneFlags::CLONE_DETACHED | CloneFlags::CLONE_THREAD)).is_empty()) { return Err(SystemError::EINVAL); } @@ -483,6 +484,8 @@ impl ProcessManager { )?; *pcb.thread_pid.write() = new_pid; } + + // log::debug!("fork: clone_flags: {:?}", clone_flags); // 设置线程组id、组长 if clone_flags.contains(CloneFlags::CLONE_THREAD) { pcb.thread.write_irqsave().group_leader = @@ -493,14 +496,15 @@ impl ProcessManager { } } else { pcb.thread.write_irqsave().group_leader = Arc::downgrade(pcb); + + let ptr = pcb.as_ref() as *const ProcessControlBlock as *mut ProcessControlBlock; unsafe { - let ptr = pcb.as_ref() as *const ProcessControlBlock as *mut ProcessControlBlock; - (*ptr).tgid = pcb.tgid; + (*ptr).tgid = pcb.pid; } } // CLONE_PARENT re-uses the old parent - if clone_flags.contains(CloneFlags::CLONE_PARENT | CloneFlags::CLONE_THREAD) { + if !((clone_flags & (CloneFlags::CLONE_PARENT | CloneFlags::CLONE_THREAD)).is_empty()) { *pcb.real_parent_pcb.write_irqsave() = current_pcb.real_parent_pcb.read_irqsave().clone(); diff --git a/kernel/src/process/kthread.rs b/kernel/src/process/kthread.rs index 6f4d113d..6ee7d13a 100644 --- a/kernel/src/process/kthread.rs +++ b/kernel/src/process/kthread.rs @@ -286,7 +286,7 @@ impl KernelThreadMechanism { KernelThreadMechanism::__inner_create( &create_info, - CloneFlags::CLONE_VM | CloneFlags::CLONE_SIGNAL, + CloneFlags::CLONE_VM | CloneFlags::CLONE_SIGHAND, ) .unwrap_or_else(|e| panic!("Failed to create initial kernel thread, error: {:?}", e)); @@ -313,7 +313,7 @@ impl KernelThreadMechanism { .expect("kthreadadd should be run first"); let kthreadd_pid: Pid = Self::__inner_create( &info, - CloneFlags::CLONE_VM | CloneFlags::CLONE_FS | CloneFlags::CLONE_SIGNAL, + CloneFlags::CLONE_VM | CloneFlags::CLONE_FS | CloneFlags::CLONE_SIGHAND, ) .expect("Failed to create kthread daemon"); let pcb = ProcessManager::find(kthreadd_pid).unwrap(); @@ -466,7 +466,7 @@ impl KernelThreadMechanism { // create a new kernel thread let result: Result = Self::__inner_create( &info, - CloneFlags::CLONE_VM | CloneFlags::CLONE_FS | CloneFlags::CLONE_SIGNAL, + CloneFlags::CLONE_VM | CloneFlags::CLONE_FS | CloneFlags::CLONE_SIGHAND, ); if result.is_err() { // 创建失败 diff --git a/kernel/src/process/mod.rs b/kernel/src/process/mod.rs index 013aceb4..cab08196 100644 --- a/kernel/src/process/mod.rs +++ b/kernel/src/process/mod.rs @@ -31,7 +31,10 @@ use crate::{ procfs::procfs_unregister_pid, vfs::{file::FileDescriptorVec, FileType}, }, - ipc::signal_types::{SigInfo, SigPending, SignalStruct}, + ipc::{ + signal::RestartBlock, + signal_types::{SigInfo, SigPending, SignalStruct}, + }, libs::{ align::AlignedBox, casting::DowncastArc, @@ -712,6 +715,8 @@ pub struct ProcessControlBlock { /// 进程作为主体的凭证集 cred: SpinLock, self_ref: Weak, + + restart_block: SpinLock>, } impl ProcessControlBlock { @@ -799,6 +804,7 @@ impl ProcessControlBlock { nsproxy: Arc::new(RwLock::new(NsProxy::new())), cred: SpinLock::new(cred), self_ref: Weak::new(), + restart_block: SpinLock::new(None), }; pcb.sig_info.write().set_tty(tty); @@ -1117,6 +1123,18 @@ impl ProcessControlBlock { pub fn threads_read_irqsave(&self) -> RwLockReadGuard { self.thread.read_irqsave() } + + pub fn restart_block(&self) -> SpinLockGuard> { + self.restart_block.lock() + } + + pub fn set_restart_fn( + &self, + restart_block: Option, + ) -> Result { + *self.restart_block.lock() = restart_block; + return Err(SystemError::ERESTART_RESTARTBLOCK); + } } impl Drop for ProcessControlBlock { @@ -1596,6 +1614,7 @@ pub fn process_init() { pub struct ProcessSignalInfo { // 当前进程被屏蔽的信号 sig_blocked: SigSet, + // 暂存旧信号,用于恢复 saved_sigmask: SigSet, // sig_pending 中存储当前线程要处理的信号 sig_pending: SigPending, diff --git a/kernel/src/syscall/mod.rs b/kernel/src/syscall/mod.rs index 03087e15..d755abca 100644 --- a/kernel/src/syscall/mod.rs +++ b/kernel/src/syscall/mod.rs @@ -24,7 +24,7 @@ use num_traits::FromPrimitive; use system_error::SystemError; use crate::{ - arch::{cpu::cpu_reset, interrupt::TrapFrame, MMArch}, + arch::{interrupt::TrapFrame, MMArch}, filesystem::vfs::{ fcntl::{AtFlags, FcntlCommand}, file::FileMode, @@ -232,7 +232,13 @@ impl Syscall { Self::sbrk(increment).map(|vaddr: VirtAddr| vaddr.data()) } - SYS_REBOOT => Self::reboot(), + SYS_REBOOT => { + let magic1 = args[0] as u32; + let magic2 = args[1] as u32; + let cmd = args[2] as u32; + let arg = args[3]; + Self::reboot(magic1, magic2, cmd, arg) + } SYS_CHDIR => { let r = args[0] as *const u8; @@ -873,8 +879,10 @@ impl Syscall { #[cfg(target_arch = "x86_64")] SYS_POLL => { - warn!("SYS_POLL has not yet been implemented"); - Ok(0) + let fds = args[0]; + let nfds = args[1] as u32; + let timeout = args[2] as i32; + Self::poll(fds, nfds, timeout) } SYS_SETPGID => { @@ -1253,8 +1261,4 @@ impl Syscall { print!("\x1B[38;2;{fr};{fg};{fb};48;2;{br};{bg};{bb}m{s}\x1B[0m"); return Ok(s.len()); } - - pub fn reboot() -> Result { - unsafe { cpu_reset() }; - } } diff --git a/kernel/src/time/mod.rs b/kernel/src/time/mod.rs index 9a0c829f..12947310 100644 --- a/kernel/src/time/mod.rs +++ b/kernel/src/time/mod.rs @@ -270,6 +270,24 @@ impl Instant { pub const fn total_micros(&self) -> i64 { self.micros } + + /// Returns the duration between this instant and another one. + /// + /// # Arguments + /// + /// * `earlier` - The earlier instant to calculate the duration since. + /// + /// # Returns + /// + /// An `Option` representing the duration between this instant and the earlier one. + /// If the earlier instant is later than this one, it returns `None`. + pub fn duration_since(&self, earlier: Instant) -> Option { + if earlier.micros > self.micros { + return None; + } + let micros_diff = self.micros - earlier.micros; + Some(Duration::from_micros(micros_diff as u64)) + } } impl fmt::Display for Instant { diff --git a/kernel/src/time/syscall.rs b/kernel/src/time/syscall.rs index ec8c0a15..19b16775 100644 --- a/kernel/src/time/syscall.rs +++ b/kernel/src/time/syscall.rs @@ -79,6 +79,7 @@ impl Syscall { if sleep_time.is_null() { return Err(SystemError::EFAULT); } + let slt_spec = PosixTimeSpec { tv_sec: unsafe { *sleep_time }.tv_sec, tv_nsec: unsafe { *sleep_time }.tv_nsec, diff --git a/kernel/src/time/timer.rs b/kernel/src/time/timer.rs index 917502ca..8276db2e 100644 --- a/kernel/src/time/timer.rs +++ b/kernel/src/time/timer.rs @@ -157,7 +157,7 @@ impl Timer { let expire_jiffies = inner_guard.expire_jiffies; let self_arc = inner_guard.self_ref.upgrade().unwrap(); drop(inner_guard); - let mut split_pos: usize = 0; + let mut split_pos: usize = timer_list.len(); for (pos, elt) in timer_list.iter().enumerate() { if Arc::ptr_eq(&self_arc, &elt.1) { warn!("Timer already in list"); diff --git a/kernel/src/virt/mod.rs b/kernel/src/virt/mod.rs index 937d3d51..0f2205dd 100644 --- a/kernel/src/virt/mod.rs +++ b/kernel/src/virt/mod.rs @@ -1 +1,2 @@ pub mod kvm; +pub mod vm; diff --git a/kernel/src/virt/vm/kvm_dev.rs b/kernel/src/virt/vm/kvm_dev.rs new file mode 100644 index 00000000..cab74f37 --- /dev/null +++ b/kernel/src/virt/vm/kvm_dev.rs @@ -0,0 +1,491 @@ +use core::intrinsics::unlikely; + +use alloc::sync::{Arc, Weak}; +use log::{debug, warn}; +use system_error::SystemError; + +use crate::{ + arch::{ + vm::{kvm_host::KvmCommonRegs, uapi::UapiKvmSegmentRegs}, + MMArch, + }, + driver::base::device::device_number::DeviceNumber, + filesystem::{ + devfs::{devfs_register, DevFS, DeviceINode}, + vfs::{ + core::generate_inode_id, + file::{File, FileMode}, + syscall::ModeType, + FileType, IndexNode, Metadata, + }, + }, + libs::spinlock::SpinLock, + mm::MemoryManagementArch, + process::ProcessManager, + syscall::user_access::{UserBufferReader, UserBufferWriter}, + time::PosixTimeSpec, + virt::vm::user_api::{KvmUserspaceMemoryRegion, PosixKvmUserspaceMemoryRegion}, +}; + +use super::kvm_host::{vcpu::LockedVirtCpu, LockedVm}; + +#[derive(Debug)] +pub struct KvmInode { + /// 指向自身的弱引用 + self_ref: Weak, + /// 指向inode所在的文件系统对象的指针 + fs: Weak, + /// INode 元数据 + metadata: Metadata, +} + +#[derive(Debug)] +pub struct LockedKvmInode { + inner: SpinLock, +} + +impl LockedKvmInode { + const KVM_CREATE_VM: u32 = 0xAE01; + const KVM_GET_VCPU_MMAP_SIZE: u32 = 0xAE04; + + pub fn new() -> Arc { + let inode = KvmInode { + self_ref: Weak::default(), + fs: Weak::default(), + metadata: Metadata { + dev_id: 1, + inode_id: generate_inode_id(), + size: 0, + blk_size: 0, + blocks: 0, + atime: PosixTimeSpec::default(), + mtime: PosixTimeSpec::default(), + ctime: PosixTimeSpec::default(), + file_type: FileType::KvmDevice, // 文件夹,block设备,char设备 + mode: ModeType::S_IALLUGO, + nlinks: 1, + uid: 0, + gid: 0, + raw_dev: DeviceNumber::default(), // 这里用来作为device number + }, + }; + + let result = Arc::new(LockedKvmInode { + inner: SpinLock::new(inode), + }); + result.inner.lock().self_ref = Arc::downgrade(&result); + + return result; + } + + fn create_vm(&self, vm_type: usize) -> Result { + let kvm = LockedVm::create(vm_type)?; + + let instance = KvmInstance::new(kvm); + + let current = ProcessManager::current_pcb(); + + let file = File::new(instance, FileMode::O_RDWR)?; + let fd = current.fd_table().write().alloc_fd(file, None)?; + return Ok(fd as usize); + } +} + +impl DeviceINode for LockedKvmInode { + fn set_fs(&self, fs: Weak) { + self.inner.lock().fs = fs; + } +} + +impl IndexNode for LockedKvmInode { + fn open( + &self, + _data: crate::libs::spinlock::SpinLockGuard, + _mode: &FileMode, + ) -> Result<(), SystemError> { + Ok(()) + } + fn read_at( + &self, + _offset: usize, + _len: usize, + _buf: &mut [u8], + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result { + Err(SystemError::ENOSYS) + } + + fn write_at( + &self, + _offset: usize, + _len: usize, + _buf: &[u8], + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result { + Err(SystemError::ENOSYS) + } + + fn fs(&self) -> Arc { + self.inner.lock().fs.upgrade().unwrap() + } + + fn as_any_ref(&self) -> &dyn core::any::Any { + self + } + + fn list(&self) -> Result, system_error::SystemError> { + Err(SystemError::ENOSYS) + } + + fn metadata(&self) -> Result { + Ok(self.inner.lock().metadata.clone()) + } + + fn ioctl( + &self, + cmd: u32, + arg: usize, + _private_data: &crate::filesystem::vfs::FilePrivateData, + ) -> Result { + match cmd { + Self::KVM_CREATE_VM => { + let ret = self.create_vm(arg); + warn!("[KVM]: KVM_CREATE_VM {ret:?}"); + + return ret; + } + + Self::KVM_GET_VCPU_MMAP_SIZE => { + if arg != 0 { + return Err(SystemError::EINVAL); + } + debug!("[KVM] KVM_GET_VCPU_MMAP_SIZE"); + return Ok(MMArch::PAGE_SIZE); + } + + _ => { + // TODO: arch_ioctl + warn!("[KVM]: unknown iooctl cmd {cmd:x}"); + } + } + + Ok(0) + } + + fn close( + &self, + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result<(), SystemError> { + Ok(()) + } +} + +#[derive(Debug)] +pub struct KvmInstance { + kvm: Arc, + metadata: Metadata, +} + +impl KvmInstance { + const KVM_CREATE_VCPU: u32 = 0xAE41; + const KVM_SET_USER_MEMORY_REGION: u32 = 0x4020AE46; + + pub fn new(vm: Arc) -> Arc { + Arc::new(Self { + kvm: vm, + metadata: Metadata { + dev_id: 1, + inode_id: generate_inode_id(), + size: 0, + blk_size: 0, + blocks: 0, + atime: PosixTimeSpec::default(), + mtime: PosixTimeSpec::default(), + ctime: PosixTimeSpec::default(), + file_type: FileType::KvmDevice, + mode: ModeType::S_IALLUGO, + nlinks: 1, + uid: 0, + gid: 0, + raw_dev: DeviceNumber::default(), // 这里用来作为device number + }, + }) + } +} + +impl IndexNode for KvmInstance { + fn open( + &self, + _data: crate::libs::spinlock::SpinLockGuard, + _mode: &crate::filesystem::vfs::file::FileMode, + ) -> Result<(), SystemError> { + Ok(()) + } + + #[inline(never)] + fn ioctl( + &self, + cmd: u32, + arg: usize, + _private_data: &crate::filesystem::vfs::FilePrivateData, + ) -> Result { + debug!("kvm instance ioctl cmd {cmd:x}"); + match cmd { + Self::KVM_CREATE_VCPU => { + let ret = self.kvm.lock().create_vcpu(arg); + debug!("[KVM] create vcpu fd {ret:?}"); + return ret; + } + + Self::KVM_SET_USER_MEMORY_REGION => { + debug!("[KVM-INSTANCE] KVM_SET_USER_MEMORY_REGION"); + let user_reader = UserBufferReader::new( + arg as *const PosixKvmUserspaceMemoryRegion, + core::mem::size_of::(), + true, + )?; + + let region = user_reader.read_one_from_user::(0)?; + + self.kvm + .lock() + .set_memory_region(KvmUserspaceMemoryRegion::from_posix(region)?)?; + + return Ok(0); + } + + _ => { + // arch_ioctl + } + } + + todo!() + } + + fn read_at( + &self, + _offset: usize, + _len: usize, + _buf: &mut [u8], + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result { + todo!() + } + + fn write_at( + &self, + _offset: usize, + _len: usize, + _buf: &[u8], + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result { + todo!() + } + + fn fs(&self) -> Arc { + todo!() + } + + fn as_any_ref(&self) -> &dyn core::any::Any { + todo!() + } + + fn list(&self) -> Result, SystemError> { + todo!() + } + + fn metadata(&self) -> Result { + Ok(self.metadata.clone()) + } + + fn close( + &self, + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result<(), SystemError> { + Ok(()) + } +} + +#[derive(Debug)] +pub struct KvmVcpuDev { + vcpu: Arc, + /// INode 元数据 + metadata: Metadata, +} + +impl KvmVcpuDev { + const KVM_RUN: u32 = 0xAE80; + const KVM_GET_REGS: u32 = 0x8090AE81; + const KVM_SET_REGS: u32 = 0x4090AE82; + const KVM_GET_SREGS: u32 = 0x8138AE83; + const KVM_SET_SREGS: u32 = 0x4138AE84; + + pub fn new(vcpu: Arc) -> Arc { + Arc::new(Self { + vcpu, + metadata: Metadata { + dev_id: 1, + inode_id: generate_inode_id(), + size: 0, + blk_size: 0, + blocks: 0, + atime: PosixTimeSpec::default(), + mtime: PosixTimeSpec::default(), + ctime: PosixTimeSpec::default(), + file_type: FileType::KvmDevice, // 文件夹,block设备,char设备 + mode: ModeType::S_IALLUGO, + nlinks: 1, + uid: 0, + gid: 0, + raw_dev: DeviceNumber::default(), // 这里用来作为device number + }, + }) + } +} + +impl IndexNode for KvmVcpuDev { + fn open( + &self, + _data: crate::libs::spinlock::SpinLockGuard, + _mode: &FileMode, + ) -> Result<(), SystemError> { + Ok(()) + } + + fn close( + &self, + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result<(), SystemError> { + Ok(()) + } + + fn ioctl( + &self, + cmd: u32, + arg: usize, + _private_data: &crate::filesystem::vfs::FilePrivateData, + ) -> Result { + match cmd { + Self::KVM_RUN => { + if arg != 0 { + return Err(SystemError::EINVAL); + } + let mut vcpu = self.vcpu.lock(); + let oldpid = vcpu.pid; + if unlikely(oldpid != Some(ProcessManager::current_pid())) { + vcpu.pid = Some(ProcessManager::current_pid()); + } + + return vcpu.run(); + } + Self::KVM_GET_REGS => { + let kvm_regs = self.vcpu.lock().get_regs(); + let mut user_writer = UserBufferWriter::new( + arg as *const KvmCommonRegs as *mut KvmCommonRegs, + core::mem::size_of::(), + true, + )?; + + user_writer.copy_one_to_user(&kvm_regs, 0)?; + return Ok(0); + } + + Self::KVM_SET_REGS => { + let user_reader = UserBufferReader::new( + arg as *const KvmCommonRegs, + core::mem::size_of::(), + true, + )?; + + let regs = user_reader.read_one_from_user::(0)?; + + self.vcpu.lock().set_regs(regs)?; + + return Ok(0); + } + + Self::KVM_GET_SREGS => { + let sregs = self.vcpu.lock().get_segment_regs(); + + let mut writer = UserBufferWriter::new( + arg as *const UapiKvmSegmentRegs as *mut UapiKvmSegmentRegs, + core::mem::size_of::(), + true, + )?; + + writer.copy_one_to_user(&sregs, 0)?; + + return Ok(0); + } + + Self::KVM_SET_SREGS => { + let user_reader = UserBufferReader::new( + arg as *const UapiKvmSegmentRegs, + core::mem::size_of::(), + true, + )?; + + let mut sreg = UapiKvmSegmentRegs::default(); + user_reader.copy_one_from_user(&mut sreg, 0)?; + + if let Ok(_res) = self.vcpu.lock().set_segment_regs(&mut sreg) { + return Ok(0); + } else { + debug!("set segment regs failed"); + return Err(SystemError::EINVAL); + } + } + + _ => { + // arch ioctl + warn!("[KVM-VCPU] unknown ioctl cmd {cmd:x}"); + } + } + + Ok(0) + } + + fn metadata(&self) -> Result { + Ok(self.metadata.clone()) + } + + fn read_at( + &self, + _offset: usize, + _len: usize, + _buf: &mut [u8], + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result { + todo!() + } + + fn write_at( + &self, + _offset: usize, + _len: usize, + _buf: &[u8], + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result { + todo!() + } + + fn fs(&self) -> Arc { + todo!() + } + + fn as_any_ref(&self) -> &dyn core::any::Any { + todo!() + } + + fn list(&self) -> Result, SystemError> { + todo!() + } +} + +pub fn kvm_init() -> Result<(), SystemError> { + let kvm_inode = LockedKvmInode::new(); + + devfs_register("kvm", kvm_inode)?; + + Ok(()) +} diff --git a/kernel/src/virt/vm/kvm_host/mem.rs b/kernel/src/virt/vm/kvm_host/mem.rs new file mode 100644 index 00000000..304d8c4b --- /dev/null +++ b/kernel/src/virt/vm/kvm_host/mem.rs @@ -0,0 +1,714 @@ +use alloc::{ + sync::{Arc, Weak}, + vec::Vec, +}; +use bitmap::AllocBitmap; +use hashbrown::HashMap; +use log::debug; +use system_error::SystemError; + +use crate::{ + arch::{vm::mmu::kvm_mmu::PAGE_SIZE, MMArch}, + libs::{ + rbtree::RBTree, + rwlock::{RwLock, RwLockReadGuard, RwLockWriteGuard}, + spinlock::{SpinLock, SpinLockGuard}, + }, + mm::{kernel_mapper::KernelMapper, page::EntryFlags, MemoryManagementArch, VirtAddr}, + virt::{ + kvm::host_mem::PAGE_SHIFT, + vm::{kvm_host::KVM_ADDRESS_SPACE_NUM, user_api::KvmUserspaceMemoryRegion}, + }, +}; + +use super::{LockedVm, Vm}; + +pub const KVM_USER_MEM_SLOTS: u16 = u16::MAX; +pub const KVM_INTERNAL_MEM_SLOTS: u16 = 3; +pub const KVM_MEM_SLOTS_NUM: u16 = KVM_USER_MEM_SLOTS - KVM_INTERNAL_MEM_SLOTS; +pub const KVM_MEM_MAX_NR_PAGES: usize = (1 << 31) - 1; +// pub const APIC_ACCESS_PAGE_PRIVATE_MEMSLOT: u16 = KVM_MEM_SLOTS_NUM + 1; + +/// 对于普通的页帧号(PFN),最高的12位应该为零, +/// 因此我们可以mask位62到位52来表示错误的PFN, +/// mask位63来表示无槽的PFN。 +// const KVM_PFN_ERR_MASK: u64 = 0x7ff << 52; //0x7FF0000000000000 +// const KVM_PFN_ERR_NOSLOT_MASK: u64 = 0xfff << 52; //0xFFF0000000000000 +// const KVM_PFN_NOSLOT: u64 = 1 << 63; //0x8000000000000000 + +// const KVM_PFN_ERR_FAULT: u64 = KVM_PFN_ERR_MASK; +// const KVM_PFN_ERR_HWPOISON: u64 = KVM_PFN_ERR_MASK + 1; +// const KVM_PFN_ERR_RO_FAULT: u64 = KVM_PFN_ERR_MASK + 2; +// const KVM_PFN_ERR_SIGPENDING: u64 = KVM_PFN_ERR_MASK + 3; + +#[derive(Debug, Default)] +#[allow(dead_code)] +pub struct KvmMmuMemoryCache { + gfp_zero: u32, + gfp_custom: u32, + capacity: usize, + nobjs: usize, + objects: Option>, +} +impl KvmMmuMemoryCache { + #[allow(dead_code)] + pub fn kvm_mmu_totup_memory_cache( + &mut self, + _capacity: usize, + _min: usize, + ) -> Result<(), SystemError> { + // let gfp = if self.gfp_custom != 0 { + // self.gfp_custom + // } else { + // todo!(); + // }; + + // if self.nobjs >= min { + // return Ok(()); + // } + + // if unlikely(self.objects.is_none()) { + // if self.capacity == 0 { + // return Err(SystemError::EIO); + // } + + // // self.objects = Some(Box::new) + // } + + Ok(()) + } +} + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Default)] +pub struct AddrRange { + pub start: VirtAddr, + pub last: VirtAddr, +} + +#[derive(Debug, Default)] +pub struct KvmMemSlotSet { + /// 最后一次使用到的内存插槽 + pub last_use: Option>, + /// 存储虚拟地址(hva)和内存插槽之间的映射关系 + hva_tree: RBTree>, + /// 用于存储全局页帧号(gfn)和内存插槽之间的映射关系 + pub gfn_tree: RBTree>, + /// 将内存插槽的ID映射到对应的内存插槽。 + slots: HashMap>, + + pub node_idx: usize, + pub generation: u64, +} + +impl KvmMemSlotSet { + pub fn get_slot(&self, id: u16) -> Option> { + self.slots.get(&id).cloned() + } +} + +#[derive(Debug)] +pub struct LockedKvmMemSlot { + inner: RwLock, +} + +impl LockedKvmMemSlot { + pub fn new() -> Arc { + Arc::new(Self { + inner: RwLock::new(KvmMemSlot::default()), + }) + } + + #[inline] + pub fn read(&self) -> RwLockReadGuard { + self.inner.read() + } + + #[inline] + pub fn write(&self) -> RwLockWriteGuard { + self.inner.write() + } + + #[inline] + pub fn copy_from(&self, other: &Arc) { + let mut guard = self.write(); + let other = other.read(); + + guard.base_gfn = other.base_gfn; + guard.npages = other.npages; + + guard.dirty_bitmap = other.dirty_bitmap.clone(); + guard.arch = other.arch; + guard.userspace_addr = other.userspace_addr; + guard.flags = other.flags; + guard.id = other.id; + guard.as_id = other.as_id; + } +} + +#[derive(Debug, Default)] +pub struct KvmMemSlot { + /// 首个gfn + pub base_gfn: u64, + /// 页数量 + pub npages: usize, + /// 脏页位图 + dirty_bitmap: Option, + /// 架构相关 + arch: (), + userspace_addr: VirtAddr, + flags: UserMemRegionFlag, + id: u16, + as_id: u16, + + hva_node_key: [AddrRange; 2], +} +#[allow(dead_code)] +impl KvmMemSlot { + pub fn check_aligned_addr(&self, align: usize) -> bool { + self.userspace_addr.data() % align == 0 + } + pub fn get_flags(&self) -> UserMemRegionFlag { + self.flags + } + pub fn get_id(&self) -> u16 { + self.id + } + // 检查内存槽是否可见 + pub fn is_visible(&self) -> bool { + self.id < KVM_USER_MEM_SLOTS + && (self.flags.bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits()) == 0 + } +} + +#[derive(Debug)] +pub struct LockedVmMemSlotSet { + inner: SpinLock, +} + +impl LockedVmMemSlotSet { + pub fn new(slots: KvmMemSlotSet) -> Arc { + Arc::new(Self { + inner: SpinLock::new(slots), + }) + } + + pub fn lock(&self) -> SpinLockGuard { + self.inner.lock() + } +} + +#[derive(Debug, Default)] +#[allow(dead_code)] +pub struct GfnToHvaCache { + generation: u64, + /// 客户机对应物理地址(Guest Physical Address) + gpa: u64, + /// 主机用户空间虚拟地址(User Host Virtual Address) + uhva: Option, + /// 主机内核空间虚拟地址(Kernel Host Virtual Address) + khva: u64, + /// 对应内存插槽 + memslot: Option>, + /// 对应物理页帧号(Page Frame Number) + pfn: Option, + /// 缓存项的使用情况 + usage: PfnCacheUsage, + /// 是否处于活动状态 + active: bool, + /// 是否有效 + valid: bool, + vm: Option>, +} + +impl GfnToHvaCache { + pub fn init(vm: Weak, usage: PfnCacheUsage) -> Self { + // check_stack_usage(); + // let mut ret: Box = unsafe { Box::new_zeroed().assume_init() }; + // ret.usage = usage; + // ret.vm = Some(vm); + // *ret + Self { + usage, + vm: Some(vm), + ..Default::default() + } + } +} + +bitflags! { + #[derive(Default)] + pub struct PfnCacheUsage: u8 { + const GUEST_USES_PFN = 1 << 0; + const HOST_USES_PFN = 1 << 1; + const GUEST_AND_HOST_USES_PFN = Self::GUEST_USES_PFN.bits | Self::HOST_USES_PFN.bits; + } + + pub struct UserMemRegionFlag: u32 { + /// 用来开启内存脏页 + const LOG_DIRTY_PAGES = 1 << 0; + /// 开启内存只读 + const READONLY = 1 << 1; + /// 标记invalid + const KVM_MEMSLOT_INVALID = 1 << 16; + } +} + +impl Default for UserMemRegionFlag { + fn default() -> Self { + Self::empty() + } +} + +#[derive(PartialEq, Eq, Debug, Clone, Copy)] +pub enum KvmMemoryChangeMode { + Create, + Delete, + Move, + FlagsOnly, +} + +impl Vm { + #[inline(never)] + pub fn set_memory_region(&mut self, mem: KvmUserspaceMemoryRegion) -> Result<(), SystemError> { + if mem.slot >= u16::MAX as u32 { + return Err(SystemError::EINVAL); + } + + let as_id = mem.slot >> 16; + let id = mem.slot as u16; + + // 检查内存对齐以及32位检测(虽然现在没什么用<) + if (mem.memory_size as usize & MMArch::PAGE_SIZE != 0) + || mem.memory_size != mem.memory_size as usize as u64 + { + return Err(SystemError::EINVAL); + } + + if !mem.guest_phys_addr.check_aligned(MMArch::PAGE_SIZE) { + return Err(SystemError::EINVAL); + } + + if !mem.userspace_addr.check_aligned(MMArch::PAGE_SIZE) { + // 这里应该还需要判断从userspace_addr->userspace_addr+memory_size这段区间都是合法的 + return Err(SystemError::EINVAL); + } + + if as_id >= KVM_ADDRESS_SPACE_NUM as u32 || id >= KVM_MEM_SLOTS_NUM { + return Err(SystemError::EINVAL); + } + + if (mem.memory_size >> MMArch::PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES as u64 { + return Err(SystemError::EINVAL); + } + + let slots = self.memslot_set(as_id as usize).clone(); + + let slots_guard = slots.lock(); + let old = slots_guard.get_slot(id); + if mem.memory_size == 0 { + if let Some(old) = &old { + let old_npages = old.read().npages; + if old_npages == 0 { + return Err(SystemError::EINVAL); + } + + if self.nr_memslot_pages < old_npages { + return Err(SystemError::EIO); + } + drop(slots_guard); + return self.set_memslot(Some(old), None, KvmMemoryChangeMode::Delete); + } else { + return Err(SystemError::EINVAL); + } + } + + let base_gfn = (mem.guest_phys_addr.data() >> MMArch::PAGE_SHIFT) as u64; + let npages = mem.memory_size >> MMArch::PAGE_SHIFT; + + let change; + if let Some(old) = &old { + let old_guard = old.read(); + if old_guard.npages == 0 { + change = KvmMemoryChangeMode::Create; + // 避免溢出 + if let Some(new_pages) = self.nr_memslot_pages.checked_add(npages as usize) { + if new_pages < self.nr_memslot_pages { + return Err(SystemError::EINVAL); + } + } else { + return Err(SystemError::EINVAL); + } + } else { + if mem.userspace_addr != old_guard.userspace_addr + || npages != old_guard.npages as u64 + || (mem.flags ^ old_guard.flags).contains(UserMemRegionFlag::READONLY) + { + return Err(SystemError::EINVAL); + } + + if base_gfn != old_guard.base_gfn { + change = KvmMemoryChangeMode::Move; + } else if mem.flags != old_guard.flags { + change = KvmMemoryChangeMode::FlagsOnly; + } else { + return Ok(()); + } + } + } else { + change = KvmMemoryChangeMode::Create; + // 避免溢出 + if let Some(new_pages) = self.nr_memslot_pages.checked_add(npages as usize) { + if new_pages < self.nr_memslot_pages { + return Err(SystemError::EINVAL); + } + } else { + return Err(SystemError::EINVAL); + } + }; + + if (change == KvmMemoryChangeMode::Create || change == KvmMemoryChangeMode::Move) + && slots_guard.gfn_tree.contains_key(&base_gfn) + { + return Err(SystemError::EEXIST); + } + + let new = LockedKvmMemSlot::new(); + let mut new_guard = new.write(); + + new_guard.as_id = as_id as u16; + new_guard.id = id; + new_guard.base_gfn = base_gfn; + new_guard.npages = npages as usize; + new_guard.flags = mem.flags; + new_guard.userspace_addr = mem.userspace_addr; + + drop(new_guard); + drop(slots_guard); + return self.set_memslot(old.as_ref(), Some(&new), change); + } + + #[allow(clippy::modulo_one)] + #[inline] + /// 获取活动内存插槽 + fn memslot_set(&self, id: usize) -> &Arc { + // 避免越界 + let id = id % KVM_ADDRESS_SPACE_NUM; + &self.memslots[id] + } + + #[inline(never)] + fn set_memslot( + &mut self, + old: Option<&Arc>, + new: Option<&Arc>, + change: KvmMemoryChangeMode, + ) -> Result<(), SystemError> { + let invalid_slot = LockedKvmMemSlot::new(); + if change == KvmMemoryChangeMode::Delete || change == KvmMemoryChangeMode::Move { + self.invalidate_memslot(old.unwrap(), &invalid_slot) + } + + match self.prepare_memory_region(old, new, change) { + Ok(_) => {} + Err(e) => { + if change == KvmMemoryChangeMode::Delete || change == KvmMemoryChangeMode::Move { + self.active_memslot(Some(&invalid_slot), old) + } + return Err(e); + } + } + + match change { + KvmMemoryChangeMode::Create => self.create_memslot(new), + KvmMemoryChangeMode::Delete => self.delete_memslot(old, &invalid_slot), + KvmMemoryChangeMode::Move => self.move_memslot(old, new, &invalid_slot), + KvmMemoryChangeMode::FlagsOnly => self.update_flags_memslot(old, new), + } + + // TODO:kvm_commit_memory_region(kvm, old, new, change); + Ok(()) + } + + fn create_memslot(&mut self, new: Option<&Arc>) { + self.replace_memslot(None, new); + self.active_memslot(None, new); + } + + fn delete_memslot( + &mut self, + old: Option<&Arc>, + invalid_slot: &Arc, + ) { + self.replace_memslot(old, None); + self.active_memslot(Some(invalid_slot), None); + } + + fn move_memslot( + &mut self, + old: Option<&Arc>, + new: Option<&Arc>, + invalid_slot: &Arc, + ) { + self.replace_memslot(old, new); + self.active_memslot(Some(invalid_slot), new); + } + + fn update_flags_memslot( + &mut self, + old: Option<&Arc>, + new: Option<&Arc>, + ) { + self.replace_memslot(old, new); + self.active_memslot(old, new); + } + + fn prepare_memory_region( + &self, + old: Option<&Arc>, + new: Option<&Arc>, + change: KvmMemoryChangeMode, + ) -> Result<(), SystemError> { + if change != KvmMemoryChangeMode::Delete { + let new = new.unwrap(); + let mut new_guard = new.write(); + if !new_guard.flags.contains(UserMemRegionFlag::LOG_DIRTY_PAGES) { + new_guard.dirty_bitmap = None; + } else if old.is_some() { + let old_guard = old.unwrap().read(); + if old_guard.dirty_bitmap.is_some() { + new_guard.dirty_bitmap = old_guard.dirty_bitmap.clone(); + } else { + new_guard.dirty_bitmap = Some(AllocBitmap::new(new_guard.npages * 2)); + } + } + } + + return self.arch_prepare_memory_region(old, new, change); + } + + fn invalidate_memslot( + &mut self, + old: &Arc, + invalid_slot: &Arc, + ) { + invalid_slot.copy_from(old); + + let mut old_guard = old.write(); + let mut invalid_slot_guard = invalid_slot.write(); + invalid_slot_guard + .flags + .insert(UserMemRegionFlag::KVM_MEMSLOT_INVALID); + + self.swap_active_memslots(old_guard.as_id as usize); + + old_guard.arch = invalid_slot_guard.arch; + } + + #[inline(never)] + fn active_memslot( + &mut self, + old: Option<&Arc>, + new: Option<&Arc>, + ) { + let as_id = if let Some(slot) = old.or(new) { + slot.read().as_id + } else { + 0 + }; + + self.swap_active_memslots(as_id as usize); + + self.replace_memslot(old, new); + } + + #[inline(never)] + fn replace_memslot( + &self, + old: Option<&Arc>, + new: Option<&Arc>, + ) { + let as_id = if let Some(slot) = old.or(new) { + slot.read().as_id + } else { + 0 + }; + + let slot_set = self.get_inactive_memslot_set(as_id as usize); + + let mut slots_guard = slot_set.lock(); + let idx = slots_guard.node_idx; + + if let Some(old) = old { + slots_guard.hva_tree.remove(&old.read().hva_node_key[idx]); + + if let Some(last) = &slots_guard.last_use { + if Arc::ptr_eq(last, old) { + slots_guard.last_use = new.cloned(); + } + } + + if new.is_none() { + slots_guard.gfn_tree.remove(&old.read().base_gfn); + return; + } + } + + let new = new.unwrap(); + let mut new_guard = new.write(); + new_guard.hva_node_key[idx].start = new_guard.userspace_addr; + new_guard.hva_node_key[idx].last = + new_guard.userspace_addr + VirtAddr::new((new_guard.npages << MMArch::PAGE_SHIFT) - 1); + + slots_guard + .hva_tree + .insert(new_guard.hva_node_key[idx], new.clone()); + + if let Some(old) = old { + slots_guard.gfn_tree.remove(&old.read().base_gfn); + } + + slots_guard.gfn_tree.insert(new_guard.base_gfn, new.clone()); + } + + fn get_inactive_memslot_set(&self, as_id: usize) -> Arc { + let active = self.memslot_set(as_id); + + let inactive_idx = active.lock().node_idx ^ 1; + return self.memslots_set[as_id][inactive_idx].clone(); + } + + fn swap_active_memslots(&mut self, as_id: usize) { + self.memslots[as_id] = self.get_inactive_memslot_set(as_id); + } +} +/// 将给定的客户机帧号(GFN)转换为用户空间虚拟地址(HVA),并根据内存槽的状态和标志进行相应的检查。 +/// +/// # 参数 +/// - `slot`: 可选的 `KvmMemSlot`,表示内存槽。 +/// - `gfn`: 客户机帧号(GFN),表示要转换的帧号。 +/// - `nr_pages`: 可选的可变引用,用于存储计算出的页数。 +/// - `write`: 布尔值,表示是否为写操作。 +/// +/// # 返回 +/// 如果成功,返回转换后的用户空间虚拟地址(HVA);如果失败,返回相应的错误。 +/// +/// # 错误 +/// 如果内存槽为空或无效,或者尝试对只读内存槽进行写操作,则返回 `SystemError::KVM_HVA_ERR_BAD`。 +pub fn __gfn_to_hva_many( + slot: &Option<&KvmMemSlot>, + gfn: u64, + nr_pages: Option<&mut u64>, + write: bool, +) -> Result { + debug!("__gfn_to_hva_many"); + + // 检查内存槽是否为空 + if slot.is_none() { + return Err(SystemError::KVM_HVA_ERR_BAD); + } + let slot = slot.as_ref().unwrap(); + + // 检查内存槽是否无效或尝试对只读内存槽进行写操作 + if slot.flags.bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits() != 0 + || (slot.flags.bits() & UserMemRegionFlag::READONLY.bits() != 0) && write + { + return Err(SystemError::KVM_HVA_ERR_BAD); + } + + // 如果 `nr_pages` 不为空,计算并更新页数 + if let Some(nr_pages) = nr_pages { + *nr_pages = slot.npages as u64 - (gfn - slot.base_gfn); + } + + // 调用辅助函数将 GFN 转换为 HVA + return Ok(__gfn_to_hva_memslot(slot, gfn)); +} + +/// 将给定的全局帧号(GFN)转换为用户空间虚拟地址(HVA)。 +/// +/// # 参数 +/// - `slot`: `KvmMemSlot`,表示内存槽。 +/// - `gfn`: 全局帧号(GFN),表示要转换的帧号。 +/// +/// # 返回 +/// 转换后的用户空间虚拟地址(HVA)。 +fn __gfn_to_hva_memslot(slot: &KvmMemSlot, gfn: u64) -> u64 { + return slot.userspace_addr.data() as u64 + (gfn - slot.base_gfn) * PAGE_SIZE; +} +/// 将给定的全局帧号(GFN)转换为页帧号(PFN),并根据内存槽的状态和标志进行相应的检查。 +/// +/// # 参数 +/// - `slot`: 内存槽的引用。 +/// - `gfn`: 全局帧号(GFN),表示要转换的帧号。 +/// - `atomic`: 布尔值,表示是否为原子操作。 +/// - `interruptible`: 布尔值,表示操作是否可中断。 +/// - `async`: 可变引用,表示操作是否为异步。 +/// - `write_fault`: 布尔值,表示是否为写操作。 +/// - `writable`: 可变引用,表示是否可写。 +/// - `hva`: 可变引用,表示用户空间虚拟地址(HVA)。 +/// +/// # 返回 +/// 如果成功,返回转换后的页帧号(PFN);如果失败,返回相应的错误。 +pub fn __gfn_to_pfn_memslot( + slot: Option<&KvmMemSlot>, + gfn: u64, + atomic_or_async: (bool, &mut bool), + interruptible: bool, + write: bool, + writable: &mut bool, + hva: &mut u64, +) -> Result { + let addr = __gfn_to_hva_many(&slot, gfn, None, write)?; + *hva = addr; + + //todo:检查地址是否为错误 + + // 如果内存槽为只读,且 writable 不为空,则更新 writable 的值 + if slot.unwrap().flags.bits() & UserMemRegionFlag::READONLY.bits() != 0 { + *writable = false; + } + + let pfn = hva_to_pfn(addr, atomic_or_async, interruptible, write, writable)?; + return Ok(pfn); +} +/// 将用户空间虚拟地址(HVA)转换为页帧号(PFN)。 +/// +/// # 参数 +/// - `addr`: 用户空间虚拟地址(HVA)。 +/// - `atomic`: 布尔值,表示是否为原子操作。 +/// - `interruptible`: 布尔值,表示操作是否可中断。 +/// - `is_async`: 可变引用,表示操作是否为异步。 +/// - `write_fault`: 布尔值,表示是否为写操作。 +/// - `writable`: 可变引用,表示是否可写。 +/// +/// # 返回 +/// 如果成功,返回转换后的页帧号(PFN);如果失败,返回相应的错误。 +// 正确性待验证 +pub fn hva_to_pfn( + addr: u64, + atomic_or_async: (bool, &mut bool), + _interruptible: bool, + _write_fault: bool, + _writable: &mut bool, +) -> Result { + // 我们可以原子地或异步地执行,但不能同时执行 + assert!( + !(atomic_or_async.0 && *atomic_or_async.1), + "Cannot be both atomic and async" + ); + + debug!("hva_to_pfn"); + // let hpa = MMArch::virt_2_phys(VirtAddr::new(addr)).unwrap().data() as u64; + let hva = VirtAddr::new(addr as usize); + let mut mapper = KernelMapper::lock(); + let mapper = mapper.as_mut().unwrap(); + if let Some((hpa, _)) = mapper.translate(hva) { + return Ok(hpa.data() as u64 >> PAGE_SHIFT); + } + debug!("hva_to_pfn NOT FOUND,try map a new pfn"); + unsafe { + mapper.map(hva, EntryFlags::mmio_flags()); + } + let (hpa, _) = mapper.translate(hva).unwrap(); + return Ok(hpa.data() as u64 >> PAGE_SHIFT); +} diff --git a/kernel/src/virt/vm/kvm_host/mod.rs b/kernel/src/virt/vm/kvm_host/mod.rs new file mode 100644 index 00000000..0493f7c9 --- /dev/null +++ b/kernel/src/virt/vm/kvm_host/mod.rs @@ -0,0 +1,268 @@ +use core::{ + fmt::Debug, + sync::atomic::{AtomicUsize, Ordering}, +}; + +use alloc::{ + boxed::Box, + sync::{Arc, Weak}, + vec::Vec, +}; +use hashbrown::HashMap; +use log::debug; +use mem::LockedKvmMemSlot; +use system_error::SystemError; + +use crate::{ + arch::{ + vm::{kvm_host::vcpu::VirtCpuRequest, vmx::KvmVmx, x86_kvm_manager}, + CurrentKvmManager, KvmArch, VirtCpuArch, + }, + filesystem::vfs::file::{File, FileMode}, + libs::spinlock::{SpinLock, SpinLockGuard}, + mm::ucontext::AddressSpace, + process::ProcessManager, + smp::cpu::ProcessorId, + virt::vm::{ + kvm_dev::KvmVcpuDev, + kvm_host::vcpu::{LockedVirtCpu, VirtCpu}, + }, +}; + +use self::{ + mem::{GfnToHvaCache, KvmMemSlotSet, LockedVmMemSlotSet, PfnCacheUsage}, + vcpu::{GuestDebug, VcpuMode}, +}; + +pub mod mem; +pub mod vcpu; + +const KVM_ADDRESS_SPACE_NUM: usize = 1; +pub const KVM_USERSAPCE_IRQ_SOURCE_ID: usize = 0; +pub const KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID: usize = 1; + +#[derive(Debug)] +pub struct LockedVm { + inner: SpinLock, +} + +static KVM_USAGE_COUNT: AtomicUsize = AtomicUsize::new(0); + +impl LockedVm { + pub fn lock(&self) -> SpinLockGuard { + self.inner.lock() + } + + pub fn create(vm_type: usize) -> Result, SystemError> { + let mut memslots_set = vec![]; + let mut memslots = vec![]; + for i in 0..KVM_ADDRESS_SPACE_NUM { + let mut tmp = vec![]; + for j in 0..2 { + let mut slots = KvmMemSlotSet::default(); + slots.last_use = None; + slots.node_idx = j; + slots.generation = i as u64; + tmp.push(LockedVmMemSlotSet::new(slots)); + } + memslots_set.push(tmp); + memslots.push(memslots_set[i][0].clone()); + } + + let kvm = Vm { + mm: ProcessManager::current_pcb() + .basic() + .user_vm() + .unwrap() + .write() + .try_clone()?, + max_vcpus: CurrentKvmManager::KVM_MAX_VCPUS, + memslots_set, + memslots, + arch: KvmArch::init(vm_type)?, + created_vcpus: 0, + lock_vm_ref: Weak::new(), + nr_memslot_pages: 0, + online_vcpus: 0, + dirty_ring_size: 0, + dirty_ring_with_bitmap: false, + vcpus: HashMap::new(), + #[cfg(target_arch = "x86_64")] + kvm_vmx: KvmVmx::default(), + nr_memslots_dirty_logging: 0, + mmu_invalidate_seq: 0, + }; + + let ret = Arc::new(Self { + inner: SpinLock::new(kvm), + }); + + Self::hardware_enable_all()?; + + ret.lock().lock_vm_ref = Arc::downgrade(&ret); + return Ok(ret); + } + + fn hardware_enable_all() -> Result<(), SystemError> { + KVM_USAGE_COUNT.fetch_add(1, Ordering::SeqCst); + + // 如果是第一个启动的,则需要对所有cpu都初始化硬件 + if KVM_USAGE_COUNT.load(Ordering::SeqCst) == 1 { + // FIXME!!!! + // 这里是要对每个cpu都进行初始化,目前这里只对当前cpu调用了初始化流程 + x86_kvm_manager().arch_hardware_enable()?; + } + + Ok(()) + } +} + +#[derive(Debug)] +#[allow(dead_code)] +pub struct Vm { + lock_vm_ref: Weak, + mm: Arc, + max_vcpus: usize, + created_vcpus: usize, + online_vcpus: usize, + /// vcpu集合 + vcpus: HashMap>, + // name: String, + /// 对应活动和非活动内存槽,实际为:[[Arc; 2]; KVM_ADDRESS_SPACE_NUM],这里暂时写Vec + memslots_set: Vec>>, + /// 当前活动内存槽,实际为:[Arc; KVM_ADDRESS_SPACE_NUM],这里暂时写Vec + pub memslots: Vec>, + /// 内存槽对应的页数 + nr_memslot_pages: usize, + + pub arch: KvmArch, + + pub dirty_ring_size: u32, + pub nr_memslots_dirty_logging: u32, + dirty_ring_with_bitmap: bool, + + #[cfg(target_arch = "x86_64")] + pub kvm_vmx: KvmVmx, + + pub mmu_invalidate_seq: u64, //用于表示内存管理单元(MMU)无效化序列号 +} + +impl Vm { + #[inline(never)] + pub fn create_vcpu(&mut self, id: usize) -> Result { + if id >= self.max_vcpus { + return Err(SystemError::EINVAL); + } + + if self.created_vcpus >= self.max_vcpus { + return Err(SystemError::EINVAL); + } + + self.created_vcpus += 1; + + let vcpu = self._create_vcpu(id)?; + if self.dirty_ring_size != 0 { + todo!() + } + + vcpu.lock().vcpu_id = self.online_vcpus; + + self.vcpus.insert(self.online_vcpus, vcpu.clone()); + + self.online_vcpus += 1; + + let vcpu_inode = KvmVcpuDev::new(vcpu); + + let file = File::new(vcpu_inode, FileMode::from_bits_truncate(0x777))?; + + let fd = ProcessManager::current_pcb() + .fd_table() + .write() + .alloc_fd(file, None)?; + + Ok(fd as usize) + } + + /// ### 创建一个vcpu,并且初始化部分数据 + #[inline(never)] + pub fn _create_vcpu(&mut self, id: usize) -> Result, SystemError> { + let mut vcpu = self.new_vcpu(id); + + vcpu.init_arch(self, id)?; + + Ok(Arc::new(LockedVirtCpu::new(vcpu))) + } + + #[inline(never)] + pub fn new_vcpu(&self, id: usize) -> VirtCpu { + return VirtCpu { + cpu: ProcessorId::INVALID, + kvm: Some(self.lock_vm_ref.clone()), + vcpu_id: id, + pid: None, + preempted: false, + ready: false, + last_used_slot: None, + stats_id: format!("kvm-{}/vcpu-{}", ProcessManager::current_pid().data(), id), + pv_time: GfnToHvaCache::init(self.lock_vm_ref.clone(), PfnCacheUsage::HOST_USES_PFN), + arch: VirtCpuArch::new(), + private: None, + request: VirtCpuRequest::empty(), + guest_debug: GuestDebug::empty(), + run: unsafe { Some(Box::new_zeroed().assume_init()) }, + vcpu_idx: 0, + mode: VcpuMode::OutsideGuestMode, + stat: Default::default(), + }; + } + + #[cfg(target_arch = "x86_64")] + pub fn kvm_vmx_mut(&mut self) -> &mut KvmVmx { + &mut self.kvm_vmx + } + + #[cfg(target_arch = "x86_64")] + pub fn kvm_vmx(&self) -> &KvmVmx { + &self.kvm_vmx + } +} + +/// ## 多处理器状态(有些状态在某些架构并不合法) +#[derive(Debug, Clone, Copy, PartialEq)] +#[allow(dead_code)] +pub enum MutilProcessorState { + Runnable, + Uninitialized, + InitReceived, + Halted, + SipiReceived, + Stopped, + CheckStop, + Operating, + Load, + ApResetHold, + Suspended, +} +///返回包含 gfn 的 memslot 的指针。如果没有找到,则返回 NULL。 +///当 "approx" 设置为 true 时,即使地址落在空洞中,也会返回 memslot。 +///在这种情况下,将返回空洞边界的其中一个 memslot。 +/// 先简陋完成,原本是二分,现在先遍历 +pub fn search_memslots( + slot_set: Arc, + gfn: u64, /*_approx:bool*/ +) -> Option> { + let slots = slot_set.lock(); + let node = &slots.gfn_tree; + //let(start,end)=(0,node.len()-1); + for (_gfn_num, slot) in node.iter() { + let slot_guard = slot.read(); + debug!( + "gfn:{gfn},slot base_gfn: {},slot npages: {}", + slot_guard.base_gfn, slot_guard.npages + ); + if gfn >= slot_guard.base_gfn && gfn < slot_guard.base_gfn + slot_guard.npages as u64 { + return Some(slot.clone()); + } + } + return None; +} diff --git a/kernel/src/virt/vm/kvm_host/vcpu.rs b/kernel/src/virt/vm/kvm_host/vcpu.rs new file mode 100644 index 00000000..ab0c075a --- /dev/null +++ b/kernel/src/virt/vm/kvm_host/vcpu.rs @@ -0,0 +1,117 @@ +use alloc::{ + boxed::Box, + string::String, + sync::{Arc, Weak}, +}; + +use crate::{ + arch::{ + vm::{ + kvm_host::{vcpu::VirtCpuRequest, KvmReg}, + vmx::VmxVCpuPriv, + }, + VirtCpuArch, VirtCpuStat, + }, + libs::spinlock::{SpinLock, SpinLockGuard}, + process::Pid, + smp::cpu::ProcessorId, + virt::vm::user_api::UapiKvmRun, +}; + +use super::{ + mem::{GfnToHvaCache, KvmMemSlot}, + LockedVm, +}; + +#[derive(Debug)] +pub struct LockedVirtCpu { + inner: SpinLock, +} + +impl LockedVirtCpu { + pub fn new(vcpu: VirtCpu) -> Self { + Self { + inner: SpinLock::new(vcpu), + } + } + + pub fn lock(&self) -> SpinLockGuard { + self.inner.lock() + } +} + +#[derive(Debug, PartialEq)] +#[allow(dead_code)] +pub enum VcpuMode { + OutsideGuestMode, + InGuestMode, + ExitingGuestMode, + ReadingShadowPageTables, +} + +#[derive(Debug)] +pub struct VirtCpu { + pub cpu: ProcessorId, + pub kvm: Option>, + /// 从用户层获取 + pub vcpu_id: usize, + /// id alloctor获取 + pub vcpu_idx: usize, + pub pid: Option, + pub preempted: bool, + pub ready: bool, + pub last_used_slot: Option>, + pub stats_id: String, + pub pv_time: GfnToHvaCache, + pub arch: VirtCpuArch, + pub stat: VirtCpuStat, + + pub mode: VcpuMode, + + pub guest_debug: GuestDebug, + + #[cfg(target_arch = "x86_64")] + pub private: Option, + + /// 记录请求 + pub request: VirtCpuRequest, + pub run: Option>, +} + +impl VirtCpu { + #[inline] + pub fn kvm(&self) -> Arc { + self.kvm.as_ref().unwrap().upgrade().unwrap() + } + + #[cfg(target_arch = "x86_64")] + pub fn vmx(&self) -> &VmxVCpuPriv { + self.private.as_ref().unwrap() + } + + #[cfg(target_arch = "x86_64")] + pub fn vmx_mut(&mut self) -> &mut VmxVCpuPriv { + self.private.as_mut().unwrap() + } + //https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.h?fi=vmx_get_exit_qual#677 + #[inline] + pub fn get_exit_qual(&mut self) -> u64 { + if !self + .arch + .test_and_mark_available(KvmReg::VcpuExregExitInfo1) + { + self.vmx_mut().vmread_exit_qual(); + } + let vmx = self.vmx(); + vmx.get_exit_qual() + //vmx. + } +} + +bitflags! { + pub struct GuestDebug: usize { + const ENABLE = 0x00000001; + const SINGLESTEP = 0x00000002; + const USE_SW_BP = 0x00010000; + } +} diff --git a/kernel/src/virt/vm/mod.rs b/kernel/src/virt/vm/mod.rs new file mode 100644 index 00000000..048b943e --- /dev/null +++ b/kernel/src/virt/vm/mod.rs @@ -0,0 +1,3 @@ +pub mod kvm_dev; +pub mod kvm_host; +pub mod user_api; diff --git a/kernel/src/virt/vm/user_api.rs b/kernel/src/virt/vm/user_api.rs new file mode 100644 index 00000000..e7d078c2 --- /dev/null +++ b/kernel/src/virt/vm/user_api.rs @@ -0,0 +1,466 @@ +/// +/// 该文件定义了暴露给用户空间的结构体 +/// +use core::fmt::Debug; + +use system_error::SystemError; + +use crate::mm::{PhysAddr, VirtAddr}; + +use super::kvm_host::mem::UserMemRegionFlag; + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmSegment { + pub base: u64, + pub limit: u32, + pub selector: u16, + pub type_: u8, + pub present: u8, + pub dpl: u8, + pub db: u8, + pub s: u8, + pub l: u8, + pub g: u8, + pub avl: u8, + pub unusable: u8, + pub padding: u8, +} + +impl UapiKvmSegment { + pub fn vmx_segment_access_rights(&self) -> u32 { + let mut ar = self.type_ as u32 & 15; + ar |= (self.s as u32 & 1) << 4; + ar |= (self.dpl as u32 & 3) << 5; + ar |= (self.present as u32 & 1) << 7; + ar |= (self.avl as u32 & 1) << 12; + ar |= (self.l as u32 & 1) << 13; + ar |= (self.db as u32 & 1) << 14; + ar |= (self.g as u32 & 1) << 15; + + let b = self.unusable != 0 || self.present == 0; + ar |= (b as u32) << 16; + + return ar; + } +} + +/// 通过这个结构可以将虚拟机的物理地址对应到用户进程的虚拟地址 +/// 用来表示虚拟机的一段物理内存 +#[repr(C)] +#[derive(Default)] +pub struct PosixKvmUserspaceMemoryRegion { + /// 在哪个slot上注册内存区间 + pub slot: u32, + /// flags有两个取值,KVM_MEM_LOG_DIRTY_PAGES和KVM_MEM_READONLY,用来指示kvm针对这段内存应该做的事情。 + /// KVM_MEM_LOG_DIRTY_PAGES用来开启内存脏页,KVM_MEM_READONLY用来开启内存只读。 + pub flags: u32, + /// 虚机内存区间起始物理地址 + pub guest_phys_addr: u64, + /// 虚机内存区间大小 + pub memory_size: u64, + /// 虚机内存区间对应的主机虚拟地址 + pub userspace_addr: u64, +} + +/// PosixKvmUserspaceMemoryRegion对应内核表示 +pub struct KvmUserspaceMemoryRegion { + /// 在哪个slot上注册内存区间 + pub slot: u32, + /// 用来指示kvm针对这段内存应该做的事情。 + /// KVM_MEM_LOG_DIRTY_PAGES用来开启内存脏页,KVM_MEM_READONLY用来开启内存只读。 + pub flags: UserMemRegionFlag, + /// 虚机内存区间起始物理地址 + pub guest_phys_addr: PhysAddr, + /// 虚机内存区间大小 + pub memory_size: u64, + /// 虚机内存区间对应的主机虚拟地址 + pub userspace_addr: VirtAddr, +} + +impl KvmUserspaceMemoryRegion { + pub fn from_posix(posix: &PosixKvmUserspaceMemoryRegion) -> Result { + let flags = UserMemRegionFlag::from_bits(posix.flags).ok_or(SystemError::EINVAL)?; + Ok(Self { + slot: posix.slot, + flags, + guest_phys_addr: PhysAddr::new(posix.guest_phys_addr as usize), + memory_size: posix.memory_size, + userspace_addr: VirtAddr::new(posix.userspace_addr as usize), + }) + } +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct UapiKvmRun { + pub request_interrupt_window: u8, + pub immediate_exit: u8, + pub padding1: [u8; 6usize], + pub exit_reason: u32, + pub ready_for_interrupt_injection: u8, + pub if_flag: u8, + pub flags: u16, + pub cr8: u64, + pub apic_base: u64, + pub __bindgen_anon_1: uapi_kvm_run__bindgen_ty_1, + pub kvm_valid_regs: u64, + pub kvm_dirty_regs: u64, + pub s: uapi_kvm_run__bindgen_ty_2, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub union uapi_kvm_run__bindgen_ty_2 { + pub regs: UapiKvmSyncRegs, + pub padding: [u8; 2048usize], +} + +impl Debug for uapi_kvm_run__bindgen_ty_2 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("uapi_kvm_run__bindgen_ty_2").finish() + } +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmSyncRegs { + pub device_irq_level: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy1 { + pub hardware_exit_reason: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy2 { + pub hardware_entry_failure_reason: u64, + pub cpu: u32, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy3 { + pub exception: u32, + pub error_code: u32, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy4 { + pub direction: u8, + pub size: u8, + pub port: u16, + pub count: u32, + pub data_offset: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmDebugExitArch { + pub hsr: u32, + pub hsr_high: u32, + pub far: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy5 { + pub arch: UapiKvmDebugExitArch, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy6 { + pub phys_addr: u64, + pub data: [u8; 8usize], + pub len: u32, + pub is_write: u8, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy7 { + pub nr: u64, + pub args: [u64; 6usize], + pub ret: u64, + pub longmode: u32, + pub pad: u32, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy8 { + pub rip: u64, + pub is_write: u32, + pub pad: u32, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy9 { + pub icptcode: u8, + pub ipa: u16, + pub ipb: u32, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy10 { + pub trans_exc_code: u64, + pub pgm_code: u32, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy11 { + pub dcrn: u32, + pub data: u32, + pub is_write: u8, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy12 { + pub suberror: u32, + pub ndata: u32, + pub data: [u64; 16usize], +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub struct UapiKvmRunBindgenTy1BindgenTy13 { + pub suberror: u32, + pub ndata: u32, + pub flags: u64, + pub __bindgen_anon_1: uapi_kvm_run__bindgen_ty_1__bindgen_ty_13__bindgen_ty_1, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub union uapi_kvm_run__bindgen_ty_1__bindgen_ty_13__bindgen_ty_1 { + pub __bindgen_anon_1: UapiKvmRunBindgenTy1BindgenTy13BindgenTy1BindgenTy1, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy13BindgenTy1BindgenTy1 { + pub insn_size: u8, + pub insn_bytes: [u8; 15usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy14 { + pub gprs: [u64; 32usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy15 { + pub nr: u64, + pub ret: u64, + pub args: [u64; 9usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy16 { + pub subchannel_id: u16, + pub subchannel_nr: u16, + pub io_int_parm: u32, + pub io_int_word: u32, + pub ipb: u32, + pub dequeued: u8, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy17 { + pub epr: u32, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub struct UapiKvmRunBindgenTy1BindgenTy18 { + pub type_: u32, + pub ndata: u32, + pub __bindgen_anon_1: uapi_kvm_run__bindgen_ty_1__bindgen_ty_18__bindgen_ty_1, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub union uapi_kvm_run__bindgen_ty_1__bindgen_ty_18__bindgen_ty_1 { + pub flags: u64, + pub data: [u64; 16usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy19 { + pub addr: u64, + pub ar: u8, + pub reserved: u8, + pub fc: u8, + pub sel1: u8, + pub sel2: u16, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy20 { + pub vector: u8, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy21 { + pub esr_iss: u64, + pub fault_ipa: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy22 { + pub error: u8, + pub pad: [u8; 7usize], + pub reason: u32, + pub index: u32, + pub data: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy23 { + pub extension_id: usize, + pub function_id: usize, + pub args: [usize; 6usize], + pub ret: [usize; 2usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy24 { + pub csr_num: usize, + pub new_value: usize, + pub write_mask: usize, + pub ret_value: usize, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy25 { + pub flags: u32, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub union uapi_kvm_run__bindgen_ty_1 { + pub hw: UapiKvmRunBindgenTy1BindgenTy1, + pub fail_entry: UapiKvmRunBindgenTy1BindgenTy2, + pub ex: UapiKvmRunBindgenTy1BindgenTy3, + pub io: UapiKvmRunBindgenTy1BindgenTy4, + pub debug: UapiKvmRunBindgenTy1BindgenTy5, + pub mmio: UapiKvmRunBindgenTy1BindgenTy6, + pub hypercall: UapiKvmRunBindgenTy1BindgenTy7, + pub tpr_access: UapiKvmRunBindgenTy1BindgenTy8, + pub s390_sieic: UapiKvmRunBindgenTy1BindgenTy9, + pub s390_reset_flags: u64, + pub s390_ucontrol: UapiKvmRunBindgenTy1BindgenTy10, + pub dcr: UapiKvmRunBindgenTy1BindgenTy11, + pub internal: UapiKvmRunBindgenTy1BindgenTy12, + pub emulation_failure: UapiKvmRunBindgenTy1BindgenTy13, + pub osi: UapiKvmRunBindgenTy1BindgenTy14, + pub papr_hcall: UapiKvmRunBindgenTy1BindgenTy15, + pub s390_tsch: UapiKvmRunBindgenTy1BindgenTy16, + pub epr: UapiKvmRunBindgenTy1BindgenTy17, + pub system_event: UapiKvmRunBindgenTy1BindgenTy18, + pub s390_stsi: UapiKvmRunBindgenTy1BindgenTy19, + pub eoi: UapiKvmRunBindgenTy1BindgenTy20, + pub hyperv: UapiKvmHypervExit, + pub arm_nisv: UapiKvmRunBindgenTy1BindgenTy21, + pub msr: UapiKvmRunBindgenTy1BindgenTy22, + pub xen: UapiKvmXenExit, + pub riscv_sbi: UapiKvmRunBindgenTy1BindgenTy23, + pub riscv_csr: UapiKvmRunBindgenTy1BindgenTy24, + pub notify: UapiKvmRunBindgenTy1BindgenTy25, + pub padding: [u8; 256usize], +} + +impl Debug for uapi_kvm_run__bindgen_ty_1 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("uapi_kvm_run__bindgen_ty_1").finish() + } +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub struct UapiKvmHypervExit { + pub type_: u32, + pub pad1: u32, + pub u: uapi_kvm_hyperv_exit__bindgen_ty_1, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub union uapi_kvm_hyperv_exit__bindgen_ty_1 { + pub synic: UapiKvmHypervExitBindgenTy1BindgenTy1, + pub hcall: UapiKvmHypervExitBindgenTy1BindgenTy2, + pub syndbg: UapiKvmHypervExitBindgenTy1BindgenTy3, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmHypervExitBindgenTy1BindgenTy1 { + pub msr: u32, + pub pad2: u32, + pub control: u64, + pub evt_page: u64, + pub msg_page: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmHypervExitBindgenTy1BindgenTy2 { + pub input: u64, + pub result: u64, + pub params: [u64; 2usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmHypervExitBindgenTy1BindgenTy3 { + pub msr: u32, + pub pad2: u32, + pub control: u64, + pub status: u64, + pub send_page: u64, + pub recv_page: u64, + pub pending_page: u64, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub struct UapiKvmXenExit { + pub type_: u32, + pub u: uapi_kvm_xen_exit__bindgen_ty_1, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub union uapi_kvm_xen_exit__bindgen_ty_1 { + pub hcall: UapiKvmXenExitBindgenTy1BindgenTy1, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmXenExitBindgenTy1BindgenTy1 { + pub longmode: u32, + pub cpl: u32, + pub input: u64, + pub result: u64, + pub params: [u64; 6usize], +} diff --git a/user/apps/test_kvm/main.c b/user/apps/test_kvm/main.c index fd60ccb6..60b062f2 100644 --- a/user/apps/test_kvm/main.c +++ b/user/apps/test_kvm/main.c @@ -1,115 +1,540 @@ -/** - * @file main.c - * @author xiaoyez (xiaoyez@zju.edu.cn) - * @brief 测试kvm的程序 - * @version 0.1 - * @date 2023-07-13 - * - * @copyright Copyright (c) 2023 - * - */ -/** - * 测试kvm命令的方法: - * 1.在DragonOS的控制台输入 exec bin/test_kvm.elf - * - */ -#include #include #include +#include #include -#include +#include +#include +//#include -#define KVM_CREATE_VCPU 0x00 -#define KVM_SET_USER_MEMORY_REGION 0x01 +typedef __signed__ char __s8; +typedef unsigned char __u8; -#define KVM_RUN 0x00 -#define KVM_GET_REGS 0x01 -#define KVM_SET_REGS 0x02 +typedef __signed__ short __s16; +typedef unsigned short __u16; -struct kvm_userspace_memory_region { - uint32_t slot; // 要在哪个slot上注册内存区间 - // flags有两个取值,KVM_MEM_LOG_DIRTY_PAGES和KVM_MEM_READONLY,用来指示kvm针对这段内存应该做的事情。 - // KVM_MEM_LOG_DIRTY_PAGES用来开启内存脏页,KVM_MEM_READONLY用来开启内存只读。 - uint32_t flags; - uint64_t guest_phys_addr; // 虚机内存区间起始物理地址 - uint64_t memory_size; // 虚机内存区间大小 - uint64_t userspace_addr; // 虚机内存区间对应的主机虚拟地址 +typedef __signed__ int __s32; +typedef unsigned int __u32; + +#ifdef __GNUC__ +__extension__ typedef __signed__ long long __s64; +__extension__ typedef unsigned long long __u64; +#else +typedef __signed__ long long __s64; +typedef unsigned long long __u64; +#endif + +//from linux/kvm.h +#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */ +#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) +#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */ + +#define KVM_RUN _IO(KVMIO, 0x80) +#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs) +#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs) +#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs) +#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs) + +#define KVMIO 0xAE +#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \ + struct kvm_userspace_memory_region) +/* Architectural interrupt line count. */ +#define KVM_NR_INTERRUPTS 256 +struct kvm_hyperv_exit { +#define KVM_EXIT_HYPERV_SYNIC 1 +#define KVM_EXIT_HYPERV_HCALL 2 +#define KVM_EXIT_HYPERV_SYNDBG 3 + __u32 type; + __u32 pad1; + union { + struct { + __u32 msr; + __u32 pad2; + __u64 control; + __u64 evt_page; + __u64 msg_page; + } synic; + struct { + __u64 input; + __u64 result; + __u64 params[2]; + } hcall; + struct { + __u32 msr; + __u32 pad2; + __u64 control; + __u64 status; + __u64 send_page; + __u64 recv_page; + __u64 pending_page; + } syndbg; + } u; }; - +struct kvm_debug_exit_arch { + __u32 exception; + __u32 pad; + __u64 pc; + __u64 dr6; + __u64 dr7; +}; +/* for KVM_SET_USER_MEMORY_REGION */ +struct kvm_userspace_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ +}; +struct kvm_xen_exit { +#define KVM_EXIT_XEN_HCALL 1 + __u32 type; + union { + struct { + __u32 longmode; + __u32 cpl; + __u64 input; + __u64 result; + __u64 params[6]; + } hcall; + } u; +}; +/* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ - uint64_t rax, rbx, rcx, rdx; - uint64_t rsi, rdi, rsp, rbp; - uint64_t r8, r9, r10, r11; - uint64_t r12, r13, r14, r15; - uint64_t rip, rflags; + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; +struct my_kvm_segment { + __u64 base; + __u32 limit; + __u16 selector; + __u8 type; + __u8 present, dpl, db, s, l, g, avl; + __u8 unusable; + __u8 padding; +}; +struct kvm_dtable { + __u64 base; + __u16 limit; + __u16 padding[3]; +}; +/* for KVM_GET_SREGS and KVM_SET_SREGS */ +struct kvm_sregs { + /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ + struct my_kvm_segment cs, ds, es, fs, gs, ss; + struct my_kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; }; -int guest_code(){ - while (1) - { - // printf("guest code\n"); - __asm__ __volatile__ ( - "mov %rax, 0\n\t" - "mov %rcx, 0\n\t" - "cpuid\n\t" - ); - } +/* for KVM_GET/SET_VCPU_EVENTS */ +struct kvm_vcpu_events { + struct { + __u8 injected; + __u8 nr; + __u8 has_error_code; + __u8 pending; + __u32 error_code; + } exception; + struct { + __u8 injected; + __u8 nr; + __u8 soft; + __u8 shadow; + } interrupt; + struct { + __u8 injected; + __u8 pending; + __u8 masked; + __u8 pad; + } nmi; + __u32 sipi_vector; + __u32 flags; + struct { + __u8 smm; + __u8 pending; + __u8 smm_inside_nmi; + __u8 latched_init; + } smi; + __u8 reserved[27]; + __u8 exception_has_payload; + __u64 exception_payload; +}; +/* kvm_sync_regs struct included by kvm_run struct */ +struct kvm_sync_regs { + /* Members of this structure are potentially malicious. + * Care must be taken by code reading, esp. interpreting, + * data fields from them inside KVM to prevent TOCTOU and + * double-fetch types of vulnerabilities. + */ + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_vcpu_events events; +}; + +/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ +struct kvm_run { + /* in */ + __u8 request_interrupt_window; + __u8 immediate_exit; + __u8 padding1[6]; + + /* out */ + __u32 exit_reason; + __u8 ready_for_interrupt_injection; + __u8 if_flag; + __u16 flags; + + /* in (pre_kvm_run), out (post_kvm_run) */ + __u64 cr8; + __u64 apic_base; + +#ifdef __KVM_S390 + /* the processor status word for s390 */ + __u64 psw_mask; /* psw upper half */ + __u64 psw_addr; /* psw lower half */ +#endif + union { + /* KVM_EXIT_UNKNOWN */ + struct { + __u64 hardware_exit_reason; + } hw; + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; + __u32 cpu; + } fail_entry; + /* KVM_EXIT_EXCEPTION */ + struct { + __u32 exception; + __u32 error_code; + } ex; + /* KVM_EXIT_IO */ + struct { +#define KVM_EXIT_IO_IN 0 +#define KVM_EXIT_IO_OUT 1 + __u8 direction; + __u8 size; /* bytes */ + __u16 port; + __u32 count; + __u64 data_offset; /* relative to kvm_run start */ + } io; + /* KVM_EXIT_DEBUG */ + struct { + struct kvm_debug_exit_arch arch; + } debug; + /* KVM_EXIT_MMIO */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } mmio; + /* KVM_EXIT_HYPERCALL */ + struct { + __u64 nr; + __u64 args[6]; + __u64 ret; + __u32 longmode; + __u32 pad; + } hypercall; + /* KVM_EXIT_TPR_ACCESS */ + struct { + __u64 rip; + __u32 is_write; + __u32 pad; + } tpr_access; + /* KVM_EXIT_S390_SIEIC */ + struct { + __u8 icptcode; + __u16 ipa; + __u32 ipb; + } s390_sieic; + /* KVM_EXIT_S390_RESET */ +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + __u64 s390_reset_flags; + /* KVM_EXIT_S390_UCONTROL */ + struct { + __u64 trans_exc_code; + __u32 pgm_code; + } s390_ucontrol; + /* KVM_EXIT_DCR (deprecated) */ + struct { + __u32 dcrn; + __u32 data; + __u8 is_write; + } dcr; + /* KVM_EXIT_INTERNAL_ERROR */ + struct { + __u32 suberror; + /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */ + __u32 ndata; + __u64 data[16]; + } internal; + /* + * KVM_INTERNAL_ERROR_EMULATION + * + * "struct emulation_failure" is an overlay of "struct internal" + * that is used for the KVM_INTERNAL_ERROR_EMULATION sub-type of + * KVM_EXIT_INTERNAL_ERROR. Note, unlike other internal error + * sub-types, this struct is ABI! It also needs to be backwards + * compatible with "struct internal". Take special care that + * "ndata" is correct, that new fields are enumerated in "flags", + * and that each flag enumerates fields that are 64-bit aligned + * and sized (so that ndata+internal.data[] is valid/accurate). + */ + struct { + __u32 suberror; + __u32 ndata; + __u64 flags; + __u8 insn_size; + __u8 insn_bytes[15]; + } emulation_failure; + /* KVM_EXIT_OSI */ + struct { + __u64 gprs[32]; + } osi; + /* KVM_EXIT_PAPR_HCALL */ + struct { + __u64 nr; + __u64 ret; + __u64 args[9]; + } papr_hcall; + /* KVM_EXIT_S390_TSCH */ + struct { + __u16 subchannel_id; + __u16 subchannel_nr; + __u32 io_int_parm; + __u32 io_int_word; + __u32 ipb; + __u8 dequeued; + } s390_tsch; + /* KVM_EXIT_EPR */ + struct { + __u32 epr; + } epr; + /* KVM_EXIT_SYSTEM_EVENT */ + struct { +#define KVM_SYSTEM_EVENT_SHUTDOWN 1 +#define KVM_SYSTEM_EVENT_RESET 2 +#define KVM_SYSTEM_EVENT_CRASH 3 + __u32 type; + __u64 flags; + } system_event; + /* KVM_EXIT_S390_STSI */ + struct { + __u64 addr; + __u8 ar; + __u8 reserved; + __u8 fc; + __u8 sel1; + __u16 sel2; + } s390_stsi; + /* KVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; + /* KVM_EXIT_HYPERV */ + struct kvm_hyperv_exit hyperv; + /* KVM_EXIT_ARM_NISV */ + struct { + __u64 esr_iss; + __u64 fault_ipa; + } arm_nisv; + /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */ + struct { + __u8 error; /* user -> kernel */ + __u8 pad[7]; +#define KVM_MSR_EXIT_REASON_INVAL (1 << 0) +#define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1) +#define KVM_MSR_EXIT_REASON_FILTER (1 << 2) + __u32 reason; /* kernel -> user */ + __u32 index; /* kernel -> user */ + __u64 data; /* kernel <-> user */ + } msr; + /* KVM_EXIT_XEN */ + struct kvm_xen_exit xen; + /* Fix the size of the union. */ + char padding[256]; + }; + + /* 2048 is the size of the char array used to bound/pad the size + * of the union that holds sync regs. + */ + #define SYNC_REGS_SIZE_BYTES 2048 + /* + * shared registers between kvm and userspace. + * kvm_valid_regs specifies the register classes set by the host + * kvm_dirty_regs specified the register classes dirtied by userspace + * struct kvm_sync_regs is architecture specific, as well as the + * bits for kvm_valid_regs and kvm_dirty_regs + */ + __u64 kvm_valid_regs; + __u64 kvm_dirty_regs; + union { + struct kvm_sync_regs regs; + char padding[SYNC_REGS_SIZE_BYTES]; + } s; +}; + + +int kvm(uint8_t code[], size_t code_len) +{ + // step 1, open /dev/kvm + int kvmfd = open("/dev/kvm", O_RDWR | O_CLOEXEC); + if (kvmfd == -1) + { + printf("failed to open /dev/kvm\n"); return 0; + } + + // step 2, create VM + int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0); + printf("vmfd %d\n", vmfd); + // step 3, set up user memory region + size_t mem_size = 0x100000; // size of user memory you want to assign + void *mem = mmap(0, mem_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + + printf("map mem %p\n", mem); + int user_entry = 0x0; + memcpy((void *)((size_t)mem + user_entry), code, code_len); + struct kvm_userspace_memory_region region = { + .slot = 0, + .flags = 0, + .guest_phys_addr = 0, + .memory_size = mem_size, + .userspace_addr = (size_t)mem}; + ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, ®ion); + /* end of step 3 */ + + // step 4, create vCPU + int vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0); + printf("create vcpu,fd: %p\n", vcpufd); + // step 5, set up memory for vCPU + size_t vcpu_mmap_size = ioctl(kvmfd, KVM_GET_VCPU_MMAP_SIZE, NULL); + struct kvm_run *run = (struct kvm_run *)mmap(0, vcpu_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpufd, 0); + + // step 6, set up vCPU's registers + /* standard registers include general-purpose registers and flags */ + struct kvm_regs regs; + ioctl(vcpufd, KVM_GET_REGS, ®s); + regs.rip = user_entry; + regs.rsp = 0x200000; // stack address + regs.rflags = 0x2; // in x86 the 0x2 bit should always be set + ioctl(vcpufd, KVM_SET_REGS, ®s); // set registers + + /* special registers include segment registers */ + struct kvm_sregs sregs; + ioctl(vcpufd, KVM_GET_SREGS, &sregs); + sregs.cs.base = sregs.cs.selector = 0; // let base of code segment equal to zero + ioctl(vcpufd, KVM_SET_SREGS, &sregs); + ioctl(vcpufd, KVM_GET_SREGS, &sregs); + // step 7, execute vm and handle exit reason + #define KVM_EXIT_UNKNOWN 0 +#define KVM_EXIT_EXCEPTION 1 +#define KVM_EXIT_IO 2 +#define KVM_EXIT_HYPERCALL 3 +#define KVM_EXIT_DEBUG 4 +#define KVM_EXIT_HLT 5 +#define KVM_EXIT_MMIO 6 +#define KVM_EXIT_IRQ_WINDOW_OPEN 7 +#define KVM_EXIT_SHUTDOWN 8 +#define KVM_EXIT_FAIL_ENTRY 9 +#define KVM_EXIT_INTR 10 +#define KVM_EXIT_SET_TPR 11 +#define KVM_EXIT_TPR_ACCESS 12 +#define KVM_EXIT_S390_SIEIC 13 +#define KVM_EXIT_S390_RESET 14 +#define KVM_EXIT_DCR 15 /* deprecated */ +#define KVM_EXIT_NMI 16 +#define KVM_EXIT_INTERNAL_ERROR 17 +#define KVM_EXIT_OSI 18 +#define KVM_EXIT_PAPR_HCALL 19 +#define KVM_EXIT_S390_UCONTROL 20 +#define KVM_EXIT_WATCHDOG 21 +#define KVM_EXIT_S390_TSCH 22 +#define KVM_EXIT_EPR 23 +#define KVM_EXIT_SYSTEM_EVENT 24 +#define KVM_EXIT_S390_STSI 25 +#define KVM_EXIT_IOAPIC_EOI 26 +#define KVM_EXIT_HYPERV 27 +#define KVM_EXIT_ARM_NISV 28 +#define KVM_EXIT_X86_RDMSR 29 +#define KVM_EXIT_X86_WRMSR 30 +#define KVM_EXIT_DIRTY_RING_FULL 31 +#define KVM_EXIT_AP_RESET_HOLD 32 +#define KVM_EXIT_X86_BUS_LOCK 33 +#define KVM_EXIT_XEN 34 + while (1) + { + ioctl(vcpufd, KVM_RUN, NULL); + ioctl(vcpufd, KVM_GET_SREGS, &sregs); + printf("Guest CR3: 0x%llx\n", sregs.cr3); + switch (run->exit_reason) + { + case KVM_EXIT_HLT: + fputs("KVM_EXIT_HLT \n", stderr); + return 0; + case KVM_EXIT_IO: + /* TODO: check port and direction here */ + putchar(*(((char *)run) + run->io.data_offset)); + printf("KVM_EXIT_IO: run->io.port = %lx \n", + run->io.port); + break; + case KVM_EXIT_FAIL_ENTRY: + printf("KVM_EXIT_FAIL_ENTRY: hardware_entry_failure_reason = 0x%lx", + run->fail_entry.hardware_entry_failure_reason); + return 0; + case KVM_EXIT_INTERNAL_ERROR: + printf("KVM_EXIT_INTERNAL_ERROR: suberror = 0x%x", + run->internal.suberror); + return 0; + case KVM_EXIT_SHUTDOWN: + printf("KVM_EXIT_SHUTDOWN"); + return 0; + default: + printf("Unhandled reason: %d", run->exit_reason); + return 0; + } + } } + /*汇编指令解释 +0xB0 0x61 (mov al, 0x61) +解释:将立即数 0x61(ASCII 字符 'a')加载到 AL 寄存器中。 + +0xBA 0x17 0x02 (mov dx, 0x0217) +Linux: ilen = 3 外中断和EPT_VIOLATION +解释:将立即数 0x0217 加载到 DX 寄存器中。 + +0xEE (out dx, al) +解释:将 AL 寄存器的值输出到 DX 寄存器指定的端口。 + +0xB0 0x0A (mov al, 0x0A) +解释:将立即数 0x0A(换行符)加载到 AL 寄存器中。 + +0xEE (out dx, al) +解释:将 AL 寄存器的值输出到 DX 寄存器指定的端口。 + +0xF4 (hlt) +解释:执行 hlt 指令,使处理器进入休眠状态,直到下一个外部中断到来。*/ + int main() { - printf("Test kvm running...\n"); - printf("Open /dev/kvm\n"); - int kvm_fd = open("/dev/kvm", O_RDWR|O_CLOEXEC); - int vmfd = ioctl(kvm_fd, 0x01, 0); - printf("vmfd=%d\n", vmfd); - - /* - __asm__ __volatile__ ( - "mov %rax, 0\n\t" - "mov %rcx, 0\n\t" - "cpuid\n\t" - ); - */ - const uint8_t code[] = { - 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ - 0x00, 0xd8, /* add %bl, %al */ - 0x04, '0', /* add $'0', %al */ - 0xee, /* out %al, (%dx) */ - 0xb0, '\n', /* mov $'\n', %al */ - 0xee, /* out %al, (%dx) */ - 0xf4, /* hlt */ - }; - - size_t mem_size = 0x4000; // size of user memory you want to assign - printf("code=%p\n", code); - // void *mem = mmap(0, mem_size, 0x7, -1, 0); - // memcpy(mem, code, sizeof(code)); - struct kvm_userspace_memory_region region = { - .slot = 0, - .flags = 0, - .guest_phys_addr = 0, - .memory_size = mem_size, - .userspace_addr = (size_t)code - }; - ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, ®ion); - - int vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0); - printf("vcpufd=%d\n", vcpufd); - int user_entry = 0x0; - - struct kvm_regs regs = {0}; - regs.rip = user_entry; - regs.rsp = 0x3000; // stack address - regs.rflags = 0x2; // in x86 the 0x2 bit should always be set - ioctl(vcpufd, KVM_SET_REGS, ®s); // set registers - - ioctl(vcpufd, KVM_RUN, 0); - - return 0; + //uint8_t code[] = "\xB0\x61\xBA\x17\x02\xEE\xB0\n\xEE\xF4"; + //uint8_t code[] = "\xB0\x61\xBA\x17\x02\xEE\xF4"; + uint8_t code[] = "\xB0\x61\xF4"; + kvm(code, sizeof(code)); + return 0; } - - diff --git a/user/apps/test_poll/.gitignore b/user/apps/test_poll/.gitignore new file mode 100644 index 00000000..96903813 --- /dev/null +++ b/user/apps/test_poll/.gitignore @@ -0,0 +1 @@ +test_poll diff --git a/user/apps/test_poll/Makefile b/user/apps/test_poll/Makefile new file mode 100644 index 00000000..6604e069 --- /dev/null +++ b/user/apps/test_poll/Makefile @@ -0,0 +1,21 @@ +ifeq ($(ARCH), x86_64) + CROSS_COMPILE=x86_64-linux-musl- +else ifeq ($(ARCH), riscv64) + CROSS_COMPILE=riscv64-linux-musl- +endif + +BIN_NAME=test_poll +CC=$(CROSS_COMPILE)gcc + +.PHONY: all +all: main.c + $(CC) -static -o $(BIN_NAME) main.c + +.PHONY: install clean +install: all + mv $(BIN_NAME) $(DADK_CURRENT_BUILD_DIR)/$(BIN_NAME) + +clean: + rm $(BIN_NAME) *.o + +fmt: diff --git a/user/apps/test_poll/main.c b/user/apps/test_poll/main.c new file mode 100644 index 00000000..9864bd79 --- /dev/null +++ b/user/apps/test_poll/main.c @@ -0,0 +1,151 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int pipe_fd[2]; // 管道文件描述符数组 +int child_can_exit = 0; // 子进程是否可以退出的标志 +int signal_pid = 0; +int poll_errno; // poll错误码 + +#define WRITE_WAIT_SEC 3 +#define POLL_TIMEOUT_SEC 5 +#define EXPECTED_MESSAGE "Data is ready!\n" +#define POLL_DELTA_MS 1000 +#define min(a, b) ((a) < (b) ? (a) : (b)) + +// 信号处理函数 +void signal_handler(int signo) { + printf("[PID: %d, TID: %lu] Signal %d received.\n", getpid(), pthread_self(), + signo); +} + +// 线程函数,用于在n秒后向管道写入数据 +void *writer_thread(void *arg) { + int seconds = WRITE_WAIT_SEC; + for (int i = 0; i < seconds; i++) { + printf("[PID: %d, TID: %lu] Waiting for %d seconds...\n", getpid(), + pthread_self(), seconds - i); + sleep(1); + kill(signal_pid, SIGUSR1); // 发送信号 + } + const char *message = EXPECTED_MESSAGE; + write(pipe_fd[1], message, strlen(message)); // 写入管道 + printf("[PID: %d, TID: %lu] Data written to pipe.\n", getpid(), + pthread_self()); + close(pipe_fd[1]); // 关闭写端 + printf("[PID: %d, TID: %lu] Pipe write end closed.\n", getpid(), + pthread_self()); + + while (child_can_exit == 0) { + printf("[PID: %d, TID: %lu] Waiting for main to finish...\n", getpid(), + pthread_self()); + sleep(1); + } + return NULL; +} + +int main() { + pthread_t tid; + struct pollfd fds[1]; + int ret; + int test_passed = 1; // 假设测试通过 + + // 创建管道 + if (pipe(pipe_fd) == -1) { + perror("pipe"); + exit(EXIT_FAILURE); + } + + // 设置信号处理函数 + struct sigaction sa; + sa.sa_handler = signal_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_RESTART; + if (sigaction(SIGUSR1, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + signal_pid = getpid(); // 设置信号接收进程ID + + // 创建写线程 + if (pthread_create(&tid, NULL, writer_thread, NULL) != 0) { + perror("pthread_create"); + exit(EXIT_FAILURE); + } + + // 设置poll监视的文件描述符 + fds[0].fd = pipe_fd[0]; // 监视管道的读端 + fds[0].events = POLLIN; // 监视是否有数据可读 + + printf("[PID: %d, TID: %lu] Waiting for data...\n", getpid(), pthread_self()); + + // 在 poll 调用前后添加时间统计 + struct timeval start_time, end_time; + gettimeofday(&start_time, NULL); // 记录 poll 开始时间 + + ret = poll(fds, 1, POLL_TIMEOUT_SEC * 1000); // 调用 poll + poll_errno = errno; + gettimeofday(&end_time, NULL); // 记录 poll 结束时间 + + // 计算 poll 的总耗时(单位:毫秒) + long poll_duration_ms = (end_time.tv_sec - start_time.tv_sec) * 1000 + + (end_time.tv_usec - start_time.tv_usec) / 1000; + + if (abs((int)poll_duration_ms - + min(POLL_TIMEOUT_SEC, WRITE_WAIT_SEC) * 1000) >= POLL_DELTA_MS) { + printf("Poll duration: %ld ms, expected: %d ms, errno: %s\n", + poll_duration_ms, POLL_TIMEOUT_SEC * 1000, strerror(poll_errno)); + test_passed = 0; // 测试失败(如果 poll 耗时与预期相差较大,认为测试未通过) + } + + if (test_passed == 0) { + + } else if (ret == -1) { + printf("poll errno: %s\n", strerror(poll_errno)); + test_passed = 0; // 测试失败 + } else if (ret == 0) { + printf("Timeout! No data available.\n"); + test_passed = 0; // 测试失败 + } else { + if (fds[0].revents & POLLIN) { + char buffer[1024]; + ssize_t count = read(pipe_fd[0], buffer, sizeof(buffer)); // 读取数据 + if (count > 0) { + printf("Data received: %s", buffer); + // 检查读取的数据是否与预期一致 + if (strcmp(buffer, EXPECTED_MESSAGE) != 0) { + printf("Unexpected data received.\n"); + test_passed = 0; // 测试失败 + } + } else { + printf("No data read from pipe.\n"); + test_passed = 0; // 测试失败 + } + } else { + printf("Unexpected event on pipe.\n"); + test_passed = 0; // 测试失败 + } + } + + child_can_exit = 1; // 允许子进程退出 + // 等待写线程结束 + pthread_join(tid, NULL); + close(pipe_fd[0]); // 关闭读端 + + if (test_passed) { + printf("Test passed!\n"); + } else { + printf("Test failed!\n"); + } + + printf("Program finished.\n"); + + return test_passed ? 0 : 1; // 返回0表示测试通过,返回1表示测试失败 +} \ No newline at end of file diff --git a/user/dadk/config/nova_shell-0.1.0.toml b/user/dadk/config/nova_shell-0.1.0.toml index e5778ddf..c3c2d467 100644 --- a/user/dadk/config/nova_shell-0.1.0.toml +++ b/user/dadk/config/nova_shell-0.1.0.toml @@ -24,7 +24,7 @@ source = "git" source-path = "https://git.mirrors.dragonos.org.cn/DragonOS-Community/NovaShell.git" # git标签或分支 # 注意: branch和revision只能二选一,且source要设置为"git" -revision = "feaebefaef" +revision = "d7d2136c5a" # 构建相关信息 [build] # (可选)构建命令 diff --git a/user/dadk/config/test_poll.toml b/user/dadk/config/test_poll.toml new file mode 100644 index 00000000..5f1a4ceb --- /dev/null +++ b/user/dadk/config/test_poll.toml @@ -0,0 +1,46 @@ +# 用户程序名称 +name = "test_poll" +# 版本号 +version = "0.1.0" +# 用户程序描述信息 +description = "test_poll" +# (可选)默认: false 是否只构建一次,如果为true,DADK会在构建成功后,将构建结果缓存起来,下次构建时,直接使用缓存的构建结果 +build-once = false +# (可选) 默认: false 是否只安装一次,如果为true,DADK会在安装成功后,不再重复安装 +install-once = false +# 目标架构 +# 可选值:"x86_64", "aarch64", "riscv64" +target-arch = ["x86_64"] +# 任务源 +[task-source] +# 构建类型 +# 可选值:"build-from_source", "install-from-prebuilt" +type = "build-from-source" +# 构建来源 +# "build_from_source" 可选值:"git", "local", "archive" +# "install_from_prebuilt" 可选值:"local", "archive" +source = "local" +# 路径或URL +source-path = "user/apps/test_poll" +# 构建相关信息 +[build] +# (可选)构建命令 +build-command = "make install" +# 安装相关信息 +[install] +# (可选)安装到DragonOS的路径 +in-dragonos-path = "/bin" +# 清除相关信息 +[clean] +# (可选)清除命令 +clean-command = "make clean" +# (可选)依赖项 +# 注意:如果没有依赖项,忽略此项,不允许只留一个[[depends]] +# [[depends]] +# name = "depend1" +# version = "0.1.1" +# (可选)环境变量 +# 注意:如果没有环境变量,忽略此项,不允许只留一个[[envs]] +# [[envs]] +# key = "PATH" +# value = "/usr/bin"