feat(virtualization): 内核虚拟化支持 (#1073)

* 几个结构体

* 通过vmx_init以及create_vm,create_vcpu部分TODO

* kvm_run完成一半

* 能够成功vmlaunch,但是在vmexit时候还有些问题未排查出来

* 解决了vmlaunch导致的cpu_reset的问题

* 整理代码

* 暂时性push到hyc仓库

* 修改内存虚拟化部分参数传入,解决死锁问题

* 初步完成ept映射.但不停EPT_VIOLATION

* 初步完成了EPT映射,但是读写内存还是有点问题

* fixme

* 更新了一些truncate到from_bits_unchecked的实现

* 完成内存虚拟化EPT_VIOLATION的映射

* fmt

* Remove /fixme from .gitignore

* Remove /fixme file

* Update kernel/src/init/init.rs

Co-authored-by: Samuel Dai <samuka007@dragonos.org>

* Update kernel/src/init/init.rs

Co-authored-by: Samuel Dai <samuka007@dragonos.org>

* 修改了注释格式,删除了附带的一些文件操作

* feat(syscall): 实现syscall restart (#1075)

能够在系统调用返回ERESTARTSYS时,信号处理结束后,自动重启系统调用.

TODO: 实现wait等需要restart_block的系统调用的重启

Signed-off-by: longjin <longjin@DragonOS.org>

* chore: update docker image version in script && update doc (#1076)

* chore: update docker image version in script

* chore: replace lots of spaces with newline in doc

* fix: 修复wait4系统调用部分语义与Linux不一致的问题 (#1080)

* fix: 修复wait4系统调用部分语义与Linux不一致的问题

解决wait不住/wait之后卡死的bug

---------

Signed-off-by: longjin <longjin@DragonOS.org>

* feat(fs/syscall): 实现fchdir系统调用 (#1081)

Signed-off-by: longjin <longjin@DragonOS.org>

* fix(mm): 修复fat文件系统的PageCache同步问题 (#1005)


---------

Co-authored-by: longjin <longjin@DragonOS.org>

* fix: 修正nographic启动时,控制台日志未能输出到文件的问题 (#1082)

Signed-off-by: longjin <longjin@DragonOS.org>

* fix(process): 修复copy_process的一些bug & 支持默认init进程传参 (#1083)

- 修复`copy_process`函数对标志位处理不正确的bug
- init进程搜索列表中,支持为默认init程序传入参数

Signed-off-by: longjin <longjin@DragonOS.org>

* feat: 完善sys_reboot (#1084)

* fix(process): 修复copy_process的一些bug & 支持默认init进程传参

- 修复`copy_process`函数对标志位处理不正确的bug
- init进程搜索列表中,支持为默认init程序传入参数

Signed-off-by: longjin <longjin@DragonOS.org>

* feat: 完善sys_reboot

- 校验magic number
- 支持多个cmd (具体内容未实现)

Signed-off-by: longjin <longjin@DragonOS.org>

---------

Signed-off-by: longjin <longjin@DragonOS.org>

* fix: 修复do_wait函数在wait所有子进程时,忘了释放锁就sleep的bug (#1089)

Signed-off-by: longjin <longjin@DragonOS.org>

* pull主线并且fmt

---------

Signed-off-by: longjin <longjin@DragonOS.org>
Co-authored-by: GnoCiYeH <heyicong@dragonos.org>
Co-authored-by: Samuel Dai <samuka007@dragonos.org>
Co-authored-by: LoGin <longjin@DragonOS.org>
Co-authored-by: LIU Yuwei <22045841+Marsman1996@users.noreply.github.com>
Co-authored-by: MemoryShore <1353318529@qq.com>
This commit is contained in:
Z Fan 2025-03-04 10:56:20 +08:00 committed by GitHub
parent 01dcb5d7a8
commit 597315b04d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
50 changed files with 13675 additions and 126 deletions

3
.gitignore vendored
View File

@ -18,5 +18,4 @@ cppcheck.xml
/target/ /target/
Cargo.lock Cargo.lock
.cache .cache
compile_commands.json compile_commands.json
/logs/

View File

@ -144,7 +144,7 @@
"rust-analyzer.checkOnSave.allTargets": false, "rust-analyzer.checkOnSave.allTargets": false,
"rust-analyzer.linkedProjects": [ "rust-analyzer.linkedProjects": [
"./kernel/Cargo.toml", "./kernel/Cargo.toml",
"./tools/Cargo.toml", //"./tools/Cargo.toml",
], ],
// "rust-analyzer.cargo.target": "riscv64gc-unknown-none-elf", // "rust-analyzer.cargo.target": "riscv64gc-unknown-none-elf",
@ -154,4 +154,5 @@
"check", "check",
], ],
"makefile.configureOnOpen": false,
} }

View File

@ -31,6 +31,7 @@ impl CFilesArch for X86_64CFilesArch {
files.insert(PathBuf::from("src/arch/x86_64/asm/head.S")); files.insert(PathBuf::from("src/arch/x86_64/asm/head.S"));
files.insert(PathBuf::from("src/arch/x86_64/asm/entry.S")); files.insert(PathBuf::from("src/arch/x86_64/asm/entry.S"));
files.insert(PathBuf::from("src/arch/x86_64/asm/apu_boot.S")); files.insert(PathBuf::from("src/arch/x86_64/asm/apu_boot.S"));
files.insert(PathBuf::from("src/arch/x86_64/vm/vmx/vmenter.S"));
} }
fn setup_global_flags(&self, c: &mut Build) { fn setup_global_flags(&self, c: &mut Build) {

View File

@ -4,7 +4,7 @@ use alloc::vec::Vec;
use crate::{bitmap_core::BitMapCore, traits::BitMapOps}; use crate::{bitmap_core::BitMapCore, traits::BitMapOps};
#[derive(Clone)] #[derive(Debug, Clone)]
pub struct AllocBitmap { pub struct AllocBitmap {
elements: usize, elements: usize,
data: Vec<usize>, data: Vec<usize>,
@ -26,6 +26,10 @@ impl AllocBitmap {
self.data[i] &= rhs.data[i]; self.data[i] &= rhs.data[i];
} }
} }
pub fn data(&self) -> &[usize] {
&self.data
}
} }
impl BitMapOps<usize> for AllocBitmap { impl BitMapOps<usize> for AllocBitmap {

View File

@ -3,7 +3,7 @@ use core::{intrinsics::unlikely, marker::PhantomData};
use crate::traits::BitOps; use crate::traits::BitOps;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub(crate) struct BitMapCore<T: BitOps> { pub struct BitMapCore<T: BitOps> {
phantom: PhantomData<T>, phantom: PhantomData<T>,
} }
@ -15,7 +15,7 @@ impl<T: BitOps> BitMapCore<T> {
} }
/// 获取位图中的某一位 /// 获取位图中的某一位
pub(crate) fn get(&self, n: usize, data: &[T], index: usize) -> Option<bool> { pub fn get(&self, n: usize, data: &[T], index: usize) -> Option<bool> {
if unlikely(index >= n) { if unlikely(index >= n) {
return None; return None;
} }
@ -30,7 +30,7 @@ impl<T: BitOps> BitMapCore<T> {
} }
/// 设置位图中的某一位 /// 设置位图中的某一位
pub(crate) fn set(&self, n: usize, data: &mut [T], index: usize, value: bool) -> Option<bool> { pub fn set(&self, n: usize, data: &mut [T], index: usize, value: bool) -> Option<bool> {
if unlikely(index >= n) { if unlikely(index >= n) {
return None; return None;
} }
@ -43,7 +43,7 @@ impl<T: BitOps> BitMapCore<T> {
Some(bit) Some(bit)
} }
pub(crate) fn set_all(&self, n: usize, data: &mut [T], value: bool) { pub fn set_all(&self, n: usize, data: &mut [T], value: bool) {
let val = if value { T::max() } else { T::zero() }; let val = if value { T::max() } else { T::zero() };
for element in data.iter_mut() { for element in data.iter_mut() {
*element = val; *element = val;
@ -58,7 +58,7 @@ impl<T: BitOps> BitMapCore<T> {
} }
/// 获取位图中第一个为1的位 /// 获取位图中第一个为1的位
pub(crate) fn first_index(&self, data: &[T]) -> Option<usize> { pub fn first_index(&self, data: &[T]) -> Option<usize> {
for (i, element) in data.iter().enumerate() { for (i, element) in data.iter().enumerate() {
let bit = <T as BitOps>::first_index(element); let bit = <T as BitOps>::first_index(element);
if let Some(b) = bit { if let Some(b) = bit {
@ -70,7 +70,7 @@ impl<T: BitOps> BitMapCore<T> {
} }
/// 获取位图中第一个为0的位 /// 获取位图中第一个为0的位
pub(crate) fn first_false_index(&self, n: usize, data: &[T]) -> Option<usize> { pub fn first_false_index(&self, n: usize, data: &[T]) -> Option<usize> {
for (i, element) in data.iter().enumerate() { for (i, element) in data.iter().enumerate() {
if let Some(bit) = <T as BitOps>::first_false_index(element) { if let Some(bit) = <T as BitOps>::first_false_index(element) {
return self.make_index(n, i * T::bit_size() + bit); return self.make_index(n, i * T::bit_size() + bit);
@ -81,7 +81,7 @@ impl<T: BitOps> BitMapCore<T> {
} }
/// 获取位图中最后一个为1的位 /// 获取位图中最后一个为1的位
pub(crate) fn last_index(&self, n: usize, data: &[T]) -> Option<usize> { pub fn last_index(&self, n: usize, data: &[T]) -> Option<usize> {
for (i, element) in data.iter().enumerate().rev() { for (i, element) in data.iter().enumerate().rev() {
if let Some(bit) = <T as BitOps>::last_index(element) { if let Some(bit) = <T as BitOps>::last_index(element) {
return self.make_index(n, i * T::bit_size() + bit); return self.make_index(n, i * T::bit_size() + bit);
@ -97,7 +97,7 @@ impl<T: BitOps> BitMapCore<T> {
/// ///
/// - `data`:位图数据 /// - `data`:位图数据
/// - `n`:位图有效位数 /// - `n`:位图有效位数
pub(crate) fn last_false_index(&self, n: usize, data: &[T]) -> Option<usize> { pub fn last_false_index(&self, n: usize, data: &[T]) -> Option<usize> {
let mut iter = data.iter().rev(); let mut iter = data.iter().rev();
let mut last_element = *iter.next()?; let mut last_element = *iter.next()?;
@ -123,7 +123,7 @@ impl<T: BitOps> BitMapCore<T> {
} }
/// 获取位图中下一个为1的位 /// 获取位图中下一个为1的位
pub(crate) fn next_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> { pub fn next_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> {
if unlikely(index >= n) { if unlikely(index >= n) {
return None; return None;
} }
@ -146,7 +146,7 @@ impl<T: BitOps> BitMapCore<T> {
} }
/// 获取位图中下一个为0的位 /// 获取位图中下一个为0的位
pub(crate) fn next_false_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> { pub fn next_false_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> {
if unlikely(index >= n) { if unlikely(index >= n) {
return None; return None;
} }
@ -169,7 +169,7 @@ impl<T: BitOps> BitMapCore<T> {
} }
/// 获取位图中上一个为1的位 /// 获取位图中上一个为1的位
pub(crate) fn prev_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> { pub fn prev_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> {
if unlikely(index >= n) { if unlikely(index >= n) {
return None; return None;
} }
@ -190,7 +190,7 @@ impl<T: BitOps> BitMapCore<T> {
None None
} }
pub(crate) fn prev_false_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> { pub fn prev_false_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> {
let element_index = index / T::bit_size(); let element_index = index / T::bit_size();
let bit_index = index % T::bit_size(); let bit_index = index % T::bit_size();
@ -208,7 +208,7 @@ impl<T: BitOps> BitMapCore<T> {
None None
} }
pub(crate) fn invert(&self, n: usize, data: &mut [T]) { pub fn invert(&self, n: usize, data: &mut [T]) {
for element in data.iter_mut() { for element in data.iter_mut() {
<T as BitOps>::invert(element); <T as BitOps>::invert(element);
} }
@ -222,7 +222,7 @@ impl<T: BitOps> BitMapCore<T> {
} }
} }
pub(crate) fn is_full(&self, n: usize, data: &[T]) -> bool { pub fn is_full(&self, n: usize, data: &[T]) -> bool {
let mut iter = data.iter().peekable(); let mut iter = data.iter().peekable();
while let Some(element) = iter.next() { while let Some(element) = iter.next() {
if iter.peek().is_none() { if iter.peek().is_none() {
@ -245,7 +245,7 @@ impl<T: BitOps> BitMapCore<T> {
return false; return false;
} }
pub(crate) fn is_empty(&self, data: &[T]) -> bool { pub fn is_empty(&self, data: &[T]) -> bool {
for element in data.iter() { for element in data.iter() {
if element != &T::zero() { if element != &T::zero() {
return false; return false;

View File

@ -13,4 +13,5 @@ mod bitmap_core;
mod static_bitmap; mod static_bitmap;
pub mod traits; pub mod traits;
pub use alloc_bitmap::AllocBitmap; pub use alloc_bitmap::AllocBitmap;
pub use bitmap_core::BitMapCore;
pub use static_bitmap::StaticBitmap; pub use static_bitmap::StaticBitmap;

View File

@ -88,7 +88,7 @@ fn tdp_get_cr3(_vcpu: &VmxVcpu) -> u64 {
return guest_cr3; return guest_cr3;
} }
fn tdp_set_eptp(root_hpa: u64) -> Result<(), SystemError> { pub fn tdp_set_eptp(root_hpa: u64) -> Result<(), SystemError> {
// 设置权限位,目前是写死的,可读可写可执行 // 设置权限位,目前是写死的,可读可写可执行
// EPT paging-structure memory type: Uncacheable // EPT paging-structure memory type: Uncacheable
let mut eptp = 0x0_u64; let mut eptp = 0x0_u64;

View File

@ -501,7 +501,7 @@ pub fn get_segment_base(gdt_base: *const u64, gdt_size: u16, segment_selector: u
// } // }
pub fn adjust_vmx_controls(ctl_min: u32, ctl_opt: u32, msr: u32, result: &mut u32) { pub fn adjust_vmx_controls(ctl_min: u32, ctl_opt: u32, msr: u32, result: &mut u32) {
let vmx_msr_low: u32 = unsafe { (msr::rdmsr(msr) & 0x0000_0000_FFFF_FFFF) as u32 }; let vmx_msr_low: u32 = unsafe { (msr::rdmsr(msr) & 0x0000_0000_FFFF_FFFF) as u32 };
let vmx_msr_high: u32 = unsafe { (msr::rdmsr(msr) << 32) as u32 }; let vmx_msr_high: u32 = unsafe { (msr::rdmsr(msr) >> 32) as u32 };
let mut ctl: u32 = ctl_min | ctl_opt; let mut ctl: u32 = ctl_min | ctl_opt;
ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */

View File

@ -264,7 +264,7 @@ extern "C" fn vmexit_handler() {
} }
#[no_mangle] #[no_mangle]
fn adjust_rip(rip: u64) -> Result<(), SystemError> { pub fn adjust_rip(rip: u64) -> Result<(), SystemError> {
let instruction_length = vmx_vmread(VmcsFields::VMEXIT_INSTR_LEN as u32)?; let instruction_length = vmx_vmread(VmcsFields::VMEXIT_INSTR_LEN as u32)?;
vmx_vmwrite(VmcsFields::GUEST_RIP as u32, rip + instruction_length)?; vmx_vmwrite(VmcsFields::GUEST_RIP as u32, rip + instruction_length)?;
Ok(()) Ok(())

View File

@ -439,6 +439,15 @@ impl X86_64MMArch {
// 不支持的原因是目前好像没有能正确的设置page-level的xd位会触发page fault // 不支持的原因是目前好像没有能正确的设置page-level的xd位会触发page fault
return true; return true;
} }
pub unsafe fn read_array<T>(addr: VirtAddr, count: usize) -> Vec<T> {
// 实现读取数组逻辑
let mut vec = Vec::with_capacity(count);
for i in 0..count {
vec.push(Self::read(addr + i * core::mem::size_of::<T>()));
}
vec
}
} }
impl VirtAddr { impl VirtAddr {

View File

@ -20,6 +20,7 @@ pub mod sched;
pub mod smp; pub mod smp;
pub mod syscall; pub mod syscall;
pub mod time; pub mod time;
pub mod vm;
pub use self::pci::pci::X86_64PciArch as PciArch; pub use self::pci::pci::X86_64PciArch as PciArch;
@ -40,3 +41,12 @@ pub use crate::arch::elf::X86_64ElfArch as CurrentElfArch;
pub use crate::arch::smp::X86_64SMPArch as CurrentSMPArch; pub use crate::arch::smp::X86_64SMPArch as CurrentSMPArch;
pub use crate::arch::sched::X86_64SchedArch as CurrentSchedArch; pub use crate::arch::sched::X86_64SchedArch as CurrentSchedArch;
pub use crate::arch::vm::KvmArchManager as CurrentKvmManager;
pub use crate::arch::vm::kvm_host::X86KvmArch as KvmArch;
pub use crate::arch::vm::x86_kvm_ops as kvm_arch_ops;
pub use crate::arch::vm::kvm_host::vcpu::X86VcpuArch as VirtCpuArch;
pub use crate::arch::vm::kvm_host::KvmVcpuStat as VirtCpuStat;

View File

@ -0,0 +1,592 @@
use core::arch::asm;
use alloc::slice;
use log::{debug, error};
use raw_cpuid::CpuId;
use system_error::SystemError;
use x86::{
bits64::vmx::vmxon,
controlregs::{cr0, cr0_write, cr4, cr4_write, Cr0, Cr4},
msr::{
rdmsr, wrmsr, IA32_FEATURE_CONTROL, IA32_VMX_CR0_FIXED0, IA32_VMX_CR0_FIXED1,
IA32_VMX_CR4_FIXED0, IA32_VMX_CR4_FIXED1,
},
vmx::vmcs::ro,
};
use crate::{
arch::{mm::barrier, MMArch},
mm::{MemoryManagementArch, PhysAddr},
};
use super::vmx::vmx_info;
pub struct KvmX86Asm;
impl KvmX86Asm {
pub fn read_pkru() -> u32 {
let cpuid = CpuId::new();
if let Some(feat) = cpuid.get_extended_feature_info() {
if feat.has_ospke() {
return Self::rdpkru();
}
}
return 0;
}
pub fn write_pkru(_val: u32) {
let cpuid = CpuId::new();
if let Some(feat) = cpuid.get_extended_feature_info() {
if feat.has_ospke() {
todo!();
}
}
}
fn rdpkru() -> u32 {
let ecx: u32 = 0;
let pkru: u32;
let _edx: u32;
unsafe {
asm!(
"rdpkru",
out("eax") pkru,
out("edx") _edx,
in("ecx") ecx,
);
}
pkru
}
pub fn get_segment_base(gdt_base: *const u64, gdt_size: u16, segment_selector: u16) -> u64 {
let table = segment_selector & 0x0004; // get table indicator in selector
let index = (segment_selector >> 3) as usize; // get index in selector
if table == 0 && index == 0 {
return 0;
}
let descriptor_table = unsafe { slice::from_raw_parts(gdt_base, gdt_size.into()) };
let descriptor = descriptor_table[index];
let base_high = (descriptor & 0xFF00_0000_0000_0000) >> 32;
let base_mid = (descriptor & 0x0000_00FF_0000_0000) >> 16;
let base_low = (descriptor & 0x0000_0000_FFFF_0000) >> 16;
let segment_base = (base_high | base_mid | base_low) & 0xFFFFFFFF;
let virtaddr = unsafe {
MMArch::phys_2_virt(PhysAddr::new(segment_base as usize))
.unwrap()
.data() as u64
};
return virtaddr;
}
}
pub struct VmxAsm;
impl VmxAsm {
pub fn vmclear(phys_addr: PhysAddr) {
debug!("vmclear addr {phys_addr:?}");
match unsafe { x86::bits64::vmx::vmclear(phys_addr.data() as u64) } {
Ok(_) => {}
Err(e) => {
panic!("[VMX] vmclear failed! reason: {e:?}");
}
}
}
pub fn vmcs_load(phys_addr: PhysAddr) {
match unsafe { x86::bits64::vmx::vmptrld(phys_addr.data() as u64) } {
Ok(_) => {}
Err(e) => {
panic!("[VMX] vmptrld failed! reason: {e:?}");
}
}
}
/// vmrite the current VMCS.
pub fn vmx_vmwrite(vmcs_field: u32, value: u64) {
unsafe {
x86::bits64::vmx::vmwrite(vmcs_field, value)
.unwrap_or_else(|_| panic!("vmcs_field: {:x} vmx_write fail", vmcs_field))
}
}
/// vmread the current VMCS.
pub fn vmx_vmread(vmcs_field: u32) -> u64 {
unsafe { x86::bits64::vmx::vmread(vmcs_field).expect("vmx_read fail: ") }
}
pub fn kvm_cpu_vmxon(phys_addr: PhysAddr) -> Result<(), SystemError> {
unsafe {
let mut cr4 = cr4();
cr4.insert(Cr4::CR4_ENABLE_VMX);
cr4_write(cr4);
Self::vmx_set_lock_bit()?;
Self::vmx_set_cr0_bits();
Self::vmx_set_cr4_bits();
debug!("vmxon addr {phys_addr:?}");
vmxon(phys_addr.data() as u64).expect("[VMX] vmxon failed! reason");
barrier::mfence();
Ok(())
}
}
#[allow(dead_code)]
const VMX_VPID_EXTENT_INDIVIDUAL_ADDR: u64 = 0;
const VMX_VPID_EXTENT_SINGLE_CONTEXT: u64 = 1;
#[allow(dead_code)]
const VMX_VPID_EXTENT_ALL_CONTEXT: u64 = 2;
#[allow(dead_code)]
const VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: u64 = 3;
#[allow(dead_code)]
const VMX_EPT_EXTENT_CONTEXT: u64 = 1;
const VMX_EPT_EXTENT_GLOBAL: u64 = 2;
#[allow(dead_code)]
const VMX_EPT_EXTENT_SHIFT: u64 = 24;
pub fn ept_sync_global() {
Self::invept(Self::VMX_EPT_EXTENT_GLOBAL, 0, 0);
}
#[allow(dead_code)]
pub fn ept_sync_context(eptp: u64) {
if vmx_info().has_vmx_invept_context() {
Self::invept(Self::VMX_EPT_EXTENT_CONTEXT, eptp, 0);
} else {
Self::ept_sync_global();
}
}
pub fn sync_vcpu_single(vpid: u16) {
if vpid == 0 {
return;
}
Self::invvpid(Self::VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0)
}
pub fn sync_vcpu_global() {
Self::invvpid(Self::VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
}
#[inline(always)]
fn invept(ext: u64, eptp: u64, gpa: u64) {
#[repr(C)]
struct InveptDescriptor {
eptp: u64,
gpa: u64,
}
let descriptor = InveptDescriptor { eptp, gpa };
unsafe {
asm!(
"invept {0}, [{1}]",
in(reg) ext,
in(reg) &descriptor,
options(nostack)
);
}
}
#[inline(always)]
fn invvpid(ext: u64, vpid: u16, gva: u64) {
#[repr(C)]
struct InvvpidDescriptor {
vpid: u16,
rsvd: u64,
gva: u64,
}
let descriptor = InvvpidDescriptor { vpid, rsvd: 0, gva };
unsafe {
asm!(
"invvpid {0}, [{1}]",
in(reg) ext,
in(reg) &descriptor,
options(nostack)
);
}
}
/// Set the mandatory bits in CR4 and clear bits that are mandatory zero
/// (Intel Manual: 24.8 Restrictions on VMX Operation)
fn vmx_set_cr4_bits() {
let ia32_vmx_cr4_fixed0 = unsafe { rdmsr(IA32_VMX_CR4_FIXED0) };
let ia32_vmx_cr4_fixed1 = unsafe { rdmsr(IA32_VMX_CR4_FIXED1) };
let mut cr4 = unsafe { cr4() };
cr4 |= Cr4::from_bits_truncate(ia32_vmx_cr4_fixed0 as usize);
cr4 &= Cr4::from_bits_truncate(ia32_vmx_cr4_fixed1 as usize);
unsafe { cr4_write(cr4) };
}
/// Check if we need to set bits in IA32_FEATURE_CONTROL
// (Intel Manual: 24.7 Enabling and Entering VMX Operation)
fn vmx_set_lock_bit() -> Result<(), SystemError> {
const VMX_LOCK_BIT: u64 = 1 << 0;
const VMXON_OUTSIDE_SMX: u64 = 1 << 2;
let ia32_feature_control = unsafe { rdmsr(IA32_FEATURE_CONTROL) };
if (ia32_feature_control & VMX_LOCK_BIT) == 0 {
unsafe {
wrmsr(
IA32_FEATURE_CONTROL,
VMXON_OUTSIDE_SMX | VMX_LOCK_BIT | ia32_feature_control,
)
};
} else if (ia32_feature_control & VMXON_OUTSIDE_SMX) == 0 {
return Err(SystemError::EPERM);
}
Ok(())
}
/// Set the mandatory bits in CR0 and clear bits that are mandatory zero
/// (Intel Manual: 24.8 Restrictions on VMX Operation)
fn vmx_set_cr0_bits() {
let ia32_vmx_cr0_fixed0 = unsafe { rdmsr(IA32_VMX_CR0_FIXED0) };
let ia32_vmx_cr0_fixed1 = unsafe { rdmsr(IA32_VMX_CR0_FIXED1) };
let mut cr0 = unsafe { cr0() };
cr0 |= Cr0::from_bits_truncate(ia32_vmx_cr0_fixed0 as usize);
cr0 &= Cr0::from_bits_truncate(ia32_vmx_cr0_fixed1 as usize);
unsafe { cr0_write(cr0) };
}
}
#[no_mangle]
unsafe extern "C" fn vmx_vmlaunch() {
if let Err(e) = x86::bits64::vmx::vmlaunch() {
error!(
"vmx_launch fail: {:?}, err code {}",
e,
VmxAsm::vmx_vmread(ro::VM_INSTRUCTION_ERROR)
);
}
}
bitflags! {
pub struct IntrInfo: u32 {
const INTR_INFO_VECTOR_MASK = 0xff;
const INTR_INFO_INTR_TYPE_MASK = 0x700;
const INTR_INFO_DELIVER_CODE_MASK = 0x800;
const INTR_INFO_UNBLOCK_NMI = 0x1000;
const INTR_INFO_VALID_MASK = 0x80000000;
const INTR_INFO_RESVD_BITS_MASK = 0x7ffff000;
}
pub struct IntrType: u32 {
/// external interrupt
const INTR_TYPE_EXT_INTR = (0 << 8);
/// reserved
const INTR_TYPE_RESERVED = (1 << 8);
/// NMI
const INTR_TYPE_NMI_INTR = (2 << 8);
/// processor exception
const INTR_TYPE_HARD_EXCEPTION = (3 << 8);
/// software interrupt
const INTR_TYPE_SOFT_INTR = (4 << 8);
/// ICE breakpoint - undocumented
const INTR_TYPE_PRIV_SW_EXCEPTION = (5 << 8);
/// software exception
const INTR_TYPE_SOFT_EXCEPTION = (6 << 8);
/// other even
const INTR_TYPE_OTHER_EVENT = (7 << 8);
}
pub struct MiscEnable: u64 {
const MSR_IA32_MISC_ENABLE_FAST_STRING = 1 << 0;
const MSR_IA32_MISC_ENABLE_TCC = 1 << 1;
const MSR_IA32_MISC_ENABLE_EMON = 1 << 7;
const MSR_IA32_MISC_ENABLE_BTS_UNAVAIL = 1 << 11;
const MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL = 1 << 12;
const MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP = 1 << 16;
const MSR_IA32_MISC_ENABLE_MWAIT = 1 << 18;
const MSR_IA32_MISC_ENABLE_LIMIT_CPUID= 1 << 22;
const MSR_IA32_MISC_ENABLE_XTPR_DISABLE = 1 << 23;
const MSR_IA32_MISC_ENABLE_XD_DISABLE = 1 << 34;
}
pub struct ArchCapabilities: u64 {
/// Not susceptible to Meltdown
const ARCH_CAP_RDCL_NO = 1 << 0;
/// Enhanced IBRS support
const ARCH_CAP_IBRS_ALL = 1 << 1;
/// RET may use alternative branch predictors
const ARCH_CAP_RSBA = 1 << 2;
/// Skip L1D flush on vmentry
const ARCH_CAP_SKIP_VMENTRY_L1DFLUSH = 1 << 3;
///
/// Not susceptible to Speculative Store Bypass
/// attack, so no Speculative Store Bypass
/// control required.
///
const ARCH_CAP_SSB_NO = 1 << 4;
/// Not susceptible to
/// Microarchitectural Data
/// Sampling (MDS) vulnerabilities.
const ARCH_CAP_MDS_NO = 1 << 5;
/// The processor is not susceptible to a
/// machine check error due to modifying the
/// code page size along with either the
/// physical address or cache type
/// without TLB invalidation.
const ARCH_CAP_PSCHANGE_MC_NO = 1 << 6;
/// MSR for TSX control is available.
const ARCH_CAP_TSX_CTRL_MSR = 1 << 7;
/// Not susceptible to
/// TSX Async Abort (TAA) vulnerabilities.
const ARCH_CAP_TAA_NO = 1 << 8;
/// Not susceptible to SBDR and SSDP
/// variants of Processor MMIO stale data
/// vulnerabilities.
const ARCH_CAP_SBDR_SSDP_NO = 1 << 13;
/// Not susceptible to FBSDP variant of
/// Processor MMIO stale data
/// vulnerabilities.
const ARCH_CAP_FBSDP_NO = 1 << 14;
/// Not susceptible to PSDP variant of
/// Processor MMIO stale data
/// vulnerabilities.
const ARCH_CAP_PSDP_NO = 1 << 15;
/// VERW clears CPU fill buffer
/// even on MDS_NO CPUs.
const ARCH_CAP_FB_CLEAR = 1 << 17;
/// MSR_IA32_MCU_OPT_CTRL[FB_CLEAR_DIS]
/// bit available to control VERW
/// behavior.
const ARCH_CAP_FB_CLEAR_CTRL = 1 << 18;
/// Indicates RET may use predictors
/// other than the RSB. With eIBRS
/// enabled predictions in kernel mode
/// are restricted to targets in
/// kernel.
const ARCH_CAP_RRSBA = 1 << 19;
/// Not susceptible to Post-Barrier
/// Return Stack Buffer Predictions.
const ARCH_CAP_PBRSB_NO = 1 << 24;
/// CPU is vulnerable to Gather
/// Data Sampling (GDS) and
/// has controls for mitigation.
const ARCH_CAP_GDS_CTRL = 1 << 25;
/// CPU is not vulnerable to Gather
/// Data Sampling (GDS).
const ARCH_CAP_GDS_NO = 1 << 26;
/// IA32_XAPIC_DISABLE_STATUS MSR
/// supported
const ARCH_CAP_XAPIC_DISABLE = 1 << 21;
const KVM_SUPPORTED_ARCH_CAP = ArchCapabilities::ARCH_CAP_RDCL_NO.bits
| ArchCapabilities::ARCH_CAP_IBRS_ALL.bits
| ArchCapabilities::ARCH_CAP_RSBA.bits
| ArchCapabilities::ARCH_CAP_SKIP_VMENTRY_L1DFLUSH.bits
| ArchCapabilities::ARCH_CAP_SSB_NO.bits
| ArchCapabilities::ARCH_CAP_MDS_NO.bits
| ArchCapabilities::ARCH_CAP_PSCHANGE_MC_NO.bits
| ArchCapabilities::ARCH_CAP_TSX_CTRL_MSR.bits
| ArchCapabilities::ARCH_CAP_TAA_NO.bits
| ArchCapabilities::ARCH_CAP_SBDR_SSDP_NO.bits
| ArchCapabilities::ARCH_CAP_FBSDP_NO.bits
| ArchCapabilities::ARCH_CAP_PSDP_NO.bits
| ArchCapabilities::ARCH_CAP_FB_CLEAR.bits
| ArchCapabilities::ARCH_CAP_RRSBA.bits
| ArchCapabilities::ARCH_CAP_PBRSB_NO.bits
| ArchCapabilities::ARCH_CAP_GDS_NO.bits;
}
}
#[derive(Debug, Default, Clone)]
pub struct MsrData {
pub host_initiated: bool,
pub index: u32,
pub data: u64,
}
#[repr(C, align(16))]
#[derive(Debug, Default, Copy, Clone)]
pub struct VmxMsrEntry {
pub index: u32,
pub reserved: u32,
pub data: u64,
}
#[allow(dead_code)]
pub mod hyperv {
/* Hyper-V specific model specific registers (MSRs) */
/* MSR used to identify the guest OS. */
pub const HV_X64_MSR_GUEST_OS_ID: u32 = 0x40000000;
/* MSR used to setup pages used to communicate with the hypervisor. */
pub const HV_X64_MSR_HYPERCALL: u32 = 0x40000001;
/* MSR used to provide vcpu index */
pub const HV_REGISTER_VP_INDEX: u32 = 0x40000002;
/* MSR used to reset the guest OS. */
pub const HV_X64_MSR_RESET: u32 = 0x40000003;
/* MSR used to provide vcpu runtime in 100ns units */
pub const HV_X64_MSR_VP_RUNTIME: u32 = 0x40000010;
/* MSR used to read the per-partition time reference counter */
pub const HV_REGISTER_TIME_REF_COUNT: u32 = 0x40000020;
/* A partition's reference time stamp counter (TSC) page */
pub const HV_REGISTER_REFERENCE_TSC: u32 = 0x40000021;
/* MSR used to retrieve the TSC frequency */
pub const HV_X64_MSR_TSC_FREQUENCY: u32 = 0x40000022;
/* MSR used to retrieve the local APIC timer frequency */
pub const HV_X64_MSR_APIC_FREQUENCY: u32 = 0x40000023;
/* Define the virtual APIC registers */
pub const HV_X64_MSR_EOI: u32 = 0x40000070;
pub const HV_X64_MSR_ICR: u32 = 0x40000071;
pub const HV_X64_MSR_TPR: u32 = 0x40000072;
pub const HV_X64_MSR_VP_ASSIST_PAGE: u32 = 0x40000073;
/* Define synthetic interrupt controller model specific registers. */
pub const HV_REGISTER_SCONTROL: u32 = 0x40000080;
pub const HV_REGISTER_SVERSION: u32 = 0x40000081;
pub const HV_REGISTER_SIEFP: u32 = 0x40000082;
pub const HV_REGISTER_SIMP: u32 = 0x40000083;
pub const HV_REGISTER_EOM: u32 = 0x40000084;
pub const HV_REGISTER_SINT0: u32 = 0x40000090;
pub const HV_REGISTER_SINT1: u32 = 0x40000091;
pub const HV_REGISTER_SINT2: u32 = 0x40000092;
pub const HV_REGISTER_SINT3: u32 = 0x40000093;
pub const HV_REGISTER_SINT4: u32 = 0x40000094;
pub const HV_REGISTER_SINT5: u32 = 0x40000095;
pub const HV_REGISTER_SINT6: u32 = 0x40000096;
pub const HV_REGISTER_SINT7: u32 = 0x40000097;
pub const HV_REGISTER_SINT8: u32 = 0x40000098;
pub const HV_REGISTER_SINT9: u32 = 0x40000099;
pub const HV_REGISTER_SINT10: u32 = 0x4000009A;
pub const HV_REGISTER_SINT11: u32 = 0x4000009B;
pub const HV_REGISTER_SINT12: u32 = 0x4000009C;
pub const HV_REGISTER_SINT13: u32 = 0x4000009D;
pub const HV_REGISTER_SINT14: u32 = 0x4000009E;
pub const HV_REGISTER_SINT15: u32 = 0x4000009F;
/*
* Define synthetic interrupt controller model specific registers for
* nested hypervisor.
*/
pub const HV_REGISTER_NESTED_SCONTROL: u32 = 0x40001080;
pub const HV_REGISTER_NESTED_SVERSION: u32 = 0x40001081;
pub const HV_REGISTER_NESTED_SIEFP: u32 = 0x40001082;
pub const HV_REGISTER_NESTED_SIMP: u32 = 0x40001083;
pub const HV_REGISTER_NESTED_EOM: u32 = 0x40001084;
pub const HV_REGISTER_NESTED_SINT0: u32 = 0x40001090;
/*
* Synthetic Timer MSRs. Four timers per vcpu.
*/
pub const HV_REGISTER_STIMER0_CONFIG: u32 = 0x400000B0;
pub const HV_REGISTER_STIMER0_COUNT: u32 = 0x400000B1;
pub const HV_REGISTER_STIMER1_CONFIG: u32 = 0x400000B2;
pub const HV_REGISTER_STIMER1_COUNT: u32 = 0x400000B3;
pub const HV_REGISTER_STIMER2_CONFIG: u32 = 0x400000B4;
pub const HV_REGISTER_STIMER2_COUNT: u32 = 0x400000B5;
pub const HV_REGISTER_STIMER3_CONFIG: u32 = 0x400000B6;
pub const HV_REGISTER_STIMER3_COUNT: u32 = 0x400000B7;
/* Hyper-V guest idle MSR */
pub const HV_X64_MSR_GUEST_IDLE: u32 = 0x400000F0;
/* Hyper-V guest crash notification MSR's */
pub const HV_REGISTER_CRASH_P0: u32 = 0x40000100;
pub const HV_REGISTER_CRASH_P1: u32 = 0x40000101;
pub const HV_REGISTER_CRASH_P2: u32 = 0x40000102;
pub const HV_REGISTER_CRASH_P3: u32 = 0x40000103;
pub const HV_REGISTER_CRASH_P4: u32 = 0x40000104;
pub const HV_REGISTER_CRASH_CTL: u32 = 0x40000105;
/* TSC emulation after migration */
pub const HV_X64_MSR_REENLIGHTENMENT_CONTROL: u32 = 0x40000106;
pub const HV_X64_MSR_TSC_EMULATION_CONTROL: u32 = 0x40000107;
pub const HV_X64_MSR_TSC_EMULATION_STATUS: u32 = 0x40000108;
/* TSC invariant control */
pub const HV_X64_MSR_TSC_INVARIANT_CONTROL: u32 = 0x40000118;
/*
* The defines related to the synthetic debugger are required by KDNet, but
* they are not documented in the Hyper-V TLFS because the synthetic debugger
* functionality has been deprecated and is subject to removal in future
* versions of Windows.
*/
pub const HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: u32 = 0x40000080;
pub const HYPERV_CPUID_SYNDBG_INTERFACE: u32 = 0x40000081;
pub const HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: u32 = 0x40000082;
/*
* Hyper-V synthetic debugger platform capabilities
* These are HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX bits.
*/
pub const HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING: u32 = 1 << 1;
/* Hyper-V Synthetic debug options MSR */
pub const HV_X64_MSR_SYNDBG_CONTROL: u32 = 0x400000F1;
pub const HV_X64_MSR_SYNDBG_STATUS: u32 = 0x400000F2;
pub const HV_X64_MSR_SYNDBG_SEND_BUFFER: u32 = 0x400000F3;
pub const HV_X64_MSR_SYNDBG_RECV_BUFFER: u32 = 0x400000F4;
pub const HV_X64_MSR_SYNDBG_PENDING_BUFFER: u32 = 0x400000F5;
pub const HV_X64_MSR_SYNDBG_OPTIONS: u32 = 0x400000FF;
}
#[allow(dead_code)]
pub mod kvm_msr {
pub const MSR_KVM_WALL_CLOCK: u32 = 0x11;
pub const MSR_KVM_SYSTEM_TIME: u32 = 0x12;
/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
pub const MSR_KVM_WALL_CLOCK_NEW: u32 = 0x4b564d00;
pub const MSR_KVM_SYSTEM_TIME_NEW: u32 = 0x4b564d01;
pub const MSR_KVM_ASYNC_PF_EN: u32 = 0x4b564d02;
pub const MSR_KVM_STEAL_TIME: u32 = 0x4b564d03;
pub const MSR_KVM_PV_EOI_EN: u32 = 0x4b564d04;
pub const MSR_KVM_POLL_CONTROL: u32 = 0x4b564d05;
pub const MSR_KVM_ASYNC_PF_INT: u32 = 0x4b564d06;
pub const MSR_KVM_ASYNC_PF_ACK: u32 = 0x4b564d07;
pub const MSR_KVM_MIGRATION_CONTROL: u32 = 0x4b564d08;
pub const PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x00000016;
pub const CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x0401e172;
pub const VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x00036dff;
pub const VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x000011ff;
}
#[derive(Debug, PartialEq, Clone, Copy)]
pub enum VcpuSegment {
ES,
CS,
SS,
DS,
FS,
GS,
TR,
LDTR,
}
#[derive(Debug, PartialEq, Clone, Copy)]
pub enum SegmentCacheField {
SEL = 0,
BASE = 1,
LIMIT = 2,
AR = 3,
NR = 4,
}

View File

@ -0,0 +1,59 @@
use alloc::vec::Vec;
#[derive(Debug, Default, Clone, Copy)]
#[allow(dead_code)]
pub struct KvmCpuidEntry2 {
pub function: u32,
pub index: u32,
pub flags: KvmCpuidFlag,
pub eax: u32,
pub ebx: u32,
pub ecx: u32,
pub edx: u32,
padding: [u32; 3],
}
impl KvmCpuidEntry2 {
pub fn find(
entries: &Vec<KvmCpuidEntry2>,
function: u32,
index: Option<u32>,
) -> Option<KvmCpuidEntry2> {
for e in entries {
if e.function != function {
continue;
}
if !e
.flags
.contains(KvmCpuidFlag::KVM_CPUID_FLAG_SIGNIFCANT_INDEX)
|| Some(e.index) == index
{
return Some(*e);
}
if index.is_none() {
return Some(*e);
}
}
None
}
}
bitflags! {
pub struct KvmCpuidFlag: u32 {
/// 表示CPUID函数的输入索引值是重要的它会影响CPUID函数的行为或返回值
const KVM_CPUID_FLAG_SIGNIFCANT_INDEX = 1 << 0;
/// 表示CPUID函数是有状态的即它的行为可能受到先前CPUID函数调用的影响
const KVM_CPUID_FLAG_STATEFUL_FUNC = 1 << 1;
/// 表示CPUID函数的状态应该在下一次CPUID函数调用中读取
const KVM_CPUID_FLAG_STATE_READ_NEXT = 1 << 2;
}
}
impl Default for KvmCpuidFlag {
fn default() -> Self {
Self::empty()
}
}

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,62 @@
use alloc::boxed::Box;
use crate::{
arch::kvm_arch_ops,
virt::vm::kvm_host::{vcpu::VirtCpu, Vm},
};
const APIC_DEFAULT_PHYS_BASE: u64 = 0xfee00000;
#[allow(dead_code)]
const MSR_IA32_APICBASE: u64 = 0x0000001b;
const MSR_IA32_APICBASE_BSP: u64 = 1 << 8;
const MSR_IA32_APICBASE_ENABLE: u64 = 1 << 11;
#[allow(dead_code)]
const MSR_IA32_APICBASE_BASE: u64 = 0xfffff << 12;
#[derive(Debug)]
pub struct KvmLapic {
pub apicv_active: bool,
pub regs: Box<[u8]>,
}
impl VirtCpu {
pub fn lapic_reset(&mut self, vm: &Vm, init_event: bool) {
kvm_arch_ops().apicv_pre_state_restore(self);
if !init_event {
let mut msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
if vm.arch.bsp_vcpu_id == self.vcpu_id {
msr_val |= MSR_IA32_APICBASE_BSP;
}
self.lapic_set_base(msr_val);
}
if self.arch.apic.is_none() {
return;
}
todo!()
}
fn lapic_set_base(&mut self, value: u64) {
let old_val = self.arch.apic_base;
let apic = self.arch.apic.as_ref();
self.arch.apic_base = value;
if (old_val ^ value) & MSR_IA32_APICBASE_ENABLE != 0 {
// TODO: kvm_update_cpuid_runtime(vcpu);
}
if apic.is_none() {
return;
}
if (old_val ^ value) & MSR_IA32_APICBASE_ENABLE != 0 {
// if value & MSR_IA32_APICBASE_ENABLE != 0 {}
}
todo!()
}
}

View File

@ -0,0 +1,463 @@
use core::{fmt::Debug, sync::atomic::AtomicU32};
use alloc::{boxed::Box, vec::Vec};
use bit_field::BitField;
use bitmap::{traits::BitMapOps, AllocBitmap};
use system_error::SystemError;
use x86::{
bits64::rflags::RFlags,
controlregs::{Cr0, Cr4},
dtables::DescriptorTablePointer,
};
use x86_64::registers::control::EferFlags;
use crate::{
smp::cpu::ProcessorId,
virt::vm::{
kvm_host::{
vcpu::VirtCpu, Vm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, KVM_USERSAPCE_IRQ_SOURCE_ID,
},
user_api::UapiKvmSegment,
},
};
use crate::arch::VirtCpuArch;
use super::{
asm::{MsrData, VcpuSegment, VmxMsrEntry},
vmx::{exit::ExitFastpathCompletion, vmx_info},
x86_kvm_manager, x86_kvm_ops,
};
pub mod lapic;
pub mod page;
pub mod vcpu;
#[allow(dead_code)]
pub const TSS_IOPB_BASE_OFFSET: usize = 0x66;
pub const TSS_BASE_SIZE: usize = 0x68;
pub const TSS_IOPB_SIZE: usize = 65536 / 8;
pub const TSS_REDIRECTION_SIZE: usize = 256 / 8;
pub const RMODE_TSS_SIZE: usize = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1;
pub const KVM_PFN_NOSLOT: u64 = 0x1 << 63;
#[allow(dead_code)]
#[derive(Debug, Default)]
pub struct X86KvmArch {
/// 中断芯片模式
pub irqchip_mode: KvmIrqChipMode,
/// 负责引导(bootstrap)kvm的vcpu_id
bsp_vcpu_id: usize,
pub pause_in_guest: bool,
pub cstate_in_guest: bool,
pub mwait_in_guest: bool,
pub hlt_in_guest: bool,
pub bus_lock_detection_enabled: bool,
irq_sources_bitmap: u64,
default_tsc_khz: u64,
guest_can_read_msr_platform_info: bool,
apicv_inhibit_reasons: usize,
pub max_vcpu_ids: usize,
pub notify_vmexit_flags: NotifyVmExitFlags,
pub notify_window: u32,
msr_fliter: Option<Box<KvmX86MsrFilter>>,
pub noncoherent_dma_count: AtomicU32,
pub active_mmu_pages: Vec<u64>,
pub n_max_mmu_pages: usize,
pub n_used_mmu_pages: usize,
}
impl X86KvmArch {
pub fn init(kvm_type: usize) -> Result<Self, SystemError> {
if kvm_type != 0 {
return Err(SystemError::EINVAL);
}
let mut arch = x86_kvm_ops().vm_init();
// 设置中断源位图
arch.irq_sources_bitmap
.set_bit(KVM_USERSAPCE_IRQ_SOURCE_ID, true)
.set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, true);
arch.default_tsc_khz = x86_kvm_manager().max_tsc_khz;
arch.guest_can_read_msr_platform_info = true;
arch.apicv_init();
Ok(arch)
}
fn apicv_init(&mut self) {
self.apicv_inhibit_reasons
.set_bit(KvmApicvInhibit::ABSENT, true);
if !vmx_info().enable_apicv {
self.apicv_inhibit_reasons
.set_bit(KvmApicvInhibit::DISABLE, true);
}
}
pub fn msr_allowed(&self, msr: u32, ftype: MsrFilterType) -> bool {
// x2APIC MSRs
if (0x800..=0x8ff).contains(&msr) {
return true;
}
if let Some(msr_filter) = &self.msr_fliter {
let mut allowed = msr_filter.default_allow;
for i in 0..msr_filter.count as usize {
let range = &msr_filter.ranges[i];
let start = range.base;
let end = start + range.nmsrs;
let flags = range.flags;
let bitmap = &range.bitmap;
if msr >= start && msr < end && flags.contains(ftype) {
allowed = bitmap.get((msr - start) as usize).unwrap_or(false);
break;
}
}
return allowed;
} else {
return true;
}
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
#[allow(dead_code)]
pub enum KvmIrqChipMode {
None,
Kernel,
Split,
}
impl Default for KvmIrqChipMode {
fn default() -> Self {
Self::None
}
}
#[allow(dead_code)]
pub trait KvmInitFunc {
fn hardware_setup(&self) -> Result<(), SystemError>;
fn handle_intel_pt_intr(&self) -> u32;
fn runtime_funcs(&self) -> &'static dyn KvmFunc;
}
pub trait KvmFunc: Send + Sync + Debug {
/// 返回该硬件支持的名字例如“Vmx”
fn name(&self) -> &'static str;
/// 启用硬件支持
fn hardware_enable(&self) -> Result<(), SystemError>;
fn vm_init(&self) -> X86KvmArch;
fn vcpu_precreate(&self, vm: &mut Vm) -> Result<(), SystemError>;
fn vcpu_create(&self, vcpu: &mut VirtCpu, vm: &Vm);
fn vcpu_load(&self, vcpu: &mut VirtCpu, cpu: ProcessorId);
fn load_mmu_pgd(&self, vcpu: &mut VirtCpu, vm: &Vm, root_hpa: u64, root_level: u32);
fn cache_reg(&self, vcpu: &mut VirtCpuArch, reg: KvmReg);
fn apicv_pre_state_restore(&self, vcpu: &mut VirtCpu);
fn set_msr(&self, vcpu: &mut VirtCpu, msr: MsrData) -> Result<(), SystemError>;
fn set_rflags(&self, vcpu: &mut VirtCpu, rflags: RFlags);
fn get_rflags(&self, vcpu: &mut VirtCpu) -> RFlags;
fn set_cr0(&self, vm: &Vm, vcpu: &mut VirtCpu, cr0: Cr0);
fn is_vaild_cr0(&self, vcpu: &VirtCpu, cr0: Cr0) -> bool;
fn set_cr4(&self, vcpu: &mut VirtCpu, cr4: Cr4);
fn post_set_cr3(&self, vcpu: &VirtCpu, cr3: u64);
fn is_vaild_cr4(&self, vcpu: &VirtCpu, cr4: Cr4) -> bool;
fn set_efer(&self, vcpu: &mut VirtCpu, efer: EferFlags);
fn set_segment(&self, vcpu: &mut VirtCpu, var: &mut UapiKvmSegment, seg: VcpuSegment);
fn get_segment(
&self,
vcpu: &mut VirtCpu,
var: UapiKvmSegment,
seg: VcpuSegment,
) -> UapiKvmSegment;
/// 这个函数不会用到VCPU这里拿到只是为了确保上一层拿到锁
fn get_idt(&self, _vcpu: &mut VirtCpu, dt: &mut DescriptorTablePointer<u8>);
fn set_idt(&self, _vcpu: &mut VirtCpu, dt: &DescriptorTablePointer<u8>);
fn get_gdt(&self, _vcpu: &mut VirtCpu, dt: &mut DescriptorTablePointer<u8>);
fn set_gdt(&self, _vcpu: &mut VirtCpu, dt: &DescriptorTablePointer<u8>);
fn update_exception_bitmap(&self, vcpu: &mut VirtCpu);
fn vcpu_reset(&self, vcpu: &mut VirtCpu, vm: &Vm, init_event: bool);
fn has_emulated_msr(&self, msr: u32) -> bool;
fn get_msr_feature(&self, msr: &mut VmxMsrEntry) -> bool;
fn prepare_switch_to_guest(&self, vcpu: &mut VirtCpu);
fn flush_tlb_all(&self, vcpu: &mut VirtCpu);
fn vcpu_run(&self, vcpu: &mut VirtCpu) -> ExitFastpathCompletion;
fn handle_exit_irqoff(&self, vcpu: &mut VirtCpu);
fn handle_exit(
&self,
vcpu: &mut VirtCpu,
vm: &Vm,
fastpath: ExitFastpathCompletion,
) -> Result<i32, SystemError>;
}
/// ## 中断抑制的原因位
#[derive(Debug)]
pub struct KvmApicvInhibit;
#[allow(dead_code)]
impl KvmApicvInhibit {
// Intel与AMD共用
/// APIC 加速功能被模块参数禁用,或者硬件不支持
pub const DISABLE: usize = 0;
/// Hyper-V 客户机正在使用 AutoEOI 功能,导致 APIC 加速被禁用。
pub const HYPERV: usize = 1;
/// 因为用户空间尚未启用内核或分裂的中断控制器,导致 APIC 加速被禁用。
pub const ABSENT: usize = 2;
/// KVM_GUESTDBG_BLOCKIRQ一种调试措施用于阻止该 vCPU 上的所有中断)被启用,以避免 AVIC/APICv 绕过此功能。
pub const BLOCKIRQ: usize = 3;
/// 当所有 vCPU 的 APIC ID 和 vCPU 的 1:1 映射被更改且 KVM 未应用其 x2APIC 热插拔修补程序时APIC 加速被禁用。
pub const PHYSICAL_ID_ALIASED: usize = 4;
/// 当 vCPU 的 APIC ID 或 APIC 基址从其复位值更改时,首次禁用 APIC 加速。
pub const APIC_ID_MODIFIED: usize = 5;
/// 当 vCPU 的 APIC ID 或 APIC 基址从其复位值更改时,首次禁用 APIC 加速。
pub const APIC_BASE_MODIFIED: usize = 6;
// 仅仅对AMD适用
/// 当 vCPU 运行嵌套客户机时AVIC 被禁用。因为与 APICv 不同,当 vCPU 运行嵌套时,该 vCPU 的同级无法使用门铃机制通过 AVIC 信号中断。
pub const NESTED: usize = 7;
/// 在 SVM 上,等待 IRQ 窗口的实现使用挂起的虚拟中断,而在 KVM 等待 IRQ 窗口时无法注入这些虚拟中断,因此在等待 IRQ 窗口时 AVIC 被禁用。
pub const IRQWIN: usize = 8;
/// PITi8254的“重新注入”模式依赖于 EOI 拦截,而 AVIC 不支持边沿触发中断的 EOI 拦截。
pub const PIT_REINJ: usize = 9;
/// SEV 不支持 AVIC因此 AVIC 被禁用。
pub const SEV: usize = 10;
/// 当所有带有有效 LDR 的 vCPU 之间的逻辑 ID 和 vCPU 的 1:1 映射被更改时AVIC 被禁用。
pub const LOGICAL_ID_ALIASED: usize = 11;
}
#[derive(Debug)]
pub struct KvmX86MsrFilter {
count: u8,
default_allow: bool,
ranges: Vec<KernelMsrRange>,
}
#[derive(Debug)]
pub struct KernelMsrRange {
pub flags: MsrFilterType,
pub nmsrs: u32,
pub base: u32,
pub bitmap: AllocBitmap,
}
#[repr(C)]
#[allow(dead_code)]
pub struct PosixMsrFilterRange {
pub flags: u32,
pub nmsrs: u32,
pub base: u32,
pub bitmap: *const u8,
}
bitflags! {
pub struct MsrFilterType: u8 {
const KVM_MSR_FILTER_READ = 1 << 0;
const KVM_MSR_FILTER_WRITE = 1 << 1;
}
pub struct NotifyVmExitFlags: u8 {
const KVM_X86_NOTIFY_VMEXIT_ENABLED = 1 << 0;
const KVM_X86_NOTIFY_VMEXIT_USER = 1 << 1;
}
}
impl Default for NotifyVmExitFlags {
fn default() -> Self {
NotifyVmExitFlags::empty()
}
}
#[derive(Debug, Clone, Copy)]
pub enum KvmReg {
VcpuRegsRax = 0,
VcpuRegsRcx = 1,
VcpuRegsRdx = 2,
VcpuRegsRbx = 3,
VcpuRegsRsp = 4,
VcpuRegsRbp = 5,
VcpuRegsRsi = 6,
VcpuRegsRdi = 7,
VcpuRegsR8 = 8,
VcpuRegsR9 = 9,
VcpuRegsR10 = 10,
VcpuRegsR11 = 11,
VcpuRegsR12 = 12,
VcpuRegsR13 = 13,
VcpuRegsR14 = 14,
VcpuRegsR15 = 15,
VcpuRegsRip = 16,
NrVcpuRegs = 17,
//VcpuExregPdptr = NrVcpuRegs,
VcpuExregCr0,
VcpuExregCr3,
VcpuExregCr4,
VcpuExregRflags,
VcpuExregSegments,
VcpuExregExitInfo1, //EXITINFO1 provides the linear address of the memory operand.
VcpuExregExitInfo2, //EXITINFO2 provides the contents of the register operand.
}
bitflags! {
pub struct HFlags: u8 {
const HF_GUEST_MASK = 1 << 0; /* VCPU is in guest-mode */
const HF_SMM_MASK = 1 << 1;
const HF_SMM_INSIDE_NMI_MASK = 1 << 2;
}
}
/// ### 虚拟机的通用寄存器
#[derive(Debug, Default, Clone, Copy)]
#[repr(C)]
pub struct KvmCommonRegs {
rax: u64,
rbx: u64,
rcx: u64,
rdx: u64,
rsi: u64,
rdi: u64,
rsp: u64,
rbp: u64,
r8: u64,
r9: u64,
r10: u64,
r11: u64,
r12: u64,
r13: u64,
r14: u64,
r15: u64,
rip: u64,
rflags: u64,
}
impl Vm {
pub fn vcpu_precreate(&mut self, id: usize) -> Result<(), SystemError> {
if self.arch.max_vcpu_ids == 0 {
self.arch.max_vcpu_ids = 1024 * 4;
}
if id >= self.arch.max_vcpu_ids {
return Err(SystemError::EINVAL);
}
return x86_kvm_ops().vcpu_precreate(self);
}
}
bitflags! {
pub struct EmulType: u32 {
const NO_DECODE = 1 << 0;
const TRAP_UD = 1 << 1;
const SKIP = 1 << 2;
const ALLOW_RETRY_PF = 1 << 3;
const TRAP_UD_FORCED = 1 << 4;
const VMWARE_GP = 1 << 5;
const PF = 1 << 6;
const COMPLETE_USER_EXIT = 1 << 7;
const WRITE_PF_TO_SP = 1 << 8;
}
}
#[allow(dead_code)]
#[derive(Default, Debug)]
///用于跟踪和记录VCPU的各种统计信息。
pub struct KvmVcpuStat {
//pub generic: KvmVcpuStatGeneric,
pub pf_taken: u64,
pub pf_fixed: u64,
pub pf_emulate: u64,
pub pf_spurious: u64,
pub pf_fast: u64,
pub pf_mmio_spte_created: u64,
pub pf_guest: u64,
pub tlb_flush: u64,
pub invlpg: u64,
pub exits: u64,
pub io_exits: u64,
pub mmio_exits: u64,
pub signal_exits: u64,
pub irq_window_exits: u64,
pub nmi_window_exits: u64,
pub l1d_flush: u64,
pub halt_exits: u64,
pub request_irq_exits: u64,
pub irq_exits: u64,
pub host_state_reload: u64,
pub fpu_reload: u64,
pub insn_emulation: u64,
pub insn_emulation_fail: u64,
pub hypercalls: u64,
pub irq_injections: u64,
pub nmi_injections: u64,
pub req_event: u64,
pub nested_run: u64,
pub directed_yield_attempted: u64,
pub directed_yield_successful: u64,
pub preemption_reported: u64,
pub preemption_other: u64,
pub guest_mode: u64,
pub notify_window_exits: u64,
}
#[inline]
/// 将 GFN 转换为 GPA
pub fn gfn_to_gpa(gfn: u64) -> u64 {
gfn << 12
}
#[allow(dead_code)]
#[inline]
/// 将 GPA 转换为 GFN
pub fn gpa_to_gfn(gfn: u64) -> u64 {
gfn >> 12
}

View File

@ -0,0 +1 @@
pub const KVM_MIN_FREE_MMU_PAGES: usize = 5;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,24 @@
use alloc::sync::Arc;
use log::warn;
use system_error::SystemError;
use crate::virt::vm::kvm_host::{
mem::{KvmMemoryChangeMode, LockedKvmMemSlot},
Vm,
};
#[allow(dead_code)]
pub struct KvmArchMemorySlot {}
impl Vm {
pub fn arch_prepare_memory_region(
&self,
_old: Option<&Arc<LockedKvmMemSlot>>,
_new: Option<&Arc<LockedKvmMemSlot>>,
_change: KvmMemoryChangeMode,
) -> Result<(), SystemError> {
// todo
warn!("arch_prepare_memory_region TODO");
Ok(())
}
}

View File

@ -0,0 +1,648 @@
use crate::arch::mm::X86_64MMArch;
use crate::arch::vm::asm::VmxAsm;
use crate::arch::vm::kvm_host::page::KVM_MIN_FREE_MMU_PAGES;
use crate::mm::PhysAddr;
use crate::virt::kvm::host_mem::PAGE_SHIFT;
use crate::{
arch::{mm::LockedFrameAllocator, MMArch, VirtCpuArch},
libs::spinlock::{SpinLock, SpinLockGuard},
mm::{page::PageMapper, MemoryManagementArch, PageTableKind},
virt::vm::kvm_host::{vcpu::VirtCpu, Vm},
};
use alloc::{sync::Arc, vec::Vec};
use bitfield_struct::bitfield;
use core::intrinsics::likely;
use core::ops::{Add, Sub};
use log::{debug, error, warn};
use raw_cpuid::CpuId;
use system_error::SystemError;
use x86::controlregs::{Cr0, Cr4};
use x86::vmx::vmcs::guest;
use x86_64::registers::control::EferFlags;
use super::super::{vmx::vmx_info, x86_kvm_ops};
use super::mmu_internal::KvmPageFault;
const PT64_ROOT_5LEVEL: usize = 5;
const PT64_ROOT_4LEVEL: usize = 4;
const PT32_ROOT_LEVEL: usize = 2;
const PT32E_ROOT_LEVEL: usize = 3;
static mut TDP_ENABLED: bool = false;
static mut TDP_MMU_ENABLED: bool = true;
static mut TDP_MMU_ALLOWED: bool = unsafe { TDP_MMU_ENABLED };
static mut TDP_ROOT_LEVEL: usize = 0;
static mut MAX_TDP_LEVEL: usize = 0;
static mut SHADOW_ACCESSED_MASK: usize = 0;
static mut MAX_HUGE_PAGE_LEVEL: PageLevel = PageLevel::None;
pub const PAGE_SIZE: u64 = 1 << PAGE_SHIFT;
pub fn is_tdp_mmu_enabled() -> bool {
unsafe { TDP_MMU_ENABLED }
}
#[allow(dead_code)]
#[repr(u8)]
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub enum PageLevel {
None,
Level4K,
Level2M,
Level1G,
Level512G,
LevelNum,
}
// 实现 Add trait
impl Add<usize> for PageLevel {
type Output = Self;
fn add(self, other: usize) -> Self {
let result = self as usize + other;
match result {
0 => PageLevel::None,
1 => PageLevel::Level4K,
2 => PageLevel::Level2M,
3 => PageLevel::Level1G,
4 => PageLevel::Level512G,
5 => PageLevel::LevelNum,
_ => PageLevel::LevelNum, // 超出范围时返回 LevelNum
}
}
}
// 实现 Sub trait
impl Sub<usize> for PageLevel {
type Output = Self;
fn sub(self, other: usize) -> Self {
let result = self as isize - other as isize;
match result {
0 => PageLevel::None,
1 => PageLevel::Level4K,
2 => PageLevel::Level2M,
3 => PageLevel::Level1G,
4 => PageLevel::Level512G,
5 => PageLevel::LevelNum,
_ => PageLevel::None, // 超出范围时返回 None
}
}
}
impl PageLevel {
fn kvm_hpage_gfn_shift(level: u8) -> u32 {
((level - 1) * 9) as u32
}
fn kvm_hpage_shift(level: u8) -> u32 {
PAGE_SHIFT + Self::kvm_hpage_gfn_shift(level)
}
fn kvm_hpage_size(level: u8) -> u64 {
1 << Self::kvm_hpage_shift(level)
}
/// 计算每个大页包含的页数
///
/// # 参数
/// - `level`: 页级别
///
/// # 返回值
/// 返回每个大页包含的页数
pub fn kvm_pages_per_hpage(level: u8) -> u64 {
Self::kvm_hpage_size(level) / PAGE_SIZE
}
}
///计算给定 GFNGuest Frame Number在指定级别上的对齐值
pub fn gfn_round_for_level(gfn: u64, level: u8) -> u64 {
gfn & !(PageLevel::kvm_pages_per_hpage(level) - 1)
}
#[derive(Debug)]
pub struct LockedKvmMmu {
inner: SpinLock<KvmMmu>,
}
impl LockedKvmMmu {
pub fn new(mmu: KvmMmu) -> Arc<Self> {
Arc::new(Self {
inner: SpinLock::new(mmu),
})
}
pub fn lock(&self) -> SpinLockGuard<KvmMmu> {
self.inner.lock()
}
}
pub type KvmMmuPageFaultHandler =
fn(vcpu: &mut VirtCpu, page_fault: &KvmPageFault) -> Result<i32, SystemError>;
#[derive(Debug, Default)]
#[allow(dead_code)]
pub struct KvmMmu {
pub root: KvmMmuRootInfo,
pub cpu_role: KvmCpuRole,
pub root_role: KvmMmuPageRole,
pub page_fault: Option<KvmMmuPageFaultHandler>,
pkru_mask: u32,
prev_roots: [KvmMmuRootInfo; Self::KVM_MMU_NUM_PREV_ROOTS],
pae_root: Vec<u64>,
pub pdptrs: [u64; 4],
}
impl KvmMmu {
pub fn _save_pdptrs(&mut self) {
self.pdptrs[0] = VmxAsm::vmx_vmread(guest::PDPTE0_FULL);
self.pdptrs[1] = VmxAsm::vmx_vmread(guest::PDPTE1_FULL);
self.pdptrs[2] = VmxAsm::vmx_vmread(guest::PDPTE2_FULL);
self.pdptrs[3] = VmxAsm::vmx_vmread(guest::PDPTE3_FULL);
}
const KVM_MMU_NUM_PREV_ROOTS: usize = 3;
pub const INVALID_PAGE: u64 = u64::MAX;
#[inline]
pub fn tdp_enabled() -> bool {
unsafe { TDP_ENABLED }
}
#[inline]
pub fn tdp_root_level() -> usize {
unsafe { TDP_ROOT_LEVEL }
}
#[inline]
pub fn max_tdp_level() -> usize {
unsafe { MAX_TDP_LEVEL }
}
#[inline]
pub fn ad_enabled() -> bool {
unsafe { SHADOW_ACCESSED_MASK != 0 }
}
/// 初始化mmu的配置因为其是无锁的所以该函数只能在初始化vmx时调用
pub fn kvm_configure_mmu(
enable_tdp: bool,
tdp_forced_root_level: usize,
tdp_max_root_level: usize,
tdp_huge_page_level: PageLevel,
) {
unsafe {
TDP_ENABLED = enable_tdp;
TDP_ROOT_LEVEL = tdp_forced_root_level;
MAX_TDP_LEVEL = tdp_max_root_level;
TDP_MMU_ENABLED = TDP_MMU_ALLOWED && TDP_ENABLED;
if TDP_ENABLED {
MAX_HUGE_PAGE_LEVEL = tdp_huge_page_level;
} else if CpuId::new()
.get_extended_processor_and_feature_identifiers()
.unwrap()
.has_1gib_pages()
{
MAX_HUGE_PAGE_LEVEL = PageLevel::Level1G;
} else {
MAX_HUGE_PAGE_LEVEL = PageLevel::Level2M;
}
}
}
}
#[derive(Debug, Default)]
pub struct KvmMmuRootInfo {
pub pgd: u64,
pub hpa: u64,
}
#[derive(Debug, Default, Clone, Copy)]
pub struct KvmCpuRole {
base: KvmMmuPageRole,
extend: KvmMmuExtenedRole,
}
impl PartialEq for KvmCpuRole {
fn eq(&self, other: &Self) -> bool {
self.base.0 == other.base.0 && self.extend.0 == other.extend.0
}
}
/// ### 用于跟踪影子页(包括 TDP 页)的属性,以确定页面是否可以在给定的 MMU 上下文中使用。
#[bitfield(u32)]
pub struct KvmMmuPageRole {
/// 表示页表级别,占用 4 位。对于普通的页表,取值是 2二级页表、3三级页表、4四级页表和 5五级页表
#[bits(4)]
pub level: u32,
/// 页表项是否为 4 字节,占用 1 位。在非 PAE 分页模式下,该值为 1
has_4_byte_gpte: bool,
/// 表示页表项所在的象限,占用 2 位。该字段仅在 has_4_byte_gpte 为 1 时有效。
#[bits(2)]
quadrant: u32,
/// 页面是否直接映射
direct: bool,
/// 页面的访问权限
#[bits(3)]
access: u32,
/// 页面是否无效
invalid: bool,
/// 页面是否启用 NX不可执行
efer_nx: bool,
/// CR0 寄存器中的写保护位(WP)是否被置位
cr0_wp: bool,
/// SMEPSupervisor Mode Execution Protection和非写保护位的组合
smep_andnot_wp: bool,
/// SMAPSupervisor Mode Access Prevention和非写保护位的组合
smap_andnot_wp: bool,
/// 页面是否禁用访问位Accessed Bit
ad_disabled: bool,
/// 当前页是否处于客户机模式
guest_mode: bool,
/// 是否将此页透传给客户机
passthrough: bool,
/// 未使用位域
#[bits(5)]
unused: u32,
/// 表示 SMMSystem Management Mode模式
#[bits(8)]
pub smm: u32,
}
impl KvmMmuPageRole {
pub fn is_cr0_pg(&self) -> bool {
self.level() > 0
}
pub fn is_cr4_pae(&self) -> bool {
!self.has_4_byte_gpte()
}
pub fn get_direct(&self) -> bool {
self.direct()
}
}
#[bitfield(u32)]
pub struct KvmMmuExtenedRole {
valid: bool,
execonly: bool,
cr4_pse: bool,
cr4_pke: bool,
cr4_smap: bool,
cr4_smep: bool,
cr4_la57: bool,
efer_lma: bool,
#[bits(24)]
unused: u32,
}
pub struct KvmMmuRoleRegs {
pub cr0: Cr0,
pub cr4: Cr4,
pub efer: EferFlags,
}
/// page falut的返回值, 用于表示页面错误的处理结果
/// 应用在handle_mmio_page_fault()、mmu.page_fault()、fast_page_fault()和
/// kvm_mmu_do_page_fault()等
#[derive(Debug, Eq, PartialEq, FromPrimitive, Clone)]
#[repr(u32)]
pub enum PFRet {
Continue, // RET_PF_CONTINUE: 到目前为止一切正常,继续处理页面错误。
Retry, // RET_PF_RETRY: 让 CPU 再次对该地址发生页面错误。
Emulate, // RET_PF_EMULATE: MMIO 页面错误,直接模拟指令。
Invalid, // RET_PF_INVALID: SPTE 无效,让实际的页面错误路径更新它。
Fixed, // RET_PF_FIXED: 故障的条目已经被修复
Spurious, // RET_PF_SPURIOUS: 故障的条目已经被修复,例如由另一个 vCPU 修复。
Err = u32::MAX, // 错误
}
impl From<PFRet> for i32 {
fn from(pf_ret: PFRet) -> Self {
pf_ret as i32
}
}
impl From<i32> for PFRet {
fn from(value: i32) -> Self {
match value {
0 => PFRet::Continue,
1 => PFRet::Retry,
2 => PFRet::Emulate,
3 => PFRet::Invalid,
4 => PFRet::Fixed,
5 => PFRet::Spurious,
_ => PFRet::Err, // 默认返回 Invalid
}
}
}
impl VirtCpuArch {
pub fn kvm_init_mmu(&mut self) {
let regs = self.role_regs();
let cpu_role = self.calc_cpu_role(&regs);
if self.walk_mmu.is_some()
&& self.nested_mmu.is_some()
&& Arc::ptr_eq(
self.walk_mmu.as_ref().unwrap(),
self.nested_mmu.as_ref().unwrap(),
)
{
todo!()
} else if KvmMmu::tdp_enabled() {
self.init_tdp_mmu(cpu_role);
} else {
todo!()
}
}
fn unload_mmu(&mut self) {
// TODO
}
pub fn reset_mmu_context(&mut self) {
self.unload_mmu();
self.kvm_init_mmu();
}
fn role_regs(&mut self) -> KvmMmuRoleRegs {
KvmMmuRoleRegs {
cr0: self.read_cr0_bits(Cr0::CR0_ENABLE_PAGING | Cr0::CR0_WRITE_PROTECT),
cr4: self.read_cr4_bits(
Cr4::CR4_ENABLE_PSE
| Cr4::CR4_ENABLE_PAE
| Cr4::CR4_ENABLE_LA57
| Cr4::CR4_ENABLE_SMEP
| Cr4::CR4_ENABLE_SMAP
| Cr4::CR4_ENABLE_PROTECTION_KEY,
),
efer: self.efer,
}
}
fn calc_cpu_role(&self, regs: &KvmMmuRoleRegs) -> KvmCpuRole {
let mut role = KvmCpuRole::default();
let base = &mut role.base;
let ext = &mut role.extend;
base.set_access(0b111);
base.set_smm(self.is_smm() as u32);
base.set_guest_mode(self.is_guest_mode());
ext.set_valid(true);
if !regs.cr0.contains(Cr0::CR0_ENABLE_PAGING) {
base.set_direct(true);
return role;
}
base.set_efer_nx(regs.efer.contains(EferFlags::NO_EXECUTE_ENABLE));
base.set_cr0_wp(regs.cr0.contains(Cr0::CR0_WRITE_PROTECT));
base.set_smep_andnot_wp(
regs.cr4.contains(Cr4::CR4_ENABLE_SMEP) && !regs.cr0.contains(Cr0::CR0_WRITE_PROTECT),
);
base.set_smap_andnot_wp(
regs.cr4.contains(Cr4::CR4_ENABLE_SMAP) && !regs.cr0.contains(Cr0::CR0_WRITE_PROTECT),
);
base.set_has_4_byte_gpte(!regs.cr4.contains(Cr4::CR4_ENABLE_PAE));
if regs.efer.contains(EferFlags::LONG_MODE_ACTIVE) {
let level = if regs.cr4.contains(Cr4::CR4_ENABLE_LA57) {
PT64_ROOT_5LEVEL as u32
} else {
PT64_ROOT_4LEVEL as u32
};
base.set_level(level);
} else if regs.cr4.contains(Cr4::CR4_ENABLE_PAE) {
base.set_level(PT32E_ROOT_LEVEL as u32);
} else {
base.set_level(PT32_ROOT_LEVEL as u32);
}
ext.set_cr4_smep(regs.cr4.contains(Cr4::CR4_ENABLE_SMEP));
ext.set_cr4_smap(regs.cr4.contains(Cr4::CR4_ENABLE_SMAP));
ext.set_cr4_pse(regs.cr4.contains(Cr4::CR4_ENABLE_PSE));
ext.set_cr4_pke(
regs.efer.contains(EferFlags::LONG_MODE_ACTIVE)
&& regs.cr4.contains(Cr4::CR4_ENABLE_PROTECTION_KEY),
);
ext.set_cr4_la57(
regs.efer.contains(EferFlags::LONG_MODE_ACTIVE)
&& regs.cr4.contains(Cr4::CR4_ENABLE_LA57),
);
ext.set_efer_lma(regs.efer.contains(EferFlags::LONG_MODE_ACTIVE));
role
}
/// https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/mmu/mmu.c#6019
pub fn vcpu_arch_mmu_create(&mut self) {
if vmx_info().tdp_enabled() {
self.guset_mmu = Some(self._mmu_create());
}
self.root_mmu = Some(self._mmu_create());
self.mmu = self.root_mmu.clone();
self.walk_mmu = self.root_mmu.clone();
}
fn _mmu_create(&self) -> Arc<LockedKvmMmu> {
let mut mmu = KvmMmu::default();
mmu.root.hpa = KvmMmu::INVALID_PAGE;
mmu.root.pgd = 0;
for role in &mut mmu.prev_roots {
role.hpa = KvmMmu::INVALID_PAGE;
role.pgd = KvmMmu::INVALID_PAGE;
}
if KvmMmu::tdp_enabled() && self.mmu_get_tdp_level() > PT32E_ROOT_LEVEL {
return LockedKvmMmu::new(mmu);
}
mmu.pae_root
.resize(MMArch::PAGE_SIZE / core::mem::size_of::<u64>(), 0);
return LockedKvmMmu::new(mmu);
}
fn mmu_get_tdp_level(&self) -> usize {
if KvmMmu::tdp_root_level() != 0 {
return KvmMmu::tdp_root_level();
}
if KvmMmu::max_tdp_level() == 5 && self.max_phyaddr <= 48 {
return 4;
}
return KvmMmu::max_tdp_level();
}
pub fn init_tdp_mmu(&mut self, cpu_role: KvmCpuRole) {
let context = self.root_mmu();
let mut context = context.lock();
let root_role = self.calc_tdp_mmu_root_page_role(cpu_role);
if cpu_role == context.cpu_role && root_role.0 == context.root_role.0 {
return;
}
context.cpu_role = cpu_role;
context.root_role = root_role;
// todo 设置函数集
if !context.cpu_role.base.is_cr0_pg() {
// todo: context->gva_to_gpa = nonpaging_gva_to_gpa;
warn!("context->gva_to_gpa = nonpaging_gva_to_gpa todo!");
} else if context.cpu_role.base.is_cr4_pae() {
// todo: context->gva_to_gpa = paging64_gva_to_gpa;
warn!("context->gva_to_gpa = paging64_gva_to_gpa todo!");
} else {
// todo: context->gva_to_gpa = paging32_gva_to_gpa;
warn!("context->gva_to_gpa = paging32_gva_to_gpa todo!");
}
// todo:
// reset_guest_paging_metadata(vcpu, context);
// reset_tdp_shadow_zero_bits_mask(context);
}
#[inline]
pub fn root_mmu(&self) -> &Arc<LockedKvmMmu> {
self.root_mmu.as_ref().unwrap()
}
#[inline]
pub fn mmu(&self) -> SpinLockGuard<KvmMmu> {
self.mmu.as_ref().unwrap().lock()
}
fn calc_tdp_mmu_root_page_role(&self, cpu_role: KvmCpuRole) -> KvmMmuPageRole {
let mut role = KvmMmuPageRole::default();
role.set_access(0b111);
role.set_cr0_wp(true);
role.set_efer_nx(true);
role.set_smm(cpu_role.base.smm());
role.set_guest_mode(cpu_role.base.guest_mode());
role.set_ad_disabled(!KvmMmu::ad_enabled());
role.set_level(self.mmu_get_tdp_level() as u32);
role.set_direct(true);
role.set_has_4_byte_gpte(false);
role
}
}
impl VirtCpu {
pub fn kvm_mmu_reload(&mut self, vm: &Vm) -> Result<(), SystemError> {
if likely(self.arch.mmu().root.hpa != KvmMmu::INVALID_PAGE) {
return Ok(());
}
return self.kvm_mmu_load(vm);
}
pub fn kvm_mmu_load(&mut self, vm: &Vm) -> Result<(), SystemError> {
let direct = self.arch.mmu().root_role.direct();
self.mmu_topup_memory_caches(!direct)?;
self.mmu_alloc_special_roots()?;
if direct {
self.mmu_alloc_direct_roots(vm)?;
} else {
self.mmu_alloc_shadow_roots(vm)?;
}
// TODO: kvm_mmu_sync_roots
self.kvm_mmu_load_pgd(vm);
Ok(())
}
pub fn kvm_mmu_load_pgd(&mut self, vm: &Vm) {
let root_hpa = self.arch.mmu().root.hpa;
debug!("kvm_mmu_load_pgd::root_hpa = {:#x}", root_hpa);
if root_hpa == KvmMmu::INVALID_PAGE {
return;
}
let level = self.arch.mmu().root_role.level();
x86_kvm_ops().load_mmu_pgd(self, vm, root_hpa, level);
}
fn mmu_topup_memory_caches(&mut self, _maybe_indirect: bool) -> Result<(), SystemError> {
// TODO
Ok(())
}
fn mmu_alloc_special_roots(&mut self) -> Result<(), SystemError> {
// TODO
Ok(())
}
fn mmu_alloc_direct_roots(&mut self, vm: &Vm) -> Result<(), SystemError> {
let shadow_root_level = self.arch.mmu().root_role.level();
let _r: Result<(), SystemError> = self.make_mmu_pages_available(vm);
let root: PhysAddr;
if KvmMmu::tdp_enabled() {
root = self.kvm_tdp_mmu_get_vcpu_root_hpa().unwrap();
let mut mmu = self.arch.mmu();
mmu.root.hpa = root.data() as u64;
} else if shadow_root_level >= PT64_ROOT_4LEVEL as u32 {
todo!()
} else if shadow_root_level == PT32E_ROOT_LEVEL as u32 {
todo!()
} else {
error!("Bad TDP root level = {}", shadow_root_level);
return Err(SystemError::EIO);
}
/* root.pgd is ignored for direct MMUs. */
self.arch.mmu().root.pgd = 0;
Ok(())
}
fn mmu_alloc_shadow_roots(&mut self, _vm: &Vm) -> Result<(), SystemError> {
todo!();
}
fn make_mmu_pages_available(&mut self, vm: &Vm) -> Result<(), SystemError> {
let avail = Self::kvm_mmu_available_pages(vm);
if likely(avail >= KVM_MIN_FREE_MMU_PAGES) {
return Ok(());
}
//kvm_mmu_zap_oldest_mmu_pages(vm, KVM_REFILL_PAGES - avail);
if Self::kvm_mmu_available_pages(vm) == 0 {
return Err(SystemError::ENOSPC);
}
Ok(())
}
fn kvm_mmu_available_pages(vm: &Vm) -> usize {
if vm.arch.n_max_mmu_pages > vm.arch.n_used_mmu_pages {
return vm.arch.n_max_mmu_pages - vm.arch.n_used_mmu_pages;
}
return 0;
}
fn kvm_tdp_mmu_get_vcpu_root_hpa(&self) -> Result<PhysAddr, SystemError> {
//todo Check for an existing root before allocating a new one. Note, the
// role check prevents consuming an invalid root.
let root = self.tdp_mmu_alloc_sp().unwrap();
Ok(PhysAddr::new(root as usize))
}
fn tdp_mmu_alloc_sp(&self) -> Result<u64, SystemError> {
// 申请并创建新的页表
let mapper: crate::mm::page::PageMapper<X86_64MMArch, LockedFrameAllocator> = unsafe {
PageMapper::create(PageTableKind::EPT, LockedFrameAllocator)
.ok_or(SystemError::ENOMEM)?
};
let ept_root_hpa = mapper.table().phys();
self.arch.mmu().root.hpa = ept_root_hpa.data() as u64;
debug!("ept_root_hpa:{:x}!", ept_root_hpa.data() as u64);
return Ok(self.arch.mmu().root.hpa);
}
}

View File

@ -0,0 +1,396 @@
use crate::mm::page::EntryFlags;
use alloc::sync::Arc;
use core::{intrinsics::unlikely, ops::Index};
use log::{debug, warn};
use x86::vmx::vmcs::{guest, host};
use system_error::SystemError;
use crate::{
arch::{
vm::{
asm::VmxAsm,
kvm_host::{EmulType, KVM_PFN_NOSLOT},
mmu::kvm_mmu::{PFRet, PageLevel},
mtrr::kvm_mtrr_check_gfn_range_consistency,
vmx::{ept::EptPageMapper, PageFaultErr},
},
MMArch,
},
mm::PhysAddr,
virt::{
kvm::host_mem::PAGE_SHIFT,
vm::kvm_host::{
mem::{LockedKvmMemSlot, LockedVmMemSlotSet, UserMemRegionFlag, __gfn_to_pfn_memslot},
search_memslots,
vcpu::VirtCpu,
Vm,
},
},
};
use super::kvm_mmu::{gfn_round_for_level, is_tdp_mmu_enabled, KvmMmuPageRole};
#[allow(dead_code)]
#[derive(Debug, Default)]
pub struct KvmMmuPage {
pub tdp_mmu_page: bool, // 标记是否为 TDPTwo-Dimensional Paging页表页
pub gfn: u64, // 客户机帧号Guest Frame Number
/*
* The following two entries are used to key the shadow page in the
* hash table.
*/
pub role: KvmMmuPageRole,
pub spt: u64, // 指向页表条目SPTE的指针
pub mmu_seq: u64,
pub map_writable: bool,
pub write_fault_to_shadow_pgtable: bool,
}
#[allow(dead_code)]
#[derive(Debug, Default)]
pub struct KvmPageFault {
// vcpu.do_page_fault 的参数
// addr是guestOS传进来的gpa
addr: PhysAddr,
error_code: u32,
prefetch: bool,
// 从 error_code 派生
exec: bool,
write: bool,
present: bool,
rsvd: bool,
user: bool,
// 从 mmu 和全局状态派生
is_tdp: bool,
nx_huge_page_workaround_enabled: bool,
// 是否可以创建大于 4KB 的映射,或由于 NX 大页被禁止
huge_page_disallowed: bool,
// 此故障可以创建的最大页面大小
max_level: u8,
// 基于 max_level 和主机映射使用的页面大小可以创建的页面大小
req_level: u8,
// 基于 req_level 和 huge_page_disallowed 将创建的页面大小
goal_level: u8,
// 移位后的 addr或如果 addr 是 gva 则是访客页表遍历的结果
gfn: u64, // gfn_t 通常是一个 64 位地址
// 包含 gfn 的 memslot。可能为 None
slot: Option<Arc<LockedKvmMemSlot>>,
// kvm_faultin_pfn 的输出
mmu_seq: u64,
// kvm_pfn_t 通常是一个 64 位地址,相当于知道了hpa
pfn: u64,
hva: u64, // hva_t 通常是一个 64 位地址
map_writable: bool,
// 表示访客正在尝试写入包含用于翻译写入本身的一个或多个 PTE 的 gfn
write_fault_to_shadow_pgtable: bool,
}
#[allow(dead_code)]
impl KvmPageFault {
pub fn pfn(&self) -> u64 {
self.pfn
}
pub fn gfn(&self) -> u64 {
self.gfn
}
pub fn gpa(&self) -> u64 {
self.addr.data() as u64
}
pub fn hva(&self) -> u64 {
self.hva
}
}
impl VirtCpu {
#[inline(never)]
pub fn page_fault(
&mut self,
vm: &Vm,
cr2_or_gpa: u64,
mut error_code: u64,
_insn: Option<u64>,
_insn_len: usize,
) -> Result<i32, SystemError> {
let emulation_type = EmulType::PF;
let _direct = self.arch.mmu().root_role.get_direct();
if error_code & PageFaultErr::PFERR_IMPLICIT_ACCESS.bits() != 0 {
warn!("Implicit access error code detected");
error_code &= !PageFaultErr::PFERR_IMPLICIT_ACCESS.bits();
}
//if self.arch.mmu().root.hpa != KvmMmu::INVALID_PAGE {
// return Ok(PFRet::Retry as u64);
//}
let mut r = PFRet::Invalid;
if unlikely(error_code & PageFaultErr::PFERR_RSVD.bits() != 0) {
todo!();
// r = self.handle_mmio_page_fault(cr2_or_gpa, direct)?;
// if r == PFRes::Emulate{
// return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,insn_len) insn_len);
// }
}
if r == PFRet::Invalid {
r = self
.do_page_fault(
vm,
cr2_or_gpa,
(error_code & 0xFFFFFFFF) as u32,
false,
emulation_type,
)?
.into();
if r == PFRet::Invalid {
return Err(SystemError::EIO);
}
}
if i32::from(r.clone()) < 0 {
return Ok(i32::from(r));
}
if r != PFRet::Emulate {
return Ok(1);
}
// 在模拟指令之前检查错误代码是否由于在翻译客户机页面时的只读RO违规。
// 这可能发生在使用嵌套虚拟化和嵌套分页的情况下。如果是这样,我们只需取消页面保护并恢复客户机。
let pferr_nested_guest_page = PageFaultErr::PFERR_GUEST_PAGE
| PageFaultErr::PFERR_WRITE
| PageFaultErr::PFERR_PRESENT;
if self.arch.mmu().root_role.get_direct()
&& (error_code & pferr_nested_guest_page.bits()) == pferr_nested_guest_page.bits()
{
todo!()
}
// self.arch.mmu.page_fault 返回 RET_PF_EMULATE但我们仍然可以乐观地尝试取消页面保护
// 并让处理器重新执行导致页面故障的指令。不允许重试 MMIO 模拟,因为这不仅毫无意义,
// 而且可能导致进入无限循环,因为处理器会不断在不存在的 MMIO 地址上发生故障。
// 重试来自嵌套客户机的指令也是毫无意义且危险的,因为我们只显式地影子 L1 的页表,
// 即为 L1 取消保护并不会神奇地修复导致 L2 失败的问题。
// if !self.mmio_info_in_cache(cr2_or_gpa, direct) && !self.arch.is_guest_mode() {
// emulation_type |= EmulType::ALLOW_RETRY_PF;
// }
// self.emulate_instruction(cr2_or_gpa, emulation_type, insn, insn_len)
todo!("emulate_instruction")
}
fn do_page_fault(
&mut self,
vm: &Vm,
cr2_or_gpa: u64,
error_code: u32,
prefetch: bool,
mut emultype: EmulType,
) -> Result<i32, SystemError> {
//初始化page fault
let mut page_fault = KvmPageFault {
addr: PhysAddr::new(cr2_or_gpa as usize),
error_code,
exec: error_code & PageFaultErr::PFERR_FETCH.bits() as u32 != 0,
write: error_code & PageFaultErr::PFERR_WRITE.bits() as u32 != 0,
present: error_code & PageFaultErr::PFERR_PRESENT.bits() as u32 != 0,
rsvd: error_code & PageFaultErr::PFERR_RSVD.bits() as u32 != 0,
user: error_code & PageFaultErr::PFERR_USER.bits() as u32 != 0,
prefetch,
is_tdp: true,
nx_huge_page_workaround_enabled: false, //todo
max_level: PageLevel::Level1G as u8,
req_level: PageLevel::Level4K as u8,
goal_level: PageLevel::Level4K as u8,
..Default::default()
};
//处理直接映射
if self.arch.mmu().root_role.get_direct() {
page_fault.gfn = (page_fault.addr.data() >> PAGE_SHIFT) as u64;
debug!("page_fault.addr.data() : 0x{:x}", page_fault.addr.data());
debug!("do_page_fault : gfn = 0x{:x}", page_fault.gfn);
page_fault.slot = self.gfn_to_memslot(page_fault.gfn, vm); //kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);没完成
}
//异步页面错误Async #PF也称为预取错误prefetch faults
//从客机guest的角度来看并不是错误并且已经在原始错误发生时被计数。
if !prefetch {
self.stat.pf_taken += 1;
}
let r = if page_fault.is_tdp {
self.tdp_page_fault(vm, &mut page_fault).unwrap()
} else {
//目前的处理page_fault的方法只有tdp_page_fault,所以这里是不会执行的
let handle = self.arch.mmu().page_fault.unwrap();
handle(self, &page_fault).unwrap()
};
if page_fault.write_fault_to_shadow_pgtable {
emultype |= EmulType::WRITE_PF_TO_SP;
}
//类似于上面的情况,预取错误并不是真正的虚假错误,并且异步页面错误路径不会进行仿真。
//然而,确实要统计由异步页面错误处理程序修复的错误,否则它们将永远不会被统计。
match PFRet::from(r) {
PFRet::Fixed => self.stat.pf_fixed += 1,
PFRet::Emulate => self.stat.pf_emulate += 1,
PFRet::Spurious => self.stat.pf_spurious += 1,
_ => {}
}
debug!("do_page_fault return r = {}", r);
Ok(r)
}
pub fn gfn_to_memslot(&self, gfn: u64, vm: &Vm) -> Option<Arc<LockedKvmMemSlot>> {
let slot_set: Arc<LockedVmMemSlotSet> = self.kvm_vcpu_memslots(vm);
//...todo
search_memslots(slot_set, gfn)
}
pub fn kvm_vcpu_memslots(&self, vm: &Vm) -> Arc<LockedVmMemSlotSet> {
vm.memslots.index(0).clone()
}
fn tdp_page_fault(
&mut self,
vm: &Vm,
page_fault: &mut KvmPageFault,
) -> Result<i32, SystemError> {
// 如果 shadow_memtype_mask 为真,并且虚拟机有非一致性 DMA
//if shadow_memtype_mask != 0 && self.kvm().lock().arch.noncoherent_dma_count > 0 {
while page_fault.max_level > PageLevel::Level4K as u8 {
let page_num = PageLevel::kvm_pages_per_hpage(page_fault.max_level);
//低地址对齐
let base = gfn_round_for_level(page_fault.gfn, page_fault.max_level);
//检查给定 GFN 范围内的内存类型是否一致,暂未实现
if kvm_mtrr_check_gfn_range_consistency(self, base, page_num) {
break;
}
page_fault.max_level -= 1;
}
//}
if is_tdp_mmu_enabled() {
return self.kvm_tdp_mmu_page_fault(vm, page_fault);
}
//正常是不会执行到这里的因为我们的是支持ept的
self.direct_page_fault(page_fault)
}
fn kvm_tdp_mmu_page_fault(
&self,
vm: &Vm,
page_fault: &mut KvmPageFault,
) -> Result<i32, SystemError> {
//page_fault_handle_page_track(page_fault)
//fast_page_fault(page_fault);
//mmu_topup_memory_caches(false);
let mut r = self
.kvm_faultin_pfn(vm, page_fault, 1 | 1 << 1 | 1 << 2)
.unwrap();
if r != PFRet::Continue {
return Ok(r.into());
}
//r = PFRet::Retry;
//if self.is_page_fault_stale(page_fault) {return;}
//实际的映射
r = self.tdp_mmu_map(page_fault)?.into();
Ok(r.into())
}
//没有实现huge page相关
fn tdp_mmu_map(&self, page_fault: &mut KvmPageFault) -> Result<i32, SystemError> {
// let ret = PFRet::Retry;//下面的逻辑和linux不一致可能在判断返回值会有问题
let mut mapper = EptPageMapper::lock();
debug!("{:?}", &page_fault);
//flags rwx
let page_flags: EntryFlags<MMArch> = unsafe { EntryFlags::from_data(0xb77) };
mapper.map(PhysAddr::new(page_fault.gpa() as usize), page_flags);
//debug_eptp();
debug!("The ept_root_addr is {:?}", EptPageMapper::root_page_addr());
//todo: 一些参数的更新
Ok(PFRet::Fixed.into())
//todo!()
}
fn direct_page_fault(&self, _page_fault: &KvmPageFault) -> Result<i32, SystemError> {
todo!()
}
fn kvm_faultin_pfn(
&self,
vm: &Vm,
page_fault: &mut KvmPageFault,
_access: u32,
) -> Result<PFRet, SystemError> {
page_fault.mmu_seq = vm.mmu_invalidate_seq;
self.__kvm_faultin_pfn(page_fault)
}
fn __kvm_faultin_pfn(&self, page_fault: &mut KvmPageFault) -> Result<PFRet, SystemError> {
let slot = &page_fault.slot;
let mut is_async = false;
if slot.is_none() {
return Err(SystemError::KVM_HVA_ERR_BAD);
}
let slot = slot.as_ref().unwrap().read();
if slot.get_flags().bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits() != 0 {
return Ok(PFRet::Retry);
}
if !slot.is_visible() {
/* 不要将私有内存槽暴露给 L2。 */
if self.arch.is_guest_mode() {
drop(slot);
page_fault.slot = None;
page_fault.pfn = KVM_PFN_NOSLOT;
page_fault.map_writable = false;
return Ok(PFRet::Continue);
}
}
// 尝试将 GFN 转换为 PFN
let guest_cr3 = VmxAsm::vmx_vmread(guest::CR3);
let host_cr3 = VmxAsm::vmx_vmread(host::CR3);
debug!("guest_cr3={:x}, host_cr3={:x}", guest_cr3, host_cr3);
page_fault.pfn = __gfn_to_pfn_memslot(
Some(&slot),
page_fault.gfn,
(false, &mut is_async),
false,
page_fault.write,
&mut page_fault.map_writable,
&mut page_fault.hva,
)?;
if !is_async {
return Ok(PFRet::Continue); /* *pfn 已经有正确的页面 */
}
// if !page_fault.prefetch && self.kvm_can_do_async_pf() {
// self.trace_kvm_try_async_get_page(page_fault.addr, page_fault.gfn);
// if self.kvm_find_async_pf_gfn(page_fault.gfn) {
// self.trace_kvm_async_pf_repeated_fault(page_fault.addr, page_fault.gfn);
// self.kvm_make_request(KVM_REQ_APF_HALT);
// return Ok(PFRet::Retry);
// } else if self.kvm_arch_setup_async_pf(page_fault.addr, page_fault.gfn) {
// return Ok(PFRet::Retry);
// }
// }
Ok(PFRet::Continue)
}
}

View File

@ -0,0 +1,3 @@
pub mod kvm_mmu;
pub mod mmu_internal;
pub mod tdp_iter;

View File

@ -0,0 +1,219 @@
// use crate::{
// arch::vm::mmu::mmu::gfn_round_for_level,
// mm::{virt_2_phys, PhysAddr, VirtAddr},
// time::sleep,
// virt::kvm::host_mem::PAGE_SHIFT,
// };
// use super::{
// mmu::{PageLevel, PAGE_SIZE},
// mmu_internal::KvmMmuPage,
// };
// pub const PT64_ROOT_MAX_LEVEL: usize = 5; //通常只用到4级但是确实有5级的情况
// pub const PT_LEVEL_BITS: u8 = 9; // 每个页表级别的位数
// pub const PT64_ENT_PER_PAGE: u32 = 1 << 9;
// pub const PTE_LEN: usize = 64;
// //Bits 51:12 are from the EPT PDPTE
// pub const PT64_BASE_ADDR_MASK: u64 = ((1u64 << 52) - 1) & !(PAGE_SIZE - 1);
// pub fn shadow_pt_index(addr: u64, level: u8) -> u64 {
// (addr >> (PAGE_SHIFT as u8 + (level - 1) * PT_LEVEL_BITS)) & ((1 << PT_LEVEL_BITS) - 1)
// }
// pub fn is_last_spte(pte: u64, level: u8) -> bool {
// level == PageLevel::Level4K as u8 || is_large_pte(pte)
// }
// pub fn is_shadow_present_pte(pte: u64) -> bool {
// pte & 1 << 11 != 0 //在intel手冊中ept PTE:11 Ignored.不是很懂
// }
// pub fn is_large_pte(pte: u64) -> bool {
// pte & 1 << 7 != 0 //在intel手冊中ept PTE:7 Ignored.
// }
// ///Bits 51:12 are from the EPT PDPTE
// pub fn spte_to_pfn(pte: u64) -> u64 {
// (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
// }
// #[derive(Default)]
// pub struct TdpIter {
// inner: TdpIterInner,
// }
// impl TdpIter {
// pub fn start(
// &self,
// root_pt: usize,
// root_level: u8,
// min_level: u8,
// next_last_level_gfn: u64,
// ) -> Self {
// let mut inner = self.inner.clone();
// inner.start(root_pt, root_level, min_level, next_last_level_gfn);
// TdpIter { inner }
// }
// }
// ///迭代器将遍历分页结构,直到找到此 GFN 的映射。
// #[derive(Default, Clone)]
// pub struct TdpIterInner {
// next_last_level_gfn: u64,
// /// 线程上次让出时的 next_last_level_gfn。
// /// 仅当 next_last_level_gfn != yielded_gfn 时让出,有助于确保前进。
// pub yielded_gfn: u64,
// ///指向遍历到当前 SPTE 的页表的指针
// pt_path: [u64; PT64_ROOT_MAX_LEVEL],
// ///指向当前 SPTE 的指针 是hva吗
// sptep: PhysAddr,
// /// 当前 SPTE 映射的最低 GFN hpa>>shift?
// pub gfn: u64,
// ///给迭代器的根页级别
// pub root_level: u8,
// ///迭代器应遍历到的最低级别
// pub min_level: u8,
// ///迭代器在分页结构中的当前级别
// pub level: u8,
// ///sptep 处值的快照
// pub old_spte: u64,
// ///迭代器是否具有有效状态。如果迭代器走出分页结构的末端,则为 false。
// ///
// pub valid: bool,
// }
// impl TdpIterInner {
// ///初始化ept iter
// #[inline(never)]
// pub fn start(
// &mut self,
// root_pt: usize,
// root_level: u8,
// min_level: u8,
// next_last_level_gfn: u64,
// ) {
// // if root_pt.role.level() == 0 || root_pt.role.level() > PT64_ROOT_MAX_LEVEL as u32 {
// // self.valid = false;
// // return;
// // }
// if root_level < 1 || root_level > PT64_ROOT_MAX_LEVEL as u8 {
// self.valid = false;
// return;
// }
// self.next_last_level_gfn = next_last_level_gfn;
// self.root_level = root_level as u8;
// self.min_level = min_level as u8;
// self.pt_path[(self.root_level - 1) as usize] = root_pt as u64;
// self.yielded_gfn = self.next_last_level_gfn;
// self.level = self.root_level;
// self.gfn = gfn_round_for_level(self.next_last_level_gfn, self.level);
// self.tdp_iter_refresh_sptep();
// self.valid = true;
// }
// /*
// * 重新计算当前GFN和level和SPTE指针并重新读取SPTE。
// */
// fn tdp_iter_refresh_sptep(&mut self) {
// // self.sptep = PhysAddr::new(
// // (self.pt_path[self.level as usize - 1]
// // + shadow_pt_index(self.gfn << PAGE_SHIFT, self.level)) as usize,
// // );
// // self.old_spte = read_sptep(self.sptep);
// }
// pub fn _next(&mut self) {
// if self.try_step_down() {
// return;
// }
// loop {
// if self.try_step_side() {
// return;
// }
// if !self.try_step_up() {
// break;
// }
// }
// self.valid = false;
// }
// ///在分页结构中向目标GFN下降一级。如果迭代器能够下降一级则返回true否则返回false。
// fn try_step_down(&mut self) -> bool {
// if self.level == self.min_level {
// return false;
// }
// //在下降之前重新读取SPTE以避免遍历到不再从此条目链接的页表中。
// self.old_spte = read_sptep(self.sptep);
// match spte_to_child_pt(self.old_spte, self.level) {
// Some(child_pt) => {
// self.level -= 1;
// self.pt_path[self.level as usize - 1] = child_pt.data() as u64;
// self.gfn = gfn_round_for_level(self.gfn, self.level);
// self.tdp_iter_refresh_sptep();
// true
// }
// None => false,
// }
// }
// fn try_step_up(&mut self) -> bool {
// if self.level == self.root_level {
// return false;
// }
// self.level += 1;
// self.gfn = gfn_round_for_level(self.gfn, self.level);
// self.tdp_iter_refresh_sptep();
// true
// }
// ///在当前页表的当前级别中移动到下一个条目。下一个条目可以指向一个page backing guest memory
// ///或者另一个页表或者它可能是不存在的。如果迭代器能够移动到页表中的下一个条目则返回true
// ///如果迭代器已经在当前页表的末尾则返回false。
// fn try_step_side(&mut self) -> bool {
// //检查迭代器是否已经在当前页表的末尾。
// if shadow_pt_index(self.gfn << PAGE_SHIFT, self.level) == (PT64_ENT_PER_PAGE - 1) as u64 {
// return false;
// }
// self.gfn += PageLevel::kvm_pages_per_hpage(self.level);
// self.next_last_level_gfn = self.gfn;
// self.sptep.add(PTE_LEN); //指向下一个spte一个spte占64位
// self.old_spte = read_sptep(self.sptep);
// true
// }
// }
// impl Iterator for TdpIter {
// type Item = TdpIterInner; // 返回 (gfn, spte) 元组
// fn next(&mut self) -> Option<Self::Item> {
// let inner = &mut self.inner;
// if !inner.valid {
// return None;
// }
// inner._next();
// if inner.valid {
// Some(inner.clone())
// } else {
// None
// }
// }
// }
// ///给定一个 SPTE 及其级别,返回一个指针,该指针包含 SPTE 所引用的子页表的hva。
// ///如果没有这样的条目,则返回 null。
// ///
// fn spte_to_child_pt(spte: u64, level: u8) -> Option<VirtAddr> {
// //没有子页表
// if !is_shadow_present_pte(spte) || is_last_spte(spte, level) {
// return None;
// }
// Some(VirtAddr::new(virt_2_phys//__va
// ((spte_to_pfn(spte)<<PAGE_SHIFT) as usize
// )))
// }
// pub fn read_sptep(sptep: PhysAddr) -> u64 {
// unsafe { *(sptep.data() as *const u64) }
// }

View File

@ -0,0 +1,640 @@
use alloc::vec::Vec;
use log::{error, warn};
use raw_cpuid::CpuId;
use system_error::SystemError;
use x86::{
controlregs::{cr4, xcr0, Cr0, Cr4, Xcr0},
msr::{self, rdmsr, wrmsr},
};
use x86_64::registers::control::{Efer, EferFlags};
use crate::{
arch::vm::vmx::{VmxL1dFlushState, L1TF_VMX_MITIGATION},
libs::once::Once,
mm::percpu::{PerCpu, PerCpuVar},
};
use self::{
asm::{hyperv::*, kvm_msr::*, ArchCapabilities, VmxMsrEntry},
kvm_host::{KvmFunc, KvmInitFunc},
};
use super::driver::tsc::TSCManager;
mod asm;
mod cpuid;
pub(super) mod exit;
pub mod kvm_host;
pub mod mem;
pub mod mmu;
pub mod mtrr;
pub mod uapi;
pub mod vmx;
static mut KVM_X86_MANAGER: Option<KvmArchManager> = None;
pub fn x86_kvm_ops() -> &'static dyn KvmFunc {
unsafe { KVM_X86_MANAGER.as_ref().unwrap().funcs() }
}
pub fn x86_kvm_manager() -> &'static KvmArchManager {
unsafe { KVM_X86_MANAGER.as_ref().unwrap() }
}
pub fn x86_kvm_manager_mut() -> &'static mut KvmArchManager {
unsafe { KVM_X86_MANAGER.as_mut().unwrap() }
}
pub fn init_kvm_arch() {
static ONCE: Once = Once::new();
ONCE.call_once(|| unsafe {
KVM_X86_MANAGER = Some(KvmArchManager::init());
let mut user_return_msrs = Vec::new();
user_return_msrs.resize(PerCpu::MAX_CPU_NUM as usize, KvmUserReturnMsrs::default());
USER_RETURN_MSRS = Some(PerCpuVar::new(user_return_msrs).unwrap());
})
}
/// fixme这些成员是否需要加锁呢?
#[derive(Debug)]
pub struct KvmArchManager {
funcs: Option<&'static dyn KvmFunc>,
host_xcr0: Xcr0,
host_efer: EferFlags,
host_xss: u64,
host_arch_capabilities: u64,
kvm_uret_msrs_list: Vec<u32>,
kvm_caps: KvmCapabilities,
max_tsc_khz: u64,
msrs_to_save: Vec<u32>,
emulated_msrs: Vec<u32>,
msr_based_features: Vec<u32>,
has_noapic_vcpu: bool,
enable_pmu: bool,
// 只读
possible_cr0_guest: Cr0,
possible_cr4_guest: Cr4,
cr4_tlbflush_bits: Cr4,
cr4_pdptr_bits: Cr4,
}
impl KvmArchManager {
pub fn init() -> Self {
Self {
possible_cr0_guest: Cr0::CR0_TASK_SWITCHED | Cr0::CR0_WRITE_PROTECT,
possible_cr4_guest: Cr4::CR4_VIRTUAL_INTERRUPTS
| Cr4::CR4_DEBUGGING_EXTENSIONS
| Cr4::CR4_ENABLE_PPMC
| Cr4::CR4_ENABLE_SSE
| Cr4::CR4_UNMASKED_SSE
| Cr4::CR4_ENABLE_GLOBAL_PAGES
| Cr4::CR4_TIME_STAMP_DISABLE
| Cr4::CR4_ENABLE_FSGSBASE,
cr4_tlbflush_bits: Cr4::CR4_ENABLE_GLOBAL_PAGES
| Cr4::CR4_ENABLE_PCID
| Cr4::CR4_ENABLE_PAE
| Cr4::CR4_ENABLE_SMEP,
cr4_pdptr_bits: Cr4::CR4_ENABLE_GLOBAL_PAGES
| Cr4::CR4_ENABLE_PSE
| Cr4::CR4_ENABLE_PAE
| Cr4::CR4_ENABLE_SMEP,
host_xcr0: Xcr0::empty(),
funcs: Default::default(),
host_efer: EferFlags::empty(),
host_xss: Default::default(),
host_arch_capabilities: Default::default(),
kvm_uret_msrs_list: Default::default(),
kvm_caps: Default::default(),
max_tsc_khz: Default::default(),
msrs_to_save: Default::default(),
emulated_msrs: Default::default(),
msr_based_features: Default::default(),
has_noapic_vcpu: Default::default(),
enable_pmu: Default::default(),
}
}
#[inline]
pub fn set_runtime_func(&mut self, funcs: &'static dyn KvmFunc) {
self.funcs = Some(funcs);
}
#[inline]
pub fn funcs(&self) -> &'static dyn KvmFunc {
self.funcs.unwrap()
}
pub fn find_user_return_msr_idx(&self, msr: u32) -> Option<usize> {
for (i, val) in self.kvm_uret_msrs_list.iter().enumerate() {
if *val == msr {
return Some(i);
}
}
None
}
pub fn mpx_supported(&self) -> bool {
self.kvm_caps.supported_xcr0 & (Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDCSR_STATE)
== (Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDCSR_STATE)
}
pub const KVM_MAX_VCPUS: usize = 1024;
pub const KVM_MAX_NR_USER_RETURN_MSRS: usize = 7;
const MSRS_TO_SAVE_BASE: &[u32] = &[
msr::IA32_SYSENTER_CS,
msr::IA32_SYSENTER_ESP,
msr::IA32_SYSENTER_EIP,
msr::IA32_STAR,
msr::IA32_CSTAR,
msr::IA32_KERNEL_GSBASE,
msr::IA32_FMASK,
msr::IA32_LSTAR,
msr::IA32_TIME_STAMP_COUNTER,
msr::IA32_PAT,
0xc0010117, // MSR_VM_HSAVE_PA?
msr::IA32_FEATURE_CONTROL,
msr::MSR_C1_PMON_EVNT_SEL0,
msr::IA32_TSC_AUX,
0x48, // MSR_IA32_SPEC_CTRL
msr::MSR_IA32_TSX_CTRL,
msr::MSR_IA32_RTIT_CTL,
msr::MSR_IA32_RTIT_STATUS,
msr::MSR_IA32_CR3_MATCH,
msr::MSR_IA32_RTIT_OUTPUT_BASE,
msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS,
msr::MSR_IA32_ADDR0_START,
msr::MSR_IA32_ADDR0_END,
msr::MSR_IA32_ADDR1_START,
msr::MSR_IA32_ADDR1_END,
msr::MSR_IA32_ADDR2_START,
msr::MSR_IA32_ADDR2_END,
msr::MSR_IA32_ADDR3_START,
msr::MSR_IA32_ADDR3_END,
0xe1, // MSR_IA32_UMWAIT_CONTROL
0x1c4, // MSR_IA32_XFD
0x1c5, // MSR_IA32_XFD_ERR
];
const EMULATED_MSRS_ALL: &[u32] = &[
MSR_KVM_SYSTEM_TIME,
MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW,
MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID,
HV_X64_MSR_HYPERCALL,
HV_REGISTER_TIME_REF_COUNT,
HV_REGISTER_REFERENCE_TSC,
HV_X64_MSR_TSC_FREQUENCY,
HV_X64_MSR_APIC_FREQUENCY,
HV_REGISTER_CRASH_P0,
HV_REGISTER_CRASH_P1,
HV_REGISTER_CRASH_P2,
HV_REGISTER_CRASH_P3,
HV_REGISTER_CRASH_P4,
HV_REGISTER_CRASH_CTL,
HV_X64_MSR_RESET,
HV_REGISTER_VP_INDEX,
HV_X64_MSR_VP_RUNTIME,
HV_REGISTER_SCONTROL,
HV_REGISTER_STIMER0_CONFIG,
HV_X64_MSR_VP_ASSIST_PAGE,
HV_X64_MSR_REENLIGHTENMENT_CONTROL,
HV_X64_MSR_TSC_EMULATION_CONTROL,
HV_X64_MSR_TSC_EMULATION_STATUS,
HV_X64_MSR_TSC_INVARIANT_CONTROL,
HV_X64_MSR_SYNDBG_OPTIONS,
HV_X64_MSR_SYNDBG_CONTROL,
HV_X64_MSR_SYNDBG_STATUS,
HV_X64_MSR_SYNDBG_SEND_BUFFER,
HV_X64_MSR_SYNDBG_RECV_BUFFER,
HV_X64_MSR_SYNDBG_PENDING_BUFFER,
MSR_KVM_ASYNC_PF_EN,
MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN,
MSR_KVM_ASYNC_PF_INT,
MSR_KVM_ASYNC_PF_ACK,
msr::IA32_TSC_ADJUST,
msr::IA32_TSC_DEADLINE,
msr::IA32_PERF_CAPABILITIES,
0x10a, // MSR_IA32_ARCH_CAPABILITIES,
msr::IA32_MISC_ENABLE,
msr::IA32_MCG_STATUS,
msr::IA32_MCG_CTL,
0x4d0, // MSR_IA32_MCG_EXT_CTL,
msr::IA32_SMBASE,
msr::MSR_SMI_COUNT,
msr::MSR_PLATFORM_INFO,
0x140, // MSR_MISC_FEATURES_ENABLES,
0xc001011f, // MSR_AMD64_VIRT_SPEC_CTRL,
0xc0000104, // MSR_AMD64_TSC_RATIO,
msr::MSR_POWER_CTL,
msr::IA32_BIOS_SIGN_ID, // MSR_IA32_UCODE_REV,
/*
* KVM always supports the "true" VMX control MSRs, even if the host
* does not. The VMX MSRs as a whole are considered "emulated" as KVM
* doesn't strictly require them to exist in the host (ignoring that
* KVM would refuse to load in the first place if the core set of MSRs
* aren't supported).
*/
msr::IA32_VMX_BASIC,
msr::IA32_VMX_TRUE_PINBASED_CTLS,
msr::IA32_VMX_TRUE_PROCBASED_CTLS,
msr::IA32_VMX_TRUE_EXIT_CTLS,
msr::IA32_VMX_TRUE_ENTRY_CTLS,
msr::IA32_VMX_MISC,
msr::IA32_VMX_CR0_FIXED0,
msr::IA32_VMX_CR4_FIXED0,
msr::IA32_VMX_VMCS_ENUM,
msr::IA32_VMX_PROCBASED_CTLS2,
msr::IA32_VMX_EPT_VPID_CAP,
msr::IA32_VMX_VMFUNC,
0xc0010015, // MSR_K7_HWCR,
MSR_KVM_POLL_CONTROL,
];
const MSR_BASED_FEATURES_ALL_EXCEPT_VMX: &[u32] = &[
0xc0011029, // MSR_AMD64_DE_CFG
msr::IA32_BIOS_SIGN_ID, // MSR_IA32_UCODE_REV
0x10a, // MSR_IA32_ARCH_CAPABILITIES,
msr::IA32_PERF_CAPABILITIES,
];
pub fn arch_hardware_enable(&self) -> Result<(), SystemError> {
self.online_user_return_msr();
x86_kvm_ops().hardware_enable()?;
// TODO: 这里需要对TSC进行一系列检测
Ok(())
}
/// ## 初始化当前cpu的kvm msr寄存器
fn online_user_return_msr(&self) {
let user_return_msrs = user_return_msrs().get_mut();
for (idx, msr) in self.kvm_uret_msrs_list.iter().enumerate() {
let val = unsafe { rdmsr(*msr) };
user_return_msrs.values[idx].host = val;
user_return_msrs.values[idx].curr = val;
}
}
/// 厂商相关的init工作
pub fn vendor_init(&mut self, init_ops: &'static dyn KvmInitFunc) -> Result<(), SystemError> {
let cpuid = CpuId::new();
let cpu_feature = cpuid.get_feature_info().ok_or(SystemError::ENOSYS)?;
let cpu_extend = cpuid.get_extended_state_info().ok_or(SystemError::ENOSYS)?;
let extend_features = cpuid
.get_extended_feature_info()
.ok_or(SystemError::ENOSYS)?;
let kvm_x86_ops = &self.funcs;
// 是否已经设置过
if kvm_x86_ops.is_some() {
error!(
"[KVM] already loaded vendor module {}",
kvm_x86_ops.unwrap().name()
);
return Err(SystemError::EEXIST);
}
// 确保cpu支持fpu浮点数处理器
if !cpu_feature.has_fpu() || !cpu_feature.has_fxsave_fxstor() {
error!("[KVM] inadequate fpu");
return Err(SystemError::ENOSYS);
}
// TODO实时内核需要判断tsc
// https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#9472
// 读取主机page attribute table页属性表
let host_pat = unsafe { rdmsr(msr::IA32_PAT) };
// PAT[0]是否为write back类型即判断低三位是否为0b110(0x06)
if host_pat & 0b111 != 0b110 {
error!("[KVM] host PAT[0] is not WB");
return Err(SystemError::EIO);
}
// TODOmmu vendor init
if cpu_feature.has_xsave() && unsafe { cr4() }.contains(Cr4::CR4_ENABLE_OS_XSAVE) {
self.host_xcr0 = unsafe { xcr0() };
self.kvm_caps.supported_xcr0 = self.host_xcr0;
}
// 保存efer
self.host_efer = Efer::read();
// 保存xss
if cpu_extend.has_xsaves_xrstors() {
self.host_xss = unsafe { rdmsr(msr::MSR_C5_PMON_BOX_CTRL) };
}
// TODO: 初始化性能监视单元PMU
// https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#9518
if extend_features.has_sha() {
self.host_arch_capabilities = unsafe {
// MSR_IA32_ARCH_CAPABILITIES
rdmsr(0x10a)
}
}
init_ops.hardware_setup()?;
self.set_runtime_func(init_ops.runtime_funcs());
self.kvm_timer_init()?;
// TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#9544
let kvm_caps = &mut self.kvm_caps;
if !cpu_extend.has_xsaves_xrstors() {
kvm_caps.supported_xss = 0;
}
if kvm_caps.has_tsc_control {
kvm_caps.max_guest_tsc_khz = 0x7fffffff.min(
((kvm_caps.max_tsc_scaling_ratio as i128 * TSCManager::tsc_khz() as i128)
>> kvm_caps.tsc_scaling_ratio_frac_bits) as u32,
);
}
kvm_caps.default_tsc_scaling_ratio = 1 << kvm_caps.tsc_scaling_ratio_frac_bits;
self.kvm_init_msr_lists();
warn!("vendor init over");
Ok(())
}
fn kvm_init_msr_lists(&mut self) {
self.msrs_to_save.clear();
self.emulated_msrs.clear();
self.msr_based_features.clear();
for msr in Self::MSRS_TO_SAVE_BASE {
self.kvm_probe_msr_to_save(*msr);
}
if self.enable_pmu {
todo!()
}
for msr in Self::EMULATED_MSRS_ALL {
if !x86_kvm_ops().has_emulated_msr(*msr) {
continue;
}
self.emulated_msrs.push(*msr);
}
for msr in msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC {
self.kvm_prove_feature_msr(msr)
}
for msr in Self::MSR_BASED_FEATURES_ALL_EXCEPT_VMX {
self.kvm_prove_feature_msr(*msr);
}
}
fn kvm_probe_msr_to_save(&mut self, msr: u32) {
let cpuid = CpuId::new();
let cpu_feat = cpuid.get_feature_info().unwrap();
let cpu_extend = cpuid.get_extended_feature_info().unwrap();
match msr {
msr::MSR_C1_PMON_EVNT_SEL0 => {
if !cpu_extend.has_mpx() {
return;
}
}
msr::IA32_TSC_AUX => {
if !cpu_feat.has_tsc() {
return;
}
}
// MSR_IA32_UNWAIT_CONTROL
0xe1 => {
if !cpu_extend.has_waitpkg() {
return;
}
}
msr::MSR_IA32_RTIT_CTL | msr::MSR_IA32_RTIT_STATUS => {
if !cpu_extend.has_processor_trace() {
return;
}
}
msr::MSR_IA32_CR3_MATCH => {
// TODO: 判断intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)
if !cpu_extend.has_processor_trace() {
return;
}
}
msr::MSR_IA32_RTIT_OUTPUT_BASE | msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS => {
// TODO: 判断!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&!intel_pt_validate_hw_cap(PT_CAP_single_range_output)
if !cpu_extend.has_processor_trace() {
return;
}
}
msr::MSR_IA32_ADDR0_START..msr::MSR_IA32_ADDR3_END => {
// TODO: 判断msr_index - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
if !cpu_extend.has_processor_trace() {
return;
}
}
msr::IA32_PMC0..msr::IA32_PMC7 => {
// TODO: 判断msr是否符合配置
}
msr::IA32_PERFEVTSEL0..msr::IA32_PERFEVTSEL7 => {
// TODO: 判断msr是否符合配置
}
msr::MSR_PERF_FIXED_CTR0..msr::MSR_PERF_FIXED_CTR2 => {
// TODO: 判断msr是否符合配置
}
msr::MSR_IA32_TSX_CTRL => {
// TODO: !(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)
// 这个寄存器目前不支持现在先return
// return;
}
_ => {}
}
self.msrs_to_save.push(msr);
}
fn kvm_prove_feature_msr(&mut self, index: u32) {
let mut msr = VmxMsrEntry {
index,
reserved: Default::default(),
data: Default::default(),
};
if self.get_msr_feature(&mut msr) {
return;
}
self.msr_based_features.push(index);
}
fn get_msr_feature(&self, msr: &mut VmxMsrEntry) -> bool {
match msr.index {
0x10a => {
// MSR_IA32_ARCH_CAPABILITIES,
msr.data = self.get_arch_capabilities();
}
msr::IA32_PERF_CAPABILITIES => {
msr.data = self.kvm_caps.supported_perf_cap;
}
msr::IA32_BIOS_SIGN_ID => {
// MSR_IA32_UCODE_REV
msr.data = unsafe { rdmsr(msr.index) };
}
_ => {
return x86_kvm_ops().get_msr_feature(msr);
}
}
return true;
}
fn get_arch_capabilities(&self) -> u64 {
let mut data = ArchCapabilities::from_bits_truncate(self.host_arch_capabilities)
& ArchCapabilities::KVM_SUPPORTED_ARCH_CAP;
data.insert(ArchCapabilities::ARCH_CAP_PSCHANGE_MC_NO);
if *L1TF_VMX_MITIGATION.read() != VmxL1dFlushState::Never {
data.insert(ArchCapabilities::ARCH_CAP_SKIP_VMENTRY_L1DFLUSH);
}
// fixme:这里是直接赋值这里应该是需要判断cpu是否存在某些bug
data.insert(
ArchCapabilities::ARCH_CAP_RDCL_NO
| ArchCapabilities::ARCH_CAP_SSB_NO
| ArchCapabilities::ARCH_CAP_MDS_NO
| ArchCapabilities::ARCH_CAP_GDS_NO,
);
return data.bits();
}
pub fn add_user_return_msr(&mut self, msr: u32) {
assert!(self.kvm_uret_msrs_list.len() < Self::KVM_MAX_NR_USER_RETURN_MSRS);
self.kvm_uret_msrs_list.push(msr)
}
fn kvm_timer_init(&mut self) -> Result<(), SystemError> {
let cpuid = CpuId::new();
let cpu_feature = cpuid.get_feature_info().ok_or(SystemError::ENOSYS)?;
if cpu_feature.has_tsc() {
self.max_tsc_khz = TSCManager::tsc_khz();
}
// TODO:此处未完成
Ok(())
}
pub fn kvm_set_user_return_msr(&self, slot: usize, mut value: u64, mask: u64) {
let msrs = user_return_msrs().get_mut();
value = (value & mask) | (msrs.values[slot].host & !mask);
if value == msrs.values[slot].curr {
return;
}
unsafe { wrmsr(self.kvm_uret_msrs_list[slot], value) };
msrs.values[slot].curr = value;
if !msrs.registered {
msrs.registered = true;
}
}
}
/// ### Kvm的功能特性
#[derive(Debug)]
pub struct KvmCapabilities {
/// 是否支持控制客户机的 TSC时间戳计数器速率
has_tsc_control: bool,
/// 客户机可以使用的 TSC 的最大速率以khz为单位
max_guest_tsc_khz: u32,
/// TSC 缩放比例的小数部分的位数
tsc_scaling_ratio_frac_bits: u8,
/// TSC 缩放比例的最大允许值
max_tsc_scaling_ratio: u64,
/// 默认的 TSC 缩放比例,其值为 1ull << tsc_scaling_ratio_frac_bits
default_tsc_scaling_ratio: u64,
/// 是否支持总线锁定的退出
has_bus_lock_exit: bool,
/// 是否支持 VM 退出通知
has_notify_vmexit: bool,
/// 支持的 MCE机器检查异常功能的位掩码
supported_mce_cap: McgCap,
/// 支持的 XCR0 寄存器的位掩码
supported_xcr0: Xcr0,
/// 支持的 XSSXSAVE Extended State寄存器的位掩码
supported_xss: u64,
/// 支持的性能监控功能的位掩码
supported_perf_cap: u64,
}
impl Default for KvmCapabilities {
fn default() -> Self {
Self {
has_tsc_control: Default::default(),
max_guest_tsc_khz: Default::default(),
tsc_scaling_ratio_frac_bits: Default::default(),
max_tsc_scaling_ratio: Default::default(),
default_tsc_scaling_ratio: Default::default(),
has_bus_lock_exit: Default::default(),
has_notify_vmexit: Default::default(),
supported_mce_cap: McgCap::MCG_CTL_P | McgCap::MCG_SER_P,
supported_xcr0: Xcr0::empty(),
supported_xss: Default::default(),
supported_perf_cap: Default::default(),
}
}
}
bitflags! {
pub struct McgCap: u64 {
const MCG_BANKCNT_MASK = 0xff; /* Number of Banks */
const MCG_CTL_P = 1 << 8; /* MCG_CTL register available */
const MCG_EXT_P = 1 << 9; /* Extended registers available */
const MCG_CMCI_P = 1 << 10; /* CMCI supported */
const MCG_EXT_CNT_MASK = 0xff0000; /* Number of Extended registers */
const MCG_EXT_CNT_SHIFT = 16;
const MCG_SER_P = 1 << 24; /* MCA recovery/new status bits */
const MCG_ELOG_P = 1 << 26; /* Extended error log supported */
const MCG_LMCE_P = 1 << 27; /* Local machine check supported */
}
}
static mut USER_RETURN_MSRS: Option<PerCpuVar<KvmUserReturnMsrs>> = None;
fn user_return_msrs() -> &'static PerCpuVar<KvmUserReturnMsrs> {
unsafe { USER_RETURN_MSRS.as_ref().unwrap() }
}
#[derive(Debug, Default, Clone)]
struct KvmUserReturnMsrs {
pub registered: bool,
pub values: [KvmUserReturnMsrsValues; KvmArchManager::KVM_MAX_NR_USER_RETURN_MSRS],
}
#[derive(Debug, Default, Clone)]
struct KvmUserReturnMsrsValues {
pub host: u64,
pub curr: u64,
}

View File

@ -0,0 +1,37 @@
use crate::virt::vm::kvm_host::vcpu::VirtCpu;
use super::kvm_host::gfn_to_gpa;
pub fn kvm_mtrr_check_gfn_range_consistency(_vcpu: &mut VirtCpu, gfn: u64, page_num: u64) -> bool {
// let mtrr_state = &vcpu.arch.mtrr_state;
// let mut iter = MtrrIter {
// mem_type: -1,
// mtrr_disabled: false,
// partial_map: false,
// };
let _start = gfn_to_gpa(gfn);
let _end = gfn_to_gpa(gfn + page_num);
// mtrr_for_each_mem_type(&mut iter, mtrr_state, start, end, |iter| {
// if iter.mem_type == -1 {
// iter.mem_type = iter.mem_type;
// } else if iter.mem_type != iter.mem_type {
// return false;
// }
// });
// if iter.mtrr_disabled {
// return true;
// }
// if !iter.partial_map {
// return true;
// }
// if iter.mem_type == -1 {
// return true;
// }
// iter.mem_type == mtrr_default_type(mtrr_state)
true
}

View File

@ -0,0 +1,102 @@
#![allow(dead_code)]
use crate::virt::vm::user_api::UapiKvmSegment;
pub const DE_VECTOR: usize = 0;
pub const DB_VECTOR: usize = 1;
pub const BP_VECTOR: usize = 3;
pub const OF_VECTOR: usize = 4;
pub const BR_VECTOR: usize = 5;
pub const UD_VECTOR: usize = 6;
pub const NM_VECTOR: usize = 7;
pub const DF_VECTOR: usize = 8;
pub const TS_VECTOR: usize = 10;
pub const NP_VECTOR: usize = 11;
pub const SS_VECTOR: usize = 12;
pub const GP_VECTOR: usize = 13;
pub const PF_VECTOR: usize = 14;
pub const MF_VECTOR: usize = 16;
pub const AC_VECTOR: usize = 17;
pub const MC_VECTOR: usize = 18;
pub const XM_VECTOR: usize = 19;
pub const VE_VECTOR: usize = 20;
pub const KVM_SYNC_X86_REGS: u64 = 1 << 0;
pub const KVM_SYNC_X86_SREGS: u64 = 1 << 1;
pub const KVM_SYNC_X86_EVENTS: u64 = 1 << 2;
pub const KVM_SYNC_X86_VALID_FIELDS: u64 =
KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS | KVM_SYNC_X86_EVENTS;
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmSegmentRegs {
pub cs: UapiKvmSegment,
pub ds: UapiKvmSegment,
pub es: UapiKvmSegment,
pub fs: UapiKvmSegment,
pub gs: UapiKvmSegment,
pub ss: UapiKvmSegment,
pub tr: UapiKvmSegment,
pub ldt: UapiKvmSegment,
pub gdt: UapiKvmDtable,
pub idt: UapiKvmDtable,
pub cr0: u64,
pub cr2: u64,
pub cr3: u64,
pub cr4: u64,
pub cr8: u64,
pub efer: u64,
pub apic_base: u64,
pub interrupt_bitmap: [u64; 4usize],
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmDtable {
pub base: u64,
pub limit: u16,
pub padding: [u16; 3usize],
}
#[allow(dead_code)]
pub mod kvm_exit {
pub const KVM_EXIT_UNKNOWN: u32 = 0;
pub const KVM_EXIT_EXCEPTION: u32 = 1;
pub const KVM_EXIT_IO: u32 = 2;
pub const KVM_EXIT_HYPERCALL: u32 = 3;
pub const KVM_EXIT_DEBUG: u32 = 4;
pub const KVM_EXIT_HLT: u32 = 5;
pub const KVM_EXIT_MMIO: u32 = 6;
pub const KVM_EXIT_IRQ_WINDOW_OPEN: u32 = 7;
pub const KVM_EXIT_SHUTDOWN: u32 = 8;
pub const KVM_EXIT_FAIL_ENTRY: u32 = 9;
pub const KVM_EXIT_INTR: u32 = 10;
pub const KVM_EXIT_SET_TPR: u32 = 11;
pub const KVM_EXIT_TPR_ACCESS: u32 = 12;
pub const KVM_EXIT_S390_SIEIC: u32 = 13;
pub const KVM_EXIT_S390_RESET: u32 = 14;
pub const KVM_EXIT_DCR: u32 = 15;
pub const KVM_EXIT_NMI: u32 = 16;
pub const KVM_EXIT_INTERNAL_ERROR: u32 = 17;
pub const KVM_EXIT_OSI: u32 = 18;
pub const KVM_EXIT_PAPR_HCALL: u32 = 19;
pub const KVM_EXIT_S390_UCONTROL: u32 = 20;
pub const KVM_EXIT_WATCHDOG: u32 = 21;
pub const KVM_EXIT_S390_TSCH: u32 = 22;
pub const KVM_EXIT_EPR: u32 = 23;
pub const KVM_EXIT_SYSTEM_EVENT: u32 = 24;
pub const KVM_EXIT_S390_STSI: u32 = 25;
pub const KVM_EXIT_IOAPIC_EOI: u32 = 26;
pub const KVM_EXIT_HYPERV: u32 = 27;
pub const KVM_EXIT_ARM_NISV: u32 = 28;
pub const KVM_EXIT_X86_RDMSR: u32 = 29;
pub const KVM_EXIT_X86_WRMSR: u32 = 30;
pub const KVM_EXIT_DIRTY_RING_FULL: u32 = 31;
pub const KVM_EXIT_AP_RESET_HOLD: u32 = 32;
pub const KVM_EXIT_X86_BUS_LOCK: u32 = 33;
pub const KVM_EXIT_XEN: u32 = 34;
pub const KVM_EXIT_RISCV_SBI: u32 = 35;
pub const KVM_EXIT_RISCV_CSR: u32 = 36;
pub const KVM_EXIT_NOTIFY: u32 = 37;
}

View File

@ -0,0 +1,19 @@
#![allow(dead_code)]
pub const VMX_EPT_MT_EPTE_SHIFT: u64 = 3;
pub const VMX_EPTP_PWL_MASK: u64 = 0x38;
pub const VMX_EPTP_PWL_4: u64 = 0x18;
pub const VMX_EPTP_PWL_5: u64 = 0x20;
pub const VMX_EPTP_AD_ENABLE_BIT: u64 = 1 << 6;
pub const VMX_EPTP_MT_MASK: u64 = 0x7;
pub const VMX_EPTP_MT_WB: u64 = 0x6;
pub const VMX_EPTP_MT_UC: u64 = 0x0;
pub const VMX_EPT_READABLE_MASK: u64 = 0x1;
pub const VMX_EPT_WRITABLE_MASK: u64 = 0x2;
pub const VMX_EPT_EXECUTABLE_MASK: u64 = 0x4;
pub const VMX_EPT_IPAT_BIT: u64 = 1 << 6;
pub const VMX_EPT_ACCESS_BIT: u64 = 1 << 8;
pub const VMX_EPT_DIRTY_BIT: u64 = 1 << 9;
pub const VMX_EPT_RWX_MASK: u64 =
VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | VMX_EPT_EXECUTABLE_MASK;
pub const VMX_EPT_MT_MASK: u64 = 7 << VMX_EPT_MT_EPTE_SHIFT;

View File

@ -0,0 +1,591 @@
use raw_cpuid::CpuId;
use x86::{
msr,
vmx::vmcs::control::{
EntryControls, ExitControls, PinbasedControls, PrimaryControls, SecondaryControls,
},
};
use crate::{
arch::vm::{
mmu::kvm_mmu::PageLevel, CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR,
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR, VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR,
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR,
},
virt::vm::kvm_host::vcpu::VirtCpu,
};
use super::{vmcs::feat::VmxFeat, Vmx};
#[derive(Debug)]
pub struct VmcsConfig {
pub size: u32,
pub basic_cap: u32,
pub revision_id: u32,
pub pin_based_exec_ctrl: PinbasedControls,
pub cpu_based_exec_ctrl: PrimaryControls,
pub cpu_based_2nd_exec_ctrl: SecondaryControls,
pub cpu_based_3rd_exec_ctrl: u32,
pub vmexit_ctrl: ExitControls,
pub vmentry_ctrl: EntryControls,
pub misc: u64,
pub nested: NestedVmxMsrs,
}
impl Default for VmcsConfig {
fn default() -> Self {
Self {
size: Default::default(),
basic_cap: Default::default(),
revision_id: Default::default(),
pin_based_exec_ctrl: PinbasedControls::empty(),
cpu_based_exec_ctrl: PrimaryControls::empty(),
cpu_based_2nd_exec_ctrl: SecondaryControls::empty(),
cpu_based_3rd_exec_ctrl: Default::default(),
vmexit_ctrl: ExitControls::empty(),
vmentry_ctrl: EntryControls::empty(),
misc: Default::default(),
nested: Default::default(),
}
}
}
#[derive(Debug, Default)]
pub struct NestedVmxMsrs {
/// 主处理器基于控制分为低32位和高32位
pub procbased_ctls_low: u32,
/// 主处理器基于控制分为低32位和高32位
pub procbased_ctls_high: u32,
/// 次要处理器控制分为低32位和高32位
pub secondary_ctls_low: u32,
/// 次要处理器控制分为低32位和高32位
pub secondary_ctls_high: u32,
/// VMX 的针脚基于控制分为低32位和高32位
pub pinbased_ctls_low: u32,
/// VMX 的针脚基于控制分为低32位和高32位
pub pinbased_ctls_high: u32,
/// VM退出控制分为低32位和高32位
pub exit_ctls_low: u32,
/// VM退出控制分为低32位和高32位
pub exit_ctls_high: u32,
/// VM进入控制分为低32位和高32位
pub entry_ctls_low: u32,
/// VM进入控制分为低32位和高32位
pub entry_ctls_high: u32,
/// VMX 的其他杂项控制分为低32位和高32位
pub misc_low: u32,
/// VMX 的其他杂项控制分为低32位和高32位
pub misc_high: u32,
/// 扩展页表EPT的能力信息
pub ept_caps: u32,
/// 虚拟处理器标识VPID的能力信息
pub vpid_caps: u32,
/// 基本能力
pub basic: u64,
/// VMX 控制的CR0寄存器的固定位
pub cr0_fixed0: u64,
/// VMX 控制的CR0寄存器的固定位
pub cr0_fixed1: u64,
/// VMX 控制的CR4寄存器的固定位
pub cr4_fixed0: u64,
/// VMX 控制的CR4寄存器的固定位
pub cr4_fixed1: u64,
/// VMX 控制的VMCS寄存器的编码
pub vmcs_enum: u64,
/// VM功能控制
pub vmfunc_controls: u64,
}
impl NestedVmxMsrs {
pub fn control_msr(low: u32, high: u32) -> u64 {
(high as u64) << 32 | low as u64
}
pub fn get_vmx_msr(&self, msr_index: u32) -> Option<u64> {
match msr_index {
msr::IA32_VMX_BASIC => {
return Some(self.basic);
}
msr::IA32_VMX_TRUE_PINBASED_CTLS | msr::IA32_VMX_PINBASED_CTLS => {
let mut data =
NestedVmxMsrs::control_msr(self.pinbased_ctls_low, self.pinbased_ctls_high);
if msr_index == msr::IA32_VMX_PINBASED_CTLS {
data |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
}
return Some(data);
}
msr::IA32_VMX_TRUE_PROCBASED_CTLS | msr::IA32_VMX_PROCBASED_CTLS => {
let mut data =
NestedVmxMsrs::control_msr(self.procbased_ctls_low, self.procbased_ctls_high);
if msr_index == msr::IA32_VMX_PROCBASED_CTLS {
data |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
}
return Some(data);
}
msr::IA32_VMX_TRUE_EXIT_CTLS | msr::IA32_VMX_EXIT_CTLS => {
let mut data = NestedVmxMsrs::control_msr(self.exit_ctls_low, self.exit_ctls_high);
if msr_index == msr::IA32_VMX_EXIT_CTLS {
data |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
}
return Some(data);
}
msr::IA32_VMX_TRUE_ENTRY_CTLS | msr::IA32_VMX_ENTRY_CTLS => {
let mut data =
NestedVmxMsrs::control_msr(self.entry_ctls_low, self.entry_ctls_high);
if msr_index == msr::IA32_VMX_ENTRY_CTLS {
data |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
}
return Some(data);
}
msr::IA32_VMX_MISC => {
return Some(NestedVmxMsrs::control_msr(self.misc_low, self.misc_high));
}
msr::IA32_VMX_CR0_FIXED0 => {
return Some(self.cr0_fixed0);
}
msr::IA32_VMX_CR0_FIXED1 => {
return Some(self.cr0_fixed1);
}
msr::IA32_VMX_CR4_FIXED0 => {
return Some(self.cr4_fixed0);
}
msr::IA32_VMX_CR4_FIXED1 => {
return Some(self.cr4_fixed1);
}
msr::IA32_VMX_VMCS_ENUM => {
return Some(self.vmcs_enum);
}
msr::IA32_VMX_PROCBASED_CTLS2 => {
return Some(NestedVmxMsrs::control_msr(
self.secondary_ctls_low,
self.secondary_ctls_high,
));
}
msr::IA32_VMX_EPT_VPID_CAP => {
return Some(self.ept_caps as u64 | ((self.vpid_caps as u64) << 32));
}
msr::IA32_VMX_VMFUNC => {
return Some(self.vmfunc_controls);
}
_ => {
return None;
}
}
}
}
#[derive(Debug, Default)]
pub struct VmxCapability {
pub ept: EptFlag,
pub vpid: VpidFlag,
}
#[derive(Debug, PartialEq)]
pub enum ProcessorTraceMode {
System,
HostGuest,
}
bitflags! {
#[derive(Default)]
pub struct VpidFlag: u32 {
/// 表示处理器支持 INVVPID 指令
const INVVPID = 1 << 0; /* (32 - 32) */
/// 表示 VPID 支持以单独地址方式进行范围
const EXTENT_INDIVIDUAL_ADDR = 1 << 8; /* (40 - 32) */
/// 表示 VPID 支持以单个上下文方式进行范围
const EXTENT_SINGLE_CONTEXT = 1 << 9; /* (41 - 32) */
/// 表示 VPID 支持以全局上下文方式进行范围
const EXTENT_GLOBAL_CONTEXT = 1 << 10; /* (42 - 32) */
/// 表示 VPID 支持以单个非全局方式进行范围
const EXTENT_SINGLE_NON_GLOBAL = 1 << 11; /* (43 - 32) */
}
#[derive(Default)]
pub struct EptFlag: u32 {
/// EPT 条目是否允许执行
const EPT_EXECUTE_ONLY = 1;
/// 处理器是否支持 4 级页表
const EPT_PAGE_WALK_4 = 1 << 6;
/// 处理器是否支持 5 级页表
const EPT_PAGE_WALK_5 = 1 << 7;
/// EPT 表的内存类型是否为不可缓存uncached
const EPTP_UC = 1 << 8;
/// EPT 表的内存类型是否为写回write-back
const EPTP_WB = 1 << 14;
/// 处理器是否支持 2MB 大页
const EPT_2MB_PAGE = 1 << 16;
/// 处理器是否支持 1GB 大页
const EPT_1GB_PAGE = 1 << 17;
/// 处理器是否支持 INV-EPT 指令,用于刷新 EPT TLB
const EPT_INVEPT = 1 << 20;
/// EPT 表是否支持访问位Access-Dirty
const EPT_AD = 1 << 21;
/// 处理器是否支持上下文扩展
const EPT_EXTENT_CONTEXT = 1 << 25;
/// 处理器是否支持全局扩展
const EPT_EXTENT_GLOBAL = 1 << 26;
}
}
impl VmxCapability {
pub fn set_val_from_msr_val(&mut self, val: u64) {
self.ept = EptFlag::from_bits_truncate(val as u32);
self.vpid = VpidFlag::from_bits_truncate((val >> 32) as u32);
}
}
impl Vmx {
/// 检查处理器是否支持VMX基本控制结构的输入输出功能
#[inline]
#[allow(dead_code)]
pub fn has_basic_inout(&self) -> bool {
return ((self.vmcs_config.basic_cap as u64) << 32) & VmxFeat::VMX_BASIC_INOUT != 0;
}
/// 检查处理器是否支持虚拟的非屏蔽中断NMI
#[inline]
pub fn has_virtual_nmis(&self) -> bool {
return self
.vmcs_config
.pin_based_exec_ctrl
.contains(PinbasedControls::VIRTUAL_NMIS)
&& self
.vmcs_config
.cpu_based_exec_ctrl
.contains(PrimaryControls::NMI_WINDOW_EXITING);
}
/// 检查处理器是否支持VMX的抢占计时器功能
#[inline]
pub fn has_preemption_timer(&self) -> bool {
return self
.vmcs_config
.pin_based_exec_ctrl
.contains(PinbasedControls::VMX_PREEMPTION_TIMER);
}
/// 检查处理器是否支持VMX的posted interrupt功能
#[inline]
pub fn has_posted_intr(&self) -> bool {
return self
.vmcs_config
.pin_based_exec_ctrl
.contains(PinbasedControls::POSTED_INTERRUPTS);
}
/// 是否支持加载IA32_EFER寄存器
#[inline]
pub fn has_load_ia32_efer(&self) -> bool {
return self
.vmcs_config
.vmentry_ctrl
.contains(EntryControls::LOAD_IA32_EFER);
}
/// 是否支持加载IA32_PERF_GLOBAL_CTRL寄存器
#[inline]
pub fn has_load_perf_global_ctrl(&self) -> bool {
return self
.vmcs_config
.vmentry_ctrl
.contains(EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL);
}
/// 是否支持加载边界检查配置寄存器MPX
#[inline]
pub fn has_mpx(&self) -> bool {
return self
.vmcs_config
.vmentry_ctrl
.contains(EntryControls::LOAD_IA32_BNDCFGS);
}
/// 是否支持虚拟处理器的任务优先级TPR影子
#[inline]
pub fn has_tpr_shadow(&self) -> bool {
return self
.vmcs_config
.cpu_based_exec_ctrl
.contains(PrimaryControls::USE_TPR_SHADOW);
}
/// 检查处理器是否支持 VMX中的 VPIDVirtual Processor ID功能
///
/// VPID 允许虚拟机监视器为每个虚拟处理器分配唯一的标识符,从而使得在不同的虚拟机之间进行快速的上下文切换和恢复成为可能。
///
/// 通过使用 VPIDVMM 可以更快速地识别和恢复之前保存的虚拟处理器的状态,从而提高了虚拟化性能和效率。
#[inline]
pub fn has_vpid(&self) -> bool {
return self
.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::ENABLE_VPID);
}
/// 是否支持invvpid
///
/// INVVPID 指令用于通知处理器无效化指定虚拟处理器标识符VPID相关的 TLBTranslation Lookaside Buffer条目
#[inline]
pub fn has_invvpid(&self) -> bool {
return self.vmx_cap.vpid.contains(VpidFlag::INVVPID);
}
/// VPID 是否支持以单独地址方式进行范围
#[allow(dead_code)]
#[inline]
pub fn has_invvpid_individual_addr(&self) -> bool {
return self.vmx_cap.vpid.contains(VpidFlag::EXTENT_INDIVIDUAL_ADDR);
}
/// VPID 是否支持以单个上下文方式进行范围
#[inline]
pub fn has_invvpid_single(&self) -> bool {
return self.vmx_cap.vpid.contains(VpidFlag::EXTENT_SINGLE_CONTEXT);
}
/// VPID 是否支持以全局上下文方式进行范围
#[inline]
pub fn has_invvpid_global(&self) -> bool {
return self.vmx_cap.vpid.contains(VpidFlag::EXTENT_GLOBAL_CONTEXT);
}
/// 是否启用EPT(Extended Page Tables)
///
/// EPT:EPT 是一种硬件虚拟化技术,允许虚拟机管理程序(例如 Hypervisor) 控制客户操作系统中虚拟地址和物理地址之间的映射。
///
/// 通过启用 EPT处理器可以将虚拟地址直接映射到物理地址从而提高虚拟机的性能和安全性。
#[inline]
pub fn has_ept(&self) -> bool {
return self
.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::ENABLE_EPT);
}
/// 是否支持4级页表
#[inline]
pub fn has_ept_4levels(&self) -> bool {
return self.vmx_cap.ept.contains(EptFlag::EPT_PAGE_WALK_4);
}
/// 是否支持5级页表
#[inline]
pub fn has_ept_5levels(&self) -> bool {
return self.vmx_cap.ept.contains(EptFlag::EPT_PAGE_WALK_5);
}
pub fn get_max_ept_level(&self) -> usize {
if self.has_ept_5levels() {
return 5;
}
return 4;
}
pub fn ept_cap_to_lpage_level(&self) -> PageLevel {
if self.vmx_cap.ept.contains(EptFlag::EPT_1GB_PAGE) {
return PageLevel::Level1G;
}
if self.vmx_cap.ept.contains(EptFlag::EPT_2MB_PAGE) {
return PageLevel::Level2M;
}
return PageLevel::Level4K;
}
/// 判断mt(Memory type)是否为write back
#[inline]
pub fn has_ept_mt_wb(&self) -> bool {
return self.vmx_cap.ept.contains(EptFlag::EPTP_WB);
}
#[inline]
pub fn has_vmx_invept_context(&self) -> bool {
self.vmx_cap.ept.contains(EptFlag::EPT_EXTENT_CONTEXT)
}
/// EPT是否支持全局拓展
#[inline]
pub fn has_invept_global(&self) -> bool {
return self.vmx_cap.ept.contains(EptFlag::EPT_EXTENT_GLOBAL);
}
/// EPT是否支持访问位
#[inline]
pub fn has_ept_ad_bits(&self) -> bool {
return self.vmx_cap.ept.contains(EptFlag::EPT_AD);
}
/// 是否支持 VMX 中的无限制客户unrestricted guest功能
///
/// 无限制客户功能允许客户操作系统在未受到主机操作系统干预的情况下运行
#[inline]
pub fn has_unrestricted_guest(&self) -> bool {
return self
.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::UNRESTRICTED_GUEST);
}
/// 是否支持 VMX 中的 FlexPriority 功能
///
/// FlexPriority 是一种功能,可以在 TPR shadow 和虚拟化 APIC 访问同时可用时启用。
///
/// TPR shadow 允许虚拟机管理程序VMM跟踪虚拟机中处理器的 TPR 值,并在需要时拦截和修改。
///
/// 虚拟化 APIC 访问允许 VMM 控制虚拟机中的 APIC 寄存器访问。
#[inline]
pub fn has_flexproirity(&self) -> bool {
return self.has_tpr_shadow() && self.has_virtualize_apic_accesses();
}
/// 是否支持 VMX 中的虚拟化 APIC 访问功能。
///
/// 当启用此功能时虚拟机管理程序VMM可以控制虚拟机中的 APIC 寄存器访问。
#[inline]
pub fn has_virtualize_apic_accesses(&self) -> bool {
return self
.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::VIRTUALIZE_APIC);
}
/// 是否支持 VMX 中的 ENCLS 指令导致的 VM 退出功能
#[inline]
pub fn has_encls_vmexit(&self) -> bool {
return self
.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::ENCLS_EXITING);
}
/// 是否支持 VMX 中的 PLE (Pause Loop Exiting) 功能。
#[inline]
pub fn has_ple(&self) -> bool {
return self
.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::PAUSE_LOOP_EXITING);
}
/// 是否支持 VMX 中的 APICv 功能
#[inline]
pub fn has_apicv(&self) -> bool {
return self.has_apic_register_virt()
&& self.has_posted_intr()
&& self.has_virtual_intr_delivery();
}
/// 是否支持虚拟化的 APIC 寄存器功能
#[inline]
pub fn has_apic_register_virt(&self) -> bool {
return self
.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::VIRTUALIZE_APIC_REGISTER);
}
/// 是否支持虚拟化的中断传递功能
#[inline]
pub fn has_virtual_intr_delivery(&self) -> bool {
return self
.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY);
}
/// 是否支持虚拟化的中断注入Inter-Processor Interrupt VirtualizationIPIV
#[inline]
pub fn has_ipiv(&self) -> bool {
return false;
}
/// 是否支持虚拟化的 TSC 缩放功能
#[inline]
pub fn has_tsc_scaling(&self) -> bool {
return self
.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::USE_TSC_SCALING);
}
/// 是否支持虚拟化的页修改日志Page Modification Logging
#[inline]
pub fn has_pml(&self) -> bool {
return self
.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::ENABLE_PML);
}
/// 检查 CPU 是否支持使用 MSR 位图来控制 VMX
#[inline]
pub fn has_msr_bitmap(&self) -> bool {
return self
.vmcs_config
.cpu_based_exec_ctrl
.contains(PrimaryControls::USE_MSR_BITMAPS);
}
#[inline]
pub fn has_sceondary_exec_ctrls(&self) -> bool {
self.vmcs_config
.cpu_based_exec_ctrl
.contains(PrimaryControls::SECONDARY_CONTROLS)
}
#[inline]
pub fn has_rdtscp(&self) -> bool {
self.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::ENABLE_RDTSCP)
}
#[inline]
pub fn has_vmfunc(&self) -> bool {
self.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::ENABLE_VM_FUNCTIONS)
}
#[inline]
pub fn has_xsaves(&self) -> bool {
self.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::ENABLE_XSAVES_XRSTORS)
}
#[inline]
pub fn vmx_umip_emulated(&self) -> bool {
let feat = CpuId::new().get_extended_feature_info().unwrap().has_umip();
return !feat
&& (self
.vmcs_config
.cpu_based_2nd_exec_ctrl
.contains(SecondaryControls::DTABLE_EXITING));
}
#[inline]
pub fn has_tertiary_exec_ctrls(&self) -> bool {
false
}
#[inline]
pub fn has_bus_lock_detection(&self) -> bool {
false
}
#[inline]
pub fn has_notify_vmexit(&self) -> bool {
false
}
/// 是否需要拦截页面故障
#[inline]
pub fn vmx_need_pf_intercept(&self, _vcpu: &VirtCpu) -> bool {
// if (!enable_ept)
// return true;
false
}
}

View File

@ -0,0 +1,466 @@
use crate::arch::mm::LockedFrameAllocator;
use crate::arch::vm::asm::VmxAsm;
use crate::arch::vm::mmu::kvm_mmu::PageLevel;
use crate::arch::vm::mmu::mmu_internal::KvmPageFault;
use crate::arch::MMArch;
use crate::libs::spinlock::SpinLockGuard;
use crate::mm::allocator::page_frame::FrameAllocator;
use crate::mm::page::{
page_manager_lock_irqsave, EntryFlags, PageEntry, PageFlags, PageFlush, PageManager, PageType,
};
use crate::mm::{MemoryManagementArch, PhysAddr, VirtAddr};
use crate::smp::core::smp_get_processor_id;
use crate::smp::cpu::AtomicProcessorId;
use crate::smp::cpu::ProcessorId;
use core::ops::Add;
use core::sync::atomic::{compiler_fence, AtomicUsize, Ordering};
use log::{debug, error, warn};
use system_error::SystemError;
use x86::msr;
use x86::vmx::vmcs::control;
// pub const VMX_EPT_MT_EPTE_SHIFT:u64 = 3;
pub const VMX_EPT_RWX_MASK: u64 = 0x7;
// Exit Qualifications for EPT Violations
pub const EPT_VIOLATION_ACC_READ_BIT: u64 = 0;
pub const EPT_VIOLATION_ACC_WRITE_BIT: u64 = 1;
pub const EPT_VIOLATION_ACC_INSTR_BIT: u64 = 2;
pub const EPT_VIOLATION_RWX_SHIFT: u64 = 3;
pub const EPT_VIOLATION_GVA_IS_VALID_BIT: u64 = 7;
pub const EPT_VIOLATION_GVA_TRANSLATED_BIT: u64 = 8;
bitflags! {
pub struct EptViolationExitQual :u64{
const ACC_READ = 1 << EPT_VIOLATION_ACC_READ_BIT;
const ACC_WRITE = 1 << EPT_VIOLATION_ACC_WRITE_BIT;
const ACC_INSTR = 1 << EPT_VIOLATION_ACC_INSTR_BIT;
const RWX_MASK = VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT;
const GVA_IS_VALID = 1 << EPT_VIOLATION_GVA_IS_VALID_BIT;
const GVA_TRANSLATED = 1 << EPT_VIOLATION_GVA_TRANSLATED_BIT;
}
}
// /// 全局EPT物理页信息管理器
// pub static mut EPT_PAGE_MANAGER: Option<SpinLock<EptPageManager>> = None;
// /// 初始化EPT_PAGE_MANAGER
// pub fn ept_page_manager_init() {
// kinfo!("page_manager_init");
// let page_manager = SpinLock::new(EptPageManager::new());
// compiler_fence(Ordering::SeqCst);
// unsafe { EPT_PAGE_MANAGER = Some(page_manager) };
// compiler_fence(Ordering::SeqCst);
// kinfo!("page_manager_init done");
// }
// pub fn ept_page_manager_lock_irqsave() -> SpinLockGuard<'static, EptPageManager> {
// unsafe { EPT_PAGE_MANAGER.as_ref().unwrap().lock_irqsave() }
// }
// EPT 页表数据结构
#[derive(Debug)]
pub struct EptPageTable {
/// 当前页表表示的虚拟地址空间的起始地址,内核访问EPT页表也是在虚拟地址空间中的
base: VirtAddr,
/// 当前页表所在的物理地址
phys: PhysAddr,
/// 当前页表的层级
/// PageLevel::4K = 1
level: PageLevel,
}
impl EptPageTable {
pub fn phys(&self) -> PhysAddr {
self.phys
}
/// 设置当前页表的第i个页表项
pub unsafe fn set_entry(&self, i: usize, entry: PageEntry<MMArch>) -> Option<()> {
let entry_virt = self.entry_virt(i)?;
MMArch::write::<PageEntry<MMArch>>(entry_virt, entry);
let page_entry = MMArch::read::<PageEntry<MMArch>>(entry_virt);
debug!("Set EPT entry: {:?} , index : {:?}", page_entry, i);
return Some(());
}
/// 判断当前页表的第i个页表项是否已经填写了值
///
/// ## 参数
/// - Some(true) 如果已经填写了值
/// - Some(false) 如果未填写值
/// - None 如果i超出了页表项的范围
pub fn entry_mapped(&self, i: usize) -> Option<bool> {
let etv = unsafe { self.entry_virt(i) }?;
if unsafe { MMArch::read::<usize>(etv) } != 0 {
return Some(true);
} else {
return Some(false);
}
}
/// 获取当前页表的层级
#[inline(always)]
pub fn level(&self) -> PageLevel {
self.level
}
/// 获取第i个entry的虚拟内存空间
#[allow(dead_code)]
pub fn entry_base(&self, i: usize) -> Option<VirtAddr> {
if i < MMArch::PAGE_ENTRY_NUM {
let shift = (self.level as usize - 1) * MMArch::PAGE_ENTRY_SHIFT + MMArch::PAGE_SHIFT;
return Some(self.base.add(i << shift));
} else {
return None;
}
}
/// 获取当前页表自身所在的虚拟地址
#[inline(always)]
pub unsafe fn virt(&self) -> VirtAddr {
return MMArch::phys_2_virt(self.phys).unwrap();
}
/// 获取当前页表的第i个页表项所在的虚拟地址注意与entry_base进行区分
pub unsafe fn entry_virt(&self, i: usize) -> Option<VirtAddr> {
if i < MMArch::PAGE_ENTRY_NUM {
return Some(self.virt().add(i * MMArch::PAGE_ENTRY_SIZE));
} else {
return None;
}
}
/// 获取当前页表的第i个页表项
pub unsafe fn entry(&self, i: usize) -> Option<PageEntry<MMArch>> {
let entry_virt = self.entry_virt(i)?;
return Some(PageEntry::from_usize(MMArch::read::<usize>(entry_virt)));
}
pub fn new(base: VirtAddr, phys: PhysAddr, level: PageLevel) -> Self {
Self { base, phys, level }
}
/// 根据虚拟地址,获取对应的页表项在页表中的下标
///
/// ## 参数
///
/// - hva: 虚拟地址
///
/// ## 返回值
///
/// 页表项在页表中的下标。如果addr不在当前页表所表示的虚拟地址空间中则返回None
pub unsafe fn index_of(&self, gpa: PhysAddr) -> Option<usize> {
let addr = VirtAddr::new(gpa.data() & MMArch::PAGE_ADDRESS_MASK);
let shift = (self.level - 1) as usize * MMArch::PAGE_ENTRY_SHIFT + MMArch::PAGE_SHIFT;
//let mask = (MMArch::PAGE_ENTRY_NUM << shift) - 1;
// if addr < self.base || addr >= self.base.add(mask) {
// return None;
// } else {
return Some((addr.data() >> shift) & MMArch::PAGE_ENTRY_MASK);
//}
}
pub fn next_level_table(&self, index: usize) -> Option<EptPageTable> {
if self.level == PageLevel::Level4K {
return None;
}
// 返回下一级页表
let phys = unsafe { self.entry(index)?.address() };
let base;
if let Ok(phys) = phys {
base = unsafe { MMArch::phys_2_virt(PhysAddr::new(phys.data())).unwrap() };
} else {
base = unsafe { MMArch::phys_2_virt(PhysAddr::new(phys.unwrap_err().data())).unwrap() };
}
let level = self.level - 1;
if let Err(_phys) = phys {
debug!("EptPageTable::next_level_table: phys {:?}", phys);
// Not Present的情况下返回None
// 这里之所以绕了一圈是因为在虚拟机启动阶段的page_fault的addr是not_present的但是也要进行映射
// 可能有点问题,但是先这么写
if _phys.data() & 0x7 == 0x000 {
return None;
}
return Some(EptPageTable::new(base, PhysAddr::new(_phys.data()), level));
}
return Some(EptPageTable::new(
base,
PhysAddr::new(phys.unwrap().data()),
level,
));
}
}
// // EPT物理页管理器
// pub struct EptPageManager {
// phys2page: HashMap<PhysAddr, EptPageTable>,
// }
// impl EptPageManager {
// pub fn new() -> Self {
// Self {
// phys2page: HashMap::new(),
// }
// }
// }
/// Check if MTRR is supported
#[allow(dead_code)]
pub fn check_ept_features() -> Result<(), SystemError> {
const MTRR_ENABLE_BIT: u64 = 1 << 11;
let ia32_mtrr_def_type = unsafe { msr::rdmsr(msr::IA32_MTRR_DEF_TYPE) };
if (ia32_mtrr_def_type & MTRR_ENABLE_BIT) == 0 {
return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP);
}
Ok(())
}
/// 标志当前没有处理器持有内核映射器的锁
/// 之所以需要这个标志是因为AtomicUsize::new(0)会把0当作一个处理器的id
const EPT_MAPPER_NO_PROCESSOR: ProcessorId = ProcessorId::INVALID;
/// 当前持有内核映射器锁的处理器
static EPT_MAPPER_LOCK_OWNER: AtomicProcessorId = AtomicProcessorId::new(EPT_MAPPER_NO_PROCESSOR);
/// 内核映射器的锁计数器
static EPT_MAPPER_LOCK_COUNT: AtomicUsize = AtomicUsize::new(0);
pub struct EptPageMapper {
/// EPT页表映射器
//mapper: PageMapper,//PageTableKind::EPT, LockedFrameAllocator
/// 标记当前映射器是否为只读
readonly: bool,
// EPT页表根地址
root_page_addr: PhysAddr,
/// 页分配器
frame_allocator: LockedFrameAllocator,
}
impl EptPageMapper {
/// 返回最上层的ept页表
pub fn table(&self) -> EptPageTable {
EptPageTable::new(
unsafe { MMArch::phys_2_virt(self.root_page_addr).unwrap() },
self.root_page_addr,
PageLevel::Level512G,
)
}
pub fn root_page_addr() -> PhysAddr {
//PML4的物理地址
let eptp = VmxAsm::vmx_vmread(control::EPTP_FULL);
let addr = eptp & 0xFFFF_FFFF_FFFF_F000; //去除低12位
PhysAddr::new(addr as usize)
}
fn lock_cpu(cpuid: ProcessorId) -> Self {
loop {
match EPT_MAPPER_LOCK_OWNER.compare_exchange_weak(
EPT_MAPPER_NO_PROCESSOR,
cpuid,
Ordering::Acquire,
Ordering::Relaxed,
) {
Ok(_) => break,
// 当前处理器已经持有了锁
Err(id) if id == cpuid => break,
// either CAS failed, or some other hardware thread holds the lock
Err(_) => core::hint::spin_loop(),
}
}
let prev_count = EPT_MAPPER_LOCK_COUNT.fetch_add(1, Ordering::Relaxed);
compiler_fence(Ordering::Acquire);
// 本地核心已经持有过锁,因此标记当前加锁获得的映射器为只读
let readonly = prev_count > 0;
let root_page_addr = Self::root_page_addr();
return Self {
readonly,
root_page_addr,
frame_allocator: LockedFrameAllocator,
};
}
/// 锁定内核映射器, 并返回一个内核映射器对象
/// 目前只有这一个办法可以获得EptPageMapper对象
#[inline(always)]
pub fn lock() -> Self {
//fixme:得到的是cpuid还是vcpuid?
let cpuid = smp_get_processor_id();
return Self::lock_cpu(cpuid);
}
/// 检查有无gpa->hpa的映射
#[no_mangle]
pub fn is_mapped(&self, page_fault: &mut KvmPageFault) -> bool {
let gpa = page_fault.gpa() as usize;
let mut page_table = self.table();
let mut next_page_table;
loop {
let index: usize = unsafe {
if let Some(i) = page_table.index_of(PhysAddr::new(gpa)) {
debug!("ept page table index: {:?}", i);
i
} else {
error!("ept page table index_of failed");
return false;
}
};
debug!("EPT table: index = {:?}, value = {:?}", index, page_table);
if let Some(table) = page_table.next_level_table(index) {
if table.level() == PageLevel::Level4K {
debug!("EPT table 4K: {:?}", table);
return true;
}
debug!("table.level(): {:?}", table.level());
next_page_table = table;
} else {
return false;
}
page_table = next_page_table;
}
}
/// 从当前EptPageMapper的页分配器中分配一个物理页(hpa)并将其映射到指定的gpa
pub fn map(&mut self, gpa: PhysAddr, flags: EntryFlags<MMArch>) -> Option<PageFlush<MMArch>> {
let gpa = PhysAddr::new(gpa.data() & (!MMArch::PAGE_NEGATIVE_MASK) & !0xFFF);
self.map_gpa(gpa, flags)
}
///映射一个hpa到指定的gpa
pub fn map_gpa(
&mut self,
gpa: PhysAddr,
flags: EntryFlags<MMArch>,
) -> Option<PageFlush<MMArch>> {
// 验证虚拟地址和物理地址是否对齐
if !(gpa.check_aligned(MMArch::PAGE_SIZE)) {
error!("Try to map unaligned page: gpa={:?}", gpa);
}
// TODO 验证flags是否合法
let mut table = self.table();
debug!("ept page table: {:?}", table);
loop {
let i = unsafe { table.index_of(gpa).unwrap() };
assert!(i < MMArch::PAGE_ENTRY_NUM);
if table.level() == PageLevel::Level4K {
//检查这个4K页面是否映射过
if table.entry_mapped(i).unwrap() {
unsafe {
let entry_virt = table.entry_virt(i)?;
let _set_entry = MMArch::read::<PageEntry<MMArch>>(entry_virt);
warn!(
"index :: {:?} , Page gpa :: {:?} already mapped,content is: {:x}",
i,
gpa,
_set_entry.data()
);
return None;
};
}
//分配一个entry的物理页
compiler_fence(Ordering::SeqCst);
// let hpa: PhysAddr = unsafe { self.frame_allocator.allocate_one() }?;
// debug!("Allocate hpa: {:?}", hpa);
// 修改全局页管理器
let mut page_manager_guard: SpinLockGuard<'static, PageManager> =
page_manager_lock_irqsave();
let page = page_manager_guard
.create_one_page(
PageType::Normal,
PageFlags::empty(),
&mut self.frame_allocator,
)
.ok()?;
let hpa = page.phys_address();
drop(page_manager_guard);
// 清空这个页帧
unsafe {
MMArch::write_bytes(MMArch::phys_2_virt(hpa).unwrap(), 0, MMArch::PAGE_SIZE)
};
let entry = PageEntry::new(hpa, flags);
unsafe { table.set_entry(i, entry) };
compiler_fence(Ordering::SeqCst);
//打印页表项以进行验证
unsafe {
let entry_virt = table.entry_virt(i)?;
let _set_entry = MMArch::read::<PageEntry<MMArch>>(entry_virt);
}
return Some(PageFlush::new(unsafe { table.entry_virt(i)? }));
} else {
let next_table = table.next_level_table(i);
if let Some(next_table) = next_table {
table = next_table;
debug!("already next table: {:?}", table);
} else {
// 分配下一级页表
let frame = unsafe { self.frame_allocator.allocate_one() }?;
// 清空这个页帧
unsafe {
MMArch::write_bytes(
MMArch::phys_2_virt(frame).unwrap(),
0,
MMArch::PAGE_SIZE,
)
};
// fixme::设置页表项的flags可能有点问题
let flags: EntryFlags<MMArch> = unsafe { EntryFlags::from_data(0x7) };
// 把新分配的页表映射到当前页表
unsafe { table.set_entry(i, PageEntry::new(frame, flags)) };
// 获取新分配的页表
table = table.next_level_table(i)?;
}
}
}
}
}
#[allow(dead_code)]
//调试EPT页表用可以打印出EPT页表的值
pub fn debug_eptp() {
let pml4_hpa: PhysAddr = EptPageMapper::lock().table().phys();
debug!("Prepare to read EPTP address");
let pml4_hva = unsafe { MMArch::phys_2_virt(PhysAddr::new(pml4_hpa.data())).unwrap() };
debug!("PML4_hpa: 0x{:x}", pml4_hpa.data());
debug!("PML4_hva: 0x{:x}", pml4_hva.data()); //Level512G
unsafe {
let entry = MMArch::read::<u64>(pml4_hva);
debug!("Value at EPTP address: 0x{:x}", entry); //Level2M
// 遍历并打印所有已分配的页面
traverse_ept_table(pml4_hva, 4);
}
}
unsafe fn traverse_ept_table(table_addr: VirtAddr, level: u8) {
if level == (u8::MAX) {
return;
}
let entries = MMArch::read_array::<u64>(table_addr, 511);
for (i, entry) in entries.iter().enumerate() {
//打印已分配的entry和4K页表的所有entry
if *entry & 0x7 != 0 || level == 0 {
let next_level_addr = if level != 0 {
MMArch::phys_2_virt(PhysAddr::new((*entry & 0xFFFFFFFFF000) as usize))
} else {
//暂未分配地址
if *entry == 0 {
continue;
}
MMArch::phys_2_virt(PhysAddr::new((*entry & 0xFFFFFFFFF000) as usize))
};
let entry_value = MMArch::read::<u64>(next_level_addr.unwrap());
debug!(
"Level {} - index {}: HPA: 0x{:016x}, read_to: 0x{:016x}",
level, i, *entry, /*& 0xFFFFFFFFF000*/ entry_value,
);
// 递归遍历下一级页表
traverse_ept_table(next_level_addr.unwrap(), level - 1);
}
}
}

View File

@ -0,0 +1,426 @@
use bitfield_struct::bitfield;
use system_error::SystemError;
use x86::vmx::vmcs::{guest, ro};
use crate::{
arch::vm::asm::{IntrInfo, VmxAsm},
virt::vm::kvm_host::{vcpu::VirtCpu, Vm},
};
use super::{ept::EptViolationExitQual, vmx_info, PageFaultErr};
extern crate num_traits;
#[bitfield(u32)]
pub struct VmxExitReason {
pub basic: u16,
pub reserved16: bool,
pub reserved17: bool,
pub reserved18: bool,
pub reserved19: bool,
pub reserved20: bool,
pub reserved21: bool,
pub reserved22: bool,
pub reserved23: bool,
pub reserved24: bool,
pub reserved25: bool,
pub bus_lock_detected: bool,
pub enclave_mode: bool,
pub smi_pending_mtf: bool,
pub smi_from_vmx_root: bool,
pub reserved30: bool,
pub failed_vmentry: bool,
}
//#define VMX_EXIT_REASONS
#[derive(FromPrimitive, PartialEq, Clone, Copy)]
#[allow(non_camel_case_types)]
pub enum VmxExitReasonBasic {
EXCEPTION_OR_NMI = 0,
EXTERNAL_INTERRUPT = 1,
TRIPLE_FAULT = 2,
INIT_SIGNAL = 3,
SIPI = 4,
IO_SMI = 5,
OTHER_SMI = 6,
INTERRUPT_WINDOW = 7,
NMI_WINDOW = 8,
TASK_SWITCH = 9,
CPUID = 10,
GETSEC = 11,
HLT = 12,
INVD = 13,
INVLPG = 14,
RDPMC = 15,
RDTSC = 16,
RSM = 17,
VMCALL = 18,
VMCLEAR = 19,
VMLAUNCH = 20,
VMPTRLD = 21,
VMPTRST = 22,
VMREAD = 23,
VMRESUME = 24,
VMWRITE = 25,
VMXOFF = 26,
VMXON = 27,
CR_ACCESS = 28,
DR_ACCESS = 29,
IO_INSTRUCTION = 30,
RDMSR = 31,
WRMSR = 32,
VM_ENTRY_FAILURE_INVALID_GUEST_STATE = 33,
VM_ENTRY_FAILURE_MSR_LOADING = 34,
MWAIT = 36,
MONITOR_TRAP_FLAG = 37,
MONITOR = 39,
PAUSE = 40,
VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT = 41,
TPR_BELOW_THRESHOLD = 43,
APIC_ACCESS = 44,
VIRTUALIZED_EOI = 45, // "EOI_INDUCED"
ACCESS_GDTR_OR_IDTR = 46,
ACCESS_LDTR_OR_TR = 47,
EPT_VIOLATION = 48,
EPT_MISCONFIG = 49,
INVEPT = 50,
RDTSCP = 51,
VMX_PREEMPTION_TIMER_EXPIRED = 52,
INVVPID = 53,
WBINVD = 54,
XSETBV = 55,
APIC_WRITE = 56,
RDRAND = 57,
INVPCID = 58,
VMFUNC = 59,
ENCLS = 60,
RDSEED = 61,
PML_FULL = 62,
XSAVES = 63,
XRSTORS = 64,
UMWAIT = 67,
TPAUSE = 68,
BUS_LOCK = 74,
NOTIFY = 75,
UNKNOWN,
}
impl From<u16> for VmxExitReasonBasic {
fn from(num: u16) -> Self {
match num {
0 => VmxExitReasonBasic::EXCEPTION_OR_NMI,
1 => VmxExitReasonBasic::EXTERNAL_INTERRUPT,
2 => VmxExitReasonBasic::TRIPLE_FAULT,
3 => VmxExitReasonBasic::INIT_SIGNAL,
4 => VmxExitReasonBasic::SIPI,
5 => VmxExitReasonBasic::IO_SMI,
6 => VmxExitReasonBasic::OTHER_SMI,
7 => VmxExitReasonBasic::INTERRUPT_WINDOW,
8 => VmxExitReasonBasic::NMI_WINDOW,
9 => VmxExitReasonBasic::TASK_SWITCH,
10 => VmxExitReasonBasic::CPUID,
11 => VmxExitReasonBasic::GETSEC,
12 => VmxExitReasonBasic::HLT,
13 => VmxExitReasonBasic::INVD,
14 => VmxExitReasonBasic::INVLPG,
15 => VmxExitReasonBasic::RDPMC,
16 => VmxExitReasonBasic::RDTSC,
17 => VmxExitReasonBasic::RSM,
18 => VmxExitReasonBasic::VMCALL,
19 => VmxExitReasonBasic::VMCLEAR,
20 => VmxExitReasonBasic::VMLAUNCH,
21 => VmxExitReasonBasic::VMPTRLD,
22 => VmxExitReasonBasic::VMPTRST,
23 => VmxExitReasonBasic::VMREAD,
24 => VmxExitReasonBasic::VMRESUME,
25 => VmxExitReasonBasic::VMWRITE,
26 => VmxExitReasonBasic::VMXOFF,
27 => VmxExitReasonBasic::VMXON,
28 => VmxExitReasonBasic::CR_ACCESS,
29 => VmxExitReasonBasic::DR_ACCESS,
30 => VmxExitReasonBasic::IO_INSTRUCTION,
31 => VmxExitReasonBasic::RDMSR,
32 => VmxExitReasonBasic::WRMSR,
33 => VmxExitReasonBasic::VM_ENTRY_FAILURE_INVALID_GUEST_STATE,
34 => VmxExitReasonBasic::VM_ENTRY_FAILURE_MSR_LOADING,
36 => VmxExitReasonBasic::MWAIT,
37 => VmxExitReasonBasic::MONITOR_TRAP_FLAG,
39 => VmxExitReasonBasic::MONITOR,
40 => VmxExitReasonBasic::PAUSE,
41 => VmxExitReasonBasic::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT,
43 => VmxExitReasonBasic::TPR_BELOW_THRESHOLD,
44 => VmxExitReasonBasic::APIC_ACCESS,
45 => VmxExitReasonBasic::VIRTUALIZED_EOI,
46 => VmxExitReasonBasic::ACCESS_GDTR_OR_IDTR,
47 => VmxExitReasonBasic::ACCESS_LDTR_OR_TR,
48 => VmxExitReasonBasic::EPT_VIOLATION,
49 => VmxExitReasonBasic::EPT_MISCONFIG,
50 => VmxExitReasonBasic::INVEPT,
51 => VmxExitReasonBasic::RDTSCP,
52 => VmxExitReasonBasic::VMX_PREEMPTION_TIMER_EXPIRED,
53 => VmxExitReasonBasic::INVVPID,
54 => VmxExitReasonBasic::WBINVD,
55 => VmxExitReasonBasic::XSETBV,
56 => VmxExitReasonBasic::APIC_WRITE,
57 => VmxExitReasonBasic::RDRAND,
58 => VmxExitReasonBasic::INVPCID,
59 => VmxExitReasonBasic::VMFUNC,
60 => VmxExitReasonBasic::ENCLS,
61 => VmxExitReasonBasic::RDSEED,
62 => VmxExitReasonBasic::PML_FULL,
63 => VmxExitReasonBasic::XSAVES,
64 => VmxExitReasonBasic::XRSTORS,
67 => VmxExitReasonBasic::UMWAIT,
68 => VmxExitReasonBasic::TPAUSE,
74 => VmxExitReasonBasic::BUS_LOCK,
75 => VmxExitReasonBasic::NOTIFY,
_ => VmxExitReasonBasic::UNKNOWN,
}
}
}
#[derive(Debug, PartialEq)]
#[allow(dead_code)]
pub enum ExitFastpathCompletion {
None,
ReenterGuest,
ExitHandled,
}
pub struct VmxExitHandlers {}
// //name 代表暂时不懂含义的(name linux=name DragonOS)
// ExceptionNmi = VmxExitReasonBasic::EXCEPTION_OR_NMI as isize,
// ExternalInterrupt = VmxExitReasonBasic::EXTERNAL_INTERRUPT as isize,
// TripleFault = VmxExitReasonBasic::TRIPLE_FAULT as isize,
// NmiWindow = VmxExitReasonBasic::NMI_WINDOW as isize,
// IoInstruction = VmxExitReasonBasic::IO_INSTRUCTION as isize,
// CrAccess = VmxExitReasonBasic::CR_ACCESS as isize,
// DrAccess = VmxExitReasonBasic::DR_ACCESS as isize,
// Cpuid = VmxExitReasonBasic::CPUID as isize,
// MsrRead = VmxExitReasonBasic::RDMSR as isize,
// MsrWrite = VmxExitReasonBasic::WRMSR as isize,
// InterruptWindow = VmxExitReasonBasic::INTERRUPT_WINDOW as isize,
// Hlt = VmxExitReasonBasic::HLT as isize,
// Invd = VmxExitReasonBasic::INVD as isize,
// Invlpg = VmxExitReasonBasic::INVLPG as isize,
// Rdpmc = VmxExitReasonBasic::RDPMC as isize,
// Vmcall = VmxExitReasonBasic::VMCALL as isize,
// Vmclear = VmxExitReasonBasic::VMCLEAR as isize,
// Vmlaunch = VmxExitReasonBasic::VMLAUNCH as isize,
// Vmptrld = VmxExitReasonBasic::VMPTRLD as isize,
// Vmptrst = VmxExitReasonBasic::VMPTRST as isize,
// Vmread = VmxExitReasonBasic::VMREAD as isize,
// Vmresume = VmxExitReasonBasic::VMRESUME as isize,
// Vmwrite = VmxExitReasonBasic::VMWRITE as isize,
// Vmoff = VmxExitReasonBasic::VMXOFF as isize,
// Vmon = VmxExitReasonBasic::VMXON as isize,
// TprBelowThreshold = VmxExitReasonBasic::TPR_BELOW_THRESHOLD as isize,
// ApicAccess = VmxExitReasonBasic::APIC_ACCESS as isize,
// ApicWrite = VmxExitReasonBasic::APIC_WRITE as isize,
// EoiInduced = VmxExitReasonBasic::VIRTUALIZED_EOI as isize, //name
// Wbinvd = VmxExitReasonBasic::WBINVD as isize,
// Xsetbv = VmxExitReasonBasic::XSETBV as isize,
// TaskSwitch = VmxExitReasonBasic::TASK_SWITCH as isize,
// MceDuringVmentry = VmxExitReasonBasic::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT as isize, //name
// GdtrIdtr = VmxExitReasonBasic::ACCESS_GDTR_OR_IDTR as isize,
// LdtrTr = VmxExitReasonBasic::ACCESS_LDTR_OR_TR as isize,
// EptViolation = VmxExitReasonBasic::EPT_VIOLATION as isize,
// EptMisconfig = VmxExitReasonBasic::EPT_MISCONFIG as isize,
// PauseInstruction = VmxExitReasonBasic::PAUSE as isize,
// MwaitInstruction = VmxExitReasonBasic::MWAIT as isize,
// MonitorTrapFlag = VmxExitReasonBasic::MONITOR_TRAP_FLAG as isize,
// MonitorInstruction = VmxExitReasonBasic::MONITOR as isize,
// Invept = VmxExitReasonBasic::INVEPT as isize,
// Invvpid = VmxExitReasonBasic::INVVPID as isize,
// Rdrand = VmxExitReasonBasic::RDRAND as isize,
// Rdseed = VmxExitReasonBasic::RDSEED as isize,
// PmlFull = VmxExitReasonBasic::PML_FULL as isize,
// Invpcid = VmxExitReasonBasic::INVPCID as isize,
// Vmfunc = VmxExitReasonBasic::VMFUNC as isize,
// PreemptionTimer = VmxExitReasonBasic::VMX_PREEMPTION_TIMER_EXPIRED as isize,
// Encls = VmxExitReasonBasic::ENCLS as isize,
// BusLock = VmxExitReasonBasic::BUS_LOCK as isize,
// Notify = VmxExitReasonBasic::NOTIFY as isize,
// Unknown,
impl VmxExitHandlers {
#[inline(never)]
pub fn try_handle_exit(
vcpu: &mut VirtCpu,
vm: &Vm,
basic: VmxExitReasonBasic,
) -> Option<Result<i32, SystemError>> {
// let exit_reason = vmx_vmread(VmcsFields::VMEXIT_EXIT_REASON as u32).unwrap() as u32;
// let exit_basic_reason = exit_reason & 0x0000_ffff;
// let guest_rip = vmx_vmread(VmcsFields::GUEST_RIP as u32).unwrap();
// let _guest_rflags = vmx_vmread(VmcsFields::GUEST_RFLAGS as u32).unwrap();
match basic {
VmxExitReasonBasic::IO_INSTRUCTION => {
return Some(Self::handle_io(vcpu));
}
VmxExitReasonBasic::EPT_VIOLATION => {
let r = Some(Self::handle_ept_violation(vcpu, vm));
debug();
r
}
VmxExitReasonBasic::EXTERNAL_INTERRUPT => {
return Some(Self::handle_external_interrupt(vcpu));
}
VmxExitReasonBasic::EXCEPTION_OR_NMI => {
todo!()
}
_ => None,
}
}
fn handle_io(_vcpu: &mut VirtCpu) -> Result<i32, SystemError> {
todo!();
}
fn handle_external_interrupt(vcpu: &mut VirtCpu) -> Result<i32, SystemError> {
vcpu.stat.irq_exits += 1;
Ok(1)
}
fn handle_ept_violation(vcpu: &mut VirtCpu, vm: &Vm) -> Result<i32, SystemError> {
let exit_qualification = vcpu.get_exit_qual(); //0x184
// EPT 违规发生在从 NMI 执行 iret 时,
// 在下一次 VM 进入之前必须设置 "blocked by NMI" 位。
// 有一些错误可能会导致该位未被设置:
// AAK134, BY25。
let vmx = vcpu.vmx();
if vmx.idt_vectoring_info.bits() & IntrInfo::INTR_INFO_VALID_MASK.bits() != 0
&& vmx_info().enable_vnmi
&& exit_qualification & IntrInfo::INTR_INFO_UNBLOCK_NMI.bits() as u64 != 0
{
VmxAsm::vmx_vmwrite(guest::INTERRUPTIBILITY_STATE, 0x8); //GUEST_INTR_STATE_NMI
}
let gpa = VmxAsm::vmx_vmread(ro::GUEST_PHYSICAL_ADDR_FULL);
//let exit_qualification = VmxAsm::vmx_vmread(ro::EXIT_QUALIFICATION);
// trace_kvm_page_fault(vcpu, gpa, exit_qualification);//
// 根据故障类型确定错误代码
let mut error_code = if exit_qualification & (EptViolationExitQual::ACC_READ.bits()) != 0 {
//debug!("error_code::ACC_READ");
PageFaultErr::PFERR_USER.bits()
} else {
0
};
error_code |= if exit_qualification & (EptViolationExitQual::ACC_WRITE.bits()) != 0 {
//debug!("error_code::ACC_WRITE");
PageFaultErr::PFERR_WRITE.bits()
} else {
0
};
error_code |= if exit_qualification & (EptViolationExitQual::ACC_INSTR.bits()) != 0 {
//actice
//debug!("error_code::ACC_INSTR");
PageFaultErr::PFERR_FETCH.bits()
} else {
0
};
error_code |= if exit_qualification & (EptViolationExitQual::RWX_MASK.bits()) != 0 {
//debug!("error_code::RWX_MASK");
PageFaultErr::PFERR_PRESENT.bits()
} else {
0
};
if exit_qualification & (EptViolationExitQual::GVA_IS_VALID.bits()) != 0 {
//调试用
//debug!("GVA is valid");
} else {
//debug!("GVA is invalid");
}
error_code |= if exit_qualification & (EptViolationExitQual::GVA_TRANSLATED.bits()) != 0 {
//debug!("error_code:GVA GVA_TRANSLATED");
PageFaultErr::PFERR_GUEST_FINAL.bits() //active
} else {
PageFaultErr::PFERR_GUEST_PAGE.bits()
};
//fixme:: 此时error_code为0x100000011
vcpu.arch.exit_qual = exit_qualification;
// 检查 GPA 是否超出物理内存限制,因为这是一个客户机页面错误。
// 我们必须在这里模拟指令,因为如果非法地址是分页结构的地址,
// 则会设置 EPT_VIOLATION_ACC_WRITE 位。
// 或者,如果支持,我们还可以使用 EPT 违规的高级 VM 退出信息来重建页面错误代码。
// if allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa) {
// return kvm_emulate_instruction(vcpu, 0);
// }
//debug!("EPT violation: error_code={:#x}", error_code);
vcpu.page_fault(vm, gpa, error_code, None, 0)
}
}
fn debug() {
// // 3
// let info = VmxAsm::vmx_vmread(VmcsFields::VMEXIT_INSTR_LEN as u32);
// debug!("vmexit handler: VMEXIT_INSTR_LEN: 0x{:x}!", info);
// //0
// let info = VmxAsm::vmx_vmread(VmcsFields::VMEXIT_INSTR_INFO as u32);
// debug!("vmexit handler: VMEXIT_INSTR_INFO: 0x{:x}!", info);
// //0x64042
// /*0x64042
// 将其转换为二进制0x64042 的二进制表示是 110010000001000010。
// 每个位代表一个异常向量(例如,除以零,调试,不可屏蔽中断,断点等)。
// 从 vmx_update_exception_bitmap 函数中,我们看到设置的特定异常:
// PF_VECTOR页面错误
// UD_VECTOR未定义操作码
// MC_VECTOR机器检查
// DB_VECTOR调试
// AC_VECTOR对齐检查
// 值 0x64042 设置了与这些异常相对应的位,这意味着当这些异常在来宾中发生时将导致 VM 退出。 */
// let info = VmxAsm::vmx_vmread(control::EXCEPTION_BITMAP);
// debug!("vmexit handler: EXCEPTION_BITMAP: 0x{:x}!", info);
// //9
// let info = VmxAsm::vmx_vmread(control::PAGE_FAULT_ERR_CODE_MASK);
// debug!("vmexit handler: PAGE_FAULT_ERR_CODE_MASK: 0x{:x}!", info);
// //1
// let info = VmxAsm::vmx_vmread(control::PAGE_FAULT_ERR_CODE_MATCH);
// debug!("vmexit handler: PAGE_FAULT_ERR_CODE_MATCH: 0x{:x}!", info);
// //0
// let info = VmxAsm::vmx_vmread(control::EPTP_LIST_ADDR_FULL);
// debug!("vmexit handler: EPTP_LIST_ADDR_FULL: 0x{:x}!", info);
// let info = VmxAsm::vmx_vmread(ro::VM_INSTRUCTION_ERROR);
// debug!("vmexit handler: VM_INSTRUCTION_ERROR: 0x{:x}!", info);
// let info = VmxAsm::vmx_vmread(ro::EXIT_REASON);
// debug!("vmexit handler: EXIT_REASON:0x{:x}!", info);//EPT VIOLATION
// let info = VmxAsm::vmx_vmread(ro::VMEXIT_INTERRUPTION_INFO);
// debug!("vmexit handler: VMEXIT_INTERRUPTION_INFO: 0x{:x}!", info);
// let info = VmxAsm::vmx_vmread(ro::VMEXIT_INTERRUPTION_ERR_CODE);
// debug!("vmexit handler: VMEXIT_INTERRUPTION_ERR_CODE: 0x{:x}!", info);
// let info = VmxAsm::vmx_vmread(ro::IDT_VECTORING_INFO);
// debug!("vmexit handler: IDT_VECTORING_INFO: 0x{:x}!", info);
// let info = VmxAsm::vmx_vmread(ro::IDT_VECTORING_ERR_CODE);
// debug!("vmexit handler: IDT_VECTORING_ERR_CODE: 0x{:x}!", info);
// let info = VmxAsm::vmx_vmread(ro::VMEXIT_INSTRUCTION_LEN);
// debug!("vmexit handler: VMEXIT_INSTRUCTION_LEN: 0x{:x}!", info);
// let info = VmxAsm::vmx_vmread(ro::VMEXIT_INSTRUCTION_INFO);
// debug!("vmexit handler: VMEXIT_INSTRUCTION_INFO: 0x{:x}!", info);
//panic
// let info = VmxAsm::vmx_vmread(control::EPTP_INDEX);
// debug!("vmexit handler: EPTP_INDEX: 0x{:x}!", info);
//panic
// let info = VmxAsm::vmx_vmread(control::VIRT_EXCEPTION_INFO_ADDR_FULL);
// debug!("vmexit handler: VIRT_EXCEPTION_INFO_ADDR_FULL: 0x{:x}!", info);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,160 @@
use system_error::SystemError;
use x86::{
msr::{
IA32_VMX_ENTRY_CTLS, IA32_VMX_EXIT_CTLS, IA32_VMX_PINBASED_CTLS, IA32_VMX_PROCBASED_CTLS,
IA32_VMX_PROCBASED_CTLS2,
},
vmx::vmcs::control::{
EntryControls, ExitControls, PinbasedControls, PrimaryControls, SecondaryControls,
},
};
use crate::arch::vm::vmx::Vmx;
pub struct VmxFeat;
#[allow(dead_code)]
impl VmxFeat {
pub const KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL: u32 = PrimaryControls::HLT_EXITING.bits()
| PrimaryControls::CR3_LOAD_EXITING.bits()
| PrimaryControls::CR3_STORE_EXITING.bits()
| PrimaryControls::UNCOND_IO_EXITING.bits()
| PrimaryControls::MOV_DR_EXITING.bits()
| PrimaryControls::USE_TSC_OFFSETTING.bits()
| PrimaryControls::MWAIT_EXITING.bits()
| PrimaryControls::MONITOR_EXITING.bits()
| PrimaryControls::INVLPG_EXITING.bits()
| PrimaryControls::RDPMC_EXITING.bits()
| PrimaryControls::INTERRUPT_WINDOW_EXITING.bits()
| PrimaryControls::CR8_LOAD_EXITING.bits()
| PrimaryControls::CR8_STORE_EXITING.bits();
pub const KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL: u32 = PrimaryControls::RDTSC_EXITING
.bits()
| PrimaryControls::USE_TPR_SHADOW.bits()
| PrimaryControls::USE_IO_BITMAPS.bits()
| PrimaryControls::MONITOR_TRAP_FLAG.bits()
| PrimaryControls::USE_MSR_BITMAPS.bits()
| PrimaryControls::NMI_WINDOW_EXITING.bits()
| PrimaryControls::PAUSE_EXITING.bits()
| PrimaryControls::SECONDARY_CONTROLS.bits();
pub const KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL: u32 = 0;
pub const KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL: u32 = SecondaryControls::VIRTUALIZE_APIC
.bits()
| SecondaryControls::VIRTUALIZE_X2APIC.bits()
| SecondaryControls::WBINVD_EXITING.bits()
| SecondaryControls::ENABLE_VPID.bits()
| SecondaryControls::ENABLE_EPT.bits()
| SecondaryControls::UNRESTRICTED_GUEST.bits()
| SecondaryControls::PAUSE_LOOP_EXITING.bits()
| SecondaryControls::DTABLE_EXITING.bits()
| SecondaryControls::ENABLE_RDTSCP.bits()
| SecondaryControls::ENABLE_INVPCID.bits()
| SecondaryControls::VIRTUALIZE_APIC_REGISTER.bits()
| SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY.bits()
| SecondaryControls::VMCS_SHADOWING.bits()
| SecondaryControls::ENABLE_XSAVES_XRSTORS.bits()
| SecondaryControls::RDSEED_EXITING.bits()
| SecondaryControls::RDRAND_EXITING.bits()
| SecondaryControls::ENABLE_PML.bits()
| SecondaryControls::USE_TSC_SCALING.bits()
| SecondaryControls::ENABLE_USER_WAIT_PAUSE.bits()
| SecondaryControls::INTEL_PT_GUEST_PHYSICAL.bits()
| SecondaryControls::CONCEAL_VMX_FROM_PT.bits()
| SecondaryControls::ENABLE_VM_FUNCTIONS.bits()
| SecondaryControls::ENCLS_EXITING.bits();
// | SecondaryControls::BUS_LOCK_DETECTION.bits()
// | SecondaryControls::NOTIFY_VM_EXITING.bits()
pub const KVM_REQUIRED_VMX_VM_EXIT_CONTROLS: u32 = ExitControls::SAVE_DEBUG_CONTROLS.bits()
| ExitControls::ACK_INTERRUPT_ON_EXIT.bits()
| ExitControls::HOST_ADDRESS_SPACE_SIZE.bits();
pub const KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS: u32 = ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL
.bits()
| ExitControls::SAVE_IA32_PAT.bits()
| ExitControls::LOAD_IA32_PAT.bits()
| ExitControls::SAVE_IA32_EFER.bits()
| ExitControls::SAVE_VMX_PREEMPTION_TIMER.bits()
| ExitControls::LOAD_IA32_EFER.bits()
| ExitControls::CLEAR_IA32_BNDCFGS.bits()
| ExitControls::CONCEAL_VMX_FROM_PT.bits()
| ExitControls::CLEAR_IA32_RTIT_CTL.bits();
pub const KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 =
PinbasedControls::EXTERNAL_INTERRUPT_EXITING.bits() | PinbasedControls::NMI_EXITING.bits();
pub const KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 =
PinbasedControls::VIRTUAL_NMIS.bits() | PinbasedControls::POSTED_INTERRUPTS.bits();
pub const KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS: u32 =
EntryControls::LOAD_DEBUG_CONTROLS.bits() | EntryControls::IA32E_MODE_GUEST.bits();
pub const KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS: u32 = EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL
.bits()
| EntryControls::LOAD_IA32_PAT.bits()
| EntryControls::LOAD_IA32_EFER.bits()
| EntryControls::LOAD_IA32_BNDCFGS.bits()
| EntryControls::CONCEAL_VMX_FROM_PT.bits()
| EntryControls::LOAD_IA32_RTIT_CTL.bits();
/* VMX_BASIC bits and bitmasks */
pub const VMX_BASIC_VMCS_SIZE_SHIFT: u64 = 32;
pub const VMX_BASIC_TRUE_CTLS: u64 = 1 << 55;
pub const VMX_BASIC_64: u64 = 0x0001000000000000;
pub const VMX_BASIC_MEM_TYPE_SHIFT: u64 = 50;
pub const VMX_BASIC_MEM_TYPE_MASK: u64 = 0x003c000000000000;
pub const VMX_BASIC_MEM_TYPE_WB: u64 = 6;
pub const VMX_BASIC_INOUT: u64 = 0x0040000000000000;
pub fn adjust_primary_controls() -> Result<PrimaryControls, SystemError> {
Ok(unsafe {
PrimaryControls::from_bits_unchecked(Vmx::adjust_vmx_controls(
Self::KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
Self::KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
IA32_VMX_PROCBASED_CTLS,
)?)
})
}
pub fn adjust_secondary_controls() -> Result<SecondaryControls, SystemError> {
Ok(unsafe {
SecondaryControls::from_bits_unchecked(Vmx::adjust_vmx_controls(
Self::KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
Self::KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
IA32_VMX_PROCBASED_CTLS2,
)?)
})
}
pub fn adjust_exit_controls() -> Result<ExitControls, SystemError> {
Ok(unsafe {
ExitControls::from_bits_unchecked(Vmx::adjust_vmx_controls(
Self::KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
Self::KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
IA32_VMX_EXIT_CTLS,
)?)
})
}
pub fn adjust_entry_controls() -> Result<EntryControls, SystemError> {
Ok(unsafe {
EntryControls::from_bits_unchecked(Vmx::adjust_vmx_controls(
Self::KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
Self::KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
IA32_VMX_ENTRY_CTLS,
)?)
})
}
pub fn adjust_pin_based_controls() -> Result<PinbasedControls, SystemError> {
Ok(unsafe {
PinbasedControls::from_bits_unchecked(Vmx::adjust_vmx_controls(
Self::KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
Self::KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
IA32_VMX_PINBASED_CTLS,
)?)
})
}
}

View File

@ -0,0 +1,451 @@
use core::intrinsics::unlikely;
use alloc::{boxed::Box, collections::LinkedList, sync::Arc};
use bitmap::{traits::BitMapOps, AllocBitmap};
use x86::{
controlregs::Cr4,
vmx::vmcs::{
control::{self, PrimaryControls},
host,
},
};
use x86_64::{registers::control::Cr3Flags, structures::paging::PhysFrame};
use crate::{
arch::{
vm::asm::{IntrInfo, IntrType, VmxAsm},
MMArch,
},
libs::spinlock::{SpinLock, SpinLockGuard},
mm::{percpu::PerCpuVar, MemoryManagementArch, PhysAddr, VirtAddr},
smp::cpu::ProcessorId,
};
use super::vmx_info;
pub mod feat;
pub static mut PERCPU_VMCS: Option<PerCpuVar<Option<Arc<LockedVMControlStructure>>>> = None;
pub static mut PERCPU_LOADED_VMCS_LIST: Option<PerCpuVar<LinkedList<Arc<LockedLoadedVmcs>>>> = None;
pub static mut VMXAREA: Option<PerCpuVar<Box<VMControlStructure>>> = None;
pub fn current_vmcs() -> &'static Option<Arc<LockedVMControlStructure>> {
unsafe { PERCPU_VMCS.as_ref().unwrap().get() }
}
pub fn current_vmcs_mut() -> &'static mut Option<Arc<LockedVMControlStructure>> {
unsafe { PERCPU_VMCS.as_ref().unwrap().get_mut() }
}
pub fn current_loaded_vmcs_list_mut() -> &'static mut LinkedList<Arc<LockedLoadedVmcs>> {
unsafe { PERCPU_LOADED_VMCS_LIST.as_ref().unwrap().get_mut() }
}
#[allow(dead_code)]
pub fn current_loaded_vmcs_list() -> &'static LinkedList<Arc<LockedLoadedVmcs>> {
unsafe { PERCPU_LOADED_VMCS_LIST.as_ref().unwrap().get() }
}
pub fn vmx_area() -> &'static PerCpuVar<Box<VMControlStructure>> {
unsafe { VMXAREA.as_ref().unwrap() }
}
#[repr(C, align(4096))]
#[derive(Debug, Clone)]
pub struct VMControlStructure {
pub header: u32,
pub abort: u32,
pub data: [u8; MMArch::PAGE_SIZE - core::mem::size_of::<u32>() - core::mem::size_of::<u32>()],
}
impl VMControlStructure {
pub fn new() -> Box<Self> {
let mut vmcs: Box<VMControlStructure> = unsafe {
Box::try_new_zeroed()
.expect("alloc vmcs failed")
.assume_init()
};
vmcs.set_revision_id(vmx_info().vmcs_config.revision_id);
vmcs
}
pub fn revision_id(&self) -> u32 {
self.header & 0x7FFF_FFFF
}
#[allow(dead_code)]
pub fn is_shadow_vmcs(&self) -> bool {
self.header & 0x8000_0000 == 1
}
pub fn set_shadow_vmcs(&mut self, shadow: bool) {
self.header |= (shadow as u32) << 31;
}
pub fn set_revision_id(&mut self, id: u32) {
self.header = self.header & 0x8000_0000 | (id & 0x7FFF_FFFF);
}
}
#[derive(Debug)]
pub struct LockedVMControlStructure {
/// 记录内部的vmcs的物理地址
phys_addr: PhysAddr,
inner: SpinLock<Box<VMControlStructure>>,
}
impl LockedVMControlStructure {
#[inline(never)]
pub fn new(shadow: bool) -> Arc<Self> {
let mut vmcs = VMControlStructure::new();
let phys_addr = unsafe {
MMArch::virt_2_phys(VirtAddr::new(vmcs.as_ref() as *const _ as usize)).unwrap()
};
vmcs.set_shadow_vmcs(shadow);
Arc::new(Self {
phys_addr,
inner: SpinLock::new(vmcs),
})
}
pub fn lock(&self) -> SpinLockGuard<Box<VMControlStructure>> {
self.inner.lock()
}
pub fn phys_addr(&self) -> PhysAddr {
self.phys_addr
}
}
#[derive(Debug)]
pub struct VmcsHostState {
pub cr3: (PhysFrame, Cr3Flags),
pub cr4: Cr4,
pub gs_base: usize,
pub fs_base: usize,
pub rsp: usize,
pub fs_sel: u16,
pub gs_sel: u16,
pub ldt_sel: u16,
pub ds_sel: u16,
pub es_sel: u16,
}
impl VmcsHostState {
pub fn set_host_fsgs(&mut self, fs_sel: u16, gs_sel: u16, fs_base: usize, gs_base: usize) {
if unlikely(self.fs_sel != fs_sel) {
if (fs_sel & 7) == 0 {
VmxAsm::vmx_vmwrite(host::FS_SELECTOR, fs_sel as u64);
} else {
VmxAsm::vmx_vmwrite(host::FS_SELECTOR, 0);
}
self.fs_sel = fs_sel;
}
if unlikely(self.gs_sel != gs_sel) {
if (gs_sel & 7) == 0 {
VmxAsm::vmx_vmwrite(host::GS_SELECTOR, gs_sel as u64);
} else {
VmxAsm::vmx_vmwrite(host::GS_SELECTOR, 0);
}
self.gs_sel = gs_sel;
}
if unlikely(fs_base != self.fs_base) {
VmxAsm::vmx_vmwrite(host::FS_BASE, fs_base as u64);
self.fs_base = fs_base;
}
if unlikely(self.gs_base != gs_base) {
VmxAsm::vmx_vmwrite(host::GS_BASE, gs_base as u64);
self.gs_base = gs_base;
}
}
}
impl Default for VmcsHostState {
fn default() -> Self {
Self {
cr3: (
PhysFrame::containing_address(x86_64::PhysAddr::new(0)),
Cr3Flags::empty(),
),
cr4: Cr4::empty(),
gs_base: 0,
fs_base: 0,
rsp: 0,
fs_sel: 0,
gs_sel: 0,
ldt_sel: 0,
ds_sel: 0,
es_sel: 0,
}
}
}
#[derive(Debug, Default)]
pub struct VmcsControlsShadow {
vm_entry: u32,
vm_exit: u32,
pin: u32,
exec: u32,
secondary_exec: u32,
tertiary_exec: u64,
}
#[derive(Debug)]
#[allow(dead_code)]
pub struct LoadedVmcs {
pub vmcs: Arc<LockedVMControlStructure>,
pub shadow_vmcs: Option<Arc<LockedVMControlStructure>>,
pub cpu: ProcessorId,
/// 是否已经执行了 VMLAUNCH 指令
pub launched: bool,
/// NMI 是否已知未被屏蔽
nmi_known_unmasked: bool,
/// Hypervisor 定时器是否被软禁用
hv_timer_soft_disabled: bool,
/// 支持 vnmi-less CPU 的字段,指示 VNMI 是否被软阻止
pub soft_vnmi_blocked: bool,
/// 记录 VM 进入时间
entry_time: u64,
/// 记录 VNMI 被阻止的时间
vnmi_blocked_time: u64,
/// msr位图
pub msr_bitmap: VmxMsrBitmap,
/// 保存 VMCS 主机状态的结构体
pub host_state: VmcsHostState,
/// 保存 VMCS 控制字段的shadow状态的结构体。
controls_shadow: VmcsControlsShadow,
}
impl LoadedVmcs {
pub fn controls_set(&mut self, ctl_type: ControlsType, value: u64) {
match ctl_type {
ControlsType::VmEntry => {
if self.controls_shadow.vm_entry != value as u32 {
VmxAsm::vmx_vmwrite(control::VMENTRY_CONTROLS, value);
self.controls_shadow.vm_entry = value as u32;
}
}
ControlsType::VmExit => {
if self.controls_shadow.vm_exit != value as u32 {
VmxAsm::vmx_vmwrite(control::VMEXIT_CONTROLS, value);
self.controls_shadow.vm_exit = value as u32;
}
}
ControlsType::Pin => {
if self.controls_shadow.pin != value as u32 {
VmxAsm::vmx_vmwrite(control::PINBASED_EXEC_CONTROLS, value);
self.controls_shadow.pin = value as u32;
}
}
ControlsType::Exec => {
if self.controls_shadow.exec != value as u32 {
VmxAsm::vmx_vmwrite(control::PRIMARY_PROCBASED_EXEC_CONTROLS, value);
self.controls_shadow.exec = value as u32;
}
}
ControlsType::SecondaryExec => {
if self.controls_shadow.secondary_exec != value as u32 {
VmxAsm::vmx_vmwrite(control::SECONDARY_PROCBASED_EXEC_CONTROLS, value);
self.controls_shadow.secondary_exec = value as u32;
}
}
ControlsType::TertiaryExec => {
if self.controls_shadow.tertiary_exec != value {
VmxAsm::vmx_vmwrite(0x2034, value);
self.controls_shadow.tertiary_exec = value;
}
}
}
}
pub fn controls_get(&self, ctl_type: ControlsType) -> u64 {
match ctl_type {
ControlsType::VmEntry => self.controls_shadow.vm_entry as u64,
ControlsType::VmExit => self.controls_shadow.vm_exit as u64,
ControlsType::Pin => self.controls_shadow.pin as u64,
ControlsType::Exec => self.controls_shadow.exec as u64,
ControlsType::SecondaryExec => self.controls_shadow.secondary_exec as u64,
ControlsType::TertiaryExec => self.controls_shadow.tertiary_exec,
}
}
pub fn controls_setbit(&mut self, ctl_type: ControlsType, value: u64) {
let val = self.controls_get(ctl_type) | value;
self.controls_set(ctl_type, val)
}
pub fn controls_clearbit(&mut self, ctl_type: ControlsType, value: u64) {
let val = self.controls_get(ctl_type) & (!value);
self.controls_set(ctl_type, val)
}
pub fn msr_write_intercepted(&mut self, msr: u32) -> bool {
if unsafe {
PrimaryControls::from_bits_unchecked(self.controls_get(ControlsType::Exec) as u32)
.contains(PrimaryControls::USE_MSR_BITMAPS)
} {
return true;
}
return self
.msr_bitmap
.ctl(msr, VmxMsrBitmapAction::Test, VmxMsrBitmapAccess::Write);
}
}
#[derive(Debug)]
pub struct LockedLoadedVmcs {
inner: SpinLock<LoadedVmcs>,
}
#[derive(Debug, Clone, Copy)]
#[allow(dead_code)]
pub enum ControlsType {
VmEntry,
VmExit,
Pin,
Exec,
SecondaryExec,
TertiaryExec,
}
impl LockedLoadedVmcs {
pub fn new() -> Arc<Self> {
let bitmap = if vmx_info().has_msr_bitmap() {
let bitmap = VmxMsrBitmap::new(true, MMArch::PAGE_SIZE * u8::BITS as usize);
bitmap
} else {
VmxMsrBitmap::new(true, 0)
};
let vmcs = LockedVMControlStructure::new(false);
VmxAsm::vmclear(vmcs.phys_addr);
Arc::new(Self {
inner: SpinLock::new(LoadedVmcs {
vmcs,
shadow_vmcs: None,
cpu: ProcessorId::INVALID,
launched: false,
hv_timer_soft_disabled: false,
msr_bitmap: bitmap,
host_state: VmcsHostState::default(),
controls_shadow: VmcsControlsShadow::default(),
nmi_known_unmasked: false,
soft_vnmi_blocked: false,
entry_time: 0,
vnmi_blocked_time: 0,
}),
})
}
pub fn lock(&self) -> SpinLockGuard<LoadedVmcs> {
self.inner.lock()
}
}
#[derive(Debug)]
pub struct VmxMsrBitmap {
data: AllocBitmap,
phys_addr: usize,
}
pub enum VmxMsrBitmapAction {
Test,
Set,
Clear,
}
pub enum VmxMsrBitmapAccess {
Write,
Read,
}
impl VmxMsrBitmapAccess {
pub const fn base(&self) -> usize {
match self {
VmxMsrBitmapAccess::Write => 0x800 * core::mem::size_of::<usize>(),
VmxMsrBitmapAccess::Read => 0,
}
}
}
impl VmxMsrBitmap {
pub fn new(init_val: bool, size: usize) -> Self {
let mut data = AllocBitmap::new(size);
data.set_all(init_val);
let addr = data.data() as *const [usize] as *const usize as usize;
Self {
data,
phys_addr: unsafe { MMArch::virt_2_phys(VirtAddr::new(addr)).unwrap().data() },
}
}
pub fn phys_addr(&self) -> usize {
self.phys_addr
}
pub fn ctl(
&mut self,
msr: u32,
action: VmxMsrBitmapAction,
access: VmxMsrBitmapAccess,
) -> bool {
if msr <= 0x1fff {
return self.bit_op(msr as usize, access.base(), action);
} else if (0xc0000000..=0xc0001fff).contains(&msr) {
// 这里是有问题的,需要后续检查
// https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.h#450
return self.bit_op(msr as usize & 0x1fff, access.base() + 0x400, action);
} else {
return true;
}
}
fn bit_op(&mut self, msr: usize, base: usize, action: VmxMsrBitmapAction) -> bool {
match action {
VmxMsrBitmapAction::Test => {
let ret = self.data.get(msr + base);
ret.unwrap_or(false)
}
VmxMsrBitmapAction::Set => {
self.data.set(msr + base, true);
true
}
VmxMsrBitmapAction::Clear => {
self.data.set(msr + base, false);
true
}
}
}
}
/// 中断相关辅助函数载体
pub struct VmcsIntrHelper;
impl VmcsIntrHelper {
pub fn is_nmi(intr_info: &IntrInfo) -> bool {
return Self::is_intr_type(intr_info, IntrType::INTR_TYPE_NMI_INTR);
}
pub fn is_intr_type(intr_info: &IntrInfo, intr_type: IntrType) -> bool {
return (*intr_info
& (IntrInfo::INTR_INFO_VALID_MASK | IntrInfo::INTR_INFO_INTR_TYPE_MASK))
.bits()
== IntrInfo::INTR_INFO_VALID_MASK.bits() | intr_type.bits();
}
pub fn is_external_intr(intr_info: &IntrInfo) -> bool {
return Self::is_intr_type(intr_info, IntrType::INTR_TYPE_EXT_INTR);
}
}

View File

@ -0,0 +1,179 @@
#include "common/asm.h"
#define __VCPU_REGS_RAX 0
#define __VCPU_REGS_RCX 1
#define __VCPU_REGS_RDX 2
#define __VCPU_REGS_RBX 3
#define __VCPU_REGS_RSP 4
#define __VCPU_REGS_RBP 5
#define __VCPU_REGS_RSI 6
#define __VCPU_REGS_RDI 7
#define __VCPU_REGS_R8 8
#define __VCPU_REGS_R9 9
#define __VCPU_REGS_R10 10
#define __VCPU_REGS_R11 11
#define __VCPU_REGS_R12 12
#define __VCPU_REGS_R13 13
#define __VCPU_REGS_R14 14
#define __VCPU_REGS_R15 15
#define VCPU_RAX __VCPU_REGS_RAX * 8
#define VCPU_RCX __VCPU_REGS_RCX * 8
#define VCPU_RDX __VCPU_REGS_RDX * 8
#define VCPU_RBX __VCPU_REGS_RBX * 8
#define VCPU_RBP __VCPU_REGS_RBP * 8
#define VCPU_RSI __VCPU_REGS_RSI * 8
#define VCPU_RDI __VCPU_REGS_RDI * 8
#define VCPU_R8 __VCPU_REGS_R8 * 8
#define VCPU_R9 __VCPU_REGS_R9 * 8
#define VCPU_R10 __VCPU_REGS_R10 * 8
#define VCPU_R11 __VCPU_REGS_R11 * 8
#define VCPU_R12 __VCPU_REGS_R12 * 8
#define VCPU_R13 __VCPU_REGS_R13 * 8
#define VCPU_R14 __VCPU_REGS_R14 * 8
#define VCPU_R15 __VCPU_REGS_R15 * 8
#define VMX_RUN_VMRESUME_SHIFT 0
#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
#define VMX_RUN_VMRESUME 1 << VMX_RUN_VMRESUME_SHIFT
#define VMX_RUN_SAVE_SPEC_CTRL 1 << VMX_RUN_SAVE_SPEC_CTRL_SHIFT
// VCPUguest
ENTRY(__vmx_vcpu_run)
pushq %rbp
movq %rsp, %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
push %rbx
//
push %rdi
//
push %rdx
//
push %rsi
mov %edx, %ebx
lea (%rsp), %rsi
call vmx_update_host_rsp
// TODO: spec_ctrl
.Lspec_ctrl_done:
mov %rsp, %rax
bt $VMX_RUN_VMRESUME_SHIFT, %ebx
mov VCPU_RCX(%rax), %rcx
mov VCPU_RDX(%rax), %rdx
mov VCPU_RBX(%rax), %rbx
mov VCPU_RBP(%rax), %rbp
mov VCPU_RSI(%rax), %rsi
mov VCPU_RDI(%rax), %rdi
mov VCPU_R8(%rax), %R8
mov VCPU_R9(%rax), %r9
mov VCPU_R10(%rax), %r10
mov VCPU_R11(%rax), %r11
mov VCPU_R12(%rax), %r12
mov VCPU_R13(%rax), %r13
mov VCPU_R14(%rax), %r14
mov VCPU_R15(%rax), %r15
mov VCPU_RAX(%rax), %rax
// TODO: clear cpu buffer
jnc .Lvmlaunch
.Lvmresume:
vmresume
jmp .Lvmfail
.Lvmlaunch:
call vmx_vmlaunch
jmp .Lvmfail
// guest退
ENTRY(vmx_vmexit)
// TODO: unwind hint restore
// guest RAX
push %rax
// regsrax
mov 8(%rsp), %rax
// guest
pop VCPU_RAX(%rax)
mov %rcx, VCPU_RCX(%rax)
mov %rdx, VCPU_RDX(%rax)
mov %rbx, VCPU_RBX(%rax)
mov %rbp, VCPU_RBP(%rax)
mov %rsi, VCPU_RSI(%rax)
mov %rdi, VCPU_RDI(%rax)
mov %r8, VCPU_R8(%rax)
mov %r9, VCPU_R9(%rax)
mov %r10, VCPU_R10(%rax)
mov %r11, VCPU_R11(%rax)
mov %r12, VCPU_R12(%rax)
mov %r13, VCPU_R13(%rax)
mov %r14, VCPU_R14(%rax)
mov %r15, VCPU_R15(%rax)
xor %ebx, %ebx
.Lclear_regs:
pop %rax
xor %eax, %eax
xor %ecx, %ecx
xor %edx, %edx
xor %ebp, %ebp
xor %esi, %esi
xor %edi, %edi
xor %r8d, %r8d
xor %r9d, %r9d
xor %r10d, %r10d
xor %r11d, %r11d
xor %r12d, %r12d
xor %r13d, %r13d
xor %r14d, %r14d
xor %r15d, %r15d
// todo: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmenter.S#270
pop %rsi
pop %rdi
call vmx_spec_ctrl_restore_host
mov %rbx, %rax
pop %rbx
pop %r12
pop %r13
pop %r14
pop %r15
pop %rbp
ret
.Lvmfail:
// 1
mov $1, %rbx
jmp .Lclear_regs

View File

@ -92,8 +92,11 @@ fn do_start_kernel() {
Futex::init(); Futex::init();
crate::bpf::init_bpf_system(); crate::bpf::init_bpf_system();
crate::debug::jump_label::static_keys_init(); crate::debug::jump_label::static_keys_init();
// #[cfg(all(target_arch = "x86_64", feature = "kvm"))]
// crate::virt::kvm::kvm_init();
#[cfg(all(target_arch = "x86_64", feature = "kvm"))] #[cfg(all(target_arch = "x86_64", feature = "kvm"))]
crate::virt::kvm::kvm_init(); crate::arch::vm::vmx::vmx_init().unwrap();
} }
/// 在内存管理初始化之前,执行的初始化 /// 在内存管理初始化之前,执行的初始化

View File

@ -1,5 +1,6 @@
#![no_main] // <1> #![no_main] // <1>
#![feature(alloc_error_handler)] #![feature(alloc_error_handler)]
#![feature(new_zeroed_alloc)]
#![feature(allocator_api)] #![feature(allocator_api)]
#![feature(arbitrary_self_types)] #![feature(arbitrary_self_types)]
#![feature(concat_idents)] #![feature(concat_idents)]

View File

@ -829,6 +829,15 @@ impl<K: Ord + Debug, V: Debug> IntoIterator for RBTree<K, V> {
} }
} }
impl<K: Ord + Debug, V: Debug> Default for RBTree<K, V> {
fn default() -> Self {
RBTree {
root: NodePtr::null(),
len: 0,
}
}
}
impl<K: Ord + Debug, V: Debug> RBTree<K, V> { impl<K: Ord + Debug, V: Debug> RBTree<K, V> {
/// Creates an empty `RBTree`. /// Creates an empty `RBTree`.
pub fn new() -> RBTree<K, V> { pub fn new() -> RBTree<K, V> {

View File

@ -155,7 +155,7 @@ pub enum PageTableKind {
} }
/// 物理内存地址 /// 物理内存地址
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)] #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Default)]
#[repr(transparent)] #[repr(transparent)]
pub struct PhysAddr(usize); pub struct PhysAddr(usize);
@ -277,7 +277,7 @@ impl core::ops::SubAssign<PhysAddr> for PhysAddr {
} }
/// 虚拟内存地址 /// 虚拟内存地址
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)] #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Default)]
#[repr(transparent)] #[repr(transparent)]
pub struct VirtAddr(usize); pub struct VirtAddr(usize);

View File

@ -874,6 +874,7 @@ impl<Arch: MemoryManagementArch> PageTable<Arch> {
} }
/// 页表项 /// 页表项
#[repr(C, align(8))]
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct PageEntry<Arch> { pub struct PageEntry<Arch> {
data: usize, data: usize,

View File

@ -36,7 +36,7 @@ impl Syscall {
Ok(check) Ok(check)
} }
#[allow(dead_code)]
pub fn sys_setns(_fd: i32, flags: u64) -> Result<usize, SystemError> { pub fn sys_setns(_fd: i32, flags: u64) -> Result<usize, SystemError> {
let check = check_unshare_flags(flags)?; let check = check_unshare_flags(flags)?;

View File

@ -1 +1,2 @@
pub mod kvm; pub mod kvm;
pub mod vm;

View File

@ -0,0 +1,491 @@
use core::intrinsics::unlikely;
use alloc::sync::{Arc, Weak};
use log::{debug, warn};
use system_error::SystemError;
use crate::{
arch::{
vm::{kvm_host::KvmCommonRegs, uapi::UapiKvmSegmentRegs},
MMArch,
},
driver::base::device::device_number::DeviceNumber,
filesystem::{
devfs::{devfs_register, DevFS, DeviceINode},
vfs::{
core::generate_inode_id,
file::{File, FileMode},
syscall::ModeType,
FileType, IndexNode, Metadata,
},
},
libs::spinlock::SpinLock,
mm::MemoryManagementArch,
process::ProcessManager,
syscall::user_access::{UserBufferReader, UserBufferWriter},
time::PosixTimeSpec,
virt::vm::user_api::{KvmUserspaceMemoryRegion, PosixKvmUserspaceMemoryRegion},
};
use super::kvm_host::{vcpu::LockedVirtCpu, LockedVm};
#[derive(Debug)]
pub struct KvmInode {
/// 指向自身的弱引用
self_ref: Weak<LockedKvmInode>,
/// 指向inode所在的文件系统对象的指针
fs: Weak<DevFS>,
/// INode 元数据
metadata: Metadata,
}
#[derive(Debug)]
pub struct LockedKvmInode {
inner: SpinLock<KvmInode>,
}
impl LockedKvmInode {
const KVM_CREATE_VM: u32 = 0xAE01;
const KVM_GET_VCPU_MMAP_SIZE: u32 = 0xAE04;
pub fn new() -> Arc<Self> {
let inode = KvmInode {
self_ref: Weak::default(),
fs: Weak::default(),
metadata: Metadata {
dev_id: 1,
inode_id: generate_inode_id(),
size: 0,
blk_size: 0,
blocks: 0,
atime: PosixTimeSpec::default(),
mtime: PosixTimeSpec::default(),
ctime: PosixTimeSpec::default(),
file_type: FileType::KvmDevice, // 文件夹block设备char设备
mode: ModeType::S_IALLUGO,
nlinks: 1,
uid: 0,
gid: 0,
raw_dev: DeviceNumber::default(), // 这里用来作为device number
},
};
let result = Arc::new(LockedKvmInode {
inner: SpinLock::new(inode),
});
result.inner.lock().self_ref = Arc::downgrade(&result);
return result;
}
fn create_vm(&self, vm_type: usize) -> Result<usize, SystemError> {
let kvm = LockedVm::create(vm_type)?;
let instance = KvmInstance::new(kvm);
let current = ProcessManager::current_pcb();
let file = File::new(instance, FileMode::O_RDWR)?;
let fd = current.fd_table().write().alloc_fd(file, None)?;
return Ok(fd as usize);
}
}
impl DeviceINode for LockedKvmInode {
fn set_fs(&self, fs: Weak<DevFS>) {
self.inner.lock().fs = fs;
}
}
impl IndexNode for LockedKvmInode {
fn open(
&self,
_data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
_mode: &FileMode,
) -> Result<(), SystemError> {
Ok(())
}
fn read_at(
&self,
_offset: usize,
_len: usize,
_buf: &mut [u8],
_data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
) -> Result<usize, system_error::SystemError> {
Err(SystemError::ENOSYS)
}
fn write_at(
&self,
_offset: usize,
_len: usize,
_buf: &[u8],
_data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
) -> Result<usize, system_error::SystemError> {
Err(SystemError::ENOSYS)
}
fn fs(&self) -> Arc<dyn crate::filesystem::vfs::FileSystem> {
self.inner.lock().fs.upgrade().unwrap()
}
fn as_any_ref(&self) -> &dyn core::any::Any {
self
}
fn list(&self) -> Result<alloc::vec::Vec<alloc::string::String>, system_error::SystemError> {
Err(SystemError::ENOSYS)
}
fn metadata(&self) -> Result<Metadata, system_error::SystemError> {
Ok(self.inner.lock().metadata.clone())
}
fn ioctl(
&self,
cmd: u32,
arg: usize,
_private_data: &crate::filesystem::vfs::FilePrivateData,
) -> Result<usize, SystemError> {
match cmd {
Self::KVM_CREATE_VM => {
let ret = self.create_vm(arg);
warn!("[KVM]: KVM_CREATE_VM {ret:?}");
return ret;
}
Self::KVM_GET_VCPU_MMAP_SIZE => {
if arg != 0 {
return Err(SystemError::EINVAL);
}
debug!("[KVM] KVM_GET_VCPU_MMAP_SIZE");
return Ok(MMArch::PAGE_SIZE);
}
_ => {
// TODO: arch_ioctl
warn!("[KVM]: unknown iooctl cmd {cmd:x}");
}
}
Ok(0)
}
fn close(
&self,
_data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
) -> Result<(), SystemError> {
Ok(())
}
}
#[derive(Debug)]
pub struct KvmInstance {
kvm: Arc<LockedVm>,
metadata: Metadata,
}
impl KvmInstance {
const KVM_CREATE_VCPU: u32 = 0xAE41;
const KVM_SET_USER_MEMORY_REGION: u32 = 0x4020AE46;
pub fn new(vm: Arc<LockedVm>) -> Arc<Self> {
Arc::new(Self {
kvm: vm,
metadata: Metadata {
dev_id: 1,
inode_id: generate_inode_id(),
size: 0,
blk_size: 0,
blocks: 0,
atime: PosixTimeSpec::default(),
mtime: PosixTimeSpec::default(),
ctime: PosixTimeSpec::default(),
file_type: FileType::KvmDevice,
mode: ModeType::S_IALLUGO,
nlinks: 1,
uid: 0,
gid: 0,
raw_dev: DeviceNumber::default(), // 这里用来作为device number
},
})
}
}
impl IndexNode for KvmInstance {
fn open(
&self,
_data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
_mode: &crate::filesystem::vfs::file::FileMode,
) -> Result<(), SystemError> {
Ok(())
}
#[inline(never)]
fn ioctl(
&self,
cmd: u32,
arg: usize,
_private_data: &crate::filesystem::vfs::FilePrivateData,
) -> Result<usize, SystemError> {
debug!("kvm instance ioctl cmd {cmd:x}");
match cmd {
Self::KVM_CREATE_VCPU => {
let ret = self.kvm.lock().create_vcpu(arg);
debug!("[KVM] create vcpu fd {ret:?}");
return ret;
}
Self::KVM_SET_USER_MEMORY_REGION => {
debug!("[KVM-INSTANCE] KVM_SET_USER_MEMORY_REGION");
let user_reader = UserBufferReader::new(
arg as *const PosixKvmUserspaceMemoryRegion,
core::mem::size_of::<PosixKvmUserspaceMemoryRegion>(),
true,
)?;
let region = user_reader.read_one_from_user::<PosixKvmUserspaceMemoryRegion>(0)?;
self.kvm
.lock()
.set_memory_region(KvmUserspaceMemoryRegion::from_posix(region)?)?;
return Ok(0);
}
_ => {
// arch_ioctl
}
}
todo!()
}
fn read_at(
&self,
_offset: usize,
_len: usize,
_buf: &mut [u8],
_data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
) -> Result<usize, SystemError> {
todo!()
}
fn write_at(
&self,
_offset: usize,
_len: usize,
_buf: &[u8],
_data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
) -> Result<usize, SystemError> {
todo!()
}
fn fs(&self) -> Arc<dyn crate::filesystem::vfs::FileSystem> {
todo!()
}
fn as_any_ref(&self) -> &dyn core::any::Any {
todo!()
}
fn list(&self) -> Result<alloc::vec::Vec<alloc::string::String>, SystemError> {
todo!()
}
fn metadata(&self) -> Result<Metadata, SystemError> {
Ok(self.metadata.clone())
}
fn close(
&self,
_data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
) -> Result<(), SystemError> {
Ok(())
}
}
#[derive(Debug)]
pub struct KvmVcpuDev {
vcpu: Arc<LockedVirtCpu>,
/// INode 元数据
metadata: Metadata,
}
impl KvmVcpuDev {
const KVM_RUN: u32 = 0xAE80;
const KVM_GET_REGS: u32 = 0x8090AE81;
const KVM_SET_REGS: u32 = 0x4090AE82;
const KVM_GET_SREGS: u32 = 0x8138AE83;
const KVM_SET_SREGS: u32 = 0x4138AE84;
pub fn new(vcpu: Arc<LockedVirtCpu>) -> Arc<Self> {
Arc::new(Self {
vcpu,
metadata: Metadata {
dev_id: 1,
inode_id: generate_inode_id(),
size: 0,
blk_size: 0,
blocks: 0,
atime: PosixTimeSpec::default(),
mtime: PosixTimeSpec::default(),
ctime: PosixTimeSpec::default(),
file_type: FileType::KvmDevice, // 文件夹block设备char设备
mode: ModeType::S_IALLUGO,
nlinks: 1,
uid: 0,
gid: 0,
raw_dev: DeviceNumber::default(), // 这里用来作为device number
},
})
}
}
impl IndexNode for KvmVcpuDev {
fn open(
&self,
_data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
_mode: &FileMode,
) -> Result<(), SystemError> {
Ok(())
}
fn close(
&self,
_data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
) -> Result<(), SystemError> {
Ok(())
}
fn ioctl(
&self,
cmd: u32,
arg: usize,
_private_data: &crate::filesystem::vfs::FilePrivateData,
) -> Result<usize, SystemError> {
match cmd {
Self::KVM_RUN => {
if arg != 0 {
return Err(SystemError::EINVAL);
}
let mut vcpu = self.vcpu.lock();
let oldpid = vcpu.pid;
if unlikely(oldpid != Some(ProcessManager::current_pid())) {
vcpu.pid = Some(ProcessManager::current_pid());
}
return vcpu.run();
}
Self::KVM_GET_REGS => {
let kvm_regs = self.vcpu.lock().get_regs();
let mut user_writer = UserBufferWriter::new(
arg as *const KvmCommonRegs as *mut KvmCommonRegs,
core::mem::size_of::<KvmCommonRegs>(),
true,
)?;
user_writer.copy_one_to_user(&kvm_regs, 0)?;
return Ok(0);
}
Self::KVM_SET_REGS => {
let user_reader = UserBufferReader::new(
arg as *const KvmCommonRegs,
core::mem::size_of::<KvmCommonRegs>(),
true,
)?;
let regs = user_reader.read_one_from_user::<KvmCommonRegs>(0)?;
self.vcpu.lock().set_regs(regs)?;
return Ok(0);
}
Self::KVM_GET_SREGS => {
let sregs = self.vcpu.lock().get_segment_regs();
let mut writer = UserBufferWriter::new(
arg as *const UapiKvmSegmentRegs as *mut UapiKvmSegmentRegs,
core::mem::size_of::<UapiKvmSegmentRegs>(),
true,
)?;
writer.copy_one_to_user(&sregs, 0)?;
return Ok(0);
}
Self::KVM_SET_SREGS => {
let user_reader = UserBufferReader::new(
arg as *const UapiKvmSegmentRegs,
core::mem::size_of::<UapiKvmSegmentRegs>(),
true,
)?;
let mut sreg = UapiKvmSegmentRegs::default();
user_reader.copy_one_from_user(&mut sreg, 0)?;
if let Ok(_res) = self.vcpu.lock().set_segment_regs(&mut sreg) {
return Ok(0);
} else {
debug!("set segment regs failed");
return Err(SystemError::EINVAL);
}
}
_ => {
// arch ioctl
warn!("[KVM-VCPU] unknown ioctl cmd {cmd:x}");
}
}
Ok(0)
}
fn metadata(&self) -> Result<Metadata, SystemError> {
Ok(self.metadata.clone())
}
fn read_at(
&self,
_offset: usize,
_len: usize,
_buf: &mut [u8],
_data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
) -> Result<usize, SystemError> {
todo!()
}
fn write_at(
&self,
_offset: usize,
_len: usize,
_buf: &[u8],
_data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
) -> Result<usize, SystemError> {
todo!()
}
fn fs(&self) -> Arc<dyn crate::filesystem::vfs::FileSystem> {
todo!()
}
fn as_any_ref(&self) -> &dyn core::any::Any {
todo!()
}
fn list(&self) -> Result<alloc::vec::Vec<alloc::string::String>, SystemError> {
todo!()
}
}
pub fn kvm_init() -> Result<(), SystemError> {
let kvm_inode = LockedKvmInode::new();
devfs_register("kvm", kvm_inode)?;
Ok(())
}

View File

@ -0,0 +1,714 @@
use alloc::{
sync::{Arc, Weak},
vec::Vec,
};
use bitmap::AllocBitmap;
use hashbrown::HashMap;
use log::debug;
use system_error::SystemError;
use crate::{
arch::{vm::mmu::kvm_mmu::PAGE_SIZE, MMArch},
libs::{
rbtree::RBTree,
rwlock::{RwLock, RwLockReadGuard, RwLockWriteGuard},
spinlock::{SpinLock, SpinLockGuard},
},
mm::{kernel_mapper::KernelMapper, page::EntryFlags, MemoryManagementArch, VirtAddr},
virt::{
kvm::host_mem::PAGE_SHIFT,
vm::{kvm_host::KVM_ADDRESS_SPACE_NUM, user_api::KvmUserspaceMemoryRegion},
},
};
use super::{LockedVm, Vm};
pub const KVM_USER_MEM_SLOTS: u16 = u16::MAX;
pub const KVM_INTERNAL_MEM_SLOTS: u16 = 3;
pub const KVM_MEM_SLOTS_NUM: u16 = KVM_USER_MEM_SLOTS - KVM_INTERNAL_MEM_SLOTS;
pub const KVM_MEM_MAX_NR_PAGES: usize = (1 << 31) - 1;
// pub const APIC_ACCESS_PAGE_PRIVATE_MEMSLOT: u16 = KVM_MEM_SLOTS_NUM + 1;
/// 对于普通的页帧号PFN最高的12位应该为零
/// 因此我们可以mask位62到位52来表示错误的PFN
/// mask位63来表示无槽的PFN。
// const KVM_PFN_ERR_MASK: u64 = 0x7ff << 52; //0x7FF0000000000000
// const KVM_PFN_ERR_NOSLOT_MASK: u64 = 0xfff << 52; //0xFFF0000000000000
// const KVM_PFN_NOSLOT: u64 = 1 << 63; //0x8000000000000000
// const KVM_PFN_ERR_FAULT: u64 = KVM_PFN_ERR_MASK;
// const KVM_PFN_ERR_HWPOISON: u64 = KVM_PFN_ERR_MASK + 1;
// const KVM_PFN_ERR_RO_FAULT: u64 = KVM_PFN_ERR_MASK + 2;
// const KVM_PFN_ERR_SIGPENDING: u64 = KVM_PFN_ERR_MASK + 3;
#[derive(Debug, Default)]
#[allow(dead_code)]
pub struct KvmMmuMemoryCache {
gfp_zero: u32,
gfp_custom: u32,
capacity: usize,
nobjs: usize,
objects: Option<Vec<u8>>,
}
impl KvmMmuMemoryCache {
#[allow(dead_code)]
pub fn kvm_mmu_totup_memory_cache(
&mut self,
_capacity: usize,
_min: usize,
) -> Result<(), SystemError> {
// let gfp = if self.gfp_custom != 0 {
// self.gfp_custom
// } else {
// todo!();
// };
// if self.nobjs >= min {
// return Ok(());
// }
// if unlikely(self.objects.is_none()) {
// if self.capacity == 0 {
// return Err(SystemError::EIO);
// }
// // self.objects = Some(Box::new)
// }
Ok(())
}
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Default)]
pub struct AddrRange {
pub start: VirtAddr,
pub last: VirtAddr,
}
#[derive(Debug, Default)]
pub struct KvmMemSlotSet {
/// 最后一次使用到的内存插槽
pub last_use: Option<Arc<LockedKvmMemSlot>>,
/// 存储虚拟地址hva和内存插槽之间的映射关系
hva_tree: RBTree<AddrRange, Arc<LockedKvmMemSlot>>,
/// 用于存储全局页帧号gfn和内存插槽之间的映射关系
pub gfn_tree: RBTree<u64, Arc<LockedKvmMemSlot>>,
/// 将内存插槽的ID映射到对应的内存插槽。
slots: HashMap<u16, Arc<LockedKvmMemSlot>>,
pub node_idx: usize,
pub generation: u64,
}
impl KvmMemSlotSet {
pub fn get_slot(&self, id: u16) -> Option<Arc<LockedKvmMemSlot>> {
self.slots.get(&id).cloned()
}
}
#[derive(Debug)]
pub struct LockedKvmMemSlot {
inner: RwLock<KvmMemSlot>,
}
impl LockedKvmMemSlot {
pub fn new() -> Arc<Self> {
Arc::new(Self {
inner: RwLock::new(KvmMemSlot::default()),
})
}
#[inline]
pub fn read(&self) -> RwLockReadGuard<KvmMemSlot> {
self.inner.read()
}
#[inline]
pub fn write(&self) -> RwLockWriteGuard<KvmMemSlot> {
self.inner.write()
}
#[inline]
pub fn copy_from(&self, other: &Arc<LockedKvmMemSlot>) {
let mut guard = self.write();
let other = other.read();
guard.base_gfn = other.base_gfn;
guard.npages = other.npages;
guard.dirty_bitmap = other.dirty_bitmap.clone();
guard.arch = other.arch;
guard.userspace_addr = other.userspace_addr;
guard.flags = other.flags;
guard.id = other.id;
guard.as_id = other.as_id;
}
}
#[derive(Debug, Default)]
pub struct KvmMemSlot {
/// 首个gfn
pub base_gfn: u64,
/// 页数量
pub npages: usize,
/// 脏页位图
dirty_bitmap: Option<AllocBitmap>,
/// 架构相关
arch: (),
userspace_addr: VirtAddr,
flags: UserMemRegionFlag,
id: u16,
as_id: u16,
hva_node_key: [AddrRange; 2],
}
#[allow(dead_code)]
impl KvmMemSlot {
pub fn check_aligned_addr(&self, align: usize) -> bool {
self.userspace_addr.data() % align == 0
}
pub fn get_flags(&self) -> UserMemRegionFlag {
self.flags
}
pub fn get_id(&self) -> u16 {
self.id
}
// 检查内存槽是否可见
pub fn is_visible(&self) -> bool {
self.id < KVM_USER_MEM_SLOTS
&& (self.flags.bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits()) == 0
}
}
#[derive(Debug)]
pub struct LockedVmMemSlotSet {
inner: SpinLock<KvmMemSlotSet>,
}
impl LockedVmMemSlotSet {
pub fn new(slots: KvmMemSlotSet) -> Arc<Self> {
Arc::new(Self {
inner: SpinLock::new(slots),
})
}
pub fn lock(&self) -> SpinLockGuard<KvmMemSlotSet> {
self.inner.lock()
}
}
#[derive(Debug, Default)]
#[allow(dead_code)]
pub struct GfnToHvaCache {
generation: u64,
/// 客户机对应物理地址Guest Physical Address
gpa: u64,
/// 主机用户空间虚拟地址User Host Virtual Address
uhva: Option<u64>,
/// 主机内核空间虚拟地址Kernel Host Virtual Address
khva: u64,
/// 对应内存插槽
memslot: Option<Arc<LockedKvmMemSlot>>,
/// 对应物理页帧号(Page Frame Number)
pfn: Option<u64>,
/// 缓存项的使用情况
usage: PfnCacheUsage,
/// 是否处于活动状态
active: bool,
/// 是否有效
valid: bool,
vm: Option<Weak<LockedVm>>,
}
impl GfnToHvaCache {
pub fn init(vm: Weak<LockedVm>, usage: PfnCacheUsage) -> Self {
// check_stack_usage();
// let mut ret: Box<GfnToHvaCache> = unsafe { Box::new_zeroed().assume_init() };
// ret.usage = usage;
// ret.vm = Some(vm);
// *ret
Self {
usage,
vm: Some(vm),
..Default::default()
}
}
}
bitflags! {
#[derive(Default)]
pub struct PfnCacheUsage: u8 {
const GUEST_USES_PFN = 1 << 0;
const HOST_USES_PFN = 1 << 1;
const GUEST_AND_HOST_USES_PFN = Self::GUEST_USES_PFN.bits | Self::HOST_USES_PFN.bits;
}
pub struct UserMemRegionFlag: u32 {
/// 用来开启内存脏页
const LOG_DIRTY_PAGES = 1 << 0;
/// 开启内存只读
const READONLY = 1 << 1;
/// 标记invalid
const KVM_MEMSLOT_INVALID = 1 << 16;
}
}
impl Default for UserMemRegionFlag {
fn default() -> Self {
Self::empty()
}
}
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub enum KvmMemoryChangeMode {
Create,
Delete,
Move,
FlagsOnly,
}
impl Vm {
#[inline(never)]
pub fn set_memory_region(&mut self, mem: KvmUserspaceMemoryRegion) -> Result<(), SystemError> {
if mem.slot >= u16::MAX as u32 {
return Err(SystemError::EINVAL);
}
let as_id = mem.slot >> 16;
let id = mem.slot as u16;
// 检查内存对齐以及32位检测虽然现在没什么用<
if (mem.memory_size as usize & MMArch::PAGE_SIZE != 0)
|| mem.memory_size != mem.memory_size as usize as u64
{
return Err(SystemError::EINVAL);
}
if !mem.guest_phys_addr.check_aligned(MMArch::PAGE_SIZE) {
return Err(SystemError::EINVAL);
}
if !mem.userspace_addr.check_aligned(MMArch::PAGE_SIZE) {
// 这里应该还需要判断从userspace_addr->userspace_addr+memory_size这段区间都是合法的
return Err(SystemError::EINVAL);
}
if as_id >= KVM_ADDRESS_SPACE_NUM as u32 || id >= KVM_MEM_SLOTS_NUM {
return Err(SystemError::EINVAL);
}
if (mem.memory_size >> MMArch::PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES as u64 {
return Err(SystemError::EINVAL);
}
let slots = self.memslot_set(as_id as usize).clone();
let slots_guard = slots.lock();
let old = slots_guard.get_slot(id);
if mem.memory_size == 0 {
if let Some(old) = &old {
let old_npages = old.read().npages;
if old_npages == 0 {
return Err(SystemError::EINVAL);
}
if self.nr_memslot_pages < old_npages {
return Err(SystemError::EIO);
}
drop(slots_guard);
return self.set_memslot(Some(old), None, KvmMemoryChangeMode::Delete);
} else {
return Err(SystemError::EINVAL);
}
}
let base_gfn = (mem.guest_phys_addr.data() >> MMArch::PAGE_SHIFT) as u64;
let npages = mem.memory_size >> MMArch::PAGE_SHIFT;
let change;
if let Some(old) = &old {
let old_guard = old.read();
if old_guard.npages == 0 {
change = KvmMemoryChangeMode::Create;
// 避免溢出
if let Some(new_pages) = self.nr_memslot_pages.checked_add(npages as usize) {
if new_pages < self.nr_memslot_pages {
return Err(SystemError::EINVAL);
}
} else {
return Err(SystemError::EINVAL);
}
} else {
if mem.userspace_addr != old_guard.userspace_addr
|| npages != old_guard.npages as u64
|| (mem.flags ^ old_guard.flags).contains(UserMemRegionFlag::READONLY)
{
return Err(SystemError::EINVAL);
}
if base_gfn != old_guard.base_gfn {
change = KvmMemoryChangeMode::Move;
} else if mem.flags != old_guard.flags {
change = KvmMemoryChangeMode::FlagsOnly;
} else {
return Ok(());
}
}
} else {
change = KvmMemoryChangeMode::Create;
// 避免溢出
if let Some(new_pages) = self.nr_memslot_pages.checked_add(npages as usize) {
if new_pages < self.nr_memslot_pages {
return Err(SystemError::EINVAL);
}
} else {
return Err(SystemError::EINVAL);
}
};
if (change == KvmMemoryChangeMode::Create || change == KvmMemoryChangeMode::Move)
&& slots_guard.gfn_tree.contains_key(&base_gfn)
{
return Err(SystemError::EEXIST);
}
let new = LockedKvmMemSlot::new();
let mut new_guard = new.write();
new_guard.as_id = as_id as u16;
new_guard.id = id;
new_guard.base_gfn = base_gfn;
new_guard.npages = npages as usize;
new_guard.flags = mem.flags;
new_guard.userspace_addr = mem.userspace_addr;
drop(new_guard);
drop(slots_guard);
return self.set_memslot(old.as_ref(), Some(&new), change);
}
#[allow(clippy::modulo_one)]
#[inline]
/// 获取活动内存插槽
fn memslot_set(&self, id: usize) -> &Arc<LockedVmMemSlotSet> {
// 避免越界
let id = id % KVM_ADDRESS_SPACE_NUM;
&self.memslots[id]
}
#[inline(never)]
fn set_memslot(
&mut self,
old: Option<&Arc<LockedKvmMemSlot>>,
new: Option<&Arc<LockedKvmMemSlot>>,
change: KvmMemoryChangeMode,
) -> Result<(), SystemError> {
let invalid_slot = LockedKvmMemSlot::new();
if change == KvmMemoryChangeMode::Delete || change == KvmMemoryChangeMode::Move {
self.invalidate_memslot(old.unwrap(), &invalid_slot)
}
match self.prepare_memory_region(old, new, change) {
Ok(_) => {}
Err(e) => {
if change == KvmMemoryChangeMode::Delete || change == KvmMemoryChangeMode::Move {
self.active_memslot(Some(&invalid_slot), old)
}
return Err(e);
}
}
match change {
KvmMemoryChangeMode::Create => self.create_memslot(new),
KvmMemoryChangeMode::Delete => self.delete_memslot(old, &invalid_slot),
KvmMemoryChangeMode::Move => self.move_memslot(old, new, &invalid_slot),
KvmMemoryChangeMode::FlagsOnly => self.update_flags_memslot(old, new),
}
// TODO:kvm_commit_memory_region(kvm, old, new, change);
Ok(())
}
fn create_memslot(&mut self, new: Option<&Arc<LockedKvmMemSlot>>) {
self.replace_memslot(None, new);
self.active_memslot(None, new);
}
fn delete_memslot(
&mut self,
old: Option<&Arc<LockedKvmMemSlot>>,
invalid_slot: &Arc<LockedKvmMemSlot>,
) {
self.replace_memslot(old, None);
self.active_memslot(Some(invalid_slot), None);
}
fn move_memslot(
&mut self,
old: Option<&Arc<LockedKvmMemSlot>>,
new: Option<&Arc<LockedKvmMemSlot>>,
invalid_slot: &Arc<LockedKvmMemSlot>,
) {
self.replace_memslot(old, new);
self.active_memslot(Some(invalid_slot), new);
}
fn update_flags_memslot(
&mut self,
old: Option<&Arc<LockedKvmMemSlot>>,
new: Option<&Arc<LockedKvmMemSlot>>,
) {
self.replace_memslot(old, new);
self.active_memslot(old, new);
}
fn prepare_memory_region(
&self,
old: Option<&Arc<LockedKvmMemSlot>>,
new: Option<&Arc<LockedKvmMemSlot>>,
change: KvmMemoryChangeMode,
) -> Result<(), SystemError> {
if change != KvmMemoryChangeMode::Delete {
let new = new.unwrap();
let mut new_guard = new.write();
if !new_guard.flags.contains(UserMemRegionFlag::LOG_DIRTY_PAGES) {
new_guard.dirty_bitmap = None;
} else if old.is_some() {
let old_guard = old.unwrap().read();
if old_guard.dirty_bitmap.is_some() {
new_guard.dirty_bitmap = old_guard.dirty_bitmap.clone();
} else {
new_guard.dirty_bitmap = Some(AllocBitmap::new(new_guard.npages * 2));
}
}
}
return self.arch_prepare_memory_region(old, new, change);
}
fn invalidate_memslot(
&mut self,
old: &Arc<LockedKvmMemSlot>,
invalid_slot: &Arc<LockedKvmMemSlot>,
) {
invalid_slot.copy_from(old);
let mut old_guard = old.write();
let mut invalid_slot_guard = invalid_slot.write();
invalid_slot_guard
.flags
.insert(UserMemRegionFlag::KVM_MEMSLOT_INVALID);
self.swap_active_memslots(old_guard.as_id as usize);
old_guard.arch = invalid_slot_guard.arch;
}
#[inline(never)]
fn active_memslot(
&mut self,
old: Option<&Arc<LockedKvmMemSlot>>,
new: Option<&Arc<LockedKvmMemSlot>>,
) {
let as_id = if let Some(slot) = old.or(new) {
slot.read().as_id
} else {
0
};
self.swap_active_memslots(as_id as usize);
self.replace_memslot(old, new);
}
#[inline(never)]
fn replace_memslot(
&self,
old: Option<&Arc<LockedKvmMemSlot>>,
new: Option<&Arc<LockedKvmMemSlot>>,
) {
let as_id = if let Some(slot) = old.or(new) {
slot.read().as_id
} else {
0
};
let slot_set = self.get_inactive_memslot_set(as_id as usize);
let mut slots_guard = slot_set.lock();
let idx = slots_guard.node_idx;
if let Some(old) = old {
slots_guard.hva_tree.remove(&old.read().hva_node_key[idx]);
if let Some(last) = &slots_guard.last_use {
if Arc::ptr_eq(last, old) {
slots_guard.last_use = new.cloned();
}
}
if new.is_none() {
slots_guard.gfn_tree.remove(&old.read().base_gfn);
return;
}
}
let new = new.unwrap();
let mut new_guard = new.write();
new_guard.hva_node_key[idx].start = new_guard.userspace_addr;
new_guard.hva_node_key[idx].last =
new_guard.userspace_addr + VirtAddr::new((new_guard.npages << MMArch::PAGE_SHIFT) - 1);
slots_guard
.hva_tree
.insert(new_guard.hva_node_key[idx], new.clone());
if let Some(old) = old {
slots_guard.gfn_tree.remove(&old.read().base_gfn);
}
slots_guard.gfn_tree.insert(new_guard.base_gfn, new.clone());
}
fn get_inactive_memslot_set(&self, as_id: usize) -> Arc<LockedVmMemSlotSet> {
let active = self.memslot_set(as_id);
let inactive_idx = active.lock().node_idx ^ 1;
return self.memslots_set[as_id][inactive_idx].clone();
}
fn swap_active_memslots(&mut self, as_id: usize) {
self.memslots[as_id] = self.get_inactive_memslot_set(as_id);
}
}
/// 将给定的客户机帧号GFN转换为用户空间虚拟地址HVA并根据内存槽的状态和标志进行相应的检查。
///
/// # 参数
/// - `slot`: 可选的 `KvmMemSlot`,表示内存槽。
/// - `gfn`: 客户机帧号GFN表示要转换的帧号。
/// - `nr_pages`: 可选的可变引用,用于存储计算出的页数。
/// - `write`: 布尔值,表示是否为写操作。
///
/// # 返回
/// 如果成功返回转换后的用户空间虚拟地址HVA如果失败返回相应的错误。
///
/// # 错误
/// 如果内存槽为空或无效,或者尝试对只读内存槽进行写操作,则返回 `SystemError::KVM_HVA_ERR_BAD`。
pub fn __gfn_to_hva_many(
slot: &Option<&KvmMemSlot>,
gfn: u64,
nr_pages: Option<&mut u64>,
write: bool,
) -> Result<u64, SystemError> {
debug!("__gfn_to_hva_many");
// 检查内存槽是否为空
if slot.is_none() {
return Err(SystemError::KVM_HVA_ERR_BAD);
}
let slot = slot.as_ref().unwrap();
// 检查内存槽是否无效或尝试对只读内存槽进行写操作
if slot.flags.bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits() != 0
|| (slot.flags.bits() & UserMemRegionFlag::READONLY.bits() != 0) && write
{
return Err(SystemError::KVM_HVA_ERR_BAD);
}
// 如果 `nr_pages` 不为空,计算并更新页数
if let Some(nr_pages) = nr_pages {
*nr_pages = slot.npages as u64 - (gfn - slot.base_gfn);
}
// 调用辅助函数将 GFN 转换为 HVA
return Ok(__gfn_to_hva_memslot(slot, gfn));
}
/// 将给定的全局帧号GFN转换为用户空间虚拟地址HVA
///
/// # 参数
/// - `slot`: `KvmMemSlot`,表示内存槽。
/// - `gfn`: 全局帧号GFN表示要转换的帧号。
///
/// # 返回
/// 转换后的用户空间虚拟地址HVA
fn __gfn_to_hva_memslot(slot: &KvmMemSlot, gfn: u64) -> u64 {
return slot.userspace_addr.data() as u64 + (gfn - slot.base_gfn) * PAGE_SIZE;
}
/// 将给定的全局帧号GFN转换为页帧号PFN并根据内存槽的状态和标志进行相应的检查。
///
/// # 参数
/// - `slot`: 内存槽的引用。
/// - `gfn`: 全局帧号GFN表示要转换的帧号。
/// - `atomic`: 布尔值,表示是否为原子操作。
/// - `interruptible`: 布尔值,表示操作是否可中断。
/// - `async`: 可变引用,表示操作是否为异步。
/// - `write_fault`: 布尔值,表示是否为写操作。
/// - `writable`: 可变引用,表示是否可写。
/// - `hva`: 可变引用表示用户空间虚拟地址HVA
///
/// # 返回
/// 如果成功返回转换后的页帧号PFN如果失败返回相应的错误。
pub fn __gfn_to_pfn_memslot(
slot: Option<&KvmMemSlot>,
gfn: u64,
atomic_or_async: (bool, &mut bool),
interruptible: bool,
write: bool,
writable: &mut bool,
hva: &mut u64,
) -> Result<u64, SystemError> {
let addr = __gfn_to_hva_many(&slot, gfn, None, write)?;
*hva = addr;
//todo:检查地址是否为错误
// 如果内存槽为只读,且 writable 不为空,则更新 writable 的值
if slot.unwrap().flags.bits() & UserMemRegionFlag::READONLY.bits() != 0 {
*writable = false;
}
let pfn = hva_to_pfn(addr, atomic_or_async, interruptible, write, writable)?;
return Ok(pfn);
}
/// 将用户空间虚拟地址HVA转换为页帧号PFN
///
/// # 参数
/// - `addr`: 用户空间虚拟地址HVA
/// - `atomic`: 布尔值,表示是否为原子操作。
/// - `interruptible`: 布尔值,表示操作是否可中断。
/// - `is_async`: 可变引用,表示操作是否为异步。
/// - `write_fault`: 布尔值,表示是否为写操作。
/// - `writable`: 可变引用,表示是否可写。
///
/// # 返回
/// 如果成功返回转换后的页帧号PFN如果失败返回相应的错误。
// 正确性待验证
pub fn hva_to_pfn(
addr: u64,
atomic_or_async: (bool, &mut bool),
_interruptible: bool,
_write_fault: bool,
_writable: &mut bool,
) -> Result<u64, SystemError> {
// 我们可以原子地或异步地执行,但不能同时执行
assert!(
!(atomic_or_async.0 && *atomic_or_async.1),
"Cannot be both atomic and async"
);
debug!("hva_to_pfn");
// let hpa = MMArch::virt_2_phys(VirtAddr::new(addr)).unwrap().data() as u64;
let hva = VirtAddr::new(addr as usize);
let mut mapper = KernelMapper::lock();
let mapper = mapper.as_mut().unwrap();
if let Some((hpa, _)) = mapper.translate(hva) {
return Ok(hpa.data() as u64 >> PAGE_SHIFT);
}
debug!("hva_to_pfn NOT FOUND,try map a new pfn");
unsafe {
mapper.map(hva, EntryFlags::mmio_flags());
}
let (hpa, _) = mapper.translate(hva).unwrap();
return Ok(hpa.data() as u64 >> PAGE_SHIFT);
}

View File

@ -0,0 +1,268 @@
use core::{
fmt::Debug,
sync::atomic::{AtomicUsize, Ordering},
};
use alloc::{
boxed::Box,
sync::{Arc, Weak},
vec::Vec,
};
use hashbrown::HashMap;
use log::debug;
use mem::LockedKvmMemSlot;
use system_error::SystemError;
use crate::{
arch::{
vm::{kvm_host::vcpu::VirtCpuRequest, vmx::KvmVmx, x86_kvm_manager},
CurrentKvmManager, KvmArch, VirtCpuArch,
},
filesystem::vfs::file::{File, FileMode},
libs::spinlock::{SpinLock, SpinLockGuard},
mm::ucontext::AddressSpace,
process::ProcessManager,
smp::cpu::ProcessorId,
virt::vm::{
kvm_dev::KvmVcpuDev,
kvm_host::vcpu::{LockedVirtCpu, VirtCpu},
},
};
use self::{
mem::{GfnToHvaCache, KvmMemSlotSet, LockedVmMemSlotSet, PfnCacheUsage},
vcpu::{GuestDebug, VcpuMode},
};
pub mod mem;
pub mod vcpu;
const KVM_ADDRESS_SPACE_NUM: usize = 1;
pub const KVM_USERSAPCE_IRQ_SOURCE_ID: usize = 0;
pub const KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID: usize = 1;
#[derive(Debug)]
pub struct LockedVm {
inner: SpinLock<Vm>,
}
static KVM_USAGE_COUNT: AtomicUsize = AtomicUsize::new(0);
impl LockedVm {
pub fn lock(&self) -> SpinLockGuard<Vm> {
self.inner.lock()
}
pub fn create(vm_type: usize) -> Result<Arc<Self>, SystemError> {
let mut memslots_set = vec![];
let mut memslots = vec![];
for i in 0..KVM_ADDRESS_SPACE_NUM {
let mut tmp = vec![];
for j in 0..2 {
let mut slots = KvmMemSlotSet::default();
slots.last_use = None;
slots.node_idx = j;
slots.generation = i as u64;
tmp.push(LockedVmMemSlotSet::new(slots));
}
memslots_set.push(tmp);
memslots.push(memslots_set[i][0].clone());
}
let kvm = Vm {
mm: ProcessManager::current_pcb()
.basic()
.user_vm()
.unwrap()
.write()
.try_clone()?,
max_vcpus: CurrentKvmManager::KVM_MAX_VCPUS,
memslots_set,
memslots,
arch: KvmArch::init(vm_type)?,
created_vcpus: 0,
lock_vm_ref: Weak::new(),
nr_memslot_pages: 0,
online_vcpus: 0,
dirty_ring_size: 0,
dirty_ring_with_bitmap: false,
vcpus: HashMap::new(),
#[cfg(target_arch = "x86_64")]
kvm_vmx: KvmVmx::default(),
nr_memslots_dirty_logging: 0,
mmu_invalidate_seq: 0,
};
let ret = Arc::new(Self {
inner: SpinLock::new(kvm),
});
Self::hardware_enable_all()?;
ret.lock().lock_vm_ref = Arc::downgrade(&ret);
return Ok(ret);
}
fn hardware_enable_all() -> Result<(), SystemError> {
KVM_USAGE_COUNT.fetch_add(1, Ordering::SeqCst);
// 如果是第一个启动的则需要对所有cpu都初始化硬件
if KVM_USAGE_COUNT.load(Ordering::SeqCst) == 1 {
// FIXME!!!!
// 这里是要对每个cpu都进行初始化目前这里只对当前cpu调用了初始化流程
x86_kvm_manager().arch_hardware_enable()?;
}
Ok(())
}
}
#[derive(Debug)]
#[allow(dead_code)]
pub struct Vm {
lock_vm_ref: Weak<LockedVm>,
mm: Arc<AddressSpace>,
max_vcpus: usize,
created_vcpus: usize,
online_vcpus: usize,
/// vcpu集合
vcpus: HashMap<usize, Arc<LockedVirtCpu>>,
// name: String,
/// 对应活动和非活动内存槽,实际为:[[Arc<LockedVmMemSlots>; 2]; KVM_ADDRESS_SPACE_NUM]这里暂时写Vec
memslots_set: Vec<Vec<Arc<LockedVmMemSlotSet>>>,
/// 当前活动内存槽,实际为:[Arc<LockedVmMemSlots>; KVM_ADDRESS_SPACE_NUM]这里暂时写Vec
pub memslots: Vec<Arc<LockedVmMemSlotSet>>,
/// 内存槽对应的页数
nr_memslot_pages: usize,
pub arch: KvmArch,
pub dirty_ring_size: u32,
pub nr_memslots_dirty_logging: u32,
dirty_ring_with_bitmap: bool,
#[cfg(target_arch = "x86_64")]
pub kvm_vmx: KvmVmx,
pub mmu_invalidate_seq: u64, //用于表示内存管理单元MMU无效化序列号
}
impl Vm {
#[inline(never)]
pub fn create_vcpu(&mut self, id: usize) -> Result<usize, SystemError> {
if id >= self.max_vcpus {
return Err(SystemError::EINVAL);
}
if self.created_vcpus >= self.max_vcpus {
return Err(SystemError::EINVAL);
}
self.created_vcpus += 1;
let vcpu = self._create_vcpu(id)?;
if self.dirty_ring_size != 0 {
todo!()
}
vcpu.lock().vcpu_id = self.online_vcpus;
self.vcpus.insert(self.online_vcpus, vcpu.clone());
self.online_vcpus += 1;
let vcpu_inode = KvmVcpuDev::new(vcpu);
let file = File::new(vcpu_inode, FileMode::from_bits_truncate(0x777))?;
let fd = ProcessManager::current_pcb()
.fd_table()
.write()
.alloc_fd(file, None)?;
Ok(fd as usize)
}
/// ### 创建一个vcpu并且初始化部分数据
#[inline(never)]
pub fn _create_vcpu(&mut self, id: usize) -> Result<Arc<LockedVirtCpu>, SystemError> {
let mut vcpu = self.new_vcpu(id);
vcpu.init_arch(self, id)?;
Ok(Arc::new(LockedVirtCpu::new(vcpu)))
}
#[inline(never)]
pub fn new_vcpu(&self, id: usize) -> VirtCpu {
return VirtCpu {
cpu: ProcessorId::INVALID,
kvm: Some(self.lock_vm_ref.clone()),
vcpu_id: id,
pid: None,
preempted: false,
ready: false,
last_used_slot: None,
stats_id: format!("kvm-{}/vcpu-{}", ProcessManager::current_pid().data(), id),
pv_time: GfnToHvaCache::init(self.lock_vm_ref.clone(), PfnCacheUsage::HOST_USES_PFN),
arch: VirtCpuArch::new(),
private: None,
request: VirtCpuRequest::empty(),
guest_debug: GuestDebug::empty(),
run: unsafe { Some(Box::new_zeroed().assume_init()) },
vcpu_idx: 0,
mode: VcpuMode::OutsideGuestMode,
stat: Default::default(),
};
}
#[cfg(target_arch = "x86_64")]
pub fn kvm_vmx_mut(&mut self) -> &mut KvmVmx {
&mut self.kvm_vmx
}
#[cfg(target_arch = "x86_64")]
pub fn kvm_vmx(&self) -> &KvmVmx {
&self.kvm_vmx
}
}
/// ## 多处理器状态(有些状态在某些架构并不合法)
#[derive(Debug, Clone, Copy, PartialEq)]
#[allow(dead_code)]
pub enum MutilProcessorState {
Runnable,
Uninitialized,
InitReceived,
Halted,
SipiReceived,
Stopped,
CheckStop,
Operating,
Load,
ApResetHold,
Suspended,
}
///返回包含 gfn 的 memslot 的指针。如果没有找到,则返回 NULL。
///当 "approx" 设置为 true 时,即使地址落在空洞中,也会返回 memslot。
///在这种情况下,将返回空洞边界的其中一个 memslot。
/// 先简陋完成,原本是二分,现在先遍历
pub fn search_memslots(
slot_set: Arc<LockedVmMemSlotSet>,
gfn: u64, /*_approx:bool*/
) -> Option<Arc<LockedKvmMemSlot>> {
let slots = slot_set.lock();
let node = &slots.gfn_tree;
//let(start,end)=(0,node.len()-1);
for (_gfn_num, slot) in node.iter() {
let slot_guard = slot.read();
debug!(
"gfn:{gfn},slot base_gfn: {},slot npages: {}",
slot_guard.base_gfn, slot_guard.npages
);
if gfn >= slot_guard.base_gfn && gfn < slot_guard.base_gfn + slot_guard.npages as u64 {
return Some(slot.clone());
}
}
return None;
}

View File

@ -0,0 +1,117 @@
use alloc::{
boxed::Box,
string::String,
sync::{Arc, Weak},
};
use crate::{
arch::{
vm::{
kvm_host::{vcpu::VirtCpuRequest, KvmReg},
vmx::VmxVCpuPriv,
},
VirtCpuArch, VirtCpuStat,
},
libs::spinlock::{SpinLock, SpinLockGuard},
process::Pid,
smp::cpu::ProcessorId,
virt::vm::user_api::UapiKvmRun,
};
use super::{
mem::{GfnToHvaCache, KvmMemSlot},
LockedVm,
};
#[derive(Debug)]
pub struct LockedVirtCpu {
inner: SpinLock<VirtCpu>,
}
impl LockedVirtCpu {
pub fn new(vcpu: VirtCpu) -> Self {
Self {
inner: SpinLock::new(vcpu),
}
}
pub fn lock(&self) -> SpinLockGuard<VirtCpu> {
self.inner.lock()
}
}
#[derive(Debug, PartialEq)]
#[allow(dead_code)]
pub enum VcpuMode {
OutsideGuestMode,
InGuestMode,
ExitingGuestMode,
ReadingShadowPageTables,
}
#[derive(Debug)]
pub struct VirtCpu {
pub cpu: ProcessorId,
pub kvm: Option<Weak<LockedVm>>,
/// 从用户层获取
pub vcpu_id: usize,
/// id alloctor获取
pub vcpu_idx: usize,
pub pid: Option<Pid>,
pub preempted: bool,
pub ready: bool,
pub last_used_slot: Option<Arc<KvmMemSlot>>,
pub stats_id: String,
pub pv_time: GfnToHvaCache,
pub arch: VirtCpuArch,
pub stat: VirtCpuStat,
pub mode: VcpuMode,
pub guest_debug: GuestDebug,
#[cfg(target_arch = "x86_64")]
pub private: Option<VmxVCpuPriv>,
/// 记录请求
pub request: VirtCpuRequest,
pub run: Option<Box<UapiKvmRun>>,
}
impl VirtCpu {
#[inline]
pub fn kvm(&self) -> Arc<LockedVm> {
self.kvm.as_ref().unwrap().upgrade().unwrap()
}
#[cfg(target_arch = "x86_64")]
pub fn vmx(&self) -> &VmxVCpuPriv {
self.private.as_ref().unwrap()
}
#[cfg(target_arch = "x86_64")]
pub fn vmx_mut(&mut self) -> &mut VmxVCpuPriv {
self.private.as_mut().unwrap()
}
//https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.h?fi=vmx_get_exit_qual#677
#[inline]
pub fn get_exit_qual(&mut self) -> u64 {
if !self
.arch
.test_and_mark_available(KvmReg::VcpuExregExitInfo1)
{
self.vmx_mut().vmread_exit_qual();
}
let vmx = self.vmx();
vmx.get_exit_qual()
//vmx.
}
}
bitflags! {
pub struct GuestDebug: usize {
const ENABLE = 0x00000001;
const SINGLESTEP = 0x00000002;
const USE_SW_BP = 0x00010000;
}
}

View File

@ -0,0 +1,3 @@
pub mod kvm_dev;
pub mod kvm_host;
pub mod user_api;

View File

@ -0,0 +1,466 @@
///
/// 该文件定义了暴露给用户空间的结构体
///
use core::fmt::Debug;
use system_error::SystemError;
use crate::mm::{PhysAddr, VirtAddr};
use super::kvm_host::mem::UserMemRegionFlag;
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmSegment {
pub base: u64,
pub limit: u32,
pub selector: u16,
pub type_: u8,
pub present: u8,
pub dpl: u8,
pub db: u8,
pub s: u8,
pub l: u8,
pub g: u8,
pub avl: u8,
pub unusable: u8,
pub padding: u8,
}
impl UapiKvmSegment {
pub fn vmx_segment_access_rights(&self) -> u32 {
let mut ar = self.type_ as u32 & 15;
ar |= (self.s as u32 & 1) << 4;
ar |= (self.dpl as u32 & 3) << 5;
ar |= (self.present as u32 & 1) << 7;
ar |= (self.avl as u32 & 1) << 12;
ar |= (self.l as u32 & 1) << 13;
ar |= (self.db as u32 & 1) << 14;
ar |= (self.g as u32 & 1) << 15;
let b = self.unusable != 0 || self.present == 0;
ar |= (b as u32) << 16;
return ar;
}
}
/// 通过这个结构可以将虚拟机的物理地址对应到用户进程的虚拟地址
/// 用来表示虚拟机的一段物理内存
#[repr(C)]
#[derive(Default)]
pub struct PosixKvmUserspaceMemoryRegion {
/// 在哪个slot上注册内存区间
pub slot: u32,
/// flags有两个取值KVM_MEM_LOG_DIRTY_PAGES和KVM_MEM_READONLY用来指示kvm针对这段内存应该做的事情。
/// KVM_MEM_LOG_DIRTY_PAGES用来开启内存脏页KVM_MEM_READONLY用来开启内存只读。
pub flags: u32,
/// 虚机内存区间起始物理地址
pub guest_phys_addr: u64,
/// 虚机内存区间大小
pub memory_size: u64,
/// 虚机内存区间对应的主机虚拟地址
pub userspace_addr: u64,
}
/// PosixKvmUserspaceMemoryRegion对应内核表示
pub struct KvmUserspaceMemoryRegion {
/// 在哪个slot上注册内存区间
pub slot: u32,
/// 用来指示kvm针对这段内存应该做的事情。
/// KVM_MEM_LOG_DIRTY_PAGES用来开启内存脏页KVM_MEM_READONLY用来开启内存只读。
pub flags: UserMemRegionFlag,
/// 虚机内存区间起始物理地址
pub guest_phys_addr: PhysAddr,
/// 虚机内存区间大小
pub memory_size: u64,
/// 虚机内存区间对应的主机虚拟地址
pub userspace_addr: VirtAddr,
}
impl KvmUserspaceMemoryRegion {
pub fn from_posix(posix: &PosixKvmUserspaceMemoryRegion) -> Result<Self, SystemError> {
let flags = UserMemRegionFlag::from_bits(posix.flags).ok_or(SystemError::EINVAL)?;
Ok(Self {
slot: posix.slot,
flags,
guest_phys_addr: PhysAddr::new(posix.guest_phys_addr as usize),
memory_size: posix.memory_size,
userspace_addr: VirtAddr::new(posix.userspace_addr as usize),
})
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct UapiKvmRun {
pub request_interrupt_window: u8,
pub immediate_exit: u8,
pub padding1: [u8; 6usize],
pub exit_reason: u32,
pub ready_for_interrupt_injection: u8,
pub if_flag: u8,
pub flags: u16,
pub cr8: u64,
pub apic_base: u64,
pub __bindgen_anon_1: uapi_kvm_run__bindgen_ty_1,
pub kvm_valid_regs: u64,
pub kvm_dirty_regs: u64,
pub s: uapi_kvm_run__bindgen_ty_2,
}
#[repr(C)]
#[derive(Copy, Clone)]
pub union uapi_kvm_run__bindgen_ty_2 {
pub regs: UapiKvmSyncRegs,
pub padding: [u8; 2048usize],
}
impl Debug for uapi_kvm_run__bindgen_ty_2 {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("uapi_kvm_run__bindgen_ty_2").finish()
}
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmSyncRegs {
pub device_irq_level: u64,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy1 {
pub hardware_exit_reason: u64,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy2 {
pub hardware_entry_failure_reason: u64,
pub cpu: u32,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy3 {
pub exception: u32,
pub error_code: u32,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy4 {
pub direction: u8,
pub size: u8,
pub port: u16,
pub count: u32,
pub data_offset: u64,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmDebugExitArch {
pub hsr: u32,
pub hsr_high: u32,
pub far: u64,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy5 {
pub arch: UapiKvmDebugExitArch,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy6 {
pub phys_addr: u64,
pub data: [u8; 8usize],
pub len: u32,
pub is_write: u8,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy7 {
pub nr: u64,
pub args: [u64; 6usize],
pub ret: u64,
pub longmode: u32,
pub pad: u32,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy8 {
pub rip: u64,
pub is_write: u32,
pub pad: u32,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy9 {
pub icptcode: u8,
pub ipa: u16,
pub ipb: u32,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy10 {
pub trans_exc_code: u64,
pub pgm_code: u32,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy11 {
pub dcrn: u32,
pub data: u32,
pub is_write: u8,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy12 {
pub suberror: u32,
pub ndata: u32,
pub data: [u64; 16usize],
}
#[repr(C)]
#[derive(Copy, Clone)]
pub struct UapiKvmRunBindgenTy1BindgenTy13 {
pub suberror: u32,
pub ndata: u32,
pub flags: u64,
pub __bindgen_anon_1: uapi_kvm_run__bindgen_ty_1__bindgen_ty_13__bindgen_ty_1,
}
#[repr(C)]
#[derive(Copy, Clone)]
pub union uapi_kvm_run__bindgen_ty_1__bindgen_ty_13__bindgen_ty_1 {
pub __bindgen_anon_1: UapiKvmRunBindgenTy1BindgenTy13BindgenTy1BindgenTy1,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy13BindgenTy1BindgenTy1 {
pub insn_size: u8,
pub insn_bytes: [u8; 15usize],
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy14 {
pub gprs: [u64; 32usize],
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy15 {
pub nr: u64,
pub ret: u64,
pub args: [u64; 9usize],
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy16 {
pub subchannel_id: u16,
pub subchannel_nr: u16,
pub io_int_parm: u32,
pub io_int_word: u32,
pub ipb: u32,
pub dequeued: u8,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy17 {
pub epr: u32,
}
#[repr(C)]
#[derive(Copy, Clone)]
pub struct UapiKvmRunBindgenTy1BindgenTy18 {
pub type_: u32,
pub ndata: u32,
pub __bindgen_anon_1: uapi_kvm_run__bindgen_ty_1__bindgen_ty_18__bindgen_ty_1,
}
#[repr(C)]
#[derive(Copy, Clone)]
pub union uapi_kvm_run__bindgen_ty_1__bindgen_ty_18__bindgen_ty_1 {
pub flags: u64,
pub data: [u64; 16usize],
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy19 {
pub addr: u64,
pub ar: u8,
pub reserved: u8,
pub fc: u8,
pub sel1: u8,
pub sel2: u16,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy20 {
pub vector: u8,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy21 {
pub esr_iss: u64,
pub fault_ipa: u64,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy22 {
pub error: u8,
pub pad: [u8; 7usize],
pub reason: u32,
pub index: u32,
pub data: u64,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy23 {
pub extension_id: usize,
pub function_id: usize,
pub args: [usize; 6usize],
pub ret: [usize; 2usize],
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy24 {
pub csr_num: usize,
pub new_value: usize,
pub write_mask: usize,
pub ret_value: usize,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmRunBindgenTy1BindgenTy25 {
pub flags: u32,
}
#[repr(C)]
#[derive(Copy, Clone)]
pub union uapi_kvm_run__bindgen_ty_1 {
pub hw: UapiKvmRunBindgenTy1BindgenTy1,
pub fail_entry: UapiKvmRunBindgenTy1BindgenTy2,
pub ex: UapiKvmRunBindgenTy1BindgenTy3,
pub io: UapiKvmRunBindgenTy1BindgenTy4,
pub debug: UapiKvmRunBindgenTy1BindgenTy5,
pub mmio: UapiKvmRunBindgenTy1BindgenTy6,
pub hypercall: UapiKvmRunBindgenTy1BindgenTy7,
pub tpr_access: UapiKvmRunBindgenTy1BindgenTy8,
pub s390_sieic: UapiKvmRunBindgenTy1BindgenTy9,
pub s390_reset_flags: u64,
pub s390_ucontrol: UapiKvmRunBindgenTy1BindgenTy10,
pub dcr: UapiKvmRunBindgenTy1BindgenTy11,
pub internal: UapiKvmRunBindgenTy1BindgenTy12,
pub emulation_failure: UapiKvmRunBindgenTy1BindgenTy13,
pub osi: UapiKvmRunBindgenTy1BindgenTy14,
pub papr_hcall: UapiKvmRunBindgenTy1BindgenTy15,
pub s390_tsch: UapiKvmRunBindgenTy1BindgenTy16,
pub epr: UapiKvmRunBindgenTy1BindgenTy17,
pub system_event: UapiKvmRunBindgenTy1BindgenTy18,
pub s390_stsi: UapiKvmRunBindgenTy1BindgenTy19,
pub eoi: UapiKvmRunBindgenTy1BindgenTy20,
pub hyperv: UapiKvmHypervExit,
pub arm_nisv: UapiKvmRunBindgenTy1BindgenTy21,
pub msr: UapiKvmRunBindgenTy1BindgenTy22,
pub xen: UapiKvmXenExit,
pub riscv_sbi: UapiKvmRunBindgenTy1BindgenTy23,
pub riscv_csr: UapiKvmRunBindgenTy1BindgenTy24,
pub notify: UapiKvmRunBindgenTy1BindgenTy25,
pub padding: [u8; 256usize],
}
impl Debug for uapi_kvm_run__bindgen_ty_1 {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("uapi_kvm_run__bindgen_ty_1").finish()
}
}
#[repr(C)]
#[derive(Copy, Clone)]
pub struct UapiKvmHypervExit {
pub type_: u32,
pub pad1: u32,
pub u: uapi_kvm_hyperv_exit__bindgen_ty_1,
}
#[repr(C)]
#[derive(Copy, Clone)]
pub union uapi_kvm_hyperv_exit__bindgen_ty_1 {
pub synic: UapiKvmHypervExitBindgenTy1BindgenTy1,
pub hcall: UapiKvmHypervExitBindgenTy1BindgenTy2,
pub syndbg: UapiKvmHypervExitBindgenTy1BindgenTy3,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmHypervExitBindgenTy1BindgenTy1 {
pub msr: u32,
pub pad2: u32,
pub control: u64,
pub evt_page: u64,
pub msg_page: u64,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmHypervExitBindgenTy1BindgenTy2 {
pub input: u64,
pub result: u64,
pub params: [u64; 2usize],
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmHypervExitBindgenTy1BindgenTy3 {
pub msr: u32,
pub pad2: u32,
pub control: u64,
pub status: u64,
pub send_page: u64,
pub recv_page: u64,
pub pending_page: u64,
}
#[repr(C)]
#[derive(Copy, Clone)]
pub struct UapiKvmXenExit {
pub type_: u32,
pub u: uapi_kvm_xen_exit__bindgen_ty_1,
}
#[repr(C)]
#[derive(Copy, Clone)]
pub union uapi_kvm_xen_exit__bindgen_ty_1 {
pub hcall: UapiKvmXenExitBindgenTy1BindgenTy1,
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct UapiKvmXenExitBindgenTy1BindgenTy1 {
pub longmode: u32,
pub cpl: u32,
pub input: u64,
pub result: u64,
pub params: [u64; 6usize],
}

17
package-lock.json generated Normal file
View File

@ -0,0 +1,17 @@
{
"name": "DragonOS",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"dependencies": {
"crypto-js": "^4.2.0"
}
},
"node_modules/crypto-js": {
"version": "4.2.0",
"resolved": "https://mirrors.huaweicloud.com/repository/npm/crypto-js/-/crypto-js-4.2.0.tgz",
"integrity": "sha512-KALDyEYgpY+Rlob/iriUtjV6d5Eq+Y191A5g4UqLAi8CyGP9N1+FdVbkc1SxKc2r4YAYqG8JzO2KGL+AizD70Q=="
}
}
}

View File

@ -1,3 +0,0 @@
target remote localhost:1234
file bin/kernel/kernel.elf
set follow-fork-mode child

View File

@ -1,115 +1,540 @@
/**
* @file main.c
* @author xiaoyez (xiaoyez@zju.edu.cn)
* @brief kvm的程序
* @version 0.1
* @date 2023-07-13
*
* @copyright Copyright (c) 2023
*
*/
/**
* kvm命令的方法:
* 1.DragonOS的控制台输入 exec bin/test_kvm.elf
*
*/
#include <fcntl.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <string.h>
#include <sys/ioctl.h> #include <sys/ioctl.h>
#include <unistd.h> #include <sys/mman.h>
#include <fcntl.h>
//#include <linux/kvm.h>
#define KVM_CREATE_VCPU 0x00 typedef __signed__ char __s8;
#define KVM_SET_USER_MEMORY_REGION 0x01 typedef unsigned char __u8;
#define KVM_RUN 0x00 typedef __signed__ short __s16;
#define KVM_GET_REGS 0x01 typedef unsigned short __u16;
#define KVM_SET_REGS 0x02
struct kvm_userspace_memory_region { typedef __signed__ int __s32;
uint32_t slot; // 要在哪个slot上注册内存区间 typedef unsigned int __u32;
// flags有两个取值KVM_MEM_LOG_DIRTY_PAGES和KVM_MEM_READONLY用来指示kvm针对这段内存应该做的事情。
// KVM_MEM_LOG_DIRTY_PAGES用来开启内存脏页KVM_MEM_READONLY用来开启内存只读。 #ifdef __GNUC__
uint32_t flags; __extension__ typedef __signed__ long long __s64;
uint64_t guest_phys_addr; // 虚机内存区间起始物理地址 __extension__ typedef unsigned long long __u64;
uint64_t memory_size; // 虚机内存区间大小 #else
uint64_t userspace_addr; // 虚机内存区间对应的主机虚拟地址 typedef __signed__ long long __s64;
typedef unsigned long long __u64;
#endif
//from linux/kvm.h
#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */
#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */
#define KVM_RUN _IO(KVMIO, 0x80)
#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs)
#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs)
#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs)
#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs)
#define KVMIO 0xAE
#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \
struct kvm_userspace_memory_region)
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
struct kvm_hyperv_exit {
#define KVM_EXIT_HYPERV_SYNIC 1
#define KVM_EXIT_HYPERV_HCALL 2
#define KVM_EXIT_HYPERV_SYNDBG 3
__u32 type;
__u32 pad1;
union {
struct {
__u32 msr;
__u32 pad2;
__u64 control;
__u64 evt_page;
__u64 msg_page;
} synic;
struct {
__u64 input;
__u64 result;
__u64 params[2];
} hcall;
struct {
__u32 msr;
__u32 pad2;
__u64 control;
__u64 status;
__u64 send_page;
__u64 recv_page;
__u64 pending_page;
} syndbg;
} u;
}; };
struct kvm_debug_exit_arch {
__u32 exception;
__u32 pad;
__u64 pc;
__u64 dr6;
__u64 dr7;
};
/* for KVM_SET_USER_MEMORY_REGION */
struct kvm_userspace_memory_region {
__u32 slot;
__u32 flags;
__u64 guest_phys_addr;
__u64 memory_size; /* bytes */
__u64 userspace_addr; /* start of the userspace allocated memory */
};
struct kvm_xen_exit {
#define KVM_EXIT_XEN_HCALL 1
__u32 type;
union {
struct {
__u32 longmode;
__u32 cpl;
__u64 input;
__u64 result;
__u64 params[6];
} hcall;
} u;
};
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs { struct kvm_regs {
/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
uint64_t rax, rbx, rcx, rdx; __u64 rax, rbx, rcx, rdx;
uint64_t rsi, rdi, rsp, rbp; __u64 rsi, rdi, rsp, rbp;
uint64_t r8, r9, r10, r11; __u64 r8, r9, r10, r11;
uint64_t r12, r13, r14, r15; __u64 r12, r13, r14, r15;
uint64_t rip, rflags; __u64 rip, rflags;
};
struct my_kvm_segment {
__u64 base;
__u32 limit;
__u16 selector;
__u8 type;
__u8 present, dpl, db, s, l, g, avl;
__u8 unusable;
__u8 padding;
};
struct kvm_dtable {
__u64 base;
__u16 limit;
__u16 padding[3];
};
/* for KVM_GET_SREGS and KVM_SET_SREGS */
struct kvm_sregs {
/* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
struct my_kvm_segment cs, ds, es, fs, gs, ss;
struct my_kvm_segment tr, ldt;
struct kvm_dtable gdt, idt;
__u64 cr0, cr2, cr3, cr4, cr8;
__u64 efer;
__u64 apic_base;
__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
}; };
int guest_code(){ /* for KVM_GET/SET_VCPU_EVENTS */
while (1) struct kvm_vcpu_events {
{ struct {
// printf("guest code\n"); __u8 injected;
__asm__ __volatile__ ( __u8 nr;
"mov %rax, 0\n\t" __u8 has_error_code;
"mov %rcx, 0\n\t" __u8 pending;
"cpuid\n\t" __u32 error_code;
); } exception;
} struct {
__u8 injected;
__u8 nr;
__u8 soft;
__u8 shadow;
} interrupt;
struct {
__u8 injected;
__u8 pending;
__u8 masked;
__u8 pad;
} nmi;
__u32 sipi_vector;
__u32 flags;
struct {
__u8 smm;
__u8 pending;
__u8 smm_inside_nmi;
__u8 latched_init;
} smi;
__u8 reserved[27];
__u8 exception_has_payload;
__u64 exception_payload;
};
/* kvm_sync_regs struct included by kvm_run struct */
struct kvm_sync_regs {
/* Members of this structure are potentially malicious.
* Care must be taken by code reading, esp. interpreting,
* data fields from them inside KVM to prevent TOCTOU and
* double-fetch types of vulnerabilities.
*/
struct kvm_regs regs;
struct kvm_sregs sregs;
struct kvm_vcpu_events events;
};
/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
struct kvm_run {
/* in */
__u8 request_interrupt_window;
__u8 immediate_exit;
__u8 padding1[6];
/* out */
__u32 exit_reason;
__u8 ready_for_interrupt_injection;
__u8 if_flag;
__u16 flags;
/* in (pre_kvm_run), out (post_kvm_run) */
__u64 cr8;
__u64 apic_base;
#ifdef __KVM_S390
/* the processor status word for s390 */
__u64 psw_mask; /* psw upper half */
__u64 psw_addr; /* psw lower half */
#endif
union {
/* KVM_EXIT_UNKNOWN */
struct {
__u64 hardware_exit_reason;
} hw;
/* KVM_EXIT_FAIL_ENTRY */
struct {
__u64 hardware_entry_failure_reason;
__u32 cpu;
} fail_entry;
/* KVM_EXIT_EXCEPTION */
struct {
__u32 exception;
__u32 error_code;
} ex;
/* KVM_EXIT_IO */
struct {
#define KVM_EXIT_IO_IN 0
#define KVM_EXIT_IO_OUT 1
__u8 direction;
__u8 size; /* bytes */
__u16 port;
__u32 count;
__u64 data_offset; /* relative to kvm_run start */
} io;
/* KVM_EXIT_DEBUG */
struct {
struct kvm_debug_exit_arch arch;
} debug;
/* KVM_EXIT_MMIO */
struct {
__u64 phys_addr;
__u8 data[8];
__u32 len;
__u8 is_write;
} mmio;
/* KVM_EXIT_HYPERCALL */
struct {
__u64 nr;
__u64 args[6];
__u64 ret;
__u32 longmode;
__u32 pad;
} hypercall;
/* KVM_EXIT_TPR_ACCESS */
struct {
__u64 rip;
__u32 is_write;
__u32 pad;
} tpr_access;
/* KVM_EXIT_S390_SIEIC */
struct {
__u8 icptcode;
__u16 ipa;
__u32 ipb;
} s390_sieic;
/* KVM_EXIT_S390_RESET */
#define KVM_S390_RESET_POR 1
#define KVM_S390_RESET_CLEAR 2
#define KVM_S390_RESET_SUBSYSTEM 4
#define KVM_S390_RESET_CPU_INIT 8
#define KVM_S390_RESET_IPL 16
__u64 s390_reset_flags;
/* KVM_EXIT_S390_UCONTROL */
struct {
__u64 trans_exc_code;
__u32 pgm_code;
} s390_ucontrol;
/* KVM_EXIT_DCR (deprecated) */
struct {
__u32 dcrn;
__u32 data;
__u8 is_write;
} dcr;
/* KVM_EXIT_INTERNAL_ERROR */
struct {
__u32 suberror;
/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
__u32 ndata;
__u64 data[16];
} internal;
/*
* KVM_INTERNAL_ERROR_EMULATION
*
* "struct emulation_failure" is an overlay of "struct internal"
* that is used for the KVM_INTERNAL_ERROR_EMULATION sub-type of
* KVM_EXIT_INTERNAL_ERROR. Note, unlike other internal error
* sub-types, this struct is ABI! It also needs to be backwards
* compatible with "struct internal". Take special care that
* "ndata" is correct, that new fields are enumerated in "flags",
* and that each flag enumerates fields that are 64-bit aligned
* and sized (so that ndata+internal.data[] is valid/accurate).
*/
struct {
__u32 suberror;
__u32 ndata;
__u64 flags;
__u8 insn_size;
__u8 insn_bytes[15];
} emulation_failure;
/* KVM_EXIT_OSI */
struct {
__u64 gprs[32];
} osi;
/* KVM_EXIT_PAPR_HCALL */
struct {
__u64 nr;
__u64 ret;
__u64 args[9];
} papr_hcall;
/* KVM_EXIT_S390_TSCH */
struct {
__u16 subchannel_id;
__u16 subchannel_nr;
__u32 io_int_parm;
__u32 io_int_word;
__u32 ipb;
__u8 dequeued;
} s390_tsch;
/* KVM_EXIT_EPR */
struct {
__u32 epr;
} epr;
/* KVM_EXIT_SYSTEM_EVENT */
struct {
#define KVM_SYSTEM_EVENT_SHUTDOWN 1
#define KVM_SYSTEM_EVENT_RESET 2
#define KVM_SYSTEM_EVENT_CRASH 3
__u32 type;
__u64 flags;
} system_event;
/* KVM_EXIT_S390_STSI */
struct {
__u64 addr;
__u8 ar;
__u8 reserved;
__u8 fc;
__u8 sel1;
__u16 sel2;
} s390_stsi;
/* KVM_EXIT_IOAPIC_EOI */
struct {
__u8 vector;
} eoi;
/* KVM_EXIT_HYPERV */
struct kvm_hyperv_exit hyperv;
/* KVM_EXIT_ARM_NISV */
struct {
__u64 esr_iss;
__u64 fault_ipa;
} arm_nisv;
/* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */
struct {
__u8 error; /* user -> kernel */
__u8 pad[7];
#define KVM_MSR_EXIT_REASON_INVAL (1 << 0)
#define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1)
#define KVM_MSR_EXIT_REASON_FILTER (1 << 2)
__u32 reason; /* kernel -> user */
__u32 index; /* kernel -> user */
__u64 data; /* kernel <-> user */
} msr;
/* KVM_EXIT_XEN */
struct kvm_xen_exit xen;
/* Fix the size of the union. */
char padding[256];
};
/* 2048 is the size of the char array used to bound/pad the size
* of the union that holds sync regs.
*/
#define SYNC_REGS_SIZE_BYTES 2048
/*
* shared registers between kvm and userspace.
* kvm_valid_regs specifies the register classes set by the host
* kvm_dirty_regs specified the register classes dirtied by userspace
* struct kvm_sync_regs is architecture specific, as well as the
* bits for kvm_valid_regs and kvm_dirty_regs
*/
__u64 kvm_valid_regs;
__u64 kvm_dirty_regs;
union {
struct kvm_sync_regs regs;
char padding[SYNC_REGS_SIZE_BYTES];
} s;
};
int kvm(uint8_t code[], size_t code_len)
{
// step 1, open /dev/kvm
int kvmfd = open("/dev/kvm", O_RDWR | O_CLOEXEC);
if (kvmfd == -1)
{
printf("failed to open /dev/kvm\n");
return 0; return 0;
}
// step 2, create VM
int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0);
printf("vmfd %d\n", vmfd);
// step 3, set up user memory region
size_t mem_size = 0x100000; // size of user memory you want to assign
void *mem = mmap(0, mem_size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
printf("map mem %p\n", mem);
int user_entry = 0x0;
memcpy((void *)((size_t)mem + user_entry), code, code_len);
struct kvm_userspace_memory_region region = {
.slot = 0,
.flags = 0,
.guest_phys_addr = 0,
.memory_size = mem_size,
.userspace_addr = (size_t)mem};
ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &region);
/* end of step 3 */
// step 4, create vCPU
int vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0);
printf("create vcpu,fd: %p\n", vcpufd);
// step 5, set up memory for vCPU
size_t vcpu_mmap_size = ioctl(kvmfd, KVM_GET_VCPU_MMAP_SIZE, NULL);
struct kvm_run *run = (struct kvm_run *)mmap(0, vcpu_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpufd, 0);
// step 6, set up vCPU's registers
/* standard registers include general-purpose registers and flags */
struct kvm_regs regs;
ioctl(vcpufd, KVM_GET_REGS, &regs);
regs.rip = user_entry;
regs.rsp = 0x200000; // stack address
regs.rflags = 0x2; // in x86 the 0x2 bit should always be set
ioctl(vcpufd, KVM_SET_REGS, &regs); // set registers
/* special registers include segment registers */
struct kvm_sregs sregs;
ioctl(vcpufd, KVM_GET_SREGS, &sregs);
sregs.cs.base = sregs.cs.selector = 0; // let base of code segment equal to zero
ioctl(vcpufd, KVM_SET_SREGS, &sregs);
ioctl(vcpufd, KVM_GET_SREGS, &sregs);
// step 7, execute vm and handle exit reason
#define KVM_EXIT_UNKNOWN 0
#define KVM_EXIT_EXCEPTION 1
#define KVM_EXIT_IO 2
#define KVM_EXIT_HYPERCALL 3
#define KVM_EXIT_DEBUG 4
#define KVM_EXIT_HLT 5
#define KVM_EXIT_MMIO 6
#define KVM_EXIT_IRQ_WINDOW_OPEN 7
#define KVM_EXIT_SHUTDOWN 8
#define KVM_EXIT_FAIL_ENTRY 9
#define KVM_EXIT_INTR 10
#define KVM_EXIT_SET_TPR 11
#define KVM_EXIT_TPR_ACCESS 12
#define KVM_EXIT_S390_SIEIC 13
#define KVM_EXIT_S390_RESET 14
#define KVM_EXIT_DCR 15 /* deprecated */
#define KVM_EXIT_NMI 16
#define KVM_EXIT_INTERNAL_ERROR 17
#define KVM_EXIT_OSI 18
#define KVM_EXIT_PAPR_HCALL 19
#define KVM_EXIT_S390_UCONTROL 20
#define KVM_EXIT_WATCHDOG 21
#define KVM_EXIT_S390_TSCH 22
#define KVM_EXIT_EPR 23
#define KVM_EXIT_SYSTEM_EVENT 24
#define KVM_EXIT_S390_STSI 25
#define KVM_EXIT_IOAPIC_EOI 26
#define KVM_EXIT_HYPERV 27
#define KVM_EXIT_ARM_NISV 28
#define KVM_EXIT_X86_RDMSR 29
#define KVM_EXIT_X86_WRMSR 30
#define KVM_EXIT_DIRTY_RING_FULL 31
#define KVM_EXIT_AP_RESET_HOLD 32
#define KVM_EXIT_X86_BUS_LOCK 33
#define KVM_EXIT_XEN 34
while (1)
{
ioctl(vcpufd, KVM_RUN, NULL);
ioctl(vcpufd, KVM_GET_SREGS, &sregs);
printf("Guest CR3: 0x%llx\n", sregs.cr3);
switch (run->exit_reason)
{
case KVM_EXIT_HLT:
fputs("KVM_EXIT_HLT \n", stderr);
return 0;
case KVM_EXIT_IO:
/* TODO: check port and direction here */
putchar(*(((char *)run) + run->io.data_offset));
printf("KVM_EXIT_IO: run->io.port = %lx \n",
run->io.port);
break;
case KVM_EXIT_FAIL_ENTRY:
printf("KVM_EXIT_FAIL_ENTRY: hardware_entry_failure_reason = 0x%lx",
run->fail_entry.hardware_entry_failure_reason);
return 0;
case KVM_EXIT_INTERNAL_ERROR:
printf("KVM_EXIT_INTERNAL_ERROR: suberror = 0x%x",
run->internal.suberror);
return 0;
case KVM_EXIT_SHUTDOWN:
printf("KVM_EXIT_SHUTDOWN");
return 0;
default:
printf("Unhandled reason: %d", run->exit_reason);
return 0;
}
}
} }
/*汇编指令解释
0xB0 0x61 (mov al, 0x61)
0x61ASCII 'a' AL
0xBA 0x17 0x02 (mov dx, 0x0217)
Linux: ilen = 3 EPT_VIOLATION
0x0217 DX
0xEE (out dx, al)
AL DX
0xB0 0x0A (mov al, 0x0A)
0x0A AL
0xEE (out dx, al)
AL DX
0xF4 (hlt)
hlt 使*/
int main() int main()
{ {
printf("Test kvm running...\n"); //uint8_t code[] = "\xB0\x61\xBA\x17\x02\xEE\xB0\n\xEE\xF4";
printf("Open /dev/kvm\n"); //uint8_t code[] = "\xB0\x61\xBA\x17\x02\xEE\xF4";
int kvm_fd = open("/dev/kvm", O_RDWR|O_CLOEXEC); uint8_t code[] = "\xB0\x61\xF4";
int vmfd = ioctl(kvm_fd, 0x01, 0); kvm(code, sizeof(code));
printf("vmfd=%d\n", vmfd); return 0;
/*
__asm__ __volatile__ (
"mov %rax, 0\n\t"
"mov %rcx, 0\n\t"
"cpuid\n\t"
);
*/
const uint8_t code[] = {
0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
0x00, 0xd8, /* add %bl, %al */
0x04, '0', /* add $'0', %al */
0xee, /* out %al, (%dx) */
0xb0, '\n', /* mov $'\n', %al */
0xee, /* out %al, (%dx) */
0xf4, /* hlt */
};
size_t mem_size = 0x4000; // size of user memory you want to assign
printf("code=%p\n", code);
// void *mem = mmap(0, mem_size, 0x7, -1, 0);
// memcpy(mem, code, sizeof(code));
struct kvm_userspace_memory_region region = {
.slot = 0,
.flags = 0,
.guest_phys_addr = 0,
.memory_size = mem_size,
.userspace_addr = (size_t)code
};
ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &region);
int vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0);
printf("vcpufd=%d\n", vcpufd);
int user_entry = 0x0;
struct kvm_regs regs = {0};
regs.rip = user_entry;
regs.rsp = 0x3000; // stack address
regs.rflags = 0x2; // in x86 the 0x2 bit should always be set
ioctl(vcpufd, KVM_SET_REGS, &regs); // set registers
ioctl(vcpufd, KVM_RUN, 0);
return 0;
} }