feat(virtualization): 内核虚拟化支持 (#1073)

* 几个结构体

* 通过vmx_init以及create_vm,create_vcpu部分TODO

* kvm_run完成一半

* 能够成功vmlaunch,但是在vmexit时候还有些问题未排查出来

* 解决了vmlaunch导致的cpu_reset的问题

* 整理代码

* 暂时性push到hyc仓库

* 修改内存虚拟化部分参数传入,解决死锁问题

* 初步完成ept映射.但不停EPT_VIOLATION

* 初步完成了EPT映射,但是读写内存还是有点问题

* fixme

* 更新了一些truncate到from_bits_unchecked的实现

* 完成内存虚拟化EPT_VIOLATION的映射

* fmt

* Remove /fixme from .gitignore

* Remove /fixme file

* Update kernel/src/init/init.rs

Co-authored-by: Samuel Dai <samuka007@dragonos.org>

* Update kernel/src/init/init.rs

Co-authored-by: Samuel Dai <samuka007@dragonos.org>

* 修改了注释格式,删除了附带的一些文件操作

* feat(syscall): 实现syscall restart (#1075)

能够在系统调用返回ERESTARTSYS时,信号处理结束后,自动重启系统调用.

TODO: 实现wait等需要restart_block的系统调用的重启

Signed-off-by: longjin <longjin@DragonOS.org>

* chore: update docker image version in script && update doc (#1076)

* chore: update docker image version in script

* chore: replace lots of spaces with newline in doc

* fix: 修复wait4系统调用部分语义与Linux不一致的问题 (#1080)

* fix: 修复wait4系统调用部分语义与Linux不一致的问题

解决wait不住/wait之后卡死的bug

---------

Signed-off-by: longjin <longjin@DragonOS.org>

* feat(fs/syscall): 实现fchdir系统调用 (#1081)

Signed-off-by: longjin <longjin@DragonOS.org>

* fix(mm): 修复fat文件系统的PageCache同步问题 (#1005)


---------

Co-authored-by: longjin <longjin@DragonOS.org>

* fix: 修正nographic启动时,控制台日志未能输出到文件的问题 (#1082)

Signed-off-by: longjin <longjin@DragonOS.org>

* fix(process): 修复copy_process的一些bug & 支持默认init进程传参 (#1083)

- 修复`copy_process`函数对标志位处理不正确的bug
- init进程搜索列表中,支持为默认init程序传入参数

Signed-off-by: longjin <longjin@DragonOS.org>

* feat: 完善sys_reboot (#1084)

* fix(process): 修复copy_process的一些bug & 支持默认init进程传参

- 修复`copy_process`函数对标志位处理不正确的bug
- init进程搜索列表中,支持为默认init程序传入参数

Signed-off-by: longjin <longjin@DragonOS.org>

* feat: 完善sys_reboot

- 校验magic number
- 支持多个cmd (具体内容未实现)

Signed-off-by: longjin <longjin@DragonOS.org>

---------

Signed-off-by: longjin <longjin@DragonOS.org>

* fix: 修复do_wait函数在wait所有子进程时,忘了释放锁就sleep的bug (#1089)

Signed-off-by: longjin <longjin@DragonOS.org>

* pull主线并且fmt

---------

Signed-off-by: longjin <longjin@DragonOS.org>
Co-authored-by: GnoCiYeH <heyicong@dragonos.org>
Co-authored-by: Samuel Dai <samuka007@dragonos.org>
Co-authored-by: LoGin <longjin@DragonOS.org>
Co-authored-by: LIU Yuwei <22045841+Marsman1996@users.noreply.github.com>
Co-authored-by: MemoryShore <1353318529@qq.com>
This commit is contained in:
Z Fan
2025-03-04 10:56:20 +08:00
committed by GitHub
parent 01dcb5d7a8
commit 597315b04d
50 changed files with 13675 additions and 126 deletions

View File

@ -0,0 +1,463 @@
use core::{fmt::Debug, sync::atomic::AtomicU32};
use alloc::{boxed::Box, vec::Vec};
use bit_field::BitField;
use bitmap::{traits::BitMapOps, AllocBitmap};
use system_error::SystemError;
use x86::{
bits64::rflags::RFlags,
controlregs::{Cr0, Cr4},
dtables::DescriptorTablePointer,
};
use x86_64::registers::control::EferFlags;
use crate::{
smp::cpu::ProcessorId,
virt::vm::{
kvm_host::{
vcpu::VirtCpu, Vm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, KVM_USERSAPCE_IRQ_SOURCE_ID,
},
user_api::UapiKvmSegment,
},
};
use crate::arch::VirtCpuArch;
use super::{
asm::{MsrData, VcpuSegment, VmxMsrEntry},
vmx::{exit::ExitFastpathCompletion, vmx_info},
x86_kvm_manager, x86_kvm_ops,
};
pub mod lapic;
pub mod page;
pub mod vcpu;
#[allow(dead_code)]
pub const TSS_IOPB_BASE_OFFSET: usize = 0x66;
pub const TSS_BASE_SIZE: usize = 0x68;
pub const TSS_IOPB_SIZE: usize = 65536 / 8;
pub const TSS_REDIRECTION_SIZE: usize = 256 / 8;
pub const RMODE_TSS_SIZE: usize = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1;
pub const KVM_PFN_NOSLOT: u64 = 0x1 << 63;
#[allow(dead_code)]
#[derive(Debug, Default)]
pub struct X86KvmArch {
/// 中断芯片模式
pub irqchip_mode: KvmIrqChipMode,
/// 负责引导(bootstrap)kvm的vcpu_id
bsp_vcpu_id: usize,
pub pause_in_guest: bool,
pub cstate_in_guest: bool,
pub mwait_in_guest: bool,
pub hlt_in_guest: bool,
pub bus_lock_detection_enabled: bool,
irq_sources_bitmap: u64,
default_tsc_khz: u64,
guest_can_read_msr_platform_info: bool,
apicv_inhibit_reasons: usize,
pub max_vcpu_ids: usize,
pub notify_vmexit_flags: NotifyVmExitFlags,
pub notify_window: u32,
msr_fliter: Option<Box<KvmX86MsrFilter>>,
pub noncoherent_dma_count: AtomicU32,
pub active_mmu_pages: Vec<u64>,
pub n_max_mmu_pages: usize,
pub n_used_mmu_pages: usize,
}
impl X86KvmArch {
pub fn init(kvm_type: usize) -> Result<Self, SystemError> {
if kvm_type != 0 {
return Err(SystemError::EINVAL);
}
let mut arch = x86_kvm_ops().vm_init();
// 设置中断源位图
arch.irq_sources_bitmap
.set_bit(KVM_USERSAPCE_IRQ_SOURCE_ID, true)
.set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, true);
arch.default_tsc_khz = x86_kvm_manager().max_tsc_khz;
arch.guest_can_read_msr_platform_info = true;
arch.apicv_init();
Ok(arch)
}
fn apicv_init(&mut self) {
self.apicv_inhibit_reasons
.set_bit(KvmApicvInhibit::ABSENT, true);
if !vmx_info().enable_apicv {
self.apicv_inhibit_reasons
.set_bit(KvmApicvInhibit::DISABLE, true);
}
}
pub fn msr_allowed(&self, msr: u32, ftype: MsrFilterType) -> bool {
// x2APIC MSRs
if (0x800..=0x8ff).contains(&msr) {
return true;
}
if let Some(msr_filter) = &self.msr_fliter {
let mut allowed = msr_filter.default_allow;
for i in 0..msr_filter.count as usize {
let range = &msr_filter.ranges[i];
let start = range.base;
let end = start + range.nmsrs;
let flags = range.flags;
let bitmap = &range.bitmap;
if msr >= start && msr < end && flags.contains(ftype) {
allowed = bitmap.get((msr - start) as usize).unwrap_or(false);
break;
}
}
return allowed;
} else {
return true;
}
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
#[allow(dead_code)]
pub enum KvmIrqChipMode {
None,
Kernel,
Split,
}
impl Default for KvmIrqChipMode {
fn default() -> Self {
Self::None
}
}
#[allow(dead_code)]
pub trait KvmInitFunc {
fn hardware_setup(&self) -> Result<(), SystemError>;
fn handle_intel_pt_intr(&self) -> u32;
fn runtime_funcs(&self) -> &'static dyn KvmFunc;
}
pub trait KvmFunc: Send + Sync + Debug {
/// 返回该硬件支持的名字例如“Vmx”
fn name(&self) -> &'static str;
/// 启用硬件支持
fn hardware_enable(&self) -> Result<(), SystemError>;
fn vm_init(&self) -> X86KvmArch;
fn vcpu_precreate(&self, vm: &mut Vm) -> Result<(), SystemError>;
fn vcpu_create(&self, vcpu: &mut VirtCpu, vm: &Vm);
fn vcpu_load(&self, vcpu: &mut VirtCpu, cpu: ProcessorId);
fn load_mmu_pgd(&self, vcpu: &mut VirtCpu, vm: &Vm, root_hpa: u64, root_level: u32);
fn cache_reg(&self, vcpu: &mut VirtCpuArch, reg: KvmReg);
fn apicv_pre_state_restore(&self, vcpu: &mut VirtCpu);
fn set_msr(&self, vcpu: &mut VirtCpu, msr: MsrData) -> Result<(), SystemError>;
fn set_rflags(&self, vcpu: &mut VirtCpu, rflags: RFlags);
fn get_rflags(&self, vcpu: &mut VirtCpu) -> RFlags;
fn set_cr0(&self, vm: &Vm, vcpu: &mut VirtCpu, cr0: Cr0);
fn is_vaild_cr0(&self, vcpu: &VirtCpu, cr0: Cr0) -> bool;
fn set_cr4(&self, vcpu: &mut VirtCpu, cr4: Cr4);
fn post_set_cr3(&self, vcpu: &VirtCpu, cr3: u64);
fn is_vaild_cr4(&self, vcpu: &VirtCpu, cr4: Cr4) -> bool;
fn set_efer(&self, vcpu: &mut VirtCpu, efer: EferFlags);
fn set_segment(&self, vcpu: &mut VirtCpu, var: &mut UapiKvmSegment, seg: VcpuSegment);
fn get_segment(
&self,
vcpu: &mut VirtCpu,
var: UapiKvmSegment,
seg: VcpuSegment,
) -> UapiKvmSegment;
/// 这个函数不会用到VCPU这里拿到只是为了确保上一层拿到锁
fn get_idt(&self, _vcpu: &mut VirtCpu, dt: &mut DescriptorTablePointer<u8>);
fn set_idt(&self, _vcpu: &mut VirtCpu, dt: &DescriptorTablePointer<u8>);
fn get_gdt(&self, _vcpu: &mut VirtCpu, dt: &mut DescriptorTablePointer<u8>);
fn set_gdt(&self, _vcpu: &mut VirtCpu, dt: &DescriptorTablePointer<u8>);
fn update_exception_bitmap(&self, vcpu: &mut VirtCpu);
fn vcpu_reset(&self, vcpu: &mut VirtCpu, vm: &Vm, init_event: bool);
fn has_emulated_msr(&self, msr: u32) -> bool;
fn get_msr_feature(&self, msr: &mut VmxMsrEntry) -> bool;
fn prepare_switch_to_guest(&self, vcpu: &mut VirtCpu);
fn flush_tlb_all(&self, vcpu: &mut VirtCpu);
fn vcpu_run(&self, vcpu: &mut VirtCpu) -> ExitFastpathCompletion;
fn handle_exit_irqoff(&self, vcpu: &mut VirtCpu);
fn handle_exit(
&self,
vcpu: &mut VirtCpu,
vm: &Vm,
fastpath: ExitFastpathCompletion,
) -> Result<i32, SystemError>;
}
/// ## 中断抑制的原因位
#[derive(Debug)]
pub struct KvmApicvInhibit;
#[allow(dead_code)]
impl KvmApicvInhibit {
// Intel与AMD共用
/// APIC 加速功能被模块参数禁用,或者硬件不支持
pub const DISABLE: usize = 0;
/// Hyper-V 客户机正在使用 AutoEOI 功能,导致 APIC 加速被禁用。
pub const HYPERV: usize = 1;
/// 因为用户空间尚未启用内核或分裂的中断控制器,导致 APIC 加速被禁用。
pub const ABSENT: usize = 2;
/// KVM_GUESTDBG_BLOCKIRQ一种调试措施用于阻止该 vCPU 上的所有中断)被启用,以避免 AVIC/APICv 绕过此功能。
pub const BLOCKIRQ: usize = 3;
/// 当所有 vCPU 的 APIC ID 和 vCPU 的 1:1 映射被更改且 KVM 未应用其 x2APIC 热插拔修补程序时APIC 加速被禁用。
pub const PHYSICAL_ID_ALIASED: usize = 4;
/// 当 vCPU 的 APIC ID 或 APIC 基址从其复位值更改时,首次禁用 APIC 加速。
pub const APIC_ID_MODIFIED: usize = 5;
/// 当 vCPU 的 APIC ID 或 APIC 基址从其复位值更改时,首次禁用 APIC 加速。
pub const APIC_BASE_MODIFIED: usize = 6;
// 仅仅对AMD适用
/// 当 vCPU 运行嵌套客户机时AVIC 被禁用。因为与 APICv 不同,当 vCPU 运行嵌套时,该 vCPU 的同级无法使用门铃机制通过 AVIC 信号中断。
pub const NESTED: usize = 7;
/// 在 SVM 上,等待 IRQ 窗口的实现使用挂起的虚拟中断,而在 KVM 等待 IRQ 窗口时无法注入这些虚拟中断,因此在等待 IRQ 窗口时 AVIC 被禁用。
pub const IRQWIN: usize = 8;
/// PITi8254的“重新注入”模式依赖于 EOI 拦截,而 AVIC 不支持边沿触发中断的 EOI 拦截。
pub const PIT_REINJ: usize = 9;
/// SEV 不支持 AVIC因此 AVIC 被禁用。
pub const SEV: usize = 10;
/// 当所有带有有效 LDR 的 vCPU 之间的逻辑 ID 和 vCPU 的 1:1 映射被更改时AVIC 被禁用。
pub const LOGICAL_ID_ALIASED: usize = 11;
}
#[derive(Debug)]
pub struct KvmX86MsrFilter {
count: u8,
default_allow: bool,
ranges: Vec<KernelMsrRange>,
}
#[derive(Debug)]
pub struct KernelMsrRange {
pub flags: MsrFilterType,
pub nmsrs: u32,
pub base: u32,
pub bitmap: AllocBitmap,
}
#[repr(C)]
#[allow(dead_code)]
pub struct PosixMsrFilterRange {
pub flags: u32,
pub nmsrs: u32,
pub base: u32,
pub bitmap: *const u8,
}
bitflags! {
pub struct MsrFilterType: u8 {
const KVM_MSR_FILTER_READ = 1 << 0;
const KVM_MSR_FILTER_WRITE = 1 << 1;
}
pub struct NotifyVmExitFlags: u8 {
const KVM_X86_NOTIFY_VMEXIT_ENABLED = 1 << 0;
const KVM_X86_NOTIFY_VMEXIT_USER = 1 << 1;
}
}
impl Default for NotifyVmExitFlags {
fn default() -> Self {
NotifyVmExitFlags::empty()
}
}
#[derive(Debug, Clone, Copy)]
pub enum KvmReg {
VcpuRegsRax = 0,
VcpuRegsRcx = 1,
VcpuRegsRdx = 2,
VcpuRegsRbx = 3,
VcpuRegsRsp = 4,
VcpuRegsRbp = 5,
VcpuRegsRsi = 6,
VcpuRegsRdi = 7,
VcpuRegsR8 = 8,
VcpuRegsR9 = 9,
VcpuRegsR10 = 10,
VcpuRegsR11 = 11,
VcpuRegsR12 = 12,
VcpuRegsR13 = 13,
VcpuRegsR14 = 14,
VcpuRegsR15 = 15,
VcpuRegsRip = 16,
NrVcpuRegs = 17,
//VcpuExregPdptr = NrVcpuRegs,
VcpuExregCr0,
VcpuExregCr3,
VcpuExregCr4,
VcpuExregRflags,
VcpuExregSegments,
VcpuExregExitInfo1, //EXITINFO1 provides the linear address of the memory operand.
VcpuExregExitInfo2, //EXITINFO2 provides the contents of the register operand.
}
bitflags! {
pub struct HFlags: u8 {
const HF_GUEST_MASK = 1 << 0; /* VCPU is in guest-mode */
const HF_SMM_MASK = 1 << 1;
const HF_SMM_INSIDE_NMI_MASK = 1 << 2;
}
}
/// ### 虚拟机的通用寄存器
#[derive(Debug, Default, Clone, Copy)]
#[repr(C)]
pub struct KvmCommonRegs {
rax: u64,
rbx: u64,
rcx: u64,
rdx: u64,
rsi: u64,
rdi: u64,
rsp: u64,
rbp: u64,
r8: u64,
r9: u64,
r10: u64,
r11: u64,
r12: u64,
r13: u64,
r14: u64,
r15: u64,
rip: u64,
rflags: u64,
}
impl Vm {
pub fn vcpu_precreate(&mut self, id: usize) -> Result<(), SystemError> {
if self.arch.max_vcpu_ids == 0 {
self.arch.max_vcpu_ids = 1024 * 4;
}
if id >= self.arch.max_vcpu_ids {
return Err(SystemError::EINVAL);
}
return x86_kvm_ops().vcpu_precreate(self);
}
}
bitflags! {
pub struct EmulType: u32 {
const NO_DECODE = 1 << 0;
const TRAP_UD = 1 << 1;
const SKIP = 1 << 2;
const ALLOW_RETRY_PF = 1 << 3;
const TRAP_UD_FORCED = 1 << 4;
const VMWARE_GP = 1 << 5;
const PF = 1 << 6;
const COMPLETE_USER_EXIT = 1 << 7;
const WRITE_PF_TO_SP = 1 << 8;
}
}
#[allow(dead_code)]
#[derive(Default, Debug)]
///用于跟踪和记录VCPU的各种统计信息。
pub struct KvmVcpuStat {
//pub generic: KvmVcpuStatGeneric,
pub pf_taken: u64,
pub pf_fixed: u64,
pub pf_emulate: u64,
pub pf_spurious: u64,
pub pf_fast: u64,
pub pf_mmio_spte_created: u64,
pub pf_guest: u64,
pub tlb_flush: u64,
pub invlpg: u64,
pub exits: u64,
pub io_exits: u64,
pub mmio_exits: u64,
pub signal_exits: u64,
pub irq_window_exits: u64,
pub nmi_window_exits: u64,
pub l1d_flush: u64,
pub halt_exits: u64,
pub request_irq_exits: u64,
pub irq_exits: u64,
pub host_state_reload: u64,
pub fpu_reload: u64,
pub insn_emulation: u64,
pub insn_emulation_fail: u64,
pub hypercalls: u64,
pub irq_injections: u64,
pub nmi_injections: u64,
pub req_event: u64,
pub nested_run: u64,
pub directed_yield_attempted: u64,
pub directed_yield_successful: u64,
pub preemption_reported: u64,
pub preemption_other: u64,
pub guest_mode: u64,
pub notify_window_exits: u64,
}
#[inline]
/// 将 GFN 转换为 GPA
pub fn gfn_to_gpa(gfn: u64) -> u64 {
gfn << 12
}
#[allow(dead_code)]
#[inline]
/// 将 GPA 转换为 GFN
pub fn gpa_to_gfn(gfn: u64) -> u64 {
gfn >> 12
}