diff --git a/ostd/libs/linux-bzimage/setup/src/x86/amd64_efi/efi.rs b/ostd/libs/linux-bzimage/setup/src/x86/amd64_efi/efi.rs index 80a6795e9..c26d05ad5 100644 --- a/ostd/libs/linux-bzimage/setup/src/x86/amd64_efi/efi.rs +++ b/ostd/libs/linux-bzimage/setup/src/x86/amd64_efi/efi.rs @@ -135,6 +135,7 @@ fn efi_phase_runtime(memory_map: MemoryMapOwned, boot_params: &mut BootParams) - #[cfg(feature = "cvm_guest")] uefi::table::boot::MemoryType::UNACCEPTED => { unsafe { + crate::console::print_str("[EFI stub] Accepting pending pages...\n"); for page_idx in 0..md.page_count { tdx_guest::tdcall::accept_page(0, md.phys_start + page_idx * PAGE_SIZE) .unwrap(); diff --git a/ostd/src/arch/x86/boot/ap_boot.S b/ostd/src/arch/x86/boot/ap_boot.S index 76ab6c4df..76d2671e9 100644 --- a/ostd/src/arch/x86/boot/ap_boot.S +++ b/ostd/src/arch/x86/boot/ap_boot.S @@ -14,13 +14,77 @@ IA32_APIC_BASE = 0x1B IA32_X2APIC_APICID = 0x802 MMIO_XAPIC_APICID = 0xFEE00020 -ap_real_mode_boot: +start: cli // disable interrupts cld xor ax, ax // clear ax mov ds, ax // clear ds + // In the Intel Trust Domain, the APs awakened by the operating system are in long mode. + // We can determine this using the value of the CS register. + // FIXME: This method will not affect the booting of linux-efi-handover64, + // multiboot and multiboot2 in non-TDX environments. + // However, it cannot guarantee the impact on other booting methods added in the future. + mov ax, cs + cmp ax, 0x38 + jne ap_real_mode_boot + +.code64 +ap_long_mode_tdx: + // The Local APIC ID information is stored in r8d by Intel TDX Virtual Firmware. + mov edi, r8d + + lgdt [boot_gdtr] + + // Enable PAE and PGE. + mov rax, cr4 + or rax, 0xa0 + mov cr4, rax + + // Set the page table. The application processors use + // the same page table as the bootstrap processor's + // boot phase page table. + mov rax, 0 + mov rax, __boot_page_table_pointer + mov cr3, rax + + push 0x8 + mov rax, offset ap_long_mode_in_low_address + push rax + retfq + +ap_long_mode_in_low_address: + mov ax, 0 + mov ds, ax + mov ss, ax + mov es, ax + mov fs, ax + mov gs, ax + + // Update RIP to use the virtual address. + mov rax, offset ap_long_mode + jmp rax + +ap_long_mode: + // The local APIC ID is in the RDI. + mov rax, rdi + shl rax, 3 + + // Setup the stack. + mov rbx, [__ap_boot_stack_array_pointer] + mov rsp, [rbx + rax] + xor rbp, rbp + + // Go to Rust code. + mov rax, offset ap_early_entry + call rax + +.extern halt # bsp_boot.S + jmp halt + +.code16 +ap_real_mode_boot: lgdt [ap_gdtr] // load gdt mov eax, cr0 @@ -86,9 +150,9 @@ x2apic_mode: // This is a pointer to the page table used by the APs. // The BSP will fill this pointer before kicking the APs. .global __boot_page_table_pointer -.align 4 +.align 8 __boot_page_table_pointer: - .skip 4 + .skip 8 ap_protect: // Save the local APIC ID in an unused register. @@ -125,19 +189,6 @@ ap_protect: ljmp 0x8, offset ap_long_mode_in_low_address -.code64 -ap_long_mode_in_low_address: - mov ax, 0 - mov ds, ax - mov ss, ax - mov es, ax - mov fs, ax - mov gs, ax - - // Update RIP to use the virtual address. - mov rax, offset ap_long_mode - jmp rax - .data // This is a pointer to be filled by the BSP when boot stacks // of all APs are allocated and initialized. @@ -145,22 +196,3 @@ ap_long_mode_in_low_address: .align 8 __ap_boot_stack_array_pointer: .skip 8 - -.text -.code64 -ap_long_mode: - // The local APIC ID is in the RDI. - mov rax, rdi - shl rax, 3 - - // Setup the stack. - mov rbx, [__ap_boot_stack_array_pointer] - mov rsp, [rbx + rax] - xor rbp, rbp - - // Go to Rust code. - mov rax, offset ap_early_entry - call rax - -.extern halt # bsp_boot.S - jmp halt diff --git a/ostd/src/arch/x86/boot/smp.rs b/ostd/src/arch/x86/boot/smp.rs index bb8bfc197..cf4e3b628 100644 --- a/ostd/src/arch/x86/boot/smp.rs +++ b/ostd/src/arch/x86/boot/smp.rs @@ -27,6 +27,8 @@ //! This sequence does not need to be strictly followed, and there may be //! different considerations in different systems. +use cfg_if::cfg_if; + use crate::{ arch::x86::kernel::{ acpi::get_acpi_tables, @@ -38,6 +40,14 @@ use crate::{ mm::{paddr_to_vaddr, PAGE_SIZE}, }; +cfg_if! { + if #[cfg(feature = "cvm_guest")] { + use tdx_guest::tdx_is_enabled; + use crate::arch::x86::kernel::acpi::AcpiMemoryHandler; + use acpi::platform::wakeup_aps; + } +} + /// Get the number of processors /// /// This function needs to be called after the OS initializes the ACPI table. @@ -59,11 +69,30 @@ pub(crate) fn get_num_processors() -> Option { } /// Brings up all application processors. -pub(crate) fn bringup_all_aps() { +pub(crate) fn bringup_all_aps(num_cpus: u32) { copy_ap_boot_code(); fill_boot_stack_array_ptr(); fill_boot_pt_ptr(); - send_boot_ipis(); + cfg_if! { + if #[cfg(feature = "cvm_guest")] { + if tdx_is_enabled() { + for ap_num in 1..num_cpus { + wakeup_aps( + &ACPI_TABLES.get().unwrap().lock(), + AcpiMemoryHandler {}, + ap_num, + AP_BOOT_START_PA as u64, + 1000, + ) + .unwrap(); + } + } else { + send_boot_ipis(); + } + } else { + send_boot_ipis(); + } + } } /// This is where the linker load the symbols in the `.ap_boot` section. diff --git a/ostd/src/boot/smp.rs b/ostd/src/boot/smp.rs index 3a22cb256..7924c0e24 100644 --- a/ostd/src/boot/smp.rs +++ b/ostd/src/boot/smp.rs @@ -7,6 +7,8 @@ use core::sync::atomic::{AtomicBool, Ordering}; use spin::Once; +#[cfg(feature = "cvm_guest")] +use crate::mm::frame::allocator; use crate::{ arch::boot::smp::{bringup_all_aps, get_num_processors}, cpu, @@ -98,9 +100,12 @@ pub fn boot_all_aps() { log::info!("Booting all application processors..."); - bringup_all_aps(); + bringup_all_aps(num_cpus); wait_for_all_aps_started(); + #[cfg(feature = "cvm_guest")] + allocator::reclaim_tdx_ap_boot_memory(); + log::info!("All application processors started. The BSP continues to run."); } diff --git a/ostd/src/mm/frame/allocator.rs b/ostd/src/mm/frame/allocator.rs index 006e24b49..25a63e9ca 100644 --- a/ostd/src/mm/frame/allocator.rs +++ b/ostd/src/mm/frame/allocator.rs @@ -356,3 +356,38 @@ pub(crate) unsafe fn init_early_allocator() { let mut early_allocator = EARLY_ALLOCATOR.lock(); *early_allocator = Some(EarlyFrameAllocator::new()); } + +#[cfg(feature = "cvm_guest")] +pub(crate) fn reclaim_tdx_ap_boot_memory() { + let regions = &crate::boot::EARLY_INFO.get().unwrap().memory_regions; + for region in regions.iter() { + if region.typ() == MemoryRegionType::Usable { + // Make the memory region page-aligned, and skip if it is too small. + let start = region.base().align_up(PAGE_SIZE) / PAGE_SIZE; + let region_end = region.base().checked_add(region.len()).unwrap(); + let end = region_end.align_down(PAGE_SIZE) / PAGE_SIZE; + if end <= start { + continue; + } + // 0x800000 is temporarily used for AP boot in Intel TDX environment. + // We should include this frame into page allocator after AP initialization. + if (start..end).contains(&(0x800000 / PAGE_SIZE)) { + info!( + "Found usable region, start:{:x}, end:{:x}", + region.base(), + region.base() + region.len() + ); + FRAME_ALLOCATOR + .get() + .unwrap() + .disable_irq() + .lock() + .allocator + .add_frame(start, end); + + FRAME_ALLOCATOR.get().unwrap().disable_irq().lock().total += + (end - start) * PAGE_SIZE; + } + } + } +}