Fix both the preempt count atomicity and the CPU-local init problem

2025-06-22 00:43:24 +00:00 · 2024-07-19 13:23:28 +00:00
parent 00b9bf9610
commit 3ae884081f
6 changed files with 173 additions and 49 deletions
--- a/osdk/src/base_crate/x86_64.ld.template
+++ b/osdk/src/base_crate/x86_64.ld.template
@ -58,6 +58,16 @@ SECTIONS
    # areas for the application processors.
    .cpu_local              : AT(ADDR(.cpu_local) - KERNEL_VMA) {
        __cpu_local_start = .;
+
+        # These 4 bytes are used to store the CPU ID.
+        . += 4;
+
+        # These 4 bytes are used to store the number of preemption locks held.
+        # The reason is stated in the Rust documentation of
+        # [`ostd::task::processor::PreemptInfo`].
+        __cpu_local_preempt_lock_count = . - __cpu_local_start;
+        . += 4;
+
        KEEP(*(SORT(.cpu_local)))
        __cpu_local_end = .;
    }
--- a/ostd/src/arch/x86/cpu/local.rs
+++ b/ostd/src/arch/x86/cpu/local.rs
@ -0,0 +1,87 @@
+// SPDX-License-Identifier: MPL-2.0
+
+//! Architecture dependent CPU-local information utilities.
+
+use x86_64::registers::segmentation::{Segment64, FS};
+
+/// Sets the base address for the CPU local storage by writing to the FS base model-specific register.
+/// This operation is marked as `unsafe` because it directly interfaces with low-level CPU registers.
+///
+/// # Safety
+///
+///  - This function is safe to call provided that the FS register is dedicated entirely for CPU local storage
+///    and is not concurrently accessed for other purposes.
+///  - The caller must ensure that `addr` is a valid address and properly aligned, as required by the CPU.
+///  - This function should only be called in contexts where the CPU is in a state to accept such changes,
+///    such as during processor initialization.
+pub(crate) unsafe fn set_base(addr: u64) {
+    FS::write_base(x86_64::addr::VirtAddr::new(addr));
+}
+
+/// Gets the base address for the CPU local storage by reading the FS base model-specific register.
+pub(crate) fn get_base() -> u64 {
+    FS::read_base().as_u64()
+}
+
+pub mod preempt_lock_count {
+    //! We need to increment/decrement the per-CPU preemption lock count using
+    //! a single instruction. This requirement is stated by
+    //! [`crate::task::processor::PreemptInfo`].
+
+    /// The GDT ensures that the FS segment is initialized to zero on boot.
+    /// This assertion checks that the base address has been set.
+    macro_rules! debug_assert_initialized {
+        () => {
+            // The compiler may think that [`super::get_base`] has side effects
+            // so it may not be optimized out. We make sure that it will be
+            // conditionally compiled only in debug builds.
+            #[cfg(debug_assertions)]
+            debug_assert_ne!(super::get_base(), 0);
+        };
+    }
+
+    /// Increments the per-CPU preemption lock count using one instruction.
+    pub(crate) fn inc() {
+        debug_assert_initialized!();
+
+        // SAFETY: The inline assembly increments the lock count in one
+        // instruction without side effects.
+        unsafe {
+            core::arch::asm!(
+                "add dword ptr fs:[__cpu_local_preempt_lock_count], 1",
+                options(nostack),
+            );
+        }
+    }
+
+    /// Decrements the per-CPU preemption lock count using one instruction.
+    pub(crate) fn dec() {
+        debug_assert_initialized!();
+
+        // SAFETY: The inline assembly decrements the lock count in one
+        // instruction without side effects.
+        unsafe {
+            core::arch::asm!(
+                "sub dword ptr fs:[__cpu_local_preempt_lock_count], 1",
+                options(nostack),
+            );
+        }
+    }
+
+    /// Gets the per-CPU preemption lock count using one instruction.
+    pub(crate) fn get() -> u32 {
+        debug_assert_initialized!();
+
+        let count: u32;
+        // SAFETY: The inline assembly reads the lock count in one instruction
+        // without side effects.
+        unsafe {
+            core::arch::asm!(
+                "mov {0:e}, fs:[__cpu_local_preempt_lock_count]",
+                out(reg) count,
+                options(nostack, readonly),
+            );
+        }
+        count
+    }
+}
--- a/ostd/src/arch/x86/cpu/mod.rs
+++ b/ostd/src/arch/x86/cpu/mod.rs
@ -2,6 +2,8 @@

 //! CPU.

+pub mod local;
+
 use alloc::vec::Vec;
 use core::{
    arch::x86_64::{_fxrstor, _fxsave},
@ -18,10 +20,7 @@ use log::debug;
 use tdx_guest::tdcall;
 pub use trapframe::GeneralRegs as RawGeneralRegs;
 use trapframe::UserContext as RawUserContext;
-use x86_64::registers::{
-    rflags::RFlags,
-    segmentation::{Segment64, FS},
-};
+use x86_64::registers::rflags::RFlags;

 #[cfg(feature = "intel_tdx")]
 use crate::arch::tdx_guest::{handle_virtual_exception, TdxTrapFrame};
@ -673,22 +672,3 @@ impl Default for FpRegs {
 struct FxsaveArea {
    data: [u8; 512], // 512 bytes
 }
-
-/// Sets the base address for the CPU local storage by writing to the FS base model-specific register.
-/// This operation is marked as `unsafe` because it directly interfaces with low-level CPU registers.
-///
-/// # Safety
-///
-///  - This function is safe to call provided that the FS register is dedicated entirely for CPU local storage
-///    and is not concurrently accessed for other purposes.
-///  - The caller must ensure that `addr` is a valid address and properly aligned, as required by the CPU.
-///  - This function should only be called in contexts where the CPU is in a state to accept such changes,
-///    such as during processor initialization.
-pub(crate) unsafe fn set_cpu_local_base(addr: u64) {
-    FS::write_base(x86_64::addr::VirtAddr::new(addr));
-}
-
-/// Gets the base address for the CPU local storage by reading the FS base model-specific register.
-pub(crate) fn get_cpu_local_base() -> u64 {
-    FS::read_base().as_u64()
-}
--- a/ostd/src/cpu/cpu_local.rs
+++ b/ostd/src/cpu/cpu_local.rs
@ -21,7 +21,7 @@
 use core::ops::Deref;

 use crate::{
-    cpu::{get_cpu_local_base, set_cpu_local_base},
+    arch,
    trap::{disable_local, DisabledLocalIrqGuard},
 };

@ -82,6 +82,13 @@ impl<T> !Clone for CpuLocal<T> {}
 // other tasks as they should live on other CPUs to make sending useful.
 impl<T> !Send for CpuLocal<T> {}

+// A check to ensure that the CPU-local object is never accessed before the
+// initialization for all CPUs.
+#[cfg(debug_assertions)]
+use core::sync::atomic::{AtomicBool, Ordering};
+#[cfg(debug_assertions)]
+static IS_INITIALIZED: AtomicBool = AtomicBool::new(false);
+
 impl<T> CpuLocal<T> {
    /// Initialize a CPU-local object.
    ///
@ -115,6 +122,11 @@ impl<T> CpuLocal<T> {
    /// This function calculates the virtual address of the CPU-local object based on the per-
    /// cpu base address and the offset in the BSP.
    fn get(&self) -> *const T {
+        // CPU-local objects should be initialized before being accessed. It should be ensured
+        // by the implementation of OSTD initialization.
+        #[cfg(debug_assertions)]
+        debug_assert!(IS_INITIALIZED.load(Ordering::Relaxed));
+
        let offset = {
            let bsp_va = self as *const _ as usize;
            let bsp_base = __cpu_local_start as usize;
@ -124,7 +136,7 @@ impl<T> CpuLocal<T> {
            bsp_va - bsp_base as usize
        };

-        let local_base = get_cpu_local_base() as usize;
+        let local_base = arch::cpu::local::get_base() as usize;
        let local_va = local_base + offset;

        // A sanity check about the alignment.
@ -170,6 +182,24 @@ impl<T> Deref for CpuLocalDerefGuard<'_, T> {
    }
 }

+/// Sets the base address of the CPU-local storage for the bootstrap processor.
+///
+/// It should be called early to let [`crate::task::disable_preempt`] work,
+/// which needs to update a CPU-local preempt lock count. Otherwise it may
+/// panic when calling [`crate::task::disable_preempt`].
+///
+/// # Safety
+///
+/// It should be called only once and only on the BSP.
+pub(crate) unsafe fn early_init_bsp_local_base() {
+    let start_base_va = __cpu_local_start as usize as u64;
+    // SAFETY: The base to be set is the start of the `.cpu_local` section,
+    // where accessing the CPU-local objects have defined behaviors.
+    unsafe {
+        arch::cpu::local::set_base(start_base_va);
+    }
+}
+
 /// Initializes the CPU local data for the bootstrap processor (BSP).
 ///
 /// # Safety
@ -179,9 +209,14 @@ impl<T> Deref for CpuLocalDerefGuard<'_, T> {
 /// It must be guaranteed that the BSP will not access local data before
 /// this function being called, otherwise copying non-constant values
 /// will result in pretty bad undefined behavior.
-pub unsafe fn init_on_bsp() {
-    let start_base_va = __cpu_local_start as usize as u64;
-    set_cpu_local_base(start_base_va);
+pub(crate) unsafe fn init_on_bsp() {
+    // TODO: allocate the pages for application processors and copy the
+    // CPU-local objects to the allocated pages.
+
+    #[cfg(debug_assertions)]
+    {
+        IS_INITIALIZED.store(true, Ordering::Relaxed);
+    }
 }

 // These symbols are provided by the linker script.
--- a/ostd/src/lib.rs
+++ b/ostd/src/lib.rs
@ -59,6 +59,9 @@ pub use self::{cpu::cpu_local::CpuLocal, error::Error, prelude::Result};
 pub fn init() {
    arch::before_all_init();

+    // SAFETY: This function is called only once and only on the BSP.
+    unsafe { cpu::cpu_local::early_init_bsp_local_base() };
+
    mm::heap_allocator::init();

    boot::init();
--- a/ostd/src/task/processor.rs
+++ b/ostd/src/task/processor.rs
@ -1,17 +1,14 @@
 // SPDX-License-Identifier: MPL-2.0

 use alloc::sync::Arc;
-use core::{
-    cell::RefCell,
-    sync::atomic::{AtomicUsize, Ordering::Relaxed},
-};
+use core::cell::RefCell;

 use super::{
    scheduler::{fetch_task, GLOBAL_SCHEDULER},
    task::{context_switch, TaskContext},
    Task, TaskStatus,
 };
-use crate::cpu_local;
+use crate::{arch, cpu_local};

 pub struct Processor {
    current: Option<Arc<Task>>,
@ -154,38 +151,50 @@ fn switch_to_task(next_task: Arc<Task>) {
    // to the next task switching.
 }

-cpu_local! {
 static PREEMPT_COUNT: PreemptInfo = PreemptInfo::new();
-}

-/// Currently, ``PreemptInfo`` only holds the number of spin
-/// locks held by the current CPU. When it has a non-zero value,
-/// the CPU cannot call ``schedule()``.
-struct PreemptInfo {
-    num_locks: AtomicUsize,
-}
+/// Currently, it only holds the number of preemption locks held by the
+/// current CPU. When it has a non-zero value, the CPU cannot call
+/// [`schedule()`].
+///
+/// For per-CPU preemption lock count, we cannot afford two non-atomic
+/// operations to increment and decrement the count. The [`crate::cpu_local`]
+/// implementation is free to read the base register and then calculate the
+/// address of the per-CPU variable using an additional instruction. Interrupts
+/// can happen between the address calculation and modification to that
+/// address. If the task is preempted to another CPU by this interrupt, the
+/// count of the original CPU will be mistakenly modified. To avoid this, we
+/// introduce [`crate::arch::cpu::local::preempt_lock_count`]. For x86_64 we
+/// can implement this using one instruction. In other less expressive
+/// architectures, we may need to disable interrupts.
+///
+/// Also, the preemption count is reserved in the `.cpu_local` section
+/// specified in the linker script. The reason is that we need to access the
+/// preemption count before we can copy the section for application processors.
+/// So, the preemption count is not copied from bootstrap processor's section
+/// as the initialization. Instead it is initialized to zero for application
+/// processors.
+struct PreemptInfo {}

 impl PreemptInfo {
    const fn new() -> Self {
-        Self {
-            num_locks: AtomicUsize::new(0),
-        }
+        Self {}
    }

    fn increase_num_locks(&self) {
-        self.num_locks.fetch_add(1, Relaxed);
+        arch::cpu::local::preempt_lock_count::inc();
    }

    fn decrease_num_locks(&self) {
-        self.num_locks.fetch_sub(1, Relaxed);
+        arch::cpu::local::preempt_lock_count::dec();
    }

    fn is_preemptive(&self) -> bool {
-        self.num_locks.load(Relaxed) == 0
+        arch::cpu::local::preempt_lock_count::get() == 0
    }

    fn num_locks(&self) -> usize {
-        self.num_locks.load(Relaxed)
+        arch::cpu::local::preempt_lock_count::get() as usize
    }
 }