Strict TLB coherence

2025-06-22 17:03:23 +00:00 · 2025-03-21 17:22:05 +08:00
parent 30ec0be210
commit 4f0acddfd4
8 changed files with 109 additions and 19 deletions
--- a/kernel/src/vm/vmar/mod.rs
+++ b/kernel/src/vm/vmar/mod.rs
@ -457,6 +457,7 @@ impl Vmar_ {
            }
            cur_cursor.flusher().issue_tlb_flush(TlbFlushOp::All);
            cur_cursor.flusher().dispatch_tlb_flush();
+            cur_cursor.flusher().sync_tlb_flush();
        }

        Ok(new_vmar_)
--- a/kernel/src/vm/vmar/vm_mapping.rs
+++ b/kernel/src/vm/vmar/vm_mapping.rs
@ -188,6 +188,7 @@ impl VmMapping {
                    prop.flags |= new_flags;
                    cursor.map(new_frame.into(), prop);
                }
+                cursor.flusher().sync_tlb_flush();
            }
            VmItem::NotMapped { .. } => {
                // Map a new frame to the page fault address.
@ -385,6 +386,8 @@ impl VmMapping {
        let range = self.range();
        let mut cursor = vm_space.cursor_mut(&range)?;
        cursor.unmap(range.len());
+        cursor.flusher().dispatch_tlb_flush();
+        cursor.flusher().sync_tlb_flush();

        Ok(())
    }
@ -404,6 +407,7 @@ impl VmMapping {
            }
        }
        cursor.flusher().dispatch_tlb_flush();
+        cursor.flusher().sync_tlb_flush();

        Self { perms, ..self }
    }
--- a/ostd/src/boot/smp.rs
+++ b/ostd/src/boot/smp.rs
@ -136,6 +136,8 @@ fn ap_early_entry(local_apic_id: u32) -> ! {

    crate::arch::irq::enable_local();

+    crate::mm::tlb::register_timer_callbacks_this_cpu();
+
    // SAFETY: this function is only called once on this AP.
    unsafe {
        crate::mm::kspace::activate_kernel_page_table();
--- a/ostd/src/cpu/mod.rs
+++ b/ostd/src/cpu/mod.rs
@ -106,14 +106,25 @@ pub fn all_cpus() -> impl Iterator<Item = CpuId> {
 /// The implementor must ensure that the current task is pinned to the current
 /// CPU while any one of the instances of the implemented structure exists.
 pub unsafe trait PinCurrentCpu {
-    /// Returns the number of the current CPU.
+    /// Returns the ID of the current CPU.
    fn current_cpu(&self) -> CpuId {
-        let id = CURRENT_CPU.load();
-        debug_assert_ne!(id, u32::MAX, "This CPU is not initialized");
-        CpuId(id)
+        current_cpu_racy()
    }
 }

+/// Returns the ID of the current CPU.
+///
+/// This function is safe to call, but is vulnerable to races. The returned CPU
+/// ID may be outdated if the task migrates to another CPU.
+///
+/// To ensure that the CPU ID is up-to-date, do it under any guards that
+/// implements the [`PinCurrentCpu`] trait.
+pub fn current_cpu_racy() -> CpuId {
+    let id = CURRENT_CPU.load();
+    debug_assert_ne!(id, u32::MAX, "This CPU is not initialized");
+    CpuId(id)
+}
+
 // SAFETY: When IRQs are disabled, the task cannot be passively preempted and
 // migrates to another CPU. If the task actively calls `yield`, it will not be
 // successful either.
--- a/ostd/src/lib.rs
+++ b/ostd/src/lib.rs
@ -106,6 +106,7 @@ unsafe fn init() {
    boot::init_after_heap();

    mm::dma::init();
+    mm::tlb::register_timer_callbacks_this_cpu();

    unsafe { arch::late_init_on_bsp() };

--- a/ostd/src/mm/kspace/kvirt_area.rs
+++ b/ostd/src/mm/kspace/kvirt_area.rs
@ -215,7 +215,7 @@ impl KVirtArea<Tracked> {
        assert!(self.start() <= range.start && self.end() >= range.end);
        let page_table = KERNEL_PAGE_TABLE.get().unwrap();
        let mut cursor = page_table.cursor_mut(&range).unwrap();
-        let flusher = TlbFlusher::new(CpuSet::new_full(), disable_preempt());
+        let mut flusher = TlbFlusher::new(CpuSet::new_full(), disable_preempt());
        let mut va = self.start();
        for page in pages.into_iter() {
            // SAFETY: The constructor of the `KVirtArea<Tracked>` structure has already ensured this
@ -223,11 +223,11 @@ impl KVirtArea<Tracked> {
            if let Some(old) = unsafe { cursor.map(page.into(), prop) } {
                flusher.issue_tlb_flush_with(TlbFlushOp::Address(va), old);
                flusher.dispatch_tlb_flush();
+                // FIXME: We should synchronize the TLB here. But we may
+                // disable IRQs here, making deadlocks very likely.
            }
            va += PAGE_SIZE;
        }
-        flusher.issue_tlb_flush(TlbFlushOp::Range(range));
-        flusher.dispatch_tlb_flush();
    }

    /// Gets the mapped tracked page.
@ -278,13 +278,15 @@ impl KVirtArea<Untracked> {

        let page_table = KERNEL_PAGE_TABLE.get().unwrap();
        let mut cursor = page_table.cursor_mut(&va_range).unwrap();
-        let flusher = TlbFlusher::new(CpuSet::new_full(), disable_preempt());
+        let mut flusher = TlbFlusher::new(CpuSet::new_full(), disable_preempt());
        // SAFETY: The caller of `map_untracked_pages` has ensured the safety of this mapping.
        unsafe {
            cursor.map_pa(&pa_range, prop);
        }
        flusher.issue_tlb_flush(TlbFlushOp::Range(va_range.clone()));
        flusher.dispatch_tlb_flush();
+        // FIXME: We should synchronize the TLB here. But we may
+        // disable IRQs here, making deadlocks very likely.
    }

    /// Gets the mapped untracked page.
@ -318,7 +320,7 @@ impl<M: AllocatorSelector + 'static> Drop for KVirtArea<M> {
        let page_table = KERNEL_PAGE_TABLE.get().unwrap();
        let range = self.start()..self.end();
        let mut cursor = page_table.cursor_mut(&range).unwrap();
-        let flusher = TlbFlusher::new(CpuSet::new_full(), disable_preempt());
+        let mut flusher = TlbFlusher::new(CpuSet::new_full(), disable_preempt());
        let tlb_prefer_flush_all = self.end() - self.start() > FLUSH_ALL_RANGE_THRESHOLD;

        loop {
@ -361,6 +363,8 @@ impl<M: AllocatorSelector + 'static> Drop for KVirtArea<M> {
        }

        flusher.dispatch_tlb_flush();
+        // FIXME: We should synchronize the TLB here. But we may
+        // disable IRQs here, making deadlocks very likely.

        // 2. free the virtual block
        let allocator = M::select_allocator();
--- a/ostd/src/mm/tlb.rs
+++ b/ostd/src/mm/tlb.rs
@ -3,17 +3,20 @@
 //! TLB flush operations.

 use alloc::vec::Vec;
-use core::ops::Range;
+use core::{
+    ops::Range,
+    sync::atomic::{AtomicBool, Ordering},
+};

 use super::{
    frame::{meta::AnyFrameMeta, Frame},
    Vaddr, PAGE_SIZE,
 };
 use crate::{
+    arch::irq,
    cpu::{CpuSet, PinCurrentCpu},
    cpu_local,
    sync::{LocalIrqDisabled, SpinLock},
-    task::disable_preempt,
 };

 /// A TLB flusher that is aware of which CPUs are needed to be flushed.
@ -25,6 +28,7 @@ pub struct TlbFlusher<G: PinCurrentCpu> {
    // list brings non-trivial overhead.
    need_remote_flush: bool,
    need_self_flush: bool,
+    have_unsynced_flush: bool,
    _pin_current: G,
 }

@ -50,27 +54,74 @@ impl<G: PinCurrentCpu> TlbFlusher<G> {
            target_cpus,
            need_remote_flush,
            need_self_flush,
+            have_unsynced_flush: false,
            _pin_current: pin_current_guard,
        }
    }

    /// Issues a pending TLB flush request.
    ///
-    /// On SMP systems, the notification is sent to all the relevant CPUs only
-    /// when [`Self::dispatch_tlb_flush`] is called.
+    /// This function does not guarantee to flush the TLB entries on either
+    /// this CPU or remote CPUs. The flush requests are only performed when
+    /// [`Self::dispatch_tlb_flush`] is called.
    pub fn issue_tlb_flush(&self, op: TlbFlushOp) {
        self.issue_tlb_flush_(op, None);
    }

    /// Dispatches all the pending TLB flush requests.
    ///
-    /// The pending requests are issued by [`Self::issue_tlb_flush`].
-    pub fn dispatch_tlb_flush(&self) {
+    /// All previous pending requests issued by [`Self::issue_tlb_flush`]
+    /// starts to be processed after this function. But it may not be
+    /// synchronous. Upon the return of this function, the TLB entries may not
+    /// be coherent.
+    pub fn dispatch_tlb_flush(&mut self) {
        if !self.need_remote_flush {
            return;
        }

+        for cpu in self.target_cpus.iter() {
+            ACK_REMOTE_FLUSH
+                .get_on_cpu(cpu)
+                .store(false, Ordering::Relaxed);
+        }
+
        crate::smp::inter_processor_call(&self.target_cpus, do_remote_flush);
+
+        self.have_unsynced_flush = true;
+    }
+
+    /// Wait for all the previous TLB flush requests to be completed.
+    ///
+    /// After this function, all TLB entries corresponding to previous
+    /// dispatched TLB flush requests are guaranteed to be coherent.
+    ///
+    /// The TLB flush requests are issued with [`Self::issue_tlb_flush`] and
+    /// dispatched with [`Self::dispatch_tlb_flush`]. This method will not
+    /// dispatch any issued requests so it will not guarantee TLB coherence
+    /// of requests that are not dispatched.
+    ///
+    /// # Panics
+    ///
+    /// This method panics if the IRQs are disabled. Since the remote flush are
+    /// processed in IRQs, two CPUs may deadlock if they are waiting for each
+    /// other's TLB coherence.
+    pub fn sync_tlb_flush(&mut self) {
+        if !self.have_unsynced_flush {
+            return;
+        }
+
+        assert!(
+            irq::is_local_enabled(),
+            "Waiting for remote flush with IRQs disabled"
+        );
+
+        for cpu in self.target_cpus.iter() {
+            while !ACK_REMOTE_FLUSH.get_on_cpu(cpu).load(Ordering::Acquire) {
+                core::hint::spin_loop();
+            }
+        }
+
+        self.have_unsynced_flush = false;
    }

    /// Issues a TLB flush request that must happen before dropping the page.
@ -154,21 +205,37 @@ impl TlbFlushOp {
    }
 }

+/// Registers the timer callbacks for the TLB flush operations.
+///
+/// We check if there's pending TLB flush requests on each CPU in the timer
+/// callback. This is to ensure that the TLB flush requests are processed
+/// ultimately in case the IPIs are not received.
+///
+/// This function should be done once for each CPU during the initialization.
+pub(crate) fn register_timer_callbacks_this_cpu() {
+    crate::timer::register_callback(do_remote_flush);
+}
+
 // The queues of pending requests on each CPU.
 //
 // Lock ordering: lock FLUSH_OPS before PAGE_KEEPER.
 cpu_local! {
    static FLUSH_OPS: SpinLock<OpsStack, LocalIrqDisabled> = SpinLock::new(OpsStack::new());
    static PAGE_KEEPER: SpinLock<Vec<Frame<dyn AnyFrameMeta>>, LocalIrqDisabled> = SpinLock::new(Vec::new());
+    /// Whether this CPU finishes the last remote flush request.
+    static ACK_REMOTE_FLUSH: AtomicBool = AtomicBool::new(true);
 }

 fn do_remote_flush() {
-    let preempt_guard = disable_preempt();
-    let current_cpu = preempt_guard.current_cpu();
+    let current_cpu = crate::cpu::current_cpu_racy(); // Safe because we are in IRQs.

    let mut op_queue = FLUSH_OPS.get_on_cpu(current_cpu).lock();
    op_queue.flush_all();
    PAGE_KEEPER.get_on_cpu(current_cpu).lock().clear();
+
+    ACK_REMOTE_FLUSH
+        .get_on_cpu(current_cpu)
+        .store(true, Ordering::Release);
 }

 /// If a TLB flushing request exceeds this threshold, we flush all.
--- a/ostd/src/mm/vm_space.rs
+++ b/ostd/src/mm/vm_space.rs
@ -318,8 +318,8 @@ impl CursorMut<'_, '_> {
    }

    /// Get the dedicated TLB flusher for this cursor.
-    pub fn flusher(&self) -> &TlbFlusher<DisabledPreemptGuard> {
-        &self.flusher
+    pub fn flusher(&mut self) -> &mut TlbFlusher<DisabledPreemptGuard> {
+        &mut self.flusher
    }

    /// Map a frame into the current slot.