From 3472560c55540aafad785ee7e7a59081f2857b83 Mon Sep 17 00:00:00 2001
From: Zhang Junyang <junyang@stu.pku.edu.cn>
Date: Sun, 23 Mar 2025 21:44:30 +0800
Subject: [PATCH] Make TLB issuing less contended

---
 ostd/src/mm/tlb.rs | 151 ++++++++++++++++++++++++++++-----------------
 1 file changed, 96 insertions(+), 55 deletions(-)
diff --git a/ostd/src/mm/tlb.rs b/ostd/src/mm/tlb.rs
index 8151eb51c..73407fcdb 100644
--- a/ostd/src/mm/tlb.rs
+++ b/ostd/src/mm/tlb.rs
@@ -23,24 +23,26 @@ use crate::{
 ///
 /// The flusher needs to stick to the current CPU.
 pub struct TlbFlusher<'a, G: PinCurrentCpu> {
-    /// The CPUs to be flushed.
-    ///
-    /// If the targets is `None`, the flusher will flush all the CPUs.
     target_cpus: &'a AtomicCpuSet,
     have_unsynced_flush: CpuSet,
-    pin_current: G,
+    ops_stack: OpsStack,
+    _pin_current: G,
 }
 
 impl<'a, G: PinCurrentCpu> TlbFlusher<'a, G> {
     /// Creates a new TLB flusher with the specified CPUs to be flushed.
     ///
+    /// The target CPUs should be a reference to an [`AtomicCpuSet`] that will
+    /// be loaded upon [`Self::dispatch_tlb_flush`].
+    ///
     /// The flusher needs to stick to the current CPU. So please provide a
     /// guard that implements [`PinCurrentCpu`].
     pub fn new(target_cpus: &'a AtomicCpuSet, pin_current_guard: G) -> Self {
         Self {
             target_cpus,
             have_unsynced_flush: CpuSet::new_empty(),
-            pin_current: pin_current_guard,
+            ops_stack: OpsStack::new(),
+            _pin_current: pin_current_guard,
         }
     }
 
@@ -49,22 +51,43 @@ impl<'a, G: PinCurrentCpu> TlbFlusher<'a, G> {
     /// This function does not guarantee to flush the TLB entries on either
     /// this CPU or remote CPUs. The flush requests are only performed when
     /// [`Self::dispatch_tlb_flush`] is called.
-    pub fn issue_tlb_flush(&self, op: TlbFlushOp) {
-        self.issue_tlb_flush_(op, None);
+    pub fn issue_tlb_flush(&mut self, op: TlbFlushOp) {
+        self.ops_stack.push(op, None);
+    }
+
+    /// Issues a TLB flush request that must happen before dropping the page.
+    ///
+    /// If we need to remove a mapped page from the page table, we can only
+    /// recycle the page after all the relevant TLB entries in all CPUs are
+    /// flushed. Otherwise if the page is recycled for other purposes, the user
+    /// space program can still access the page through the TLB entries. This
+    /// method is designed to be used in such cases.
+    pub fn issue_tlb_flush_with(
+        &mut self,
+        op: TlbFlushOp,
+        drop_after_flush: Frame<dyn AnyFrameMeta>,
+    ) {
+        self.ops_stack.push(op, Some(drop_after_flush));
     }
 
     /// Dispatches all the pending TLB flush requests.
     ///
-    /// All previous pending requests issued by [`Self::issue_tlb_flush`]
-    /// starts to be processed after this function. But it may not be
-    /// synchronous. Upon the return of this function, the TLB entries may not
-    /// be coherent.
+    /// All previous pending requests issued by [`Self::issue_tlb_flush`] or
+    /// [`Self::issue_tlb_flush_with`] starts to be processed after this
+    /// function. But it may not be synchronous. Upon the return of this
+    /// function, the TLB entries may not be coherent.
     pub fn dispatch_tlb_flush(&mut self) {
+        let irq_guard = crate::trap::disable_local();
+
+        if self.ops_stack.is_empty() {
+            return;
+        }
+
         // `Release` to make sure our modification on the PT is visible to CPUs
         // that are going to activate the PT.
         let mut target_cpus = self.target_cpus.load(Ordering::Release);
 
-        let cur_cpu = self.pin_current.current_cpu();
+        let cur_cpu = irq_guard.current_cpu();
         let mut need_flush_on_self = false;
 
         if target_cpus.contains(cur_cpu) {
@@ -73,9 +96,15 @@ impl<'a, G: PinCurrentCpu> TlbFlusher<'a, G> {
         }
 
         for cpu in target_cpus.iter() {
-            ACK_REMOTE_FLUSH
-                .get_on_cpu(cpu)
-                .store(false, Ordering::Relaxed);
+            {
+                let mut flush_ops = FLUSH_OPS.get_on_cpu(cpu).lock();
+                flush_ops.push_from(&self.ops_stack);
+
+                // Clear ACK before dropping the lock to avoid false ACKs.
+                ACK_REMOTE_FLUSH
+                    .get_on_cpu(cpu)
+                    .store(false, Ordering::Relaxed);
+            }
             self.have_unsynced_flush.add(cpu);
         }
 
@@ -83,7 +112,9 @@ impl<'a, G: PinCurrentCpu> TlbFlusher<'a, G> {
 
         // Flush ourselves after sending all IPIs to save some time.
         if need_flush_on_self {
-            do_remote_flush();
+            self.ops_stack.flush_all();
+        } else {
+            self.ops_stack.clear_without_flush();
         }
     }
 
@@ -109,42 +140,17 @@ impl<'a, G: PinCurrentCpu> TlbFlusher<'a, G> {
         );
 
         for cpu in self.have_unsynced_flush.iter() {
-            while !ACK_REMOTE_FLUSH.get_on_cpu(cpu).load(Ordering::Acquire) {
+            while !ACK_REMOTE_FLUSH.get_on_cpu(cpu).load(Ordering::Relaxed) {
                 core::hint::spin_loop();
             }
         }
 
         self.have_unsynced_flush = CpuSet::new_empty();
     }
-
-    /// Issues a TLB flush request that must happen before dropping the page.
-    ///
-    /// If we need to remove a mapped page from the page table, we can only
-    /// recycle the page after all the relevant TLB entries in all CPUs are
-    /// flushed. Otherwise if the page is recycled for other purposes, the user
-    /// space program can still access the page through the TLB entries. This
-    /// method is designed to be used in such cases.
-    pub fn issue_tlb_flush_with(&self, op: TlbFlushOp, drop_after_flush: Frame<dyn AnyFrameMeta>) {
-        self.issue_tlb_flush_(op, Some(drop_after_flush));
-    }
-
-    fn issue_tlb_flush_(&self, op: TlbFlushOp, drop_after_flush: Option<Frame<dyn AnyFrameMeta>>) {
-        let op = op.optimize_for_large_range();
-
-        // `Release` to make sure our modification on the PT is visible to CPUs
-        // that are going to activate the PT.
-        let target_cpus = self.target_cpus.load(Ordering::Release);
-
-        // Slow path for multi-CPU cases.
-        for cpu in target_cpus.iter() {
-            let mut op_queue = FLUSH_OPS.get_on_cpu(cpu).lock();
-            op_queue.push(op.clone(), drop_after_flush.clone());
-        }
-    }
 }
 
 /// The operation to flush TLB entries.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub enum TlbFlushOp {
     /// Flush all TLB entries except for the global entries.
     All,
@@ -189,14 +195,23 @@ cpu_local! {
 }
 
 fn do_remote_flush() {
-    let current_cpu = crate::cpu::current_cpu_racy(); // Safe because we are in IRQs.
+    // No races because we are in IRQs/have disabled preempts.
+    let current_cpu = crate::cpu::current_cpu_racy();
 
-    let mut op_queue = FLUSH_OPS.get_on_cpu(current_cpu).lock();
-    op_queue.flush_all();
+    let mut new_op_queue = OpsStack::new();
+    {
+        let mut op_queue = FLUSH_OPS.get_on_cpu(current_cpu).lock();
 
-    ACK_REMOTE_FLUSH
-        .get_on_cpu(current_cpu)
-        .store(true, Ordering::Release);
+        core::mem::swap(&mut *op_queue, &mut new_op_queue);
+
+        // ACK before dropping the lock so that we won't miss flush requests.
+        ACK_REMOTE_FLUSH
+            .get_on_cpu(current_cpu)
+            .store(true, Ordering::Relaxed);
+    }
+    // Unlock the locks quickly to avoid contention. ACK before flushing is
+    // fine since we cannot switch back to userspace now.
+    new_op_queue.flush_all();
 }
 
 /// If a TLB flushing request exceeds this threshold, we flush all.
@@ -223,6 +238,10 @@ impl OpsStack {
         }
     }
 
+    fn is_empty(&self) -> bool {
+        !self.need_flush_all && self.size == 0
+    }
+
     fn push(&mut self, op: TlbFlushOp, drop_after_flush: Option<Frame<dyn AnyFrameMeta>>) {
         if let Some(frame) = drop_after_flush {
             self.page_keeper.push(frame);
@@ -231,13 +250,32 @@ impl OpsStack {
         if self.need_flush_all {
             return;
         }
-
-        if self.size < FLUSH_ALL_OPS_THRESHOLD {
-            self.ops[self.size] = Some(op);
-            self.size += 1;
-        } else {
+        let op = op.optimize_for_large_range();
+        if op == TlbFlushOp::All || self.size >= FLUSH_ALL_OPS_THRESHOLD {
             self.need_flush_all = true;
             self.size = 0;
+            return;
+        }
+
+        self.ops[self.size] = Some(op);
+        self.size += 1;
+    }
+
+    fn push_from(&mut self, other: &OpsStack) {
+        self.page_keeper.extend(other.page_keeper.iter().cloned());
+
+        if self.need_flush_all {
+            return;
+        }
+        if other.need_flush_all || self.size + other.size >= FLUSH_ALL_OPS_THRESHOLD {
+            self.need_flush_all = true;
+            self.size = 0;
+            return;
+        }
+
+        for i in 0..other.size {
+            self.ops[self.size] = other.ops[i].clone();
+            self.size += 1;
         }
     }
 
@@ -252,9 +290,12 @@ impl OpsStack {
             }
         }
 
+        self.clear_without_flush();
+    }
+
+    fn clear_without_flush(&mut self) {
         self.need_flush_all = false;
         self.size = 0;
-
         self.page_keeper.clear();
     }
 }