From a1accf43044ad5825e401234b765eb39ada5d957 Mon Sep 17 00:00:00 2001 From: Ruihan Li Date: Thu, 19 Jun 2025 12:46:28 +0800 Subject: [PATCH] Do some miscellaneous page table cleanups --- ostd/src/mm/page_table/cursor/locking.rs | 6 +- ostd/src/mm/page_table/cursor/mod.rs | 36 +++++------- ostd/src/mm/page_table/mod.rs | 70 ++++++++++++------------ ostd/src/mm/page_table/node/entry.rs | 62 ++++++++++----------- ostd/src/mm/page_table/node/mod.rs | 20 +++---- ostd/src/mm/page_table/test.rs | 35 ++++++++++++ 6 files changed, 128 insertions(+), 101 deletions(-) diff --git a/ostd/src/mm/page_table/cursor/locking.rs b/ostd/src/mm/page_table/cursor/locking.rs index 1b2f394a7..b34da824f 100644 --- a/ostd/src/mm/page_table/cursor/locking.rs +++ b/ostd/src/mm/page_table/cursor/locking.rs @@ -63,8 +63,10 @@ pub(super) fn unlock_range(cursor: &mut Cursor<'_, C>) { } } let guard_node = cursor.path[cursor.guard_level as usize - 1].take().unwrap(); - let cur_node_va = cursor.barrier_va.start / page_size::(cursor.guard_level + 1) - * page_size::(cursor.guard_level + 1); + let cur_node_va = cursor + .barrier_va + .start + .align_down(page_size::(cursor.guard_level + 1)); // SAFETY: A cursor maintains that its corresponding sub-tree is locked. unsafe { diff --git a/ostd/src/mm/page_table/cursor/mod.rs b/ostd/src/mm/page_table/cursor/mod.rs index 19e7b5fbe..930e018d8 100644 --- a/ostd/src/mm/page_table/cursor/mod.rs +++ b/ostd/src/mm/page_table/cursor/mod.rs @@ -101,6 +101,8 @@ impl PageTableFrag { match self { PageTableFrag::Mapped { va, item } => { let (pa, level, prop) = C::item_into_raw(item.clone()); + // SAFETY: All the arguments match those returned from the previous call + // to `item_into_raw`, and we are taking ownership of the cloned item. drop(unsafe { C::item_from_raw(pa, level, prop) }); *va..*va + page_size::(level) } @@ -149,11 +151,10 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> { let rcu_guard = self.rcu_guard; loop { - let cur_va = self.va; let level = self.level; - let cur_child = self.cur_entry().to_ref(); - let item = match cur_child { + let cur_entry = self.cur_entry(); + let item = match cur_entry.to_ref() { ChildRef::PageTable(pt) => { // SAFETY: The `pt` must be locked and no other guards exist. let guard = unsafe { pt.make_guard_unchecked(rcu_guard) }; @@ -179,7 +180,7 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> { } }; - return Ok((cur_va..cur_va + page_size::(level), item)); + return Ok((self.cur_va_range(), item)); } } @@ -287,22 +288,14 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> { } loop { - let cur_node_start = self.va & !(page_size::(self.level + 1) - 1); - let cur_node_end = cur_node_start + page_size::(self.level + 1); + let node_size = page_size::(self.level + 1); + let node_start = self.va.align_down(node_size); // If the address is within the current node, we can jump directly. - if cur_node_start <= va && va < cur_node_end { + if node_start <= va && va < node_start + node_size { self.va = va; return Ok(()); } - // There is a corner case that the cursor is depleted, sitting at the start of the - // next node but the next node is not locked because the parent is not locked. - if self.va >= self.barrier_va.end && self.level == self.guard_level { - self.va = va; - return Ok(()); - } - - debug_assert!(self.level < self.guard_level); self.pop_level(); } } @@ -321,11 +314,12 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> { /// Goes up a level. fn pop_level(&mut self) { - let Some(taken) = self.path[self.level as usize - 1].take() else { - panic!("Popping a level without a lock"); - }; + let taken = self.path[self.level as usize - 1] + .take() + .expect("Popping a level without a lock"); let _ = ManuallyDrop::new(taken); + debug_assert!(self.level < self.guard_level); self.level += 1; } @@ -345,9 +339,9 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> { /// Gets the virtual address range that the current entry covers. fn cur_va_range(&self) -> Range { - let page_size = page_size::(self.level); - let start = self.va.align_down(page_size); - start..start + page_size + let entry_size = page_size::(self.level); + let entry_start = self.va.align_down(entry_size); + entry_start..entry_start + entry_size } } diff --git a/ostd/src/mm/page_table/mod.rs b/ostd/src/mm/page_table/mod.rs index 656fb028f..b2cf92a9a 100644 --- a/ostd/src/mm/page_table/mod.rs +++ b/ostd/src/mm/page_table/mod.rs @@ -341,8 +341,11 @@ impl PageTable { // See also `::on_drop`. let pt_addr = pt.start_paddr(); let pte = PageTableEntry::new_pt(pt_addr); - // SAFETY: The index is within the bounds and the level of the new - // PTE matches the node. + // SAFETY: The index is within the bounds and the PTE is at the + // correct paging level. However, neither it's a `UserPtConfig` + // child nor the node has the ownership of the child. It is + // still safe because `UserPtConfig::TOP_LEVEL_INDEX_RANGE` + // guarantees that the cursor won't access it. unsafe { new_node.write_pte(i, pte) }; } drop(new_node); @@ -446,23 +449,27 @@ impl PageTable { } /// A software emulation of the MMU address translation process. -/// It returns the physical address of the given virtual address and the mapping info -/// if a valid mapping exists for the given virtual address. +/// +/// This method returns the physical address of the given virtual address and +/// the page property if a valid mapping exists for the given virtual address. /// /// # Safety /// -/// The caller must ensure that the root_paddr is a valid pointer to the root +/// The caller must ensure that the `root_paddr` is a pointer to a valid root /// page table node. /// -/// # Notes on the page table free-reuse-then-read problem +/// # Notes on the page table use-after-free problem /// -/// Because neither the hardware MMU nor the software page walk method -/// would get the locks of the page table while reading, they can enter -/// a to-be-recycled page table node and read the page table entries -/// after the node is recycled and reused. +/// Neither the hardware MMU nor the software page walk method acquires the page +/// table locks while reading. They can enter a to-be-recycled page table node +/// and read the page table entries after the node is recycled and reused. /// -/// To mitigate this problem, the page table nodes are by default not -/// actively recycled, until we find an appropriate solution. +/// For the hardware MMU page walk, we mitigate this problem by dropping the page +/// table nodes only after the TLBs have been flushed on all the CPUs that +/// activate the page table. +/// +/// For the software page walk, we only need to disable preemption at the beginning +/// since the page table nodes won't be recycled in the RCU critical section. #[cfg(ktest)] pub(super) unsafe fn page_walk( root_paddr: Paddr, @@ -470,43 +477,34 @@ pub(super) unsafe fn page_walk( ) -> Option<(Paddr, PageProperty)> { use super::paddr_to_vaddr; - let _guard = crate::trap::disable_local(); + let _rcu_guard = disable_preempt(); - let mut cur_level = C::NR_LEVELS; - let mut cur_pte = { - let node_addr = paddr_to_vaddr(root_paddr); + let mut pt_addr = paddr_to_vaddr(root_paddr); + for cur_level in (1..=C::NR_LEVELS).rev() { let offset = pte_index::(vaddr, cur_level); - // SAFETY: The offset does not exceed the value of PAGE_SIZE. - unsafe { (node_addr as *const C::E).add(offset).read() } - }; + // SAFETY: + // - The page table node is alive because (1) the root node is alive and + // (2) all child nodes cannot be recycled because we're in the RCU critical section. + // - The index is inside the bound, so the page table entry is valid. + // - All page table entries are aligned and accessed with atomic operations only. + let cur_pte = unsafe { load_pte((pt_addr as *mut C::E).add(offset), Ordering::Acquire) }; - while cur_level > 1 { if !cur_pte.is_present() { return None; } if cur_pte.is_last(cur_level) { debug_assert!(cur_level <= C::HIGHEST_TRANSLATION_LEVEL); - break; + return Some(( + cur_pte.paddr() + (vaddr & (page_size::(cur_level) - 1)), + cur_pte.prop(), + )); } - cur_level -= 1; - cur_pte = { - let node_addr = paddr_to_vaddr(cur_pte.paddr()); - let offset = pte_index::(vaddr, cur_level); - // SAFETY: The offset does not exceed the value of PAGE_SIZE. - unsafe { (node_addr as *const C::E).add(offset).read() } - }; + pt_addr = paddr_to_vaddr(cur_pte.paddr()); } - if cur_pte.is_present() { - Some(( - cur_pte.paddr() + (vaddr & (page_size::(cur_level) - 1)), - cur_pte.prop(), - )) - } else { - None - } + unreachable!("All present PTEs at the level 1 must be last-level PTEs"); } /// The interface for defining architecture-specific page table entries. diff --git a/ostd/src/mm/page_table/node/entry.rs b/ostd/src/mm/page_table/node/entry.rs index ea2babc48..b80425e71 100644 --- a/ostd/src/mm/page_table/node/entry.rs +++ b/ostd/src/mm/page_table/node/entry.rs @@ -2,8 +2,6 @@ //! This module provides accessors to the page table entries in a node. -use core::mem::ManuallyDrop; - use super::{Child, ChildRef, PageTableEntryTrait, PageTableGuard, PageTableNode}; use crate::{ mm::{ @@ -78,8 +76,8 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> { // SAFETY: // 1. The index is within the bounds. // 2. We replace the PTE with a new one, which differs only in - // `PageProperty`, so the level still matches the current - // page table node. + // `PageProperty`, so it's in `C` and at the correct paging level. + // 3. The child is still owned by the page table node. unsafe { self.node.write_pte(self.idx, self.pte) }; } @@ -117,7 +115,8 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> { // SAFETY: // 1. The index is within the bounds. - // 2. The new PTE is a valid child whose level matches the current page table node. + // 2. The new PTE is a child in `C` and at the correct paging level. + // 3. The ownership of the child is passed to the page table node. unsafe { self.node.write_pte(self.idx, new_pte) }; self.pte = new_pte; @@ -138,28 +137,28 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> { } let level = self.node.level(); - let new_page = PageTableNode::::alloc(level - 1); + let new_page = RcuDrop::new(PageTableNode::::alloc(level - 1)); let paddr = new_page.start_paddr(); - let _ = ManuallyDrop::new(new_page.borrow().lock(guard)); + // SAFETY: The page table won't be dropped before the RCU grace period + // ends, so it outlives `'rcu`. + let pt_ref = unsafe { PageTableNodeRef::borrow_paddr(paddr) }; + + // Lock before writing the PTE, so no one else can operate on it. + let pt_lock_guard = pt_ref.lock(guard); // SAFETY: // 1. The index is within the bounds. - // 2. The new PTE is a valid child whose level matches the current page table node. + // 2. The new PTE is a child in `C` and at the correct paging level. + // 3. The ownership of the child is passed to the page table node. unsafe { - self.node.write_pte( - self.idx, - Child::PageTable(RcuDrop::new(new_page)).into_pte(), - ) + self.node + .write_pte(self.idx, Child::PageTable(new_page).into_pte()) }; *self.node.nr_children_mut() += 1; - // SAFETY: The page table won't be dropped before the RCU grace period - // ends, so it outlives `'rcu`. - let pt_ref = unsafe { PageTableNodeRef::borrow_paddr(paddr) }; - // SAFETY: The node is locked and there are no other guards. - Some(unsafe { pt_ref.make_guard_unchecked(guard) }) + Some(pt_lock_guard) } /// Splits the entry to smaller pages if it maps to a huge page. @@ -183,8 +182,15 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> { let pa = self.pte.paddr(); let prop = self.pte.prop(); - let new_page = PageTableNode::::alloc(level - 1); - let mut pt_lock_guard = new_page.borrow().lock(guard); + let new_page = RcuDrop::new(PageTableNode::::alloc(level - 1)); + + let paddr = new_page.start_paddr(); + // SAFETY: The page table won't be dropped before the RCU grace period + // ends, so it outlives `'rcu`. + let pt_ref = unsafe { PageTableNodeRef::borrow_paddr(paddr) }; + + // Lock before writing the PTE, so no one else can operate on it. + let mut pt_lock_guard = pt_ref.lock(guard); for i in 0..nr_subpage_per_huge::() { let small_pa = pa + i * page_size::(level - 1); @@ -193,24 +199,16 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> { debug_assert!(old.is_none()); } - let paddr = new_page.start_paddr(); - let _ = ManuallyDrop::new(pt_lock_guard); - // SAFETY: // 1. The index is within the bounds. - // 2. The new PTE is a valid child whose level matches the current page table node. + // 2. The new PTE is a child in `C` and at the correct paging level. + // 3. The ownership of the child is passed to the page table node. unsafe { - self.node.write_pte( - self.idx, - Child::PageTable(RcuDrop::new(new_page)).into_pte(), - ) + self.node + .write_pte(self.idx, Child::PageTable(new_page).into_pte()) }; - // SAFETY: The page table won't be dropped before the RCU grace period - // ends, so it outlives `'rcu`. - let pt_ref = unsafe { PageTableNodeRef::borrow_paddr(paddr) }; - // SAFETY: The node is locked and there are no other guards. - Some(unsafe { pt_ref.make_guard_unchecked(guard) }) + Some(pt_lock_guard) } /// Create a new entry at the node with guard. diff --git a/ostd/src/mm/page_table/node/mod.rs b/ostd/src/mm/page_table/node/mod.rs index 522346e1a..016f90eac 100644 --- a/ostd/src/mm/page_table/node/mod.rs +++ b/ostd/src/mm/page_table/node/mod.rs @@ -225,15 +225,14 @@ impl<'rcu, C: PageTableConfig> PageTableGuard<'rcu, C> { /// /// This operation will leak the old child if the old PTE is present. /// - /// The child represented by the given PTE will handover the ownership to - /// the node. The PTE will be rendered invalid after this operation. - /// /// # Safety /// /// The caller must ensure that: /// 1. The index must be within the bound; - /// 2. The PTE must represent a valid [`Child`] whose level is compatible - /// with the page table node. + /// 2. The PTE must represent a [`Child`] in the same [`PageTableConfig`] + /// and at the right paging level (`self.level() - 1`). + /// 3. The page table node will have the ownership of the [`Child`] + /// after this method. pub(super) unsafe fn write_pte(&mut self, idx: usize, pte: C::E) { debug_assert!(idx < nr_subpage_per_huge::()); let ptr = paddr_to_vaddr(self.start_paddr()) as *mut C::E; @@ -296,24 +295,25 @@ impl PageTablePageMeta { } } -// SAFETY: We can read the page table node because the page table pages are -// accessed as untyped memory. +// FIXME: The safe APIs in the `page_table/node` module allow `Child::Frame`s with +// arbitrary addresses to be stored in the page table nodes. Therefore, they may not +// be valid `C::Item`s. The soundness of the following `on_drop` implementation must +// be reasoned in conjunction with the `page_table/cursor` implementation. unsafe impl AnyFrameMeta for PageTablePageMeta { fn on_drop(&mut self, reader: &mut VmReader) { let nr_children = self.nr_children.get_mut(); - if *nr_children == 0 { return; } let level = self.level; - - // Drop the children. let range = if level == C::NR_LEVELS { C::TOP_LEVEL_INDEX_RANGE.clone() } else { 0..nr_subpage_per_huge::() }; + + // Drop the children. reader.skip(range.start * size_of::()); for _ in range { // Non-atomic read is OK because we have mutable access. diff --git a/ostd/src/mm/page_table/test.rs b/ostd/src/mm/page_table/test.rs index f6c49b04e..c1e428622 100644 --- a/ostd/src/mm/page_table/test.rs +++ b/ostd/src/mm/page_table/test.rs @@ -432,6 +432,41 @@ mod navigation { ); } + #[ktest] + fn jump_from_end_and_query_huge_middle() { + let page_table = PageTable::::empty(); + + const HUGE_PAGE_SIZE: usize = PAGE_SIZE * 512; // 2M + + let virt_range = 0..HUGE_PAGE_SIZE * 2; // lock at level 2 + let map_va = virt_range.end - HUGE_PAGE_SIZE; + let map_item = ( + 0, + 2, + PageProperty::new_user(PageFlags::RW, CachePolicy::Writeback), + ); + + let preempt_guard = disable_preempt(); + let mut cursor = page_table.cursor_mut(&preempt_guard, &virt_range).unwrap(); + + cursor.jump(map_va).unwrap(); + unsafe { cursor.map(map_item).unwrap() }; + + // Now the cursor is at the end of the range with level 2. + assert!(cursor.query().is_err()); + + // Jump from the end. + cursor.jump(virt_range.start).unwrap(); + assert!(cursor.query().unwrap().1.is_none()); + + // Query in the middle of the huge page. + cursor.jump(virt_range.end - HUGE_PAGE_SIZE / 2).unwrap(); + assert_eq!( + cursor.query().unwrap().0, + virt_range.end - HUGE_PAGE_SIZE..virt_range.end + ); + } + #[ktest] fn find_next() { let (page_table, _, _) = setup_page_table_with_two_frames();