Do some miscellaneous page table cleanups

This commit is contained in:
Ruihan Li
2025-06-19 12:46:28 +08:00
committed by Junyang Zhang
parent 11f9675f37
commit a1accf4304
6 changed files with 128 additions and 101 deletions

View File

@ -63,8 +63,10 @@ pub(super) fn unlock_range<C: PageTableConfig>(cursor: &mut Cursor<'_, C>) {
} }
} }
let guard_node = cursor.path[cursor.guard_level as usize - 1].take().unwrap(); let guard_node = cursor.path[cursor.guard_level as usize - 1].take().unwrap();
let cur_node_va = cursor.barrier_va.start / page_size::<C>(cursor.guard_level + 1) let cur_node_va = cursor
* page_size::<C>(cursor.guard_level + 1); .barrier_va
.start
.align_down(page_size::<C>(cursor.guard_level + 1));
// SAFETY: A cursor maintains that its corresponding sub-tree is locked. // SAFETY: A cursor maintains that its corresponding sub-tree is locked.
unsafe { unsafe {

View File

@ -101,6 +101,8 @@ impl<C: PageTableConfig> PageTableFrag<C> {
match self { match self {
PageTableFrag::Mapped { va, item } => { PageTableFrag::Mapped { va, item } => {
let (pa, level, prop) = C::item_into_raw(item.clone()); let (pa, level, prop) = C::item_into_raw(item.clone());
// SAFETY: All the arguments match those returned from the previous call
// to `item_into_raw`, and we are taking ownership of the cloned item.
drop(unsafe { C::item_from_raw(pa, level, prop) }); drop(unsafe { C::item_from_raw(pa, level, prop) });
*va..*va + page_size::<C>(level) *va..*va + page_size::<C>(level)
} }
@ -149,11 +151,10 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> {
let rcu_guard = self.rcu_guard; let rcu_guard = self.rcu_guard;
loop { loop {
let cur_va = self.va;
let level = self.level; let level = self.level;
let cur_child = self.cur_entry().to_ref(); let cur_entry = self.cur_entry();
let item = match cur_child { let item = match cur_entry.to_ref() {
ChildRef::PageTable(pt) => { ChildRef::PageTable(pt) => {
// SAFETY: The `pt` must be locked and no other guards exist. // SAFETY: The `pt` must be locked and no other guards exist.
let guard = unsafe { pt.make_guard_unchecked(rcu_guard) }; let guard = unsafe { pt.make_guard_unchecked(rcu_guard) };
@ -179,7 +180,7 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> {
} }
}; };
return Ok((cur_va..cur_va + page_size::<C>(level), item)); return Ok((self.cur_va_range(), item));
} }
} }
@ -287,22 +288,14 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> {
} }
loop { loop {
let cur_node_start = self.va & !(page_size::<C>(self.level + 1) - 1); let node_size = page_size::<C>(self.level + 1);
let cur_node_end = cur_node_start + page_size::<C>(self.level + 1); let node_start = self.va.align_down(node_size);
// If the address is within the current node, we can jump directly. // If the address is within the current node, we can jump directly.
if cur_node_start <= va && va < cur_node_end { if node_start <= va && va < node_start + node_size {
self.va = va; self.va = va;
return Ok(()); return Ok(());
} }
// There is a corner case that the cursor is depleted, sitting at the start of the
// next node but the next node is not locked because the parent is not locked.
if self.va >= self.barrier_va.end && self.level == self.guard_level {
self.va = va;
return Ok(());
}
debug_assert!(self.level < self.guard_level);
self.pop_level(); self.pop_level();
} }
} }
@ -321,11 +314,12 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> {
/// Goes up a level. /// Goes up a level.
fn pop_level(&mut self) { fn pop_level(&mut self) {
let Some(taken) = self.path[self.level as usize - 1].take() else { let taken = self.path[self.level as usize - 1]
panic!("Popping a level without a lock"); .take()
}; .expect("Popping a level without a lock");
let _ = ManuallyDrop::new(taken); let _ = ManuallyDrop::new(taken);
debug_assert!(self.level < self.guard_level);
self.level += 1; self.level += 1;
} }
@ -345,9 +339,9 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> {
/// Gets the virtual address range that the current entry covers. /// Gets the virtual address range that the current entry covers.
fn cur_va_range(&self) -> Range<Vaddr> { fn cur_va_range(&self) -> Range<Vaddr> {
let page_size = page_size::<C>(self.level); let entry_size = page_size::<C>(self.level);
let start = self.va.align_down(page_size); let entry_start = self.va.align_down(entry_size);
start..start + page_size entry_start..entry_start + entry_size
} }
} }

View File

@ -341,8 +341,11 @@ impl PageTable<KernelPtConfig> {
// See also `<PageTablePageMeta as AnyFrameMeta>::on_drop`. // See also `<PageTablePageMeta as AnyFrameMeta>::on_drop`.
let pt_addr = pt.start_paddr(); let pt_addr = pt.start_paddr();
let pte = PageTableEntry::new_pt(pt_addr); let pte = PageTableEntry::new_pt(pt_addr);
// SAFETY: The index is within the bounds and the level of the new // SAFETY: The index is within the bounds and the PTE is at the
// PTE matches the node. // correct paging level. However, neither it's a `UserPtConfig`
// child nor the node has the ownership of the child. It is
// still safe because `UserPtConfig::TOP_LEVEL_INDEX_RANGE`
// guarantees that the cursor won't access it.
unsafe { new_node.write_pte(i, pte) }; unsafe { new_node.write_pte(i, pte) };
} }
drop(new_node); drop(new_node);
@ -446,23 +449,27 @@ impl<C: PageTableConfig> PageTable<C> {
} }
/// A software emulation of the MMU address translation process. /// A software emulation of the MMU address translation process.
/// It returns the physical address of the given virtual address and the mapping info ///
/// if a valid mapping exists for the given virtual address. /// This method returns the physical address of the given virtual address and
/// the page property if a valid mapping exists for the given virtual address.
/// ///
/// # Safety /// # Safety
/// ///
/// The caller must ensure that the root_paddr is a valid pointer to the root /// The caller must ensure that the `root_paddr` is a pointer to a valid root
/// page table node. /// page table node.
/// ///
/// # Notes on the page table free-reuse-then-read problem /// # Notes on the page table use-after-free problem
/// ///
/// Because neither the hardware MMU nor the software page walk method /// Neither the hardware MMU nor the software page walk method acquires the page
/// would get the locks of the page table while reading, they can enter /// table locks while reading. They can enter a to-be-recycled page table node
/// a to-be-recycled page table node and read the page table entries /// and read the page table entries after the node is recycled and reused.
/// after the node is recycled and reused.
/// ///
/// To mitigate this problem, the page table nodes are by default not /// For the hardware MMU page walk, we mitigate this problem by dropping the page
/// actively recycled, until we find an appropriate solution. /// table nodes only after the TLBs have been flushed on all the CPUs that
/// activate the page table.
///
/// For the software page walk, we only need to disable preemption at the beginning
/// since the page table nodes won't be recycled in the RCU critical section.
#[cfg(ktest)] #[cfg(ktest)]
pub(super) unsafe fn page_walk<C: PageTableConfig>( pub(super) unsafe fn page_walk<C: PageTableConfig>(
root_paddr: Paddr, root_paddr: Paddr,
@ -470,43 +477,34 @@ pub(super) unsafe fn page_walk<C: PageTableConfig>(
) -> Option<(Paddr, PageProperty)> { ) -> Option<(Paddr, PageProperty)> {
use super::paddr_to_vaddr; use super::paddr_to_vaddr;
let _guard = crate::trap::disable_local(); let _rcu_guard = disable_preempt();
let mut cur_level = C::NR_LEVELS; let mut pt_addr = paddr_to_vaddr(root_paddr);
let mut cur_pte = { for cur_level in (1..=C::NR_LEVELS).rev() {
let node_addr = paddr_to_vaddr(root_paddr);
let offset = pte_index::<C>(vaddr, cur_level); let offset = pte_index::<C>(vaddr, cur_level);
// SAFETY: The offset does not exceed the value of PAGE_SIZE. // SAFETY:
unsafe { (node_addr as *const C::E).add(offset).read() } // - The page table node is alive because (1) the root node is alive and
}; // (2) all child nodes cannot be recycled because we're in the RCU critical section.
// - The index is inside the bound, so the page table entry is valid.
// - All page table entries are aligned and accessed with atomic operations only.
let cur_pte = unsafe { load_pte((pt_addr as *mut C::E).add(offset), Ordering::Acquire) };
while cur_level > 1 {
if !cur_pte.is_present() { if !cur_pte.is_present() {
return None; return None;
} }
if cur_pte.is_last(cur_level) { if cur_pte.is_last(cur_level) {
debug_assert!(cur_level <= C::HIGHEST_TRANSLATION_LEVEL); debug_assert!(cur_level <= C::HIGHEST_TRANSLATION_LEVEL);
break; return Some((
}
cur_level -= 1;
cur_pte = {
let node_addr = paddr_to_vaddr(cur_pte.paddr());
let offset = pte_index::<C>(vaddr, cur_level);
// SAFETY: The offset does not exceed the value of PAGE_SIZE.
unsafe { (node_addr as *const C::E).add(offset).read() }
};
}
if cur_pte.is_present() {
Some((
cur_pte.paddr() + (vaddr & (page_size::<C>(cur_level) - 1)), cur_pte.paddr() + (vaddr & (page_size::<C>(cur_level) - 1)),
cur_pte.prop(), cur_pte.prop(),
)) ));
} else {
None
} }
pt_addr = paddr_to_vaddr(cur_pte.paddr());
}
unreachable!("All present PTEs at the level 1 must be last-level PTEs");
} }
/// The interface for defining architecture-specific page table entries. /// The interface for defining architecture-specific page table entries.

View File

@ -2,8 +2,6 @@
//! This module provides accessors to the page table entries in a node. //! This module provides accessors to the page table entries in a node.
use core::mem::ManuallyDrop;
use super::{Child, ChildRef, PageTableEntryTrait, PageTableGuard, PageTableNode}; use super::{Child, ChildRef, PageTableEntryTrait, PageTableGuard, PageTableNode};
use crate::{ use crate::{
mm::{ mm::{
@ -78,8 +76,8 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> {
// SAFETY: // SAFETY:
// 1. The index is within the bounds. // 1. The index is within the bounds.
// 2. We replace the PTE with a new one, which differs only in // 2. We replace the PTE with a new one, which differs only in
// `PageProperty`, so the level still matches the current // `PageProperty`, so it's in `C` and at the correct paging level.
// page table node. // 3. The child is still owned by the page table node.
unsafe { self.node.write_pte(self.idx, self.pte) }; unsafe { self.node.write_pte(self.idx, self.pte) };
} }
@ -117,7 +115,8 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> {
// SAFETY: // SAFETY:
// 1. The index is within the bounds. // 1. The index is within the bounds.
// 2. The new PTE is a valid child whose level matches the current page table node. // 2. The new PTE is a child in `C` and at the correct paging level.
// 3. The ownership of the child is passed to the page table node.
unsafe { self.node.write_pte(self.idx, new_pte) }; unsafe { self.node.write_pte(self.idx, new_pte) };
self.pte = new_pte; self.pte = new_pte;
@ -138,28 +137,28 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> {
} }
let level = self.node.level(); let level = self.node.level();
let new_page = PageTableNode::<C>::alloc(level - 1); let new_page = RcuDrop::new(PageTableNode::<C>::alloc(level - 1));
let paddr = new_page.start_paddr(); let paddr = new_page.start_paddr();
let _ = ManuallyDrop::new(new_page.borrow().lock(guard)); // SAFETY: The page table won't be dropped before the RCU grace period
// ends, so it outlives `'rcu`.
let pt_ref = unsafe { PageTableNodeRef::borrow_paddr(paddr) };
// Lock before writing the PTE, so no one else can operate on it.
let pt_lock_guard = pt_ref.lock(guard);
// SAFETY: // SAFETY:
// 1. The index is within the bounds. // 1. The index is within the bounds.
// 2. The new PTE is a valid child whose level matches the current page table node. // 2. The new PTE is a child in `C` and at the correct paging level.
// 3. The ownership of the child is passed to the page table node.
unsafe { unsafe {
self.node.write_pte( self.node
self.idx, .write_pte(self.idx, Child::PageTable(new_page).into_pte())
Child::PageTable(RcuDrop::new(new_page)).into_pte(),
)
}; };
*self.node.nr_children_mut() += 1; *self.node.nr_children_mut() += 1;
// SAFETY: The page table won't be dropped before the RCU grace period Some(pt_lock_guard)
// ends, so it outlives `'rcu`.
let pt_ref = unsafe { PageTableNodeRef::borrow_paddr(paddr) };
// SAFETY: The node is locked and there are no other guards.
Some(unsafe { pt_ref.make_guard_unchecked(guard) })
} }
/// Splits the entry to smaller pages if it maps to a huge page. /// Splits the entry to smaller pages if it maps to a huge page.
@ -183,8 +182,15 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> {
let pa = self.pte.paddr(); let pa = self.pte.paddr();
let prop = self.pte.prop(); let prop = self.pte.prop();
let new_page = PageTableNode::<C>::alloc(level - 1); let new_page = RcuDrop::new(PageTableNode::<C>::alloc(level - 1));
let mut pt_lock_guard = new_page.borrow().lock(guard);
let paddr = new_page.start_paddr();
// SAFETY: The page table won't be dropped before the RCU grace period
// ends, so it outlives `'rcu`.
let pt_ref = unsafe { PageTableNodeRef::borrow_paddr(paddr) };
// Lock before writing the PTE, so no one else can operate on it.
let mut pt_lock_guard = pt_ref.lock(guard);
for i in 0..nr_subpage_per_huge::<C>() { for i in 0..nr_subpage_per_huge::<C>() {
let small_pa = pa + i * page_size::<C>(level - 1); let small_pa = pa + i * page_size::<C>(level - 1);
@ -193,24 +199,16 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> {
debug_assert!(old.is_none()); debug_assert!(old.is_none());
} }
let paddr = new_page.start_paddr();
let _ = ManuallyDrop::new(pt_lock_guard);
// SAFETY: // SAFETY:
// 1. The index is within the bounds. // 1. The index is within the bounds.
// 2. The new PTE is a valid child whose level matches the current page table node. // 2. The new PTE is a child in `C` and at the correct paging level.
// 3. The ownership of the child is passed to the page table node.
unsafe { unsafe {
self.node.write_pte( self.node
self.idx, .write_pte(self.idx, Child::PageTable(new_page).into_pte())
Child::PageTable(RcuDrop::new(new_page)).into_pte(),
)
}; };
// SAFETY: The page table won't be dropped before the RCU grace period Some(pt_lock_guard)
// ends, so it outlives `'rcu`.
let pt_ref = unsafe { PageTableNodeRef::borrow_paddr(paddr) };
// SAFETY: The node is locked and there are no other guards.
Some(unsafe { pt_ref.make_guard_unchecked(guard) })
} }
/// Create a new entry at the node with guard. /// Create a new entry at the node with guard.

View File

@ -225,15 +225,14 @@ impl<'rcu, C: PageTableConfig> PageTableGuard<'rcu, C> {
/// ///
/// This operation will leak the old child if the old PTE is present. /// This operation will leak the old child if the old PTE is present.
/// ///
/// The child represented by the given PTE will handover the ownership to
/// the node. The PTE will be rendered invalid after this operation.
///
/// # Safety /// # Safety
/// ///
/// The caller must ensure that: /// The caller must ensure that:
/// 1. The index must be within the bound; /// 1. The index must be within the bound;
/// 2. The PTE must represent a valid [`Child`] whose level is compatible /// 2. The PTE must represent a [`Child`] in the same [`PageTableConfig`]
/// with the page table node. /// and at the right paging level (`self.level() - 1`).
/// 3. The page table node will have the ownership of the [`Child`]
/// after this method.
pub(super) unsafe fn write_pte(&mut self, idx: usize, pte: C::E) { pub(super) unsafe fn write_pte(&mut self, idx: usize, pte: C::E) {
debug_assert!(idx < nr_subpage_per_huge::<C>()); debug_assert!(idx < nr_subpage_per_huge::<C>());
let ptr = paddr_to_vaddr(self.start_paddr()) as *mut C::E; let ptr = paddr_to_vaddr(self.start_paddr()) as *mut C::E;
@ -296,24 +295,25 @@ impl<C: PageTableConfig> PageTablePageMeta<C> {
} }
} }
// SAFETY: We can read the page table node because the page table pages are // FIXME: The safe APIs in the `page_table/node` module allow `Child::Frame`s with
// accessed as untyped memory. // arbitrary addresses to be stored in the page table nodes. Therefore, they may not
// be valid `C::Item`s. The soundness of the following `on_drop` implementation must
// be reasoned in conjunction with the `page_table/cursor` implementation.
unsafe impl<C: PageTableConfig> AnyFrameMeta for PageTablePageMeta<C> { unsafe impl<C: PageTableConfig> AnyFrameMeta for PageTablePageMeta<C> {
fn on_drop(&mut self, reader: &mut VmReader<Infallible>) { fn on_drop(&mut self, reader: &mut VmReader<Infallible>) {
let nr_children = self.nr_children.get_mut(); let nr_children = self.nr_children.get_mut();
if *nr_children == 0 { if *nr_children == 0 {
return; return;
} }
let level = self.level; let level = self.level;
// Drop the children.
let range = if level == C::NR_LEVELS { let range = if level == C::NR_LEVELS {
C::TOP_LEVEL_INDEX_RANGE.clone() C::TOP_LEVEL_INDEX_RANGE.clone()
} else { } else {
0..nr_subpage_per_huge::<C>() 0..nr_subpage_per_huge::<C>()
}; };
// Drop the children.
reader.skip(range.start * size_of::<C::E>()); reader.skip(range.start * size_of::<C::E>());
for _ in range { for _ in range {
// Non-atomic read is OK because we have mutable access. // Non-atomic read is OK because we have mutable access.

View File

@ -432,6 +432,41 @@ mod navigation {
); );
} }
#[ktest]
fn jump_from_end_and_query_huge_middle() {
let page_table = PageTable::<TestPtConfig>::empty();
const HUGE_PAGE_SIZE: usize = PAGE_SIZE * 512; // 2M
let virt_range = 0..HUGE_PAGE_SIZE * 2; // lock at level 2
let map_va = virt_range.end - HUGE_PAGE_SIZE;
let map_item = (
0,
2,
PageProperty::new_user(PageFlags::RW, CachePolicy::Writeback),
);
let preempt_guard = disable_preempt();
let mut cursor = page_table.cursor_mut(&preempt_guard, &virt_range).unwrap();
cursor.jump(map_va).unwrap();
unsafe { cursor.map(map_item).unwrap() };
// Now the cursor is at the end of the range with level 2.
assert!(cursor.query().is_err());
// Jump from the end.
cursor.jump(virt_range.start).unwrap();
assert!(cursor.query().unwrap().1.is_none());
// Query in the middle of the huge page.
cursor.jump(virt_range.end - HUGE_PAGE_SIZE / 2).unwrap();
assert_eq!(
cursor.query().unwrap().0,
virt_range.end - HUGE_PAGE_SIZE..virt_range.end
);
}
#[ktest] #[ktest]
fn find_next() { fn find_next() {
let (page_table, _, _) = setup_page_table_with_two_frames(); let (page_table, _, _) = setup_page_table_with_two_frames();