Do some miscellaneous page table cleanups

This commit is contained in:
Ruihan Li
2025-06-19 12:46:28 +08:00
committed by Junyang Zhang
parent 11f9675f37
commit a1accf4304
6 changed files with 128 additions and 101 deletions

View File

@ -63,8 +63,10 @@ pub(super) fn unlock_range<C: PageTableConfig>(cursor: &mut Cursor<'_, C>) {
}
}
let guard_node = cursor.path[cursor.guard_level as usize - 1].take().unwrap();
let cur_node_va = cursor.barrier_va.start / page_size::<C>(cursor.guard_level + 1)
* page_size::<C>(cursor.guard_level + 1);
let cur_node_va = cursor
.barrier_va
.start
.align_down(page_size::<C>(cursor.guard_level + 1));
// SAFETY: A cursor maintains that its corresponding sub-tree is locked.
unsafe {

View File

@ -101,6 +101,8 @@ impl<C: PageTableConfig> PageTableFrag<C> {
match self {
PageTableFrag::Mapped { va, item } => {
let (pa, level, prop) = C::item_into_raw(item.clone());
// SAFETY: All the arguments match those returned from the previous call
// to `item_into_raw`, and we are taking ownership of the cloned item.
drop(unsafe { C::item_from_raw(pa, level, prop) });
*va..*va + page_size::<C>(level)
}
@ -149,11 +151,10 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> {
let rcu_guard = self.rcu_guard;
loop {
let cur_va = self.va;
let level = self.level;
let cur_child = self.cur_entry().to_ref();
let item = match cur_child {
let cur_entry = self.cur_entry();
let item = match cur_entry.to_ref() {
ChildRef::PageTable(pt) => {
// SAFETY: The `pt` must be locked and no other guards exist.
let guard = unsafe { pt.make_guard_unchecked(rcu_guard) };
@ -179,7 +180,7 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> {
}
};
return Ok((cur_va..cur_va + page_size::<C>(level), item));
return Ok((self.cur_va_range(), item));
}
}
@ -287,22 +288,14 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> {
}
loop {
let cur_node_start = self.va & !(page_size::<C>(self.level + 1) - 1);
let cur_node_end = cur_node_start + page_size::<C>(self.level + 1);
let node_size = page_size::<C>(self.level + 1);
let node_start = self.va.align_down(node_size);
// If the address is within the current node, we can jump directly.
if cur_node_start <= va && va < cur_node_end {
if node_start <= va && va < node_start + node_size {
self.va = va;
return Ok(());
}
// There is a corner case that the cursor is depleted, sitting at the start of the
// next node but the next node is not locked because the parent is not locked.
if self.va >= self.barrier_va.end && self.level == self.guard_level {
self.va = va;
return Ok(());
}
debug_assert!(self.level < self.guard_level);
self.pop_level();
}
}
@ -321,11 +314,12 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> {
/// Goes up a level.
fn pop_level(&mut self) {
let Some(taken) = self.path[self.level as usize - 1].take() else {
panic!("Popping a level without a lock");
};
let taken = self.path[self.level as usize - 1]
.take()
.expect("Popping a level without a lock");
let _ = ManuallyDrop::new(taken);
debug_assert!(self.level < self.guard_level);
self.level += 1;
}
@ -345,9 +339,9 @@ impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> {
/// Gets the virtual address range that the current entry covers.
fn cur_va_range(&self) -> Range<Vaddr> {
let page_size = page_size::<C>(self.level);
let start = self.va.align_down(page_size);
start..start + page_size
let entry_size = page_size::<C>(self.level);
let entry_start = self.va.align_down(entry_size);
entry_start..entry_start + entry_size
}
}

View File

@ -341,8 +341,11 @@ impl PageTable<KernelPtConfig> {
// See also `<PageTablePageMeta as AnyFrameMeta>::on_drop`.
let pt_addr = pt.start_paddr();
let pte = PageTableEntry::new_pt(pt_addr);
// SAFETY: The index is within the bounds and the level of the new
// PTE matches the node.
// SAFETY: The index is within the bounds and the PTE is at the
// correct paging level. However, neither it's a `UserPtConfig`
// child nor the node has the ownership of the child. It is
// still safe because `UserPtConfig::TOP_LEVEL_INDEX_RANGE`
// guarantees that the cursor won't access it.
unsafe { new_node.write_pte(i, pte) };
}
drop(new_node);
@ -446,23 +449,27 @@ impl<C: PageTableConfig> PageTable<C> {
}
/// A software emulation of the MMU address translation process.
/// It returns the physical address of the given virtual address and the mapping info
/// if a valid mapping exists for the given virtual address.
///
/// This method returns the physical address of the given virtual address and
/// the page property if a valid mapping exists for the given virtual address.
///
/// # Safety
///
/// The caller must ensure that the root_paddr is a valid pointer to the root
/// The caller must ensure that the `root_paddr` is a pointer to a valid root
/// page table node.
///
/// # Notes on the page table free-reuse-then-read problem
/// # Notes on the page table use-after-free problem
///
/// Because neither the hardware MMU nor the software page walk method
/// would get the locks of the page table while reading, they can enter
/// a to-be-recycled page table node and read the page table entries
/// after the node is recycled and reused.
/// Neither the hardware MMU nor the software page walk method acquires the page
/// table locks while reading. They can enter a to-be-recycled page table node
/// and read the page table entries after the node is recycled and reused.
///
/// To mitigate this problem, the page table nodes are by default not
/// actively recycled, until we find an appropriate solution.
/// For the hardware MMU page walk, we mitigate this problem by dropping the page
/// table nodes only after the TLBs have been flushed on all the CPUs that
/// activate the page table.
///
/// For the software page walk, we only need to disable preemption at the beginning
/// since the page table nodes won't be recycled in the RCU critical section.
#[cfg(ktest)]
pub(super) unsafe fn page_walk<C: PageTableConfig>(
root_paddr: Paddr,
@ -470,43 +477,34 @@ pub(super) unsafe fn page_walk<C: PageTableConfig>(
) -> Option<(Paddr, PageProperty)> {
use super::paddr_to_vaddr;
let _guard = crate::trap::disable_local();
let _rcu_guard = disable_preempt();
let mut cur_level = C::NR_LEVELS;
let mut cur_pte = {
let node_addr = paddr_to_vaddr(root_paddr);
let mut pt_addr = paddr_to_vaddr(root_paddr);
for cur_level in (1..=C::NR_LEVELS).rev() {
let offset = pte_index::<C>(vaddr, cur_level);
// SAFETY: The offset does not exceed the value of PAGE_SIZE.
unsafe { (node_addr as *const C::E).add(offset).read() }
};
// SAFETY:
// - The page table node is alive because (1) the root node is alive and
// (2) all child nodes cannot be recycled because we're in the RCU critical section.
// - The index is inside the bound, so the page table entry is valid.
// - All page table entries are aligned and accessed with atomic operations only.
let cur_pte = unsafe { load_pte((pt_addr as *mut C::E).add(offset), Ordering::Acquire) };
while cur_level > 1 {
if !cur_pte.is_present() {
return None;
}
if cur_pte.is_last(cur_level) {
debug_assert!(cur_level <= C::HIGHEST_TRANSLATION_LEVEL);
break;
}
cur_level -= 1;
cur_pte = {
let node_addr = paddr_to_vaddr(cur_pte.paddr());
let offset = pte_index::<C>(vaddr, cur_level);
// SAFETY: The offset does not exceed the value of PAGE_SIZE.
unsafe { (node_addr as *const C::E).add(offset).read() }
};
}
if cur_pte.is_present() {
Some((
return Some((
cur_pte.paddr() + (vaddr & (page_size::<C>(cur_level) - 1)),
cur_pte.prop(),
))
} else {
None
));
}
pt_addr = paddr_to_vaddr(cur_pte.paddr());
}
unreachable!("All present PTEs at the level 1 must be last-level PTEs");
}
/// The interface for defining architecture-specific page table entries.

View File

@ -2,8 +2,6 @@
//! This module provides accessors to the page table entries in a node.
use core::mem::ManuallyDrop;
use super::{Child, ChildRef, PageTableEntryTrait, PageTableGuard, PageTableNode};
use crate::{
mm::{
@ -78,8 +76,8 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> {
// SAFETY:
// 1. The index is within the bounds.
// 2. We replace the PTE with a new one, which differs only in
// `PageProperty`, so the level still matches the current
// page table node.
// `PageProperty`, so it's in `C` and at the correct paging level.
// 3. The child is still owned by the page table node.
unsafe { self.node.write_pte(self.idx, self.pte) };
}
@ -117,7 +115,8 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> {
// SAFETY:
// 1. The index is within the bounds.
// 2. The new PTE is a valid child whose level matches the current page table node.
// 2. The new PTE is a child in `C` and at the correct paging level.
// 3. The ownership of the child is passed to the page table node.
unsafe { self.node.write_pte(self.idx, new_pte) };
self.pte = new_pte;
@ -138,28 +137,28 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> {
}
let level = self.node.level();
let new_page = PageTableNode::<C>::alloc(level - 1);
let new_page = RcuDrop::new(PageTableNode::<C>::alloc(level - 1));
let paddr = new_page.start_paddr();
let _ = ManuallyDrop::new(new_page.borrow().lock(guard));
// SAFETY: The page table won't be dropped before the RCU grace period
// ends, so it outlives `'rcu`.
let pt_ref = unsafe { PageTableNodeRef::borrow_paddr(paddr) };
// Lock before writing the PTE, so no one else can operate on it.
let pt_lock_guard = pt_ref.lock(guard);
// SAFETY:
// 1. The index is within the bounds.
// 2. The new PTE is a valid child whose level matches the current page table node.
// 2. The new PTE is a child in `C` and at the correct paging level.
// 3. The ownership of the child is passed to the page table node.
unsafe {
self.node.write_pte(
self.idx,
Child::PageTable(RcuDrop::new(new_page)).into_pte(),
)
self.node
.write_pte(self.idx, Child::PageTable(new_page).into_pte())
};
*self.node.nr_children_mut() += 1;
// SAFETY: The page table won't be dropped before the RCU grace period
// ends, so it outlives `'rcu`.
let pt_ref = unsafe { PageTableNodeRef::borrow_paddr(paddr) };
// SAFETY: The node is locked and there are no other guards.
Some(unsafe { pt_ref.make_guard_unchecked(guard) })
Some(pt_lock_guard)
}
/// Splits the entry to smaller pages if it maps to a huge page.
@ -183,8 +182,15 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> {
let pa = self.pte.paddr();
let prop = self.pte.prop();
let new_page = PageTableNode::<C>::alloc(level - 1);
let mut pt_lock_guard = new_page.borrow().lock(guard);
let new_page = RcuDrop::new(PageTableNode::<C>::alloc(level - 1));
let paddr = new_page.start_paddr();
// SAFETY: The page table won't be dropped before the RCU grace period
// ends, so it outlives `'rcu`.
let pt_ref = unsafe { PageTableNodeRef::borrow_paddr(paddr) };
// Lock before writing the PTE, so no one else can operate on it.
let mut pt_lock_guard = pt_ref.lock(guard);
for i in 0..nr_subpage_per_huge::<C>() {
let small_pa = pa + i * page_size::<C>(level - 1);
@ -193,24 +199,16 @@ impl<'a, 'rcu, C: PageTableConfig> Entry<'a, 'rcu, C> {
debug_assert!(old.is_none());
}
let paddr = new_page.start_paddr();
let _ = ManuallyDrop::new(pt_lock_guard);
// SAFETY:
// 1. The index is within the bounds.
// 2. The new PTE is a valid child whose level matches the current page table node.
// 2. The new PTE is a child in `C` and at the correct paging level.
// 3. The ownership of the child is passed to the page table node.
unsafe {
self.node.write_pte(
self.idx,
Child::PageTable(RcuDrop::new(new_page)).into_pte(),
)
self.node
.write_pte(self.idx, Child::PageTable(new_page).into_pte())
};
// SAFETY: The page table won't be dropped before the RCU grace period
// ends, so it outlives `'rcu`.
let pt_ref = unsafe { PageTableNodeRef::borrow_paddr(paddr) };
// SAFETY: The node is locked and there are no other guards.
Some(unsafe { pt_ref.make_guard_unchecked(guard) })
Some(pt_lock_guard)
}
/// Create a new entry at the node with guard.

View File

@ -225,15 +225,14 @@ impl<'rcu, C: PageTableConfig> PageTableGuard<'rcu, C> {
///
/// This operation will leak the old child if the old PTE is present.
///
/// The child represented by the given PTE will handover the ownership to
/// the node. The PTE will be rendered invalid after this operation.
///
/// # Safety
///
/// The caller must ensure that:
/// 1. The index must be within the bound;
/// 2. The PTE must represent a valid [`Child`] whose level is compatible
/// with the page table node.
/// 2. The PTE must represent a [`Child`] in the same [`PageTableConfig`]
/// and at the right paging level (`self.level() - 1`).
/// 3. The page table node will have the ownership of the [`Child`]
/// after this method.
pub(super) unsafe fn write_pte(&mut self, idx: usize, pte: C::E) {
debug_assert!(idx < nr_subpage_per_huge::<C>());
let ptr = paddr_to_vaddr(self.start_paddr()) as *mut C::E;
@ -296,24 +295,25 @@ impl<C: PageTableConfig> PageTablePageMeta<C> {
}
}
// SAFETY: We can read the page table node because the page table pages are
// accessed as untyped memory.
// FIXME: The safe APIs in the `page_table/node` module allow `Child::Frame`s with
// arbitrary addresses to be stored in the page table nodes. Therefore, they may not
// be valid `C::Item`s. The soundness of the following `on_drop` implementation must
// be reasoned in conjunction with the `page_table/cursor` implementation.
unsafe impl<C: PageTableConfig> AnyFrameMeta for PageTablePageMeta<C> {
fn on_drop(&mut self, reader: &mut VmReader<Infallible>) {
let nr_children = self.nr_children.get_mut();
if *nr_children == 0 {
return;
}
let level = self.level;
// Drop the children.
let range = if level == C::NR_LEVELS {
C::TOP_LEVEL_INDEX_RANGE.clone()
} else {
0..nr_subpage_per_huge::<C>()
};
// Drop the children.
reader.skip(range.start * size_of::<C::E>());
for _ in range {
// Non-atomic read is OK because we have mutable access.

View File

@ -432,6 +432,41 @@ mod navigation {
);
}
#[ktest]
fn jump_from_end_and_query_huge_middle() {
let page_table = PageTable::<TestPtConfig>::empty();
const HUGE_PAGE_SIZE: usize = PAGE_SIZE * 512; // 2M
let virt_range = 0..HUGE_PAGE_SIZE * 2; // lock at level 2
let map_va = virt_range.end - HUGE_PAGE_SIZE;
let map_item = (
0,
2,
PageProperty::new_user(PageFlags::RW, CachePolicy::Writeback),
);
let preempt_guard = disable_preempt();
let mut cursor = page_table.cursor_mut(&preempt_guard, &virt_range).unwrap();
cursor.jump(map_va).unwrap();
unsafe { cursor.map(map_item).unwrap() };
// Now the cursor is at the end of the range with level 2.
assert!(cursor.query().is_err());
// Jump from the end.
cursor.jump(virt_range.start).unwrap();
assert!(cursor.query().unwrap().1.is_none());
// Query in the middle of the huge page.
cursor.jump(virt_range.end - HUGE_PAGE_SIZE / 2).unwrap();
assert_eq!(
cursor.query().unwrap().0,
virt_range.end - HUGE_PAGE_SIZE..virt_range.end
);
}
#[ktest]
fn find_next() {
let (page_table, _, _) = setup_page_table_with_two_frames();