Implement the stray marking in PT to ensure serializability

This commit is contained in:
Zhang Junyang 2025-03-24 11:58:03 +08:00 committed by Tate, Hongliang Tian
parent 5b7637eac3
commit 54fbdcf059
10 changed files with 167 additions and 126 deletions

View File

@ -355,9 +355,16 @@ impl Vmar_ {
/// Clears all content of the root VMAR.
fn clear_root_vmar(&self) -> Result<()> {
self.vm_space.clear();
let mut inner = self.inner.write();
inner.vm_mappings.clear();
{
let full_range = 0..MAX_USERSPACE_VADDR;
let mut cursor = self.vm_space.cursor_mut(&full_range).unwrap();
cursor.unmap(full_range.len());
cursor.flusher().sync_tlb_flush();
}
{
let mut inner = self.inner.write();
inner.vm_mappings.clear();
}
Ok(())
}

View File

@ -25,9 +25,20 @@ pub(super) fn lock_range<'a, M: PageTableMode, E: PageTableEntryTrait, C: Paging
va: &Range<Vaddr>,
new_pt_is_tracked: MapTrackingStatus,
) -> Cursor<'a, M, E, C> {
// Start RCU read-side critical section.
let preempt_guard = disable_preempt();
let mut subtree_root = traverse_and_lock_subtree_root(pt, va, new_pt_is_tracked);
// The re-try loop of finding the sub-tree root.
//
// If we locked a stray node, we need to re-try. Otherwise, although
// there are no safety concerns, the operations of a cursor on an stray
// sub-tree will not see the current state and will not change the current
// state, breaking serializability.
let mut subtree_root = loop {
if let Some(subtree_root) = try_traverse_and_lock_subtree_root(pt, va, new_pt_is_tracked) {
break subtree_root;
}
};
// Once we have locked the sub-tree that is not stray, we won't read any
// stray nodes in the following traversal since we must lock before reading.
@ -70,7 +81,11 @@ pub(super) fn unlock_range<M: PageTableMode, E: PageTableEntryTrait, C: PagingCo
/// If that node (or any of its ancestors) does not exist, we need to lock
/// the parent and create it. After the creation the lock of the parent will
/// be released and the new node will be locked.
fn traverse_and_lock_subtree_root<
///
/// If this function founds that a locked node is stray (because of racing with
/// page table recycling), it will return `None`. The caller should retry in
/// this case to lock the proper node.
fn try_traverse_and_lock_subtree_root<
'a,
M: PageTableMode,
E: PageTableEntryTrait,
@ -79,7 +94,7 @@ fn traverse_and_lock_subtree_root<
pt: &'a PageTable<M, E, C>,
va: &Range<Vaddr>,
new_pt_is_tracked: MapTrackingStatus,
) -> PageTableGuard<'a, E, C> {
) -> Option<PageTableGuard<'a, E, C>> {
// # Safety
// Must be called with `cur_pt_addr` and `'a`, `E`, `E` of the residing function.
unsafe fn lock_cur_pt<'a, E: PageTableEntryTrait, C: PagingConstsTrait>(
@ -130,10 +145,13 @@ fn traverse_and_lock_subtree_root<
let mut guard = cur_node_guard
.take()
.unwrap_or_else(|| unsafe { lock_cur_pt::<'a, E, C>(cur_pt_addr) });
if *guard.stray_mut() {
return None;
}
let mut cur_entry = guard.entry(start_idx);
if cur_entry.is_none() {
let allocated_guard = cur_entry.alloc_if_none(new_pt_is_tracked).unwrap();
cur_pt_addr = allocated_guard.start_paddr();
cur_node_guard = Some(allocated_guard);
} else if cur_entry.is_node() {
@ -148,7 +166,13 @@ fn traverse_and_lock_subtree_root<
}
// SAFETY: It is called with the required parameters.
cur_node_guard.unwrap_or_else(|| unsafe { lock_cur_pt::<'a, E, C>(cur_pt_addr) })
let mut guard =
cur_node_guard.unwrap_or_else(|| unsafe { lock_cur_pt::<'a, E, C>(cur_pt_addr) });
if *guard.stray_mut() {
return None;
}
Some(guard)
}
/// Acquires the locks for the given range in the sub-tree rooted at the node.
@ -163,6 +187,7 @@ fn dfs_acquire_lock<E: PageTableEntryTrait, C: PagingConstsTrait>(
cur_node_va: Vaddr,
va_range: Range<Vaddr>,
) {
debug_assert!(!*cur_node.stray_mut());
let cur_level = cur_node.level();
if cur_level <= 1 {
return;
@ -222,6 +247,41 @@ unsafe fn dfs_release_lock<E: PageTableEntryTrait, C: PagingConstsTrait>(
}
}
/// Marks all the nodes in the sub-tree rooted at the node as stray.
///
/// This function must be called upon the node after the node is removed
/// from the parent page table.
///
/// This function also unlocks the nodes in the sub-tree.
///
/// # Safety
///
/// The caller must ensure that all the nodes in the sub-tree are locked.
///
/// This function must not be called upon a shared node. E.g., the second-
/// top level nodes that the kernel space and user space share.
pub(super) unsafe fn dfs_mark_stray_and_unlock<E: PageTableEntryTrait, C: PagingConstsTrait>(
mut sub_tree: PageTableGuard<E, C>,
) {
*sub_tree.stray_mut() = true;
if sub_tree.level() <= 1 {
return;
}
for i in (0..nr_subpage_per_huge::<C>()).rev() {
let child = sub_tree.entry(i);
match child.to_ref() {
Child::PageTableRef(pt) => {
// SAFETY: The caller ensures that the node is locked.
let locked_pt = unsafe { PageTableGuard::<E, C>::from_raw_paddr(pt.start_paddr()) };
dfs_mark_stray_and_unlock(locked_pt);
}
Child::None | Child::Frame(_, _) | Child::Untracked(_, _, _) | Child::PageTable(_) => {}
}
}
}
fn dfs_get_idx_range<C: PagingConstsTrait>(
cur_node_level: PagingLevel,
cur_node_va: Vaddr,

View File

@ -10,6 +10,12 @@
//! [`CursorMut::new`] will lock a range in the virtual space and all the
//! operations on the range with the cursor will be atomic as a transaction.
//!
//! The guarantee of the lock protocol is that, if two cursors' ranges overlap,
//! all of one's operation must be finished before any of the other's
//! operation. The order depends on the scheduling of the threads. If a cursor
//! is ordered after another cursor, it will see all the changes made by the
//! previous cursor.
//!
//! The implementation of the lock protocol resembles two-phase locking (2PL).
//! [`CursorMut::new`] accepts an address range, which indicates the page table
//! entries that may be visited by this cursor. Then, [`CursorMut::new`] finds
@ -23,7 +29,11 @@
mod locking;
use core::{any::TypeId, marker::PhantomData, ops::Range};
use core::{
any::TypeId,
marker::PhantomData,
ops::{Deref, Range},
};
use align_ext::AlignExt;
@ -64,6 +74,8 @@ pub struct Cursor<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsT
va: Vaddr,
/// The virtual address range that is locked.
barrier_va: Range<Vaddr>,
/// This also make all the operation in the `Cursor::new` performed in a
/// RCU read-side critical section.
#[expect(dead_code)]
preempt_guard: DisabledPreemptGuard,
_phantom: PhantomData<&'a PageTable<M, E, C>>,
@ -89,6 +101,14 @@ pub enum PageTableItem {
len: usize,
prop: PageProperty,
},
/// This item can only show up as a return value of `take_next`. The caller
/// is responsible to free the page table node after TLB coherence.
/// FIXME: Separate into another type rather than `PageTableItem`?
StrayPageTable {
pt: Frame<dyn AnyFrameMeta>,
va: Vaddr,
len: usize,
},
}
impl<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsTrait> Cursor<'a, M, E, C> {
@ -509,10 +529,7 @@ impl<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsTrait> CursorM
}
// Go down if not applicable.
if cur_entry.is_node()
|| cur_va % page_size::<C>(cur_level) != 0
|| cur_va + page_size::<C>(cur_level) > end
{
if cur_va % page_size::<C>(cur_level) != 0 || cur_va + page_size::<C>(cur_level) > end {
let child = cur_entry.to_ref();
match child {
Child::PageTableRef(pt) => {
@ -567,7 +584,28 @@ impl<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsTrait> CursorM
prop,
}
}
Child::PageTable(_) | Child::None | Child::PageTableRef(_) => unreachable!(),
Child::PageTable(pt) => {
assert!(
!(TypeId::of::<M>() == TypeId::of::<KernelMode>()
&& self.0.level == C::NR_LEVELS),
"Unmapping shared kernel page table nodes"
);
// SAFETY: We must have locked this node.
let locked_pt =
unsafe { PageTableGuard::<E, C>::from_raw_paddr(pt.start_paddr()) };
// SAFETY:
// - We checked that we are not unmapping shared kernel page table nodes.
// - We must have locked the entire sub-tree since the range is locked.
unsafe { locking::dfs_mark_stray_and_unlock(locked_pt) };
PageTableItem::StrayPageTable {
pt: pt.deref().clone().into(),
va: self.0.va,
len: page_size::<C>(self.0.level),
}
}
Child::None | Child::PageTableRef(_) => unreachable!(),
};
self.0.move_forward();

View File

@ -14,8 +14,6 @@ use super::{
};
use crate::{
arch::mm::{PageTableEntry, PagingConsts},
cpu::PinCurrentCpu,
task::disable_preempt,
util::marker::SameSizeAs,
Pod,
};
@ -23,7 +21,8 @@ use crate::{
mod node;
use node::*;
pub mod cursor;
pub use cursor::{Cursor, CursorMut, PageTableItem};
pub(crate) use cursor::PageTableItem;
pub use cursor::{Cursor, CursorMut};
#[cfg(ktest)]
mod test;
@ -99,25 +98,6 @@ impl PageTable<UserMode> {
self.root.activate();
}
}
/// Clear the page table.
pub(super) fn clear<G: PinCurrentCpu>(&self, flusher: &mut super::tlb::TlbFlusher<'_, G>) {
let _guard = disable_preempt();
let mut root_node = self.root.lock();
const NR_PTES_PER_NODE: usize = nr_subpage_per_huge::<PagingConsts>();
for i in 0..NR_PTES_PER_NODE / 2 {
let mut root_entry = root_node.entry(i);
if !root_entry.is_none() {
let old = root_entry.replace(Child::None);
if let Child::PageTable(pt) = old {
flusher
.issue_tlb_flush_with(crate::mm::tlb::TlbFlushOp::All, pt.clone().into());
// There may be other cursors accessing the old child, delay it with RCU.
let _ = crate::sync::RcuDrop::new(pt);
}
}
}
}
}
impl PageTable<KernelMode> {
@ -171,7 +151,9 @@ impl PageTable<KernelMode> {
};
let pt_cloned = pt.clone();
let _ = new_node.entry(i).replace(Child::PageTable(pt_cloned));
let _ = new_node
.entry(i)
.replace(Child::PageTable(crate::sync::RcuDrop::new(pt_cloned)));
}
drop(new_node);

View File

@ -5,10 +5,13 @@
use core::{mem::ManuallyDrop, panic};
use super::{MapTrackingStatus, PageTableEntryTrait, PageTableNode, PageTableNodeRef};
use crate::mm::{
frame::{inc_frame_ref_count, meta::AnyFrameMeta, Frame},
page_prop::PageProperty,
Paddr, PagingConstsTrait, PagingLevel,
use crate::{
mm::{
frame::{inc_frame_ref_count, meta::AnyFrameMeta, Frame},
page_prop::PageProperty,
Paddr, PagingConstsTrait, PagingLevel,
},
sync::RcuDrop,
};
/// A child of a page table node.
@ -16,7 +19,7 @@ use crate::mm::{
#[derive(Debug)]
pub(in crate::mm) enum Child<'a, E: PageTableEntryTrait, C: PagingConstsTrait> {
/// A owning handle to a raw page table node.
PageTable(PageTableNode<E, C>),
PageTable(RcuDrop<PageTableNode<E, C>>),
/// A reference of a child page table node.
PageTableRef(PageTableNodeRef<'a, E, C>),
/// A mapped frame.
@ -109,7 +112,7 @@ impl<E: PageTableEntryTrait, C: PagingConstsTrait> Child<'_, E, C> {
// at the given level.
let pt = unsafe { PageTableNode::from_raw(paddr) };
debug_assert_eq!(pt.level(), level - 1);
return Child::PageTable(pt);
return Child::PageTable(RcuDrop::new(pt));
}
match is_tracked {

View File

@ -3,7 +3,10 @@
//! This module provides accessors to the page table entries in a node.
use super::{Child, MapTrackingStatus, PageTableEntryTrait, PageTableGuard, PageTableNode};
use crate::mm::{nr_subpage_per_huge, page_prop::PageProperty, page_size, PagingConstsTrait};
use crate::{
mm::{nr_subpage_per_huge, page_prop::PageProperty, page_size, PagingConstsTrait},
sync::RcuDrop,
};
/// A view of an entry in a page table node.
///
@ -127,8 +130,10 @@ impl<'guard, 'pt, E: PageTableEntryTrait, C: PagingConstsTrait> Entry<'guard, 'p
// 1. The index is within the bounds.
// 2. The new PTE is compatible with the page table node.
unsafe {
self.node
.write_pte(self.idx, Child::PageTable(new_page).into_pte())
self.node.write_pte(
self.idx,
Child::PageTable(RcuDrop::new(new_page)).into_pte(),
)
};
*self.node.nr_children_mut() += 1;
@ -178,8 +183,10 @@ impl<'guard, 'pt, E: PageTableEntryTrait, C: PagingConstsTrait> Entry<'guard, 'p
// 1. The index is within the bounds.
// 2. The new PTE is compatible with the page table node.
unsafe {
self.node
.write_pte(self.idx, Child::PageTable(new_page).into_pte())
self.node.write_pte(
self.idx,
Child::PageTable(RcuDrop::new(new_page)).into_pte(),
)
};
// SAFETY: The resulting guard lifetime (`'a`) is no shorter than the

View File

@ -191,6 +191,12 @@ impl<'a, E: PageTableEntryTrait, C: PagingConstsTrait> PageTableGuard<'a, E, C>
unsafe { *self.meta().nr_children.get() }
}
/// Returns if the page table node is detached from its parent.
pub(super) fn stray_mut(&mut self) -> &mut bool {
// SAFETY: The lock is held so we have an exclusive access.
unsafe { &mut *self.meta().stray.get() }
}
/// Reads a non-owning PTE at the given index.
///
/// A non-owning PTE means that it does not account for a reference count
@ -258,6 +264,12 @@ impl<E: PageTableEntryTrait, C: PagingConstsTrait> Drop for PageTableGuard<'_, E
pub(in crate::mm) struct PageTablePageMeta<E: PageTableEntryTrait, C: PagingConstsTrait> {
/// The number of valid PTEs. It is mutable if the lock is held.
pub nr_children: SyncUnsafeCell<u16>,
/// If the page table is detached from its parent.
///
/// A page table can be detached from its parent while still being accessed,
/// since we use a RCU scheme to recycle page tables. If this flag is set,
/// it means that the parent is recycling the page table.
pub stray: SyncUnsafeCell<bool>,
/// The level of the page table page. A page table page cannot be
/// referenced by page tables of different levels.
pub level: PagingLevel,
@ -288,6 +300,7 @@ impl<E: PageTableEntryTrait, C: PagingConstsTrait> PageTablePageMeta<E, C> {
pub fn new(level: PagingLevel, is_tracked: MapTrackingStatus) -> Self {
Self {
nr_children: SyncUnsafeCell::new(0),
stray: SyncUnsafeCell::new(false),
level,
lock: AtomicU8::new(0),
is_tracked,
@ -296,8 +309,8 @@ impl<E: PageTableEntryTrait, C: PagingConstsTrait> PageTablePageMeta<E, C> {
}
}
// SAFETY: The layout of the `PageTablePageMeta` is ensured to be the same for
// all possible generic parameters. And the layout fits the requirements.
// SAFETY: We can read the page table node because the page table pages are
// accessed as untyped memory.
unsafe impl<E: PageTableEntryTrait, C: PagingConstsTrait> AnyFrameMeta for PageTablePageMeta<E, C> {
fn on_drop(&mut self, reader: &mut VmReader<Infallible>) {
let nr_children = self.nr_children.get_mut();

View File

@ -5,7 +5,6 @@ use crate::{
mm::{
kspace::LINEAR_MAPPING_BASE_VADDR,
page_prop::{CachePolicy, PageFlags},
tlb::TlbFlusher,
FrameAllocOptions, MAX_USERSPACE_VADDR, PAGE_SIZE,
},
prelude::*,
@ -130,7 +129,6 @@ mod test_utils {
mod create_page_table {
use super::{test_utils::*, *};
use crate::cpu::{AtomicCpuSet, CpuSet};
#[ktest]
fn init_user_page_table() {
@ -167,42 +165,6 @@ mod create_page_table {
assert_eq!(kernel_node.start_paddr(), user_node.start_paddr());
}
}
#[ktest]
fn clear_user_page_table() {
// Creates a kernel page table.
let kernel_pt = PageTable::<KernelMode>::new_kernel_page_table();
// Creates a user page table.
let user_pt = kernel_pt.create_user_page_table();
// Defines a virtual address range.
let range = PAGE_SIZE..(PAGE_SIZE * 2);
// Allocates a physical frame and sets page properties.
let frame = FrameAllocOptions::default().alloc_frame().unwrap();
let page_property = PageProperty::new(PageFlags::RW, CachePolicy::Writeback);
// Maps the virtual range to the physical frame.
unsafe {
user_pt
.cursor_mut(&range)
.unwrap()
.map(frame.into(), page_property);
}
// Confirms that the mapping exists.
assert!(user_pt.query(PAGE_SIZE + 10).is_some());
let set = AtomicCpuSet::new(CpuSet::new_empty());
let mut tlb_flusher = TlbFlusher::new(&set, disable_preempt());
// Clears the page table.
user_pt.clear(&mut tlb_flusher);
// Confirms that the mapping is cleared.
assert!(user_pt.query(PAGE_SIZE + 10).is_none());
}
}
mod range_checks {

View File

@ -674,34 +674,6 @@ mod vmspace {
);
}
/// Clears the `VmSpace`.
#[ktest]
fn vmspace_clear() {
let vmspace = VmSpace::new();
let range = 0x2000..0x3000;
{
let mut cursor_mut = vmspace
.cursor_mut(&range)
.expect("Failed to create mutable cursor");
let frame = create_dummy_frame();
let prop = PageProperty::new(PageFlags::R, CachePolicy::Writeback);
cursor_mut.map(frame, prop);
}
// Clears the VmSpace.
vmspace.clear();
// Verifies that the mapping is cleared.
let mut cursor = vmspace.cursor(&range).expect("Failed to create cursor");
assert_eq!(
cursor.next(),
Some(VmItem::NotMapped {
va: range.start,
len: range.start + 0x1000
})
);
}
/// Activates and deactivates the `VmSpace` in single-CPU scenarios.
#[ktest]
fn vmspace_activate() {

View File

@ -77,14 +77,6 @@ impl VmSpace {
}
}
/// Clears the user space mappings in the page table.
pub fn clear(&self) {
let mut flusher = TlbFlusher::new(&self.cpus, disable_preempt());
self.pt.clear(&mut flusher);
flusher.dispatch_tlb_flush();
flusher.sync_tlb_flush();
}
/// Gets an immutable cursor in the virtual address range.
///
/// The cursor behaves like a lock guard, exclusively owning a sub-tree of
@ -323,6 +315,10 @@ impl<'b> CursorMut<'_, 'b> {
PageTableItem::MappedUntracked { .. } => {
panic!("found untracked memory mapped into `VmSpace`");
}
PageTableItem::StrayPageTable { pt, va, len } => {
self.flusher
.issue_tlb_flush_with(TlbFlushOp::Range(va..va + len), pt);
}
}
}
@ -470,12 +466,13 @@ impl TryFrom<PageTableItem> for VmItem {
va,
frame: page
.try_into()
.map_err(|_| "found typed memory mapped into `VmSpace`")?,
.map_err(|_| "Found typed memory mapped into `VmSpace`")?,
prop,
}),
PageTableItem::MappedUntracked { .. } => {
Err("found untracked memory mapped into `VmSpace`")
Err("Found untracked memory mapped into `VmSpace`")
}
PageTableItem::StrayPageTable { .. } => Err("Stray page table cannot be query results"),
}
}
}