diff --git a/kernel/src/vm/vmar/mod.rs b/kernel/src/vm/vmar/mod.rs index 7c666bb1..0b59cf5a 100644 --- a/kernel/src/vm/vmar/mod.rs +++ b/kernel/src/vm/vmar/mod.rs @@ -355,9 +355,16 @@ impl Vmar_ { /// Clears all content of the root VMAR. fn clear_root_vmar(&self) -> Result<()> { - self.vm_space.clear(); - let mut inner = self.inner.write(); - inner.vm_mappings.clear(); + { + let full_range = 0..MAX_USERSPACE_VADDR; + let mut cursor = self.vm_space.cursor_mut(&full_range).unwrap(); + cursor.unmap(full_range.len()); + cursor.flusher().sync_tlb_flush(); + } + { + let mut inner = self.inner.write(); + inner.vm_mappings.clear(); + } Ok(()) } diff --git a/ostd/src/mm/page_table/cursor/locking.rs b/ostd/src/mm/page_table/cursor/locking.rs index a613ece2..fc734596 100644 --- a/ostd/src/mm/page_table/cursor/locking.rs +++ b/ostd/src/mm/page_table/cursor/locking.rs @@ -25,9 +25,20 @@ pub(super) fn lock_range<'a, M: PageTableMode, E: PageTableEntryTrait, C: Paging va: &Range, new_pt_is_tracked: MapTrackingStatus, ) -> Cursor<'a, M, E, C> { + // Start RCU read-side critical section. let preempt_guard = disable_preempt(); - let mut subtree_root = traverse_and_lock_subtree_root(pt, va, new_pt_is_tracked); + // The re-try loop of finding the sub-tree root. + // + // If we locked a stray node, we need to re-try. Otherwise, although + // there are no safety concerns, the operations of a cursor on an stray + // sub-tree will not see the current state and will not change the current + // state, breaking serializability. + let mut subtree_root = loop { + if let Some(subtree_root) = try_traverse_and_lock_subtree_root(pt, va, new_pt_is_tracked) { + break subtree_root; + } + }; // Once we have locked the sub-tree that is not stray, we won't read any // stray nodes in the following traversal since we must lock before reading. @@ -70,7 +81,11 @@ pub(super) fn unlock_range, va: &Range, new_pt_is_tracked: MapTrackingStatus, -) -> PageTableGuard<'a, E, C> { +) -> Option> { // # Safety // Must be called with `cur_pt_addr` and `'a`, `E`, `E` of the residing function. unsafe fn lock_cur_pt<'a, E: PageTableEntryTrait, C: PagingConstsTrait>( @@ -130,10 +145,13 @@ fn traverse_and_lock_subtree_root< let mut guard = cur_node_guard .take() .unwrap_or_else(|| unsafe { lock_cur_pt::<'a, E, C>(cur_pt_addr) }); + if *guard.stray_mut() { + return None; + } + let mut cur_entry = guard.entry(start_idx); if cur_entry.is_none() { let allocated_guard = cur_entry.alloc_if_none(new_pt_is_tracked).unwrap(); - cur_pt_addr = allocated_guard.start_paddr(); cur_node_guard = Some(allocated_guard); } else if cur_entry.is_node() { @@ -148,7 +166,13 @@ fn traverse_and_lock_subtree_root< } // SAFETY: It is called with the required parameters. - cur_node_guard.unwrap_or_else(|| unsafe { lock_cur_pt::<'a, E, C>(cur_pt_addr) }) + let mut guard = + cur_node_guard.unwrap_or_else(|| unsafe { lock_cur_pt::<'a, E, C>(cur_pt_addr) }); + if *guard.stray_mut() { + return None; + } + + Some(guard) } /// Acquires the locks for the given range in the sub-tree rooted at the node. @@ -163,6 +187,7 @@ fn dfs_acquire_lock( cur_node_va: Vaddr, va_range: Range, ) { + debug_assert!(!*cur_node.stray_mut()); let cur_level = cur_node.level(); if cur_level <= 1 { return; @@ -222,6 +247,41 @@ unsafe fn dfs_release_lock( } } +/// Marks all the nodes in the sub-tree rooted at the node as stray. +/// +/// This function must be called upon the node after the node is removed +/// from the parent page table. +/// +/// This function also unlocks the nodes in the sub-tree. +/// +/// # Safety +/// +/// The caller must ensure that all the nodes in the sub-tree are locked. +/// +/// This function must not be called upon a shared node. E.g., the second- +/// top level nodes that the kernel space and user space share. +pub(super) unsafe fn dfs_mark_stray_and_unlock( + mut sub_tree: PageTableGuard, +) { + *sub_tree.stray_mut() = true; + + if sub_tree.level() <= 1 { + return; + } + + for i in (0..nr_subpage_per_huge::()).rev() { + let child = sub_tree.entry(i); + match child.to_ref() { + Child::PageTableRef(pt) => { + // SAFETY: The caller ensures that the node is locked. + let locked_pt = unsafe { PageTableGuard::::from_raw_paddr(pt.start_paddr()) }; + dfs_mark_stray_and_unlock(locked_pt); + } + Child::None | Child::Frame(_, _) | Child::Untracked(_, _, _) | Child::PageTable(_) => {} + } + } +} + fn dfs_get_idx_range( cur_node_level: PagingLevel, cur_node_va: Vaddr, diff --git a/ostd/src/mm/page_table/cursor/mod.rs b/ostd/src/mm/page_table/cursor/mod.rs index bf09817c..928c2bb2 100644 --- a/ostd/src/mm/page_table/cursor/mod.rs +++ b/ostd/src/mm/page_table/cursor/mod.rs @@ -10,6 +10,12 @@ //! [`CursorMut::new`] will lock a range in the virtual space and all the //! operations on the range with the cursor will be atomic as a transaction. //! +//! The guarantee of the lock protocol is that, if two cursors' ranges overlap, +//! all of one's operation must be finished before any of the other's +//! operation. The order depends on the scheduling of the threads. If a cursor +//! is ordered after another cursor, it will see all the changes made by the +//! previous cursor. +//! //! The implementation of the lock protocol resembles two-phase locking (2PL). //! [`CursorMut::new`] accepts an address range, which indicates the page table //! entries that may be visited by this cursor. Then, [`CursorMut::new`] finds @@ -23,7 +29,11 @@ mod locking; -use core::{any::TypeId, marker::PhantomData, ops::Range}; +use core::{ + any::TypeId, + marker::PhantomData, + ops::{Deref, Range}, +}; use align_ext::AlignExt; @@ -64,6 +74,8 @@ pub struct Cursor<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsT va: Vaddr, /// The virtual address range that is locked. barrier_va: Range, + /// This also make all the operation in the `Cursor::new` performed in a + /// RCU read-side critical section. #[expect(dead_code)] preempt_guard: DisabledPreemptGuard, _phantom: PhantomData<&'a PageTable>, @@ -89,6 +101,14 @@ pub enum PageTableItem { len: usize, prop: PageProperty, }, + /// This item can only show up as a return value of `take_next`. The caller + /// is responsible to free the page table node after TLB coherence. + /// FIXME: Separate into another type rather than `PageTableItem`? + StrayPageTable { + pt: Frame, + va: Vaddr, + len: usize, + }, } impl<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsTrait> Cursor<'a, M, E, C> { @@ -509,10 +529,7 @@ impl<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsTrait> CursorM } // Go down if not applicable. - if cur_entry.is_node() - || cur_va % page_size::(cur_level) != 0 - || cur_va + page_size::(cur_level) > end - { + if cur_va % page_size::(cur_level) != 0 || cur_va + page_size::(cur_level) > end { let child = cur_entry.to_ref(); match child { Child::PageTableRef(pt) => { @@ -567,7 +584,28 @@ impl<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsTrait> CursorM prop, } } - Child::PageTable(_) | Child::None | Child::PageTableRef(_) => unreachable!(), + Child::PageTable(pt) => { + assert!( + !(TypeId::of::() == TypeId::of::() + && self.0.level == C::NR_LEVELS), + "Unmapping shared kernel page table nodes" + ); + + // SAFETY: We must have locked this node. + let locked_pt = + unsafe { PageTableGuard::::from_raw_paddr(pt.start_paddr()) }; + // SAFETY: + // - We checked that we are not unmapping shared kernel page table nodes. + // - We must have locked the entire sub-tree since the range is locked. + unsafe { locking::dfs_mark_stray_and_unlock(locked_pt) }; + + PageTableItem::StrayPageTable { + pt: pt.deref().clone().into(), + va: self.0.va, + len: page_size::(self.0.level), + } + } + Child::None | Child::PageTableRef(_) => unreachable!(), }; self.0.move_forward(); diff --git a/ostd/src/mm/page_table/mod.rs b/ostd/src/mm/page_table/mod.rs index 8547de71..372ec5f4 100644 --- a/ostd/src/mm/page_table/mod.rs +++ b/ostd/src/mm/page_table/mod.rs @@ -14,8 +14,6 @@ use super::{ }; use crate::{ arch::mm::{PageTableEntry, PagingConsts}, - cpu::PinCurrentCpu, - task::disable_preempt, util::marker::SameSizeAs, Pod, }; @@ -23,7 +21,8 @@ use crate::{ mod node; use node::*; pub mod cursor; -pub use cursor::{Cursor, CursorMut, PageTableItem}; +pub(crate) use cursor::PageTableItem; +pub use cursor::{Cursor, CursorMut}; #[cfg(ktest)] mod test; @@ -99,25 +98,6 @@ impl PageTable { self.root.activate(); } } - - /// Clear the page table. - pub(super) fn clear(&self, flusher: &mut super::tlb::TlbFlusher<'_, G>) { - let _guard = disable_preempt(); - let mut root_node = self.root.lock(); - const NR_PTES_PER_NODE: usize = nr_subpage_per_huge::(); - for i in 0..NR_PTES_PER_NODE / 2 { - let mut root_entry = root_node.entry(i); - if !root_entry.is_none() { - let old = root_entry.replace(Child::None); - if let Child::PageTable(pt) = old { - flusher - .issue_tlb_flush_with(crate::mm::tlb::TlbFlushOp::All, pt.clone().into()); - // There may be other cursors accessing the old child, delay it with RCU. - let _ = crate::sync::RcuDrop::new(pt); - } - } - } - } } impl PageTable { @@ -171,7 +151,9 @@ impl PageTable { }; let pt_cloned = pt.clone(); - let _ = new_node.entry(i).replace(Child::PageTable(pt_cloned)); + let _ = new_node + .entry(i) + .replace(Child::PageTable(crate::sync::RcuDrop::new(pt_cloned))); } drop(new_node); diff --git a/ostd/src/mm/page_table/node/child.rs b/ostd/src/mm/page_table/node/child.rs index ebd86cd9..fc3e918d 100644 --- a/ostd/src/mm/page_table/node/child.rs +++ b/ostd/src/mm/page_table/node/child.rs @@ -5,10 +5,13 @@ use core::{mem::ManuallyDrop, panic}; use super::{MapTrackingStatus, PageTableEntryTrait, PageTableNode, PageTableNodeRef}; -use crate::mm::{ - frame::{inc_frame_ref_count, meta::AnyFrameMeta, Frame}, - page_prop::PageProperty, - Paddr, PagingConstsTrait, PagingLevel, +use crate::{ + mm::{ + frame::{inc_frame_ref_count, meta::AnyFrameMeta, Frame}, + page_prop::PageProperty, + Paddr, PagingConstsTrait, PagingLevel, + }, + sync::RcuDrop, }; /// A child of a page table node. @@ -16,7 +19,7 @@ use crate::mm::{ #[derive(Debug)] pub(in crate::mm) enum Child<'a, E: PageTableEntryTrait, C: PagingConstsTrait> { /// A owning handle to a raw page table node. - PageTable(PageTableNode), + PageTable(RcuDrop>), /// A reference of a child page table node. PageTableRef(PageTableNodeRef<'a, E, C>), /// A mapped frame. @@ -109,7 +112,7 @@ impl Child<'_, E, C> { // at the given level. let pt = unsafe { PageTableNode::from_raw(paddr) }; debug_assert_eq!(pt.level(), level - 1); - return Child::PageTable(pt); + return Child::PageTable(RcuDrop::new(pt)); } match is_tracked { diff --git a/ostd/src/mm/page_table/node/entry.rs b/ostd/src/mm/page_table/node/entry.rs index 3c890313..e65bd0ea 100644 --- a/ostd/src/mm/page_table/node/entry.rs +++ b/ostd/src/mm/page_table/node/entry.rs @@ -3,7 +3,10 @@ //! This module provides accessors to the page table entries in a node. use super::{Child, MapTrackingStatus, PageTableEntryTrait, PageTableGuard, PageTableNode}; -use crate::mm::{nr_subpage_per_huge, page_prop::PageProperty, page_size, PagingConstsTrait}; +use crate::{ + mm::{nr_subpage_per_huge, page_prop::PageProperty, page_size, PagingConstsTrait}, + sync::RcuDrop, +}; /// A view of an entry in a page table node. /// @@ -127,8 +130,10 @@ impl<'guard, 'pt, E: PageTableEntryTrait, C: PagingConstsTrait> Entry<'guard, 'p // 1. The index is within the bounds. // 2. The new PTE is compatible with the page table node. unsafe { - self.node - .write_pte(self.idx, Child::PageTable(new_page).into_pte()) + self.node.write_pte( + self.idx, + Child::PageTable(RcuDrop::new(new_page)).into_pte(), + ) }; *self.node.nr_children_mut() += 1; @@ -178,8 +183,10 @@ impl<'guard, 'pt, E: PageTableEntryTrait, C: PagingConstsTrait> Entry<'guard, 'p // 1. The index is within the bounds. // 2. The new PTE is compatible with the page table node. unsafe { - self.node - .write_pte(self.idx, Child::PageTable(new_page).into_pte()) + self.node.write_pte( + self.idx, + Child::PageTable(RcuDrop::new(new_page)).into_pte(), + ) }; // SAFETY: The resulting guard lifetime (`'a`) is no shorter than the diff --git a/ostd/src/mm/page_table/node/mod.rs b/ostd/src/mm/page_table/node/mod.rs index b793b837..1997cb30 100644 --- a/ostd/src/mm/page_table/node/mod.rs +++ b/ostd/src/mm/page_table/node/mod.rs @@ -191,6 +191,12 @@ impl<'a, E: PageTableEntryTrait, C: PagingConstsTrait> PageTableGuard<'a, E, C> unsafe { *self.meta().nr_children.get() } } + /// Returns if the page table node is detached from its parent. + pub(super) fn stray_mut(&mut self) -> &mut bool { + // SAFETY: The lock is held so we have an exclusive access. + unsafe { &mut *self.meta().stray.get() } + } + /// Reads a non-owning PTE at the given index. /// /// A non-owning PTE means that it does not account for a reference count @@ -258,6 +264,12 @@ impl Drop for PageTableGuard<'_, E pub(in crate::mm) struct PageTablePageMeta { /// The number of valid PTEs. It is mutable if the lock is held. pub nr_children: SyncUnsafeCell, + /// If the page table is detached from its parent. + /// + /// A page table can be detached from its parent while still being accessed, + /// since we use a RCU scheme to recycle page tables. If this flag is set, + /// it means that the parent is recycling the page table. + pub stray: SyncUnsafeCell, /// The level of the page table page. A page table page cannot be /// referenced by page tables of different levels. pub level: PagingLevel, @@ -288,6 +300,7 @@ impl PageTablePageMeta { pub fn new(level: PagingLevel, is_tracked: MapTrackingStatus) -> Self { Self { nr_children: SyncUnsafeCell::new(0), + stray: SyncUnsafeCell::new(false), level, lock: AtomicU8::new(0), is_tracked, @@ -296,8 +309,8 @@ impl PageTablePageMeta { } } -// SAFETY: The layout of the `PageTablePageMeta` is ensured to be the same for -// all possible generic parameters. And the layout fits the requirements. +// SAFETY: We can read the page table node because the page table pages are +// accessed as untyped memory. unsafe impl AnyFrameMeta for PageTablePageMeta { fn on_drop(&mut self, reader: &mut VmReader) { let nr_children = self.nr_children.get_mut(); diff --git a/ostd/src/mm/page_table/test.rs b/ostd/src/mm/page_table/test.rs index 3dedf135..4a720bc8 100644 --- a/ostd/src/mm/page_table/test.rs +++ b/ostd/src/mm/page_table/test.rs @@ -5,7 +5,6 @@ use crate::{ mm::{ kspace::LINEAR_MAPPING_BASE_VADDR, page_prop::{CachePolicy, PageFlags}, - tlb::TlbFlusher, FrameAllocOptions, MAX_USERSPACE_VADDR, PAGE_SIZE, }, prelude::*, @@ -130,7 +129,6 @@ mod test_utils { mod create_page_table { use super::{test_utils::*, *}; - use crate::cpu::{AtomicCpuSet, CpuSet}; #[ktest] fn init_user_page_table() { @@ -167,42 +165,6 @@ mod create_page_table { assert_eq!(kernel_node.start_paddr(), user_node.start_paddr()); } } - - #[ktest] - fn clear_user_page_table() { - // Creates a kernel page table. - let kernel_pt = PageTable::::new_kernel_page_table(); - - // Creates a user page table. - let user_pt = kernel_pt.create_user_page_table(); - - // Defines a virtual address range. - let range = PAGE_SIZE..(PAGE_SIZE * 2); - - // Allocates a physical frame and sets page properties. - let frame = FrameAllocOptions::default().alloc_frame().unwrap(); - let page_property = PageProperty::new(PageFlags::RW, CachePolicy::Writeback); - - // Maps the virtual range to the physical frame. - unsafe { - user_pt - .cursor_mut(&range) - .unwrap() - .map(frame.into(), page_property); - } - - // Confirms that the mapping exists. - assert!(user_pt.query(PAGE_SIZE + 10).is_some()); - - let set = AtomicCpuSet::new(CpuSet::new_empty()); - let mut tlb_flusher = TlbFlusher::new(&set, disable_preempt()); - - // Clears the page table. - user_pt.clear(&mut tlb_flusher); - - // Confirms that the mapping is cleared. - assert!(user_pt.query(PAGE_SIZE + 10).is_none()); - } } mod range_checks { diff --git a/ostd/src/mm/test.rs b/ostd/src/mm/test.rs index 1e2ffbae..efc80a26 100644 --- a/ostd/src/mm/test.rs +++ b/ostd/src/mm/test.rs @@ -674,34 +674,6 @@ mod vmspace { ); } - /// Clears the `VmSpace`. - #[ktest] - fn vmspace_clear() { - let vmspace = VmSpace::new(); - let range = 0x2000..0x3000; - { - let mut cursor_mut = vmspace - .cursor_mut(&range) - .expect("Failed to create mutable cursor"); - let frame = create_dummy_frame(); - let prop = PageProperty::new(PageFlags::R, CachePolicy::Writeback); - cursor_mut.map(frame, prop); - } - - // Clears the VmSpace. - vmspace.clear(); - - // Verifies that the mapping is cleared. - let mut cursor = vmspace.cursor(&range).expect("Failed to create cursor"); - assert_eq!( - cursor.next(), - Some(VmItem::NotMapped { - va: range.start, - len: range.start + 0x1000 - }) - ); - } - /// Activates and deactivates the `VmSpace` in single-CPU scenarios. #[ktest] fn vmspace_activate() { diff --git a/ostd/src/mm/vm_space.rs b/ostd/src/mm/vm_space.rs index a6338704..3ff87e59 100644 --- a/ostd/src/mm/vm_space.rs +++ b/ostd/src/mm/vm_space.rs @@ -77,14 +77,6 @@ impl VmSpace { } } - /// Clears the user space mappings in the page table. - pub fn clear(&self) { - let mut flusher = TlbFlusher::new(&self.cpus, disable_preempt()); - self.pt.clear(&mut flusher); - flusher.dispatch_tlb_flush(); - flusher.sync_tlb_flush(); - } - /// Gets an immutable cursor in the virtual address range. /// /// The cursor behaves like a lock guard, exclusively owning a sub-tree of @@ -323,6 +315,10 @@ impl<'b> CursorMut<'_, 'b> { PageTableItem::MappedUntracked { .. } => { panic!("found untracked memory mapped into `VmSpace`"); } + PageTableItem::StrayPageTable { pt, va, len } => { + self.flusher + .issue_tlb_flush_with(TlbFlushOp::Range(va..va + len), pt); + } } } @@ -470,12 +466,13 @@ impl TryFrom for VmItem { va, frame: page .try_into() - .map_err(|_| "found typed memory mapped into `VmSpace`")?, + .map_err(|_| "Found typed memory mapped into `VmSpace`")?, prop, }), PageTableItem::MappedUntracked { .. } => { - Err("found untracked memory mapped into `VmSpace`") + Err("Found untracked memory mapped into `VmSpace`") } + PageTableItem::StrayPageTable { .. } => Err("Stray page table cannot be query results"), } } }