// SPDX-License-Identifier: MPL-2.0 //! The page table cursor for mapping and querying over the page table. //! //! ## The page table lock protocol //! //! We provide a fine-grained lock protocol to allow concurrent accesses to //! the page table. The protocol is originally proposed by Ruihan Li //! . //! //! [`CursorMut::new`] accepts an address range, which indicates the page table //! entries that may be visited by this cursor. //! //! Then, [`CursorMut::new`] finds an intermediate page table (not necessarily //! the last-level or the top-level) which represents an address range that contains //! the whole specified address range. It requires all locks from the root page //! table to the intermediate page table, but then unlocks all locks excluding the //! one for the intermediate page table. CursorMut then maintains the lock //! guards from one for the intermediate page table to the leaf that the cursor is //! currently manipulating. //! //! For example, if we're going to map the address range shown below: //! //! ```plain //! Top-level page table node A //! / //! B //! / \ //! Last-level page table nodes C D //! Last-level PTEs ---**...**--- //! \__ __/ //! V //! Address range that we're going to map //! ``` //! //! When calling [`CursorMut::new`], it will: //! 1. `lock(A)`, `lock(B)`, `unlock(A)`; //! 2. `guards = [ locked(B) ]`. //! //! When calling [`CursorMut::map`], it will: //! 1. `lock(C)`, `guards = [ locked(B), locked(C) ]`; //! 2. Map some pages in `C`; //! 3. `unlock(C)`, `lock_guard = [ locked(B) ]`; //! 4. `lock(D)`, `lock_guard = [ locked(B), locked(D) ]`; //! 5. Map some pages in D; //! 6. `unlock(D)`, `lock_guard = [ locked(B) ]`; //! //! //! ## Validity //! //! The page table cursor API will guarantee that the page table, as a data //! structure, whose occupied memory will not suffer from data races. This is //! ensured by the page table lock protocol. In other words, any operations //! provided by the APIs (as long as safety requirements are met) will not //! break the page table data structure (or other memory). //! //! However, the page table cursor creation APIs, [`CursorMut::new`] or //! [`Cursor::new`], do not guarantee exclusive access to the virtual address //! area you claim. From the lock protocol, you can see that there are chances //! to create 2 cursors that claim the same virtual address range (one covers //! another). In this case, the greater cursor may block if it wants to modify //! the page table entries covered by the smaller cursor. Also, if the greater //! cursor destructs the smaller cursor's parent page table node, it won't block //! and the smaller cursor's change will not be visible. The user of the page //! table cursor should add additional entry point checks to prevent these defined //! behaviors if they are not wanted. use core::{any::TypeId, marker::PhantomData, ops::Range}; use align_ext::AlignExt; use super::{ page_size, pte_index, Child, KernelMode, PageTable, PageTableEntryTrait, PageTableError, PageTableMode, PageTableNode, PagingConstsTrait, PagingLevel, UserMode, }; use crate::mm::{page::DynPage, Paddr, PageProperty, Vaddr}; #[derive(Clone, Debug)] pub enum PageTableItem { NotMapped { va: Vaddr, len: usize, }, Mapped { va: Vaddr, page: DynPage, prop: PageProperty, }, #[allow(dead_code)] MappedUntracked { va: Vaddr, pa: Paddr, len: usize, prop: PageProperty, }, } /// The cursor for traversal over the page table. /// /// A slot is a PTE at any levels, which correspond to a certain virtual /// memory range sized by the "page size" of the current level. /// /// A cursor is able to move to the next slot, to read page properties, /// and even to jump to a virtual address directly. We use a guard stack to /// simulate the recursion, and adpot a page table locking protocol to /// provide concurrency. #[derive(Debug)] pub struct Cursor<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsTrait> where [(); C::NR_LEVELS as usize]:, { /// The lock guards of the cursor. The level 1 page table lock guard is at /// index 0, and the level N page table lock guard is at index N - 1. /// /// When destructing the cursor, the locks will be released in the order /// from low to high, exactly the reverse order of the acquisition. /// This behavior is ensured by the default drop implementation of Rust: /// . guards: [Option>; C::NR_LEVELS as usize], /// The level of the page table that the cursor points to. level: PagingLevel, /// From `guard_level` to `level`, the locks are held in `guards`. guard_level: PagingLevel, /// The current virtual address that the cursor points to. va: Vaddr, /// The virtual address range that is locked. barrier_va: Range, phantom: PhantomData<&'a PageTable>, } impl<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsTrait> Cursor<'a, M, E, C> where [(); C::NR_LEVELS as usize]:, { /// Creates a cursor claiming the read access for the given range. /// /// The cursor created will only be able to query or jump within the given /// range. Out-of-bound accesses will result in panics or errors as return values, /// depending on the access method. /// /// Note that this function does not ensure exclusive access to the claimed /// virtual address range. The accesses using this cursor may block or fail. pub fn new(pt: &'a PageTable, va: &Range) -> Result { if !M::covers(va) { return Err(PageTableError::InvalidVaddrRange(va.start, va.end)); } if va.start % C::BASE_PAGE_SIZE != 0 || va.end % C::BASE_PAGE_SIZE != 0 { return Err(PageTableError::UnalignedVaddr); } // Create a guard array that only hold the root node lock. let guards = core::array::from_fn(|i| { if i == (C::NR_LEVELS - 1) as usize { Some(pt.root.clone_shallow().lock()) } else { None } }); let mut cursor = Self { guards, level: C::NR_LEVELS, guard_level: C::NR_LEVELS, va: va.start, barrier_va: va.clone(), phantom: PhantomData, }; // Go down and get proper locks. The cursor should hold a lock of a // page table node containing the virtual address range. // // While going down, previous guards of too-high levels will be released. loop { let level_too_high = { let start_idx = pte_index::(va.start, cursor.level); let end_idx = pte_index::(va.end - 1, cursor.level); start_idx == end_idx }; if !level_too_high { break; } let cur_pte = cursor.read_cur_pte(); if !cur_pte.is_present() || cur_pte.is_last(cursor.level) { break; } cursor.level_down(); // Release the guard of the previous (upper) level. cursor.guards[cursor.level as usize] = None; cursor.guard_level -= 1; } Ok(cursor) } /// Gets the information of the current slot. pub fn query(&mut self) -> Result { if self.va >= self.barrier_va.end { return Err(PageTableError::InvalidVaddr(self.va)); } loop { let level = self.level; let va = self.va; let pte = self.read_cur_pte(); if !pte.is_present() { return Ok(PageTableItem::NotMapped { va, len: page_size::(level), }); } if !pte.is_last(level) { self.level_down(); continue; } match self.cur_child() { Child::Page(page) => { return Ok(PageTableItem::Mapped { va, page, prop: pte.prop(), }); } Child::Untracked(pa) => { return Ok(PageTableItem::MappedUntracked { va, pa, len: page_size::(level), prop: pte.prop(), }); } Child::None | Child::PageTable(_) => { unreachable!(); // Already checked with the PTE. } } } } /// Traverses forward in the current level to the next PTE. /// /// If reached the end of a page table node, it leads itself up to the next page of the parent /// page if possible. pub(in crate::mm) fn move_forward(&mut self) { let page_size = page_size::(self.level); let next_va = self.va.align_down(page_size) + page_size; while self.level < self.guard_level && pte_index::(next_va, self.level) == 0 { self.level_up(); } self.va = next_va; } /// Jumps to the given virtual address. /// If the target address is out of the range, this method will return `Err`. /// /// # Panics /// /// This method panics if the address has bad alignment. pub fn jump(&mut self, va: Vaddr) -> Result<(), PageTableError> { assert!(va % C::BASE_PAGE_SIZE == 0); if !self.barrier_va.contains(&va) { return Err(PageTableError::InvalidVaddr(va)); } loop { let cur_node_start = self.va & !(page_size::(self.level + 1) - 1); let cur_node_end = cur_node_start + page_size::(self.level + 1); // If the address is within the current node, we can jump directly. if cur_node_start <= va && va < cur_node_end { self.va = va; return Ok(()); } // There is a corner case that the cursor is depleted, sitting at the start of the // next node but the next node is not locked because the parent is not locked. if self.va >= self.barrier_va.end && self.level == self.guard_level { self.va = va; return Ok(()); } debug_assert!(self.level < self.guard_level); self.level_up(); } } pub fn virt_addr(&self) -> Vaddr { self.va } /// Goes up a level. We release the current page if it has no mappings since the cursor only moves /// forward. And if needed we will do the final cleanup using this method after re-walk when the /// cursor is dropped. /// /// This method requires locks acquired before calling it. The discarded level will be unlocked. fn level_up(&mut self) { self.guards[(self.level - 1) as usize] = None; self.level += 1; // TODO: Drop page tables if page tables become empty. } /// Goes down a level assuming a child page table exists. fn level_down(&mut self) { debug_assert!(self.level > 1); let Child::PageTable(nxt_lvl_ptn) = self.cur_child() else { panic!("Trying to level down when it is not mapped to a page table"); }; let nxt_lvl_ptn_locked = nxt_lvl_ptn.lock(); self.level -= 1; debug_assert_eq!(self.level, nxt_lvl_ptn_locked.level()); self.guards[(self.level - 1) as usize] = Some(nxt_lvl_ptn_locked); } fn cur_node(&self) -> &PageTableNode { self.guards[(self.level - 1) as usize].as_ref().unwrap() } fn cur_idx(&self) -> usize { pte_index::(self.va, self.level) } fn cur_child(&self) -> Child { self.cur_node() .child(self.cur_idx(), self.in_tracked_range()) } fn read_cur_pte(&self) -> E { self.cur_node().read_pte(self.cur_idx()) } /// Tells if the current virtual range must contain untracked mappings. /// /// _Tracked mappings_ means that the mapped physical addresses (in PTEs) points to pages /// tracked by the metadata system. _Tracked mappings_ must be created with page handles. /// While _untracked mappings_ solely maps to plain physical addresses. /// /// In the kernel mode, this is aligned with the definition in [`crate::mm::kspace`]. /// Only linear mappings in the kernel should be considered as untracked mappings. /// /// All mappings in the user mode are tracked. And all mappings in the IOMMU /// page table are untracked. fn in_tracked_range(&self) -> bool { TypeId::of::() == TypeId::of::() || TypeId::of::() == TypeId::of::() && !crate::mm::kspace::LINEAR_MAPPING_VADDR_RANGE.contains(&self.va) } } impl<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsTrait> Iterator for Cursor<'a, M, E, C> where [(); C::NR_LEVELS as usize]:, { type Item = PageTableItem; fn next(&mut self) -> Option { let result = self.query(); if result.is_ok() { self.move_forward(); } result.ok() } } /// The cursor of a page table that is capable of map, unmap or protect pages. /// /// Also, it has all the capabilities of a [`Cursor`]. A virtual address range /// in a page table can only be accessed by one cursor whether it is mutable or not. #[derive(Debug)] pub struct CursorMut<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsTrait>( Cursor<'a, M, E, C>, ) where [(); C::NR_LEVELS as usize]:; impl<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsTrait> CursorMut<'a, M, E, C> where [(); C::NR_LEVELS as usize]:, { /// Creates a cursor claiming the write access for the given range. /// /// The cursor created will only be able to map, query or jump within the given /// range. Out-of-bound accesses will result in panics or errors as return values, /// depending on the access method. /// /// Note that this function, the same as [`Cursor::new`], does not ensure exclusive /// access to the claimed virtual address range. The accesses using this cursor may /// block or fail. pub(super) fn new( pt: &'a PageTable, va: &Range, ) -> Result { Cursor::new(pt, va).map(|inner| Self(inner)) } /// Jumps to the given virtual address. /// /// This is the same as [`Cursor::jump`]. /// /// # Panics /// /// This method panics if the address is out of the range where the cursor is required to operate, /// or has bad alignment. pub fn jump(&mut self, va: Vaddr) -> Result<(), PageTableError> { self.0.jump(va) } /// Gets the current virtual address. pub fn virt_addr(&self) -> Vaddr { self.0.virt_addr() } /// Gets the information of the current slot. pub fn query(&mut self) -> Result { self.0.query() } /// Maps the range starting from the current address to a [`DynPage`]. /// /// # Panics /// /// This function will panic if /// - the virtual address range to be mapped is out of the range; /// - the alignment of the page is not satisfied by the virtual address; /// - it is already mapped to a huge page while the caller wants to map a smaller one. /// /// # Safety /// /// The caller should ensure that the virtual range being mapped does /// not affect kernel's memory safety. pub unsafe fn map(&mut self, page: DynPage, prop: PageProperty) { let end = self.0.va + page.size(); assert!(end <= self.0.barrier_va.end); debug_assert!(self.0.in_tracked_range()); // Go down if not applicable. while self.0.level > C::HIGHEST_TRANSLATION_LEVEL || self.0.va % page_size::(self.0.level) != 0 || self.0.va + page_size::(self.0.level) > end { let pte = self.0.read_cur_pte(); if pte.is_present() && !pte.is_last(self.0.level) { self.0.level_down(); } else if !pte.is_present() { self.level_down_create(); } else { panic!("Mapping a smaller page in an already mapped huge page"); } continue; } debug_assert_eq!(self.0.level, page.level()); // Map the current page. let idx = self.0.cur_idx(); self.cur_node_mut().set_child_page(idx, page, prop); self.0.move_forward(); } /// Maps the range starting from the current address to a physical address range. /// /// The function will map as more huge pages as possible, and it will split /// the huge pages into smaller pages if necessary. If the input range is /// large, the resulting mappings may look like this (if very huge pages /// supported): /// /// ```text /// start end /// |----|----------------|--------------------------------|----|----| /// base huge very huge base base /// 4KiB 2MiB 1GiB 4KiB 4KiB /// ``` /// /// In practice it is not suggested to use this method for safety and conciseness. /// /// # Panics /// /// This function will panic if /// - the virtual address range to be mapped is out of the range. /// /// # Safety /// /// The caller should ensure that /// - the range being mapped does not affect kernel's memory safety; /// - the physical address to be mapped is valid and safe to use; /// - it is allowed to map untracked pages in this virtual address range. pub unsafe fn map_pa(&mut self, pa: &Range, prop: PageProperty) { let end = self.0.va + pa.len(); let mut pa = pa.start; assert!(end <= self.0.barrier_va.end); while self.0.va < end { // We ensure not mapping in reserved kernel shared tables or releasing it. // Although it may be an invariant for all architectures and will be optimized // out by the compiler since `C::NR_LEVELS - 1 > C::HIGHEST_TRANSLATION_LEVEL`. let is_kernel_shared_node = TypeId::of::() == TypeId::of::() && self.0.level >= C::NR_LEVELS - 1; if self.0.level > C::HIGHEST_TRANSLATION_LEVEL || is_kernel_shared_node || self.0.va % page_size::(self.0.level) != 0 || self.0.va + page_size::(self.0.level) > end || pa % page_size::(self.0.level) != 0 { let pte = self.0.read_cur_pte(); if pte.is_present() && !pte.is_last(self.0.level) { self.0.level_down(); } else if !pte.is_present() { self.level_down_create(); } else { self.level_down_split(); } continue; } // Map the current page. debug_assert!(!self.0.in_tracked_range()); let idx = self.0.cur_idx(); self.cur_node_mut().set_child_untracked(idx, pa, prop); let level = self.0.level; pa += page_size::(level); self.0.move_forward(); } } /// Find and remove the first page in the cursor's following range. /// /// The range to be found in is the current virtual address with the /// provided length. /// /// The function stops and yields the page if it has actually removed a /// page, no matter if the following pages are also required to be unmapped. /// The returned page is the virtual page that existed before the removal /// but having just been unmapped. /// /// It also makes the cursor moves forward to the next page after the /// removed one, when an actual page is removed. If no mapped pages exist /// in the following range, the cursor will stop at the end of the range /// and return [`PageTableItem::NotMapped`]. /// /// # Safety /// /// The caller should ensure that the range being unmapped does not affect /// kernel's memory safety. /// /// # Panics /// /// This function will panic if the end range covers a part of a huge page /// and the next page is that huge page. pub unsafe fn take_next(&mut self, len: usize) -> PageTableItem { let start = self.0.va; assert!(len % page_size::(1) == 0); let end = start + len; assert!(end <= self.0.barrier_va.end); while self.0.va < end { let cur_pte = self.0.read_cur_pte(); let is_tracked = self.0.in_tracked_range(); // Skip if it is already absent. if !cur_pte.is_present() { if self.0.va + page_size::(self.0.level) > end { self.0.va = end; break; } self.0.move_forward(); continue; } // Level down if the current PTE points to a page table. if !cur_pte.is_last(self.0.level) { self.0.level_down(); // We have got down a level. If there's no mapped PTEs in // the current node, we can go back and skip to save time. if self.0.guards[(self.0.level - 1) as usize] .as_ref() .unwrap() .nr_children() == 0 { self.0.level_up(); self.0.move_forward(); } continue; } // Level down if we are removing part of a huge untracked page. if self.0.va % page_size::(self.0.level) != 0 || self.0.va + page_size::(self.0.level) > end { if !is_tracked { self.level_down_split(); continue; } else { panic!("removing part of a huge page"); } } // Unmap the current page and return it. let idx = self.0.cur_idx(); let ret = self.cur_node_mut().take_child(idx, is_tracked); let ret_page_va = self.0.va; let ret_page_size = page_size::(self.0.level); self.0.move_forward(); return match ret { Child::Page(page) => PageTableItem::Mapped { va: ret_page_va, page, prop: cur_pte.prop(), }, Child::Untracked(pa) => PageTableItem::MappedUntracked { va: ret_page_va, pa, len: ret_page_size, prop: cur_pte.prop(), }, Child::None | Child::PageTable(_) => unreachable!(), }; } // If the loop exits, we did not find any mapped pages in the range. PageTableItem::NotMapped { va: start, len } } /// Applies the operation to the next slot of mapping within the range. /// /// The range to be found in is the current virtual address with the /// provided length. /// /// The function stops and yields the actually protected range if it has /// actually protected a page, no matter if the following pages are also /// required to be protected. /// /// It also makes the cursor moves forward to the next page after the /// protected one. If no mapped pages exist in the following range, the /// cursor will stop at the end of the range and return [`None`]. /// /// # Safety /// /// The caller should ensure that the range being protected with the /// operation does not affect kernel's memory safety. /// /// # Panics /// /// This function will panic if: /// - the range to be protected is out of the range where the cursor /// is required to operate; /// - the specified virtual address range only covers a part of a page. pub unsafe fn protect_next( &mut self, len: usize, op: &mut impl FnMut(&mut PageProperty), ) -> Option> { let end = self.0.va + len; assert!(end <= self.0.barrier_va.end); while self.0.va < end { let cur_pte = self.0.read_cur_pte(); if !cur_pte.is_present() { self.0.move_forward(); continue; } // Go down if it's not a last node. if !cur_pte.is_last(self.0.level) { self.0.level_down(); // We have got down a level. If there's no mapped PTEs in // the current node, we can go back and skip to save time. if self.0.guards[(self.0.level - 1) as usize] .as_ref() .unwrap() .nr_children() == 0 { self.0.level_up(); self.0.move_forward(); } continue; } // Go down if the page size is too big and we are protecting part // of untracked huge pages. if self.0.va % page_size::(self.0.level) != 0 || self.0.va + page_size::(self.0.level) > end { if self.0.in_tracked_range() { panic!("protecting part of a huge page"); } else { self.level_down_split(); continue; } } let mut pte_prop = cur_pte.prop(); op(&mut pte_prop); let idx = self.0.cur_idx(); self.cur_node_mut().protect(idx, pte_prop); let protected_va = self.0.va..self.0.va + page_size::(self.0.level); self.0.move_forward(); return Some(protected_va); } None } /// Consumes itself and leak the root guard for the caller if it locked the root level. /// /// It is useful when the caller wants to keep the root guard while the cursor should be dropped. pub(super) fn leak_root_guard(mut self) -> Option> { if self.0.guard_level != C::NR_LEVELS { return None; } while self.0.level < C::NR_LEVELS { self.0.level_up(); } self.0.guards[(C::NR_LEVELS - 1) as usize].take() // Ok to drop the cursor here because we ensure not to access the page table if the current // level is the root level when running the dropping method. } /// Goes down a level assuming the current slot is absent. /// /// This method will create a new child page table node and go down to it. fn level_down_create(&mut self) { debug_assert!(self.0.level > 1); let new_node = PageTableNode::::alloc(self.0.level - 1); let idx = self.0.cur_idx(); let is_tracked = self.0.in_tracked_range(); self.cur_node_mut() .set_child_pt(idx, new_node.clone_raw(), is_tracked); self.0.level -= 1; self.0.guards[(self.0.level - 1) as usize] = Some(new_node); } /// Goes down a level assuming the current slot is an untracked huge page. /// /// This method will split the huge page and go down to the next level. fn level_down_split(&mut self) { debug_assert!(self.0.level > 1); debug_assert!(!self.0.in_tracked_range()); let idx = self.0.cur_idx(); self.cur_node_mut().split_untracked_huge(idx); let Child::PageTable(new_node) = self.0.cur_child() else { unreachable!(); }; self.0.level -= 1; self.0.guards[(self.0.level - 1) as usize] = Some(new_node.lock()); } fn cur_node_mut(&mut self) -> &mut PageTableNode { self.0.guards[(self.0.level - 1) as usize].as_mut().unwrap() } }