From d9eccdcfbe1db71587f5f14b3d45ef337203ad51 Mon Sep 17 00:00:00 2001 From: Zhang Junyang Date: Fri, 30 Aug 2024 17:49:18 +0800 Subject: [PATCH] Implement remote TLB flush on `VmSpace` --- kernel/src/sched/priority_scheduler.rs | 4 +- .../src/thread/work_queue/simple_scheduler.rs | 10 +- kernel/src/thread/work_queue/worker_pool.rs | 5 +- kernel/src/vm/vmar/mod.rs | 9 +- .../lib.rs | 1 + ostd/src/cpu/mod.rs | 25 +- ostd/src/mm/kspace.rs | 4 +- ostd/src/mm/page_table/cursor.rs | 93 +++-- ostd/src/mm/page_table/mod.rs | 6 +- ostd/src/mm/page_table/node.rs | 228 +++++------- ostd/src/mm/vm_space.rs | 337 +++++++++++------- ostd/src/smp.rs | 6 +- ostd/src/task/preempt/guard.rs | 1 + 13 files changed, 391 insertions(+), 338 deletions(-) diff --git a/kernel/src/sched/priority_scheduler.rs b/kernel/src/sched/priority_scheduler.rs index 15a602dce..47c82d718 100644 --- a/kernel/src/sched/priority_scheduler.rs +++ b/kernel/src/sched/priority_scheduler.rs @@ -50,14 +50,14 @@ impl PreemptScheduler { let mut minimum_load = usize::MAX; for candidate in runnable.cpu_affinity().iter() { - let rq = self.rq[candidate].lock(); + let rq = self.rq[candidate as usize].lock(); // A wild guess measuring the load of a runqueue. We assume that // real-time tasks are 4-times as important as normal tasks. let load = rq.real_time_entities.len() * 8 + rq.normal_entities.len() * 2 + rq.lowest_entities.len(); if load < minimum_load { - selected = candidate as u32; + selected = candidate; minimum_load = load; } } diff --git a/kernel/src/thread/work_queue/simple_scheduler.rs b/kernel/src/thread/work_queue/simple_scheduler.rs index 963b44dd9..4cc8bb99b 100644 --- a/kernel/src/thread/work_queue/simple_scheduler.rs +++ b/kernel/src/thread/work_queue/simple_scheduler.rs @@ -24,12 +24,12 @@ impl WorkerScheduler for SimpleScheduler { fn schedule(&self) { let worker_pool = self.worker_pool.upgrade().unwrap(); for cpu_id in worker_pool.cpu_set().iter() { - if !worker_pool.heartbeat(cpu_id as u32) - && worker_pool.has_pending_work_items(cpu_id as u32) - && !worker_pool.wake_worker(cpu_id as u32) - && worker_pool.num_workers(cpu_id as u32) < WORKER_LIMIT + if !worker_pool.heartbeat(cpu_id) + && worker_pool.has_pending_work_items(cpu_id) + && !worker_pool.wake_worker(cpu_id) + && worker_pool.num_workers(cpu_id) < WORKER_LIMIT { - worker_pool.add_worker(cpu_id as u32); + worker_pool.add_worker(cpu_id); } } } diff --git a/kernel/src/thread/work_queue/worker_pool.rs b/kernel/src/thread/work_queue/worker_pool.rs index c3fd6dd0b..c12167b0e 100644 --- a/kernel/src/thread/work_queue/worker_pool.rs +++ b/kernel/src/thread/work_queue/worker_pool.rs @@ -128,10 +128,7 @@ impl WorkerPool { Arc::new_cyclic(|pool_ref| { let mut local_pools = Vec::new(); for cpu_id in cpu_set.iter() { - local_pools.push(Arc::new(LocalWorkerPool::new( - pool_ref.clone(), - cpu_id as u32, - ))); + local_pools.push(Arc::new(LocalWorkerPool::new(pool_ref.clone(), cpu_id))); } WorkerPool { local_pools, diff --git a/kernel/src/vm/vmar/mod.rs b/kernel/src/vm/vmar/mod.rs index c3f857020..c2089611e 100644 --- a/kernel/src/vm/vmar/mod.rs +++ b/kernel/src/vm/vmar/mod.rs @@ -336,7 +336,7 @@ impl Vmar_ { if !self.is_root_vmar() { return_errno_with_message!(Errno::EACCES, "The vmar is not root vmar"); } - self.vm_space.clear(); + self.clear_vm_space(); let mut inner = self.inner.lock(); inner.child_vmar_s.clear(); inner.vm_mappings.clear(); @@ -346,6 +346,13 @@ impl Vmar_ { Ok(()) } + fn clear_vm_space(&self) { + let start = ROOT_VMAR_LOWEST_ADDR; + let end = ROOT_VMAR_CAP_ADDR; + let mut cursor = self.vm_space.cursor_mut(&(start..end)).unwrap(); + cursor.unmap(end - start); + } + pub fn destroy(&self, range: Range) -> Result<()> { self.check_destroy_range(&range)?; let mut inner = self.inner.lock(); diff --git a/osdk/tests/examples_in_book/write_a_kernel_in_100_lines_templates/lib.rs b/osdk/tests/examples_in_book/write_a_kernel_in_100_lines_templates/lib.rs index fcb2feb3b..23618110b 100644 --- a/osdk/tests/examples_in_book/write_a_kernel_in_100_lines_templates/lib.rs +++ b/osdk/tests/examples_in_book/write_a_kernel_in_100_lines_templates/lib.rs @@ -57,6 +57,7 @@ fn create_user_space(program: &[u8]) -> UserSpace { for frame in user_pages { cursor.map(frame, map_prop); } + drop(cursor); Arc::new(vm_space) }; let user_cpu_state = { diff --git a/ostd/src/cpu/mod.rs b/ostd/src/cpu/mod.rs index bd98f8d7c..309ec68d0 100644 --- a/ostd/src/cpu/mod.rs +++ b/ostd/src/cpu/mod.rs @@ -10,12 +10,7 @@ cfg_if::cfg_if! { } } -use alloc::vec::Vec; - -use bitvec::{ - prelude::{BitVec, Lsb0}, - slice::IterOnes, -}; +use bitvec::prelude::BitVec; use local::cpu_local_cell; use spin::Once; @@ -122,13 +117,6 @@ impl CpuSet { self.bitset.set(cpu_id as usize, true); } - /// Adds a list of CPUs to the set. - pub fn add_from_vec(&mut self, cpu_ids: Vec) { - for cpu_id in cpu_ids { - self.add(cpu_id) - } - } - /// Adds all CPUs to the set. pub fn add_all(&mut self) { self.bitset.fill(true); @@ -139,13 +127,6 @@ impl CpuSet { self.bitset.set(cpu_id as usize, false); } - /// Removes a list of CPUs from the set. - pub fn remove_from_vec(&mut self, cpu_ids: Vec) { - for cpu_id in cpu_ids { - self.remove(cpu_id); - } - } - /// Removes all CPUs from the set. pub fn clear(&mut self) { self.bitset.fill(false); @@ -162,8 +143,8 @@ impl CpuSet { } /// Iterates over the CPUs in the set. - pub fn iter(&self) -> IterOnes<'_, usize, Lsb0> { - self.bitset.iter_ones() + pub fn iter(&self) -> impl Iterator + '_ { + self.bitset.iter_ones().map(|idx| idx as u32) } } diff --git a/ostd/src/mm/kspace.rs b/ostd/src/mm/kspace.rs index a9075d17e..111b434a6 100644 --- a/ostd/src/mm/kspace.rs +++ b/ostd/src/mm/kspace.rs @@ -156,7 +156,7 @@ pub fn init_kernel_page_table(meta_pages: Vec>) { for meta_page in meta_pages { // SAFETY: we are doing the metadata mappings for the kernel. unsafe { - cursor.map(meta_page.into(), prop); + let _old = cursor.map(meta_page.into(), prop); } } } @@ -199,7 +199,7 @@ pub fn init_kernel_page_table(meta_pages: Vec>) { let page = Page::::from_unused(frame_paddr, KernelMeta::default()); // SAFETY: we are doing mappings for the kernel. unsafe { - cursor.map(page.into(), prop); + let _old = cursor.map(page.into(), prop); } } } diff --git a/ostd/src/mm/page_table/cursor.rs b/ostd/src/mm/page_table/cursor.rs index ee0d3d927..a02f490df 100644 --- a/ostd/src/mm/page_table/cursor.rs +++ b/ostd/src/mm/page_table/cursor.rs @@ -73,7 +73,10 @@ use super::{ page_size, pte_index, Child, KernelMode, PageTable, PageTableEntryTrait, PageTableError, PageTableMode, PageTableNode, PagingConstsTrait, PagingLevel, UserMode, }; -use crate::mm::{page::DynPage, Paddr, PageProperty, Vaddr}; +use crate::{ + mm::{page::DynPage, Paddr, PageProperty, Vaddr}, + task::{disable_preempt, DisabledPreemptGuard}, +}; #[derive(Clone, Debug)] pub enum PageTableItem { @@ -125,7 +128,8 @@ where va: Vaddr, /// The virtual address range that is locked. barrier_va: Range, - phantom: PhantomData<&'a PageTable>, + preempt_guard: DisabledPreemptGuard, + _phantom: PhantomData<&'a PageTable>, } impl<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsTrait> Cursor<'a, M, E, C> @@ -162,7 +166,8 @@ where guard_level: C::NR_LEVELS, va: va.start, barrier_va: va.clone(), - phantom: PhantomData, + preempt_guard: disable_preempt(), + _phantom: PhantomData, }; // Go down and get proper locks. The cursor should hold a lock of a @@ -204,37 +209,28 @@ where let level = self.level; let va = self.va; - let pte = self.read_cur_pte(); - if !pte.is_present() { - return Ok(PageTableItem::NotMapped { - va, - len: page_size::(level), - }); - } - if !pte.is_last(level) { - self.level_down(); - continue; - } - match self.cur_child() { - Child::Page(page) => { - return Ok(PageTableItem::Mapped { + Child::PageTable(_) => { + self.level_down(); + continue; + } + Child::None => { + return Ok(PageTableItem::NotMapped { va, - page, - prop: pte.prop(), + len: page_size::(level), }); } - Child::Untracked(pa) => { + Child::Page(page, prop) => { + return Ok(PageTableItem::Mapped { va, page, prop }); + } + Child::Untracked(pa, prop) => { return Ok(PageTableItem::MappedUntracked { va, pa, len: page_size::(level), - prop: pte.prop(), + prop, }); } - Child::None | Child::PageTable(_) => { - unreachable!(); // Already checked with the PTE. - } } } } @@ -289,6 +285,10 @@ where self.va } + pub fn preempt_guard(&self) -> &DisabledPreemptGuard { + &self.preempt_guard + } + /// Goes up a level. We release the current page if it has no mappings since the cursor only moves /// forward. And if needed we will do the final cleanup using this method after re-walk when the /// cursor is dropped. @@ -423,6 +423,8 @@ where /// Maps the range starting from the current address to a [`DynPage`]. /// + /// It returns the previously mapped [`DynPage`] if that exists. + /// /// # Panics /// /// This function will panic if @@ -434,7 +436,7 @@ where /// /// The caller should ensure that the virtual range being mapped does /// not affect kernel's memory safety. - pub unsafe fn map(&mut self, page: DynPage, prop: PageProperty) { + pub unsafe fn map(&mut self, page: DynPage, prop: PageProperty) -> Option { let end = self.0.va + page.size(); assert!(end <= self.0.barrier_va.end); debug_assert!(self.0.in_tracked_range()); @@ -458,8 +460,19 @@ where // Map the current page. let idx = self.0.cur_idx(); - self.cur_node_mut().set_child_page(idx, page, prop); + let old = self + .cur_node_mut() + .replace_child(idx, Child::Page(page, prop), true); self.0.move_forward(); + + match old { + Child::Page(old_page, _) => Some(old_page), + Child::None => None, + Child::PageTable(_) => { + todo!("Dropping page table nodes while mapping requires TLB flush") + } + Child::Untracked(_, _) => panic!("Mapping a tracked page in an untracked range"), + } } /// Maps the range starting from the current address to a physical address range. @@ -520,7 +533,9 @@ where // Map the current page. debug_assert!(!self.0.in_tracked_range()); let idx = self.0.cur_idx(); - self.cur_node_mut().set_child_untracked(idx, pa, prop); + let _ = self + .cur_node_mut() + .replace_child(idx, Child::Untracked(pa, prop), false); let level = self.0.level; pa += page_size::(level); @@ -605,23 +620,25 @@ where // Unmap the current page and return it. let idx = self.0.cur_idx(); - let ret = self.cur_node_mut().take_child(idx, is_tracked); + let ret = self + .cur_node_mut() + .replace_child(idx, Child::None, is_tracked); let ret_page_va = self.0.va; let ret_page_size = page_size::(self.0.level); self.0.move_forward(); return match ret { - Child::Page(page) => PageTableItem::Mapped { + Child::Page(page, prop) => PageTableItem::Mapped { va: ret_page_va, page, - prop: cur_pte.prop(), + prop, }, - Child::Untracked(pa) => PageTableItem::MappedUntracked { + Child::Untracked(pa, prop) => PageTableItem::MappedUntracked { va: ret_page_va, pa, len: ret_page_size, - prop: cur_pte.prop(), + prop, }, Child::None | Child::PageTable(_) => unreachable!(), }; @@ -717,6 +734,10 @@ where None } + pub fn preempt_guard(&self) -> &DisabledPreemptGuard { + &self.0.preempt_guard + } + /// Consumes itself and leak the root guard for the caller if it locked the root level. /// /// It is useful when the caller wants to keep the root guard while the cursor should be dropped. @@ -743,8 +764,12 @@ where let new_node = PageTableNode::::alloc(self.0.level - 1); let idx = self.0.cur_idx(); let is_tracked = self.0.in_tracked_range(); - self.cur_node_mut() - .set_child_pt(idx, new_node.clone_raw(), is_tracked); + let old = self.cur_node_mut().replace_child( + idx, + Child::PageTable(new_node.clone_raw()), + is_tracked, + ); + debug_assert!(old.is_none()); self.0.level -= 1; self.0.guards[(self.0.level - 1) as usize] = Some(new_node); } diff --git a/ostd/src/mm/page_table/mod.rs b/ostd/src/mm/page_table/mod.rs index e54afc420..9bb1e2cc6 100644 --- a/ostd/src/mm/page_table/mod.rs +++ b/ostd/src/mm/page_table/mod.rs @@ -162,7 +162,11 @@ impl PageTable { for i in start..end { if !root_node.read_pte(i).is_present() { let node = PageTableNode::alloc(PagingConsts::NR_LEVELS - 1); - root_node.set_child_pt(i, node.into_raw(), i < NR_PTES_PER_NODE * 3 / 4); + let _ = root_node.replace_child( + i, + Child::PageTable(node.into_raw()), + i < NR_PTES_PER_NODE * 3 / 4, + ); } } } diff --git a/ostd/src/mm/page_table/node.rs b/ostd/src/mm/page_table/node.rs index 37e54ca74..134cd112a 100644 --- a/ostd/src/mm/page_table/node.rs +++ b/ostd/src/mm/page_table/node.rs @@ -42,7 +42,6 @@ use crate::{ page_prop::PageProperty, Paddr, PagingConstsTrait, PagingLevel, PAGE_SIZE, }, - task::{disable_preempt, DisabledPreemptGuard}, }; /// The raw handle to a page table node. @@ -80,8 +79,6 @@ where // count is needed. let page = unsafe { Page::>::from_raw(this.paddr()) }; - let disable_preempt = disable_preempt(); - // Acquire the lock. while page .meta() @@ -92,10 +89,7 @@ where core::hint::spin_loop(); } - PageTableNode:: { - page, - preempt_guard: disable_preempt, - } + PageTableNode:: { page, _private: () } } /// Creates a copy of the handle. @@ -190,7 +184,7 @@ pub(super) struct PageTableNode< [(); C::NR_LEVELS as usize]:, { pub(super) page: Page>, - preempt_guard: DisabledPreemptGuard, + _private: (), } // FIXME: We cannot `#[derive(Debug)]` here due to `DisabledPreemptGuard`. Should we skip @@ -215,12 +209,21 @@ where [(); C::NR_LEVELS as usize]:, { PageTable(RawPageTableNode), - Page(DynPage), + Page(DynPage, PageProperty), /// Pages not tracked by handles. - Untracked(Paddr), + Untracked(Paddr, PageProperty), None, } +impl Child +where + [(); C::NR_LEVELS as usize]:, +{ + pub(super) fn is_none(&self) -> bool { + matches!(self, Child::None) + } +} + impl PageTableNode where [(); C::NR_LEVELS as usize]:, @@ -241,10 +244,7 @@ where unsafe { core::ptr::write_bytes(ptr, 0, PAGE_SIZE) }; debug_assert!(E::new_absent().as_bytes().iter().all(|&b| b == 0)); - Self { - page, - preempt_guard: disable_preempt(), - } + Self { page, _private: () } } pub fn level(&self) -> PagingLevel { @@ -253,16 +253,11 @@ where /// Converts the handle into a raw handle to be stored in a PTE or CPU. pub(super) fn into_raw(self) -> RawPageTableNode { - let mut this = ManuallyDrop::new(self); + let this = ManuallyDrop::new(self); let raw = this.page.paddr(); this.page.meta().lock.store(0, Ordering::Release); - // SAFETY: The field will no longer be accessed and we need to drop the field to release - // the preempt count. - unsafe { - core::ptr::drop_in_place(&mut this.preempt_guard); - } RawPageTableNode { raw, @@ -300,40 +295,82 @@ where _phantom: PhantomData, }) } else if in_tracked_range { - // SAFETY: We have a reference count to the page and can safely increase the reference - // count by one more. + // SAFETY: We have a reference count to the page and can safely + // increase the reference count by one more. unsafe { DynPage::inc_ref_count(paddr); } - Child::Page(unsafe { DynPage::from_raw(paddr) }) + // SAFETY: The physical address of the PTE points to a forgotten + // page. It is reclaimed only once. + Child::Page(unsafe { DynPage::from_raw(paddr) }, pte.prop()) } else { - Child::Untracked(paddr) + Child::Untracked(paddr, pte.prop()) } } } - /// Remove the child at the given index and return it. - pub(super) fn take_child(&mut self, idx: usize, in_tracked_range: bool) -> Child { + /// Replace the child at the given index with a new child. + /// + /// The old child is returned. + pub(super) fn replace_child( + &mut self, + idx: usize, + new_child: Child, + in_tracked_range: bool, + ) -> Child { debug_assert!(idx < nr_subpage_per_huge::()); - let pte = self.read_pte(idx); - if !pte.is_present() { - Child::None - } else { - let paddr = pte.paddr(); - let is_last = pte.is_last(self.level()); - *self.nr_children_mut() -= 1; - self.write_pte(idx, E::new_absent()); - if !is_last { + let old_pte = self.read_pte(idx); + + let new_child_is_none = match new_child { + Child::None => { + if old_pte.is_present() { + self.write_pte(idx, E::new_absent()); + } + true + } + Child::PageTable(pt) => { + let pt = ManuallyDrop::new(pt); + let new_pte = E::new_pt(pt.paddr()); + self.write_pte(idx, new_pte); + false + } + Child::Page(page, prop) => { + debug_assert!(in_tracked_range); + let new_pte = E::new_page(page.into_raw(), self.level(), prop); + self.write_pte(idx, new_pte); + false + } + Child::Untracked(pa, prop) => { + debug_assert!(!in_tracked_range); + let new_pte = E::new_page(pa, self.level(), prop); + self.write_pte(idx, new_pte); + false + } + }; + + if old_pte.is_present() { + if new_child_is_none { + *self.nr_children_mut() -= 1; + } + let paddr = old_pte.paddr(); + if !old_pte.is_last(self.level()) { Child::PageTable(RawPageTableNode { raw: paddr, _phantom: PhantomData, }) } else if in_tracked_range { - Child::Page(unsafe { DynPage::from_raw(paddr) }) + // SAFETY: The physical address of the old PTE points to a + // forgotten page. It is reclaimed only once. + Child::Page(unsafe { DynPage::from_raw(paddr) }, old_pte.prop()) } else { - Child::Untracked(paddr) + Child::Untracked(paddr, old_pte.prop()) } + } else { + if !new_child_is_none { + *self.nr_children_mut() += 1; + } + Child::None } } @@ -364,16 +401,17 @@ where Child::PageTable(pt) => { let guard = pt.clone_shallow().lock(); let new_child = guard.make_copy(0..nr_subpage_per_huge::(), 0..0); - new_pt.set_child_pt(i, new_child.into_raw(), true); + let old = new_pt.replace_child(i, Child::PageTable(new_child.into_raw()), true); + debug_assert!(old.is_none()); copied_child_count -= 1; } - Child::Page(page) => { - let prop = self.read_pte_prop(i); - new_pt.set_child_page(i, page.clone(), prop); + Child::Page(page, prop) => { + let old = new_pt.replace_child(i, Child::Page(page.clone(), prop), true); + debug_assert!(old.is_none()); copied_child_count -= 1; } Child::None => {} - Child::Untracked(_) => { + Child::Untracked(_, _) => { unreachable!(); } } @@ -386,11 +424,16 @@ where debug_assert_eq!(self.level(), C::NR_LEVELS); match self.child(i, /*meaningless*/ true) { Child::PageTable(pt) => { - new_pt.set_child_pt(i, pt.clone_shallow(), /*meaningless*/ true); + let old = new_pt.replace_child( + i, + Child::PageTable(pt.clone_shallow()), + /*meaningless*/ true, + ); + debug_assert!(old.is_none()); copied_child_count -= 1; } Child::None => {} - Child::Page(_) | Child::Untracked(_) => { + Child::Page(_, _) | Child::Untracked(_, _) => { unreachable!(); } } @@ -399,73 +442,23 @@ where new_pt } - /// Sets a child page table at a given index. - pub(super) fn set_child_pt( - &mut self, - idx: usize, - pt: RawPageTableNode, - in_tracked_range: bool, - ) { - // They should be ensured by the cursor. - debug_assert!(idx < nr_subpage_per_huge::()); - - // The ownership is transferred to a raw PTE. Don't drop the handle. - let pt = ManuallyDrop::new(pt); - - let pte = Some(E::new_pt(pt.paddr())); - self.overwrite_pte(idx, pte, in_tracked_range); - } - - /// Map a page at a given index. - pub(super) fn set_child_page(&mut self, idx: usize, page: DynPage, prop: PageProperty) { - // They should be ensured by the cursor. - debug_assert!(idx < nr_subpage_per_huge::()); - debug_assert_eq!(page.level(), self.level()); - - // Use the physical address rather than the page handle to track - // the page, and record the physical address in the PTE. - let pte = Some(E::new_page(page.into_raw(), self.level(), prop)); - self.overwrite_pte(idx, pte, true); - } - - /// Sets an untracked child page at a given index. - /// - /// # Safety - /// - /// The caller must ensure that the physical address is valid and safe to map. - pub(super) unsafe fn set_child_untracked(&mut self, idx: usize, pa: Paddr, prop: PageProperty) { - // It should be ensured by the cursor. - debug_assert!(idx < nr_subpage_per_huge::()); - - let pte = Some(E::new_page(pa, self.level(), prop)); - self.overwrite_pte(idx, pte, false); - } - - /// Reads the info from a page table entry at a given index. - pub(super) fn read_pte_prop(&self, idx: usize) -> PageProperty { - self.read_pte(idx).prop() - } - /// Splits the untracked huge page mapped at `idx` to smaller pages. pub(super) fn split_untracked_huge(&mut self, idx: usize) { // These should be ensured by the cursor. debug_assert!(idx < nr_subpage_per_huge::()); debug_assert!(self.level() > 1); - let Child::Untracked(pa) = self.child(idx, false) else { + let Child::Untracked(pa, prop) = self.child(idx, false) else { panic!("`split_untracked_huge` not called on an untracked huge page"); }; - let prop = self.read_pte_prop(idx); let mut new_page = PageTableNode::::alloc(self.level() - 1); for i in 0..nr_subpage_per_huge::() { let small_pa = pa + i * page_size::(self.level() - 1); - // SAFETY: the index is within the bound and either physical address and - // the property are valid. - unsafe { new_page.set_child_untracked(i, small_pa, prop) }; + new_page.replace_child(i, Child::Untracked(small_pa, prop), false); } - self.set_child_pt(idx, new_page.into_raw(), false); + self.replace_child(idx, Child::PageTable(new_page.into_raw()), false); } /// Protects an already mapped child at a given index. @@ -512,47 +505,6 @@ where unsafe { &mut *self.meta().nr_children.get() } } - /// Replaces a page table entry at a given index. - /// - /// This method will ensure that the child presented by the overwritten - /// PTE is dropped, and the child count is updated. - /// - /// The caller in this module will ensure that the PTE points to initialized - /// memory if the child is a page table. - fn overwrite_pte(&mut self, idx: usize, pte: Option, in_tracked_range: bool) { - let existing_pte = self.read_pte(idx); - - if existing_pte.is_present() { - self.write_pte(idx, pte.unwrap_or(E::new_absent())); - - // Drop the child. We must set the PTE before dropping the child. - // Just restore the handle and drop the handle. - - let paddr = existing_pte.paddr(); - // SAFETY: Both the `from_raw` operations here are safe as the physical - // address is valid and casted from a handle. - unsafe { - if !existing_pte.is_last(self.level()) { - // This is a page table. - drop(Page::>::from_raw(paddr)); - } else if in_tracked_range { - // This is a frame. - drop(DynPage::from_raw(paddr)); - } - } - - // Update the child count. - if pte.is_none() { - *self.nr_children_mut() -= 1; - } - } else if let Some(e) = pte { - // SAFETY: This is safe as described in the above branch. - unsafe { (self.as_ptr() as *mut E).add(idx).write(e) }; - - *self.nr_children_mut() += 1; - } - } - fn as_ptr(&self) -> *const E { paddr_to_vaddr(self.start_paddr()) as *const E } diff --git a/ostd/src/mm/vm_space.rs b/ostd/src/mm/vm_space.rs index ae0c6b95c..ce94cf082 100644 --- a/ostd/src/mm/vm_space.rs +++ b/ostd/src/mm/vm_space.rs @@ -9,29 +9,31 @@ //! powerful concurrent accesses to the page table, and suffers from the same //! validity concerns as described in [`super::page_table::cursor`]. -use core::ops::Range; +use alloc::collections::vec_deque::VecDeque; +use core::{ + ops::Range, + sync::atomic::{AtomicPtr, Ordering}, +}; use spin::Once; use super::{ io::Fallible, kspace::KERNEL_PAGE_TABLE, + page::DynPage, page_table::{PageTable, UserMode}, PageFlags, PageProperty, VmReader, VmWriter, PAGE_SIZE, }; use crate::{ - arch::mm::{ - current_page_table_paddr, tlb_flush_addr, tlb_flush_addr_range, - tlb_flush_all_excluding_global, PageTableEntry, PagingConsts, - }, - cpu::{CpuExceptionInfo, CpuSet, PinCurrentCpu}, - cpu_local_cell, + arch::mm::{current_page_table_paddr, PageTableEntry, PagingConsts}, + cpu::{num_cpus, CpuExceptionInfo, CpuSet, PinCurrentCpu}, + cpu_local, mm::{ page_table::{self, PageTableItem}, Frame, MAX_USERSPACE_VADDR, }, prelude::*, - sync::SpinLock, + sync::{RwLock, RwLockReadGuard, SpinLock}, task::disable_preempt, Error, }; @@ -55,28 +57,18 @@ use crate::{ pub struct VmSpace { pt: PageTable, page_fault_handler: Once core::result::Result<(), ()>>, - /// The CPUs that the `VmSpace` is activated on. - /// - /// TODO: implement an atomic bitset to optimize the performance in cases - /// that the number of CPUs is not large. - activated_cpus: SpinLock, + /// A CPU can only activate a `VmSpace` when no mutable cursors are alive. + /// Cursors hold read locks and activation require a write lock. + activation_lock: RwLock<()>, } -// Notes on TLB flushing: -// -// We currently assume that: -// 1. `VmSpace` _might_ be activated on the current CPU and the user memory _might_ be used -// immediately after we make changes to the page table entries. So we must invalidate the -// corresponding TLB caches accordingly. -// 2. `VmSpace` must _not_ be activated on another CPU. This assumption is trivial, since SMP -// support is not yet available. But we need to consider this situation in the future (TODO). impl VmSpace { /// Creates a new VM address space. pub fn new() -> Self { Self { pt: KERNEL_PAGE_TABLE.get().unwrap().create_user_page_table(), page_fault_handler: Once::new(), - activated_cpus: SpinLock::new(CpuSet::new_empty()), + activation_lock: RwLock::new(()), } } @@ -102,72 +94,62 @@ impl VmSpace { /// The creation of the cursor may block if another cursor having an /// overlapping range is alive. The modification to the mapping by the /// cursor may also block or be overridden the mapping of another cursor. - pub fn cursor_mut(&self, va: &Range) -> Result> { - Ok(self.pt.cursor_mut(va).map(CursorMut)?) + pub fn cursor_mut(&self, va: &Range) -> Result> { + Ok(self.pt.cursor_mut(va).map(|pt_cursor| { + let activation_lock = self.activation_lock.read(); + + let cur_cpu = pt_cursor.preempt_guard().current_cpu(); + + let mut activated_cpus = CpuSet::new_empty(); + let mut need_self_flush = false; + let mut need_remote_flush = false; + + for cpu in 0..num_cpus() { + // The activation lock is held; other CPUs cannot activate this `VmSpace`. + let ptr = + ACTIVATED_VM_SPACE.get_on_cpu(cpu).load(Ordering::Relaxed) as *const VmSpace; + if ptr == self as *const VmSpace { + activated_cpus.add(cpu); + if cpu == cur_cpu { + need_self_flush = true; + } else { + need_remote_flush = true; + } + } + } + + CursorMut { + pt_cursor, + activation_lock, + activated_cpus, + need_remote_flush, + need_self_flush, + } + })?) } /// Activates the page table on the current CPU. pub(crate) fn activate(self: &Arc) { - cpu_local_cell! { - /// The `Arc` pointer to the last activated VM space on this CPU. If the - /// pointer is NULL, it means that the last activated page table is merely - /// the kernel page table. - static LAST_ACTIVATED_VM_SPACE: *const VmSpace = core::ptr::null(); - } - let preempt_guard = disable_preempt(); - let mut activated_cpus = self.activated_cpus.lock(); + // Ensure no mutable cursors (which holds read locks) are alive. + let _activation_lock = self.activation_lock.write(); + let cpu = preempt_guard.current_cpu(); + let activated_vm_space = ACTIVATED_VM_SPACE.get_on_cpu(cpu); - if !activated_cpus.contains(cpu) { - activated_cpus.add(cpu); + let last_ptr = activated_vm_space.load(Ordering::Relaxed) as *const VmSpace; + + if last_ptr != Arc::as_ptr(self) { self.pt.activate(); - - let last_ptr = LAST_ACTIVATED_VM_SPACE.load(); - + let ptr = Arc::into_raw(Arc::clone(self)) as *mut VmSpace; + activated_vm_space.store(ptr, Ordering::Relaxed); if !last_ptr.is_null() { - // SAFETY: If the pointer is not NULL, it must be a valid - // pointer casted with `Arc::into_raw` on the last activated - // `Arc`. - let last = unsafe { Arc::from_raw(last_ptr) }; - debug_assert!(!Arc::ptr_eq(self, &last)); - let mut last_cpus = last.activated_cpus.lock(); - debug_assert!(last_cpus.contains(cpu)); - last_cpus.remove(cpu); - } - - LAST_ACTIVATED_VM_SPACE.store(Arc::into_raw(Arc::clone(self))); - } - - if activated_cpus.count() > 1 { - // We don't support remote TLB flushing yet. It is less desirable - // to activate a `VmSpace` on more than one CPU. - log::warn!("A `VmSpace` is activated on more than one CPU"); - } - } - - /// Clears all mappings. - pub fn clear(&self) { - let mut cursor = self.pt.cursor_mut(&(0..MAX_USERSPACE_VADDR)).unwrap(); - loop { - // SAFETY: It is safe to un-map memory in the userspace. - let result = unsafe { cursor.take_next(MAX_USERSPACE_VADDR - cursor.virt_addr()) }; - match result { - PageTableItem::Mapped { page, .. } => { - drop(page); - } - PageTableItem::NotMapped { .. } => { - break; - } - PageTableItem::MappedUntracked { .. } => { - panic!("found untracked memory mapped into `VmSpace`"); - } + // SAFETY: The pointer is cast from an `Arc` when it's activated + // the last time, so it can be restored and only restored once. + drop(unsafe { Arc::from_raw(last_ptr) }); } } - // TODO: currently this method calls x86_64::flush_all(), which rewrite the Cr3 register. - // We should replace it with x86_64::flush_pcid(InvPicdCommand::AllExceptGlobal) after enabling PCID. - tlb_flush_all_excluding_global(); } pub(crate) fn handle_page_fault( @@ -199,25 +181,12 @@ impl VmSpace { pub fn fork_copy_on_write(&self) -> Self { // Protect the parent VM space as read-only. let end = MAX_USERSPACE_VADDR; - let mut cursor = self.pt.cursor_mut(&(0..end)).unwrap(); + let mut cursor = self.cursor_mut(&(0..end)).unwrap(); let mut op = |prop: &mut PageProperty| { prop.flags -= PageFlags::W; }; - loop { - // SAFETY: It is safe to protect memory in the userspace. - unsafe { - if cursor - .protect_next(end - cursor.virt_addr(), &mut op) - .is_none() - { - break; - } - }; - } - // TODO: currently this method calls x86_64::flush_all(), which rewrite the Cr3 register. - // We should replace it with x86_64::flush_pcid(InvPicdCommand::AllExceptGlobal) after enabling PCID. - tlb_flush_all_excluding_global(); + cursor.protect(end, &mut op); let page_fault_handler = { let new_handler = Once::new(); @@ -227,10 +196,22 @@ impl VmSpace { new_handler }; + let CursorMut { + pt_cursor, + activation_lock, + .. + } = cursor; + + let new_pt = self.pt.clone_with(pt_cursor); + + // Release the activation lock after the page table is cloned to + // prevent modification to the parent page table while cloning. + drop(activation_lock); + Self { - pt: self.pt.clone_with(cursor), + pt: new_pt, page_fault_handler, - activated_cpus: SpinLock::new(CpuSet::new_empty()), + activation_lock: RwLock::new(()), } } @@ -326,35 +307,42 @@ impl Cursor<'_> { /// /// It exclusively owns a sub-tree of the page table, preventing others from /// reading or modifying the same sub-tree. -pub struct CursorMut<'a>(page_table::CursorMut<'a, UserMode, PageTableEntry, PagingConsts>); - -impl CursorMut<'_> { - /// The threshold used to determine whether need to flush TLB all - /// when flushing a range of TLB addresses. If the range of TLB entries - /// to be flushed exceeds this threshold, the overhead incurred by - /// flushing pages individually would surpass the overhead of flushing all entries at once. - const TLB_FLUSH_THRESHOLD: usize = 32 * PAGE_SIZE; +pub struct CursorMut<'a, 'b> { + pt_cursor: page_table::CursorMut<'a, UserMode, PageTableEntry, PagingConsts>, + #[allow(dead_code)] + activation_lock: RwLockReadGuard<'b, ()>, + // Better to store them here since loading and counting them from the CPUs + // list brings non-trivial overhead. We have a read lock so the stored set + // is always a superset of actual activated CPUs. + activated_cpus: CpuSet, + need_remote_flush: bool, + need_self_flush: bool, +} +impl CursorMut<'_, '_> { /// Query about the current slot. /// /// This is the same as [`Cursor::query`]. /// /// This function won't bring the cursor to the next slot. pub fn query(&mut self) -> Result { - Ok(self.0.query().map(|item| item.try_into().unwrap())?) + Ok(self + .pt_cursor + .query() + .map(|item| item.try_into().unwrap())?) } /// Jump to the virtual address. /// /// This is the same as [`Cursor::jump`]. pub fn jump(&mut self, va: Vaddr) -> Result<()> { - self.0.jump(va)?; + self.pt_cursor.jump(va)?; Ok(()) } /// Get the virtual address of the current slot. pub fn virt_addr(&self) -> Vaddr { - self.0.virt_addr() + self.pt_cursor.virt_addr() } /// Map a frame into the current slot. @@ -367,11 +355,10 @@ impl CursorMut<'_> { // When this bit is truly enabled, it needs to be set at a more appropriate location. prop.flags |= PageFlags::ACCESSED; // SAFETY: It is safe to map untyped memory into the userspace. - unsafe { - self.0.map(frame.into(), prop); - } + let old = unsafe { self.pt_cursor.map(frame.into(), prop) }; - tlb_flush_addr_range(&(start_va..end_va)); + self.issue_tlb_flush(TlbFlushOp::Range(start_va..end_va), old); + self.dispatch_tlb_flush(); } /// Clear the mapping starting from the current slot. @@ -388,18 +375,13 @@ impl CursorMut<'_> { pub fn unmap(&mut self, len: usize) { assert!(len % super::PAGE_SIZE == 0); let end_va = self.virt_addr() + len; - let need_flush_all = len >= Self::TLB_FLUSH_THRESHOLD; + loop { // SAFETY: It is safe to un-map memory in the userspace. - let result = unsafe { self.0.take_next(end_va - self.virt_addr()) }; + let result = unsafe { self.pt_cursor.take_next(end_va - self.virt_addr()) }; match result { PageTableItem::Mapped { va, page, .. } => { - if !need_flush_all { - // TODO: Ask other processors to flush the TLB before we - // release the page back to the allocator. - tlb_flush_addr(va); - } - drop(page); + self.issue_tlb_flush(TlbFlushOp::Address(va), Some(page)); } PageTableItem::NotMapped { .. } => { break; @@ -409,9 +391,8 @@ impl CursorMut<'_> { } } } - if need_flush_all { - tlb_flush_all_excluding_global(); - } + + self.dispatch_tlb_flush(); } /// Change the mapping property starting from the current slot. @@ -426,17 +407,121 @@ impl CursorMut<'_> { /// This method will panic if `len` is not page-aligned. pub fn protect(&mut self, len: usize, mut op: impl FnMut(&mut PageProperty)) { assert!(len % super::PAGE_SIZE == 0); - let end = self.0.virt_addr() + len; - let need_flush_all = len >= Self::TLB_FLUSH_THRESHOLD; + let end = self.virt_addr() + len; + let tlb_prefer_flush_all = len > TLB_FLUSH_ALL_THRESHOLD * PAGE_SIZE; + // SAFETY: It is safe to protect memory in the userspace. - while let Some(range) = unsafe { self.0.protect_next(end - self.0.virt_addr(), &mut op) } { - if !need_flush_all { - tlb_flush_addr(range.start); + while let Some(range) = + unsafe { self.pt_cursor.protect_next(end - self.virt_addr(), &mut op) } + { + if !tlb_prefer_flush_all { + self.issue_tlb_flush(TlbFlushOp::Range(range), None); } } - if need_flush_all { - tlb_flush_all_excluding_global(); + if tlb_prefer_flush_all { + self.issue_tlb_flush(TlbFlushOp::All, None); + } + self.dispatch_tlb_flush(); + } + + fn issue_tlb_flush(&self, op: TlbFlushOp, drop_after_flush: Option) { + let request = TlbFlushRequest { + op, + drop_after_flush, + }; + + // Fast path for single CPU cases. + if !self.need_remote_flush { + if self.need_self_flush { + request.do_flush(); + } + return; + } + + // Slow path for multi-CPU cases. + for cpu in self.activated_cpus.iter() { + let mut queue = TLB_FLUSH_REQUESTS.get_on_cpu(cpu).lock(); + queue.push_back(request.clone()); + } + } + + fn dispatch_tlb_flush(&self) { + if !self.need_remote_flush { + return; + } + + fn do_remote_flush() { + let preempt_guard = disable_preempt(); + let mut requests = TLB_FLUSH_REQUESTS + .get_on_cpu(preempt_guard.current_cpu()) + .lock(); + if requests.len() > TLB_FLUSH_ALL_THRESHOLD { + // TODO: in most cases, we need only to flush all the TLB entries + // for an ASID if it is enabled. + crate::arch::mm::tlb_flush_all_excluding_global(); + requests.clear(); + } else { + while let Some(request) = requests.pop_front() { + request.do_flush(); + if matches!(request.op, TlbFlushOp::All) { + requests.clear(); + break; + } + } + } + } + + crate::smp::inter_processor_call(&self.activated_cpus.clone(), do_remote_flush); + } +} + +/// The threshold used to determine whether we need to flush all TLB entries +/// when handling a bunch of TLB flush requests. If the number of requests +/// exceeds this threshold, the overhead incurred by flushing pages +/// individually would surpass the overhead of flushing all entries at once. +const TLB_FLUSH_ALL_THRESHOLD: usize = 32; + +cpu_local! { + /// The queue of pending requests. + static TLB_FLUSH_REQUESTS: SpinLock> = SpinLock::new(VecDeque::new()); + /// The `Arc` pointer to the activated VM space on this CPU. If the pointer + /// is NULL, it means that the activated page table is merely the kernel + /// page table. + // TODO: If we are enabling ASID, we need to maintain the TLB state of each + // CPU, rather than merely the activated `VmSpace`. When ASID is enabled, + // the non-active `VmSpace`s can still have their TLB entries in the CPU! + static ACTIVATED_VM_SPACE: AtomicPtr = AtomicPtr::new(core::ptr::null_mut()); +} + +#[derive(Debug, Clone)] +struct TlbFlushRequest { + op: TlbFlushOp, + // If we need to remove a mapped page from the page table, we can only + // recycle the page after all the relevant TLB entries in all CPUs are + // flushed. Otherwise if the page is recycled for other purposes, the user + // space program can still access the page through the TLB entries. + #[allow(dead_code)] + drop_after_flush: Option, +} + +#[derive(Debug, Clone)] +enum TlbFlushOp { + All, + Address(Vaddr), + Range(Range), +} + +impl TlbFlushRequest { + /// Perform the TLB flush operation on the current CPU. + fn do_flush(&self) { + use crate::arch::mm::{ + tlb_flush_addr, tlb_flush_addr_range, tlb_flush_all_excluding_global, + }; + match &self.op { + TlbFlushOp::All => tlb_flush_all_excluding_global(), + TlbFlushOp::Address(addr) => tlb_flush_addr(*addr), + TlbFlushOp::Range(range) => tlb_flush_addr_range(range), } } } diff --git a/ostd/src/smp.rs b/ostd/src/smp.rs index 72cf55ff4..910bb564a 100644 --- a/ostd/src/smp.rs +++ b/ostd/src/smp.rs @@ -32,7 +32,7 @@ use crate::{ /// However if called on the current processor, it will be synchronous. pub fn inter_processor_call(targets: &CpuSet, f: fn()) { let irq_guard = trap::disable_local(); - let this_cpu_id = irq_guard.current_cpu() as usize; + let this_cpu_id = irq_guard.current_cpu(); let irq_num = INTER_PROCESSOR_CALL_IRQ.get().unwrap().num(); let mut call_on_self = false; @@ -41,7 +41,7 @@ pub fn inter_processor_call(targets: &CpuSet, f: fn()) { call_on_self = true; continue; } - CALL_QUEUES.get_on_cpu(cpu_id as u32).lock().push_back(f); + CALL_QUEUES.get_on_cpu(cpu_id).lock().push_back(f); } for cpu_id in targets.iter() { if cpu_id == this_cpu_id { @@ -49,7 +49,7 @@ pub fn inter_processor_call(targets: &CpuSet, f: fn()) { } // SAFETY: It is safe to send inter processor call IPI to other CPUs. unsafe { - crate::arch::irq::send_ipi(cpu_id as u32, irq_num); + crate::arch::irq::send_ipi(cpu_id, irq_num); } } if call_on_self { diff --git a/ostd/src/task/preempt/guard.rs b/ostd/src/task/preempt/guard.rs index a8e9bd8d9..1aa88c6e1 100644 --- a/ostd/src/task/preempt/guard.rs +++ b/ostd/src/task/preempt/guard.rs @@ -3,6 +3,7 @@ /// A guard for disable preempt. #[clippy::has_significant_drop] #[must_use] +#[derive(Debug)] pub struct DisabledPreemptGuard { // This private field prevents user from constructing values of this type directly. _private: (),