Implement remote TLB flush on VmSpace

This commit is contained in:
Zhang Junyang
2024-08-30 17:49:18 +08:00
committed by Tate, Hongliang Tian
parent 326ec09169
commit d9eccdcfbe
13 changed files with 391 additions and 338 deletions

View File

@ -50,14 +50,14 @@ impl<T: PreemptSchedInfo> PreemptScheduler<T> {
let mut minimum_load = usize::MAX;
for candidate in runnable.cpu_affinity().iter() {
let rq = self.rq[candidate].lock();
let rq = self.rq[candidate as usize].lock();
// A wild guess measuring the load of a runqueue. We assume that
// real-time tasks are 4-times as important as normal tasks.
let load = rq.real_time_entities.len() * 8
+ rq.normal_entities.len() * 2
+ rq.lowest_entities.len();
if load < minimum_load {
selected = candidate as u32;
selected = candidate;
minimum_load = load;
}
}

View File

@ -24,12 +24,12 @@ impl WorkerScheduler for SimpleScheduler {
fn schedule(&self) {
let worker_pool = self.worker_pool.upgrade().unwrap();
for cpu_id in worker_pool.cpu_set().iter() {
if !worker_pool.heartbeat(cpu_id as u32)
&& worker_pool.has_pending_work_items(cpu_id as u32)
&& !worker_pool.wake_worker(cpu_id as u32)
&& worker_pool.num_workers(cpu_id as u32) < WORKER_LIMIT
if !worker_pool.heartbeat(cpu_id)
&& worker_pool.has_pending_work_items(cpu_id)
&& !worker_pool.wake_worker(cpu_id)
&& worker_pool.num_workers(cpu_id) < WORKER_LIMIT
{
worker_pool.add_worker(cpu_id as u32);
worker_pool.add_worker(cpu_id);
}
}
}

View File

@ -128,10 +128,7 @@ impl WorkerPool {
Arc::new_cyclic(|pool_ref| {
let mut local_pools = Vec::new();
for cpu_id in cpu_set.iter() {
local_pools.push(Arc::new(LocalWorkerPool::new(
pool_ref.clone(),
cpu_id as u32,
)));
local_pools.push(Arc::new(LocalWorkerPool::new(pool_ref.clone(), cpu_id)));
}
WorkerPool {
local_pools,

View File

@ -336,7 +336,7 @@ impl Vmar_ {
if !self.is_root_vmar() {
return_errno_with_message!(Errno::EACCES, "The vmar is not root vmar");
}
self.vm_space.clear();
self.clear_vm_space();
let mut inner = self.inner.lock();
inner.child_vmar_s.clear();
inner.vm_mappings.clear();
@ -346,6 +346,13 @@ impl Vmar_ {
Ok(())
}
fn clear_vm_space(&self) {
let start = ROOT_VMAR_LOWEST_ADDR;
let end = ROOT_VMAR_CAP_ADDR;
let mut cursor = self.vm_space.cursor_mut(&(start..end)).unwrap();
cursor.unmap(end - start);
}
pub fn destroy(&self, range: Range<usize>) -> Result<()> {
self.check_destroy_range(&range)?;
let mut inner = self.inner.lock();

View File

@ -57,6 +57,7 @@ fn create_user_space(program: &[u8]) -> UserSpace {
for frame in user_pages {
cursor.map(frame, map_prop);
}
drop(cursor);
Arc::new(vm_space)
};
let user_cpu_state = {

View File

@ -10,12 +10,7 @@ cfg_if::cfg_if! {
}
}
use alloc::vec::Vec;
use bitvec::{
prelude::{BitVec, Lsb0},
slice::IterOnes,
};
use bitvec::prelude::BitVec;
use local::cpu_local_cell;
use spin::Once;
@ -122,13 +117,6 @@ impl CpuSet {
self.bitset.set(cpu_id as usize, true);
}
/// Adds a list of CPUs to the set.
pub fn add_from_vec(&mut self, cpu_ids: Vec<u32>) {
for cpu_id in cpu_ids {
self.add(cpu_id)
}
}
/// Adds all CPUs to the set.
pub fn add_all(&mut self) {
self.bitset.fill(true);
@ -139,13 +127,6 @@ impl CpuSet {
self.bitset.set(cpu_id as usize, false);
}
/// Removes a list of CPUs from the set.
pub fn remove_from_vec(&mut self, cpu_ids: Vec<u32>) {
for cpu_id in cpu_ids {
self.remove(cpu_id);
}
}
/// Removes all CPUs from the set.
pub fn clear(&mut self) {
self.bitset.fill(false);
@ -162,8 +143,8 @@ impl CpuSet {
}
/// Iterates over the CPUs in the set.
pub fn iter(&self) -> IterOnes<'_, usize, Lsb0> {
self.bitset.iter_ones()
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
self.bitset.iter_ones().map(|idx| idx as u32)
}
}

View File

@ -156,7 +156,7 @@ pub fn init_kernel_page_table(meta_pages: Vec<Page<MetaPageMeta>>) {
for meta_page in meta_pages {
// SAFETY: we are doing the metadata mappings for the kernel.
unsafe {
cursor.map(meta_page.into(), prop);
let _old = cursor.map(meta_page.into(), prop);
}
}
}
@ -199,7 +199,7 @@ pub fn init_kernel_page_table(meta_pages: Vec<Page<MetaPageMeta>>) {
let page = Page::<KernelMeta>::from_unused(frame_paddr, KernelMeta::default());
// SAFETY: we are doing mappings for the kernel.
unsafe {
cursor.map(page.into(), prop);
let _old = cursor.map(page.into(), prop);
}
}
}

View File

@ -73,7 +73,10 @@ use super::{
page_size, pte_index, Child, KernelMode, PageTable, PageTableEntryTrait, PageTableError,
PageTableMode, PageTableNode, PagingConstsTrait, PagingLevel, UserMode,
};
use crate::mm::{page::DynPage, Paddr, PageProperty, Vaddr};
use crate::{
mm::{page::DynPage, Paddr, PageProperty, Vaddr},
task::{disable_preempt, DisabledPreemptGuard},
};
#[derive(Clone, Debug)]
pub enum PageTableItem {
@ -125,7 +128,8 @@ where
va: Vaddr,
/// The virtual address range that is locked.
barrier_va: Range<Vaddr>,
phantom: PhantomData<&'a PageTable<M, E, C>>,
preempt_guard: DisabledPreemptGuard,
_phantom: PhantomData<&'a PageTable<M, E, C>>,
}
impl<'a, M: PageTableMode, E: PageTableEntryTrait, C: PagingConstsTrait> Cursor<'a, M, E, C>
@ -162,7 +166,8 @@ where
guard_level: C::NR_LEVELS,
va: va.start,
barrier_va: va.clone(),
phantom: PhantomData,
preempt_guard: disable_preempt(),
_phantom: PhantomData,
};
// Go down and get proper locks. The cursor should hold a lock of a
@ -204,37 +209,28 @@ where
let level = self.level;
let va = self.va;
let pte = self.read_cur_pte();
if !pte.is_present() {
match self.cur_child() {
Child::PageTable(_) => {
self.level_down();
continue;
}
Child::None => {
return Ok(PageTableItem::NotMapped {
va,
len: page_size::<C>(level),
});
}
if !pte.is_last(level) {
self.level_down();
continue;
Child::Page(page, prop) => {
return Ok(PageTableItem::Mapped { va, page, prop });
}
match self.cur_child() {
Child::Page(page) => {
return Ok(PageTableItem::Mapped {
va,
page,
prop: pte.prop(),
});
}
Child::Untracked(pa) => {
Child::Untracked(pa, prop) => {
return Ok(PageTableItem::MappedUntracked {
va,
pa,
len: page_size::<C>(level),
prop: pte.prop(),
prop,
});
}
Child::None | Child::PageTable(_) => {
unreachable!(); // Already checked with the PTE.
}
}
}
}
@ -289,6 +285,10 @@ where
self.va
}
pub fn preempt_guard(&self) -> &DisabledPreemptGuard {
&self.preempt_guard
}
/// Goes up a level. We release the current page if it has no mappings since the cursor only moves
/// forward. And if needed we will do the final cleanup using this method after re-walk when the
/// cursor is dropped.
@ -423,6 +423,8 @@ where
/// Maps the range starting from the current address to a [`DynPage`].
///
/// It returns the previously mapped [`DynPage`] if that exists.
///
/// # Panics
///
/// This function will panic if
@ -434,7 +436,7 @@ where
///
/// The caller should ensure that the virtual range being mapped does
/// not affect kernel's memory safety.
pub unsafe fn map(&mut self, page: DynPage, prop: PageProperty) {
pub unsafe fn map(&mut self, page: DynPage, prop: PageProperty) -> Option<DynPage> {
let end = self.0.va + page.size();
assert!(end <= self.0.barrier_va.end);
debug_assert!(self.0.in_tracked_range());
@ -458,8 +460,19 @@ where
// Map the current page.
let idx = self.0.cur_idx();
self.cur_node_mut().set_child_page(idx, page, prop);
let old = self
.cur_node_mut()
.replace_child(idx, Child::Page(page, prop), true);
self.0.move_forward();
match old {
Child::Page(old_page, _) => Some(old_page),
Child::None => None,
Child::PageTable(_) => {
todo!("Dropping page table nodes while mapping requires TLB flush")
}
Child::Untracked(_, _) => panic!("Mapping a tracked page in an untracked range"),
}
}
/// Maps the range starting from the current address to a physical address range.
@ -520,7 +533,9 @@ where
// Map the current page.
debug_assert!(!self.0.in_tracked_range());
let idx = self.0.cur_idx();
self.cur_node_mut().set_child_untracked(idx, pa, prop);
let _ = self
.cur_node_mut()
.replace_child(idx, Child::Untracked(pa, prop), false);
let level = self.0.level;
pa += page_size::<C>(level);
@ -605,23 +620,25 @@ where
// Unmap the current page and return it.
let idx = self.0.cur_idx();
let ret = self.cur_node_mut().take_child(idx, is_tracked);
let ret = self
.cur_node_mut()
.replace_child(idx, Child::None, is_tracked);
let ret_page_va = self.0.va;
let ret_page_size = page_size::<C>(self.0.level);
self.0.move_forward();
return match ret {
Child::Page(page) => PageTableItem::Mapped {
Child::Page(page, prop) => PageTableItem::Mapped {
va: ret_page_va,
page,
prop: cur_pte.prop(),
prop,
},
Child::Untracked(pa) => PageTableItem::MappedUntracked {
Child::Untracked(pa, prop) => PageTableItem::MappedUntracked {
va: ret_page_va,
pa,
len: ret_page_size,
prop: cur_pte.prop(),
prop,
},
Child::None | Child::PageTable(_) => unreachable!(),
};
@ -717,6 +734,10 @@ where
None
}
pub fn preempt_guard(&self) -> &DisabledPreemptGuard {
&self.0.preempt_guard
}
/// Consumes itself and leak the root guard for the caller if it locked the root level.
///
/// It is useful when the caller wants to keep the root guard while the cursor should be dropped.
@ -743,8 +764,12 @@ where
let new_node = PageTableNode::<E, C>::alloc(self.0.level - 1);
let idx = self.0.cur_idx();
let is_tracked = self.0.in_tracked_range();
self.cur_node_mut()
.set_child_pt(idx, new_node.clone_raw(), is_tracked);
let old = self.cur_node_mut().replace_child(
idx,
Child::PageTable(new_node.clone_raw()),
is_tracked,
);
debug_assert!(old.is_none());
self.0.level -= 1;
self.0.guards[(self.0.level - 1) as usize] = Some(new_node);
}

View File

@ -162,7 +162,11 @@ impl PageTable<KernelMode> {
for i in start..end {
if !root_node.read_pte(i).is_present() {
let node = PageTableNode::alloc(PagingConsts::NR_LEVELS - 1);
root_node.set_child_pt(i, node.into_raw(), i < NR_PTES_PER_NODE * 3 / 4);
let _ = root_node.replace_child(
i,
Child::PageTable(node.into_raw()),
i < NR_PTES_PER_NODE * 3 / 4,
);
}
}
}

View File

@ -42,7 +42,6 @@ use crate::{
page_prop::PageProperty,
Paddr, PagingConstsTrait, PagingLevel, PAGE_SIZE,
},
task::{disable_preempt, DisabledPreemptGuard},
};
/// The raw handle to a page table node.
@ -80,8 +79,6 @@ where
// count is needed.
let page = unsafe { Page::<PageTablePageMeta<E, C>>::from_raw(this.paddr()) };
let disable_preempt = disable_preempt();
// Acquire the lock.
while page
.meta()
@ -92,10 +89,7 @@ where
core::hint::spin_loop();
}
PageTableNode::<E, C> {
page,
preempt_guard: disable_preempt,
}
PageTableNode::<E, C> { page, _private: () }
}
/// Creates a copy of the handle.
@ -190,7 +184,7 @@ pub(super) struct PageTableNode<
[(); C::NR_LEVELS as usize]:,
{
pub(super) page: Page<PageTablePageMeta<E, C>>,
preempt_guard: DisabledPreemptGuard,
_private: (),
}
// FIXME: We cannot `#[derive(Debug)]` here due to `DisabledPreemptGuard`. Should we skip
@ -215,12 +209,21 @@ where
[(); C::NR_LEVELS as usize]:,
{
PageTable(RawPageTableNode<E, C>),
Page(DynPage),
Page(DynPage, PageProperty),
/// Pages not tracked by handles.
Untracked(Paddr),
Untracked(Paddr, PageProperty),
None,
}
impl<E: PageTableEntryTrait, C: PagingConstsTrait> Child<E, C>
where
[(); C::NR_LEVELS as usize]:,
{
pub(super) fn is_none(&self) -> bool {
matches!(self, Child::None)
}
}
impl<E: PageTableEntryTrait, C: PagingConstsTrait> PageTableNode<E, C>
where
[(); C::NR_LEVELS as usize]:,
@ -241,10 +244,7 @@ where
unsafe { core::ptr::write_bytes(ptr, 0, PAGE_SIZE) };
debug_assert!(E::new_absent().as_bytes().iter().all(|&b| b == 0));
Self {
page,
preempt_guard: disable_preempt(),
}
Self { page, _private: () }
}
pub fn level(&self) -> PagingLevel {
@ -253,16 +253,11 @@ where
/// Converts the handle into a raw handle to be stored in a PTE or CPU.
pub(super) fn into_raw(self) -> RawPageTableNode<E, C> {
let mut this = ManuallyDrop::new(self);
let this = ManuallyDrop::new(self);
let raw = this.page.paddr();
this.page.meta().lock.store(0, Ordering::Release);
// SAFETY: The field will no longer be accessed and we need to drop the field to release
// the preempt count.
unsafe {
core::ptr::drop_in_place(&mut this.preempt_guard);
}
RawPageTableNode {
raw,
@ -300,40 +295,82 @@ where
_phantom: PhantomData,
})
} else if in_tracked_range {
// SAFETY: We have a reference count to the page and can safely increase the reference
// count by one more.
// SAFETY: We have a reference count to the page and can safely
// increase the reference count by one more.
unsafe {
DynPage::inc_ref_count(paddr);
}
Child::Page(unsafe { DynPage::from_raw(paddr) })
// SAFETY: The physical address of the PTE points to a forgotten
// page. It is reclaimed only once.
Child::Page(unsafe { DynPage::from_raw(paddr) }, pte.prop())
} else {
Child::Untracked(paddr)
Child::Untracked(paddr, pte.prop())
}
}
}
/// Remove the child at the given index and return it.
pub(super) fn take_child(&mut self, idx: usize, in_tracked_range: bool) -> Child<E, C> {
/// Replace the child at the given index with a new child.
///
/// The old child is returned.
pub(super) fn replace_child(
&mut self,
idx: usize,
new_child: Child<E, C>,
in_tracked_range: bool,
) -> Child<E, C> {
debug_assert!(idx < nr_subpage_per_huge::<C>());
let pte = self.read_pte(idx);
if !pte.is_present() {
Child::None
} else {
let paddr = pte.paddr();
let is_last = pte.is_last(self.level());
*self.nr_children_mut() -= 1;
let old_pte = self.read_pte(idx);
let new_child_is_none = match new_child {
Child::None => {
if old_pte.is_present() {
self.write_pte(idx, E::new_absent());
if !is_last {
}
true
}
Child::PageTable(pt) => {
let pt = ManuallyDrop::new(pt);
let new_pte = E::new_pt(pt.paddr());
self.write_pte(idx, new_pte);
false
}
Child::Page(page, prop) => {
debug_assert!(in_tracked_range);
let new_pte = E::new_page(page.into_raw(), self.level(), prop);
self.write_pte(idx, new_pte);
false
}
Child::Untracked(pa, prop) => {
debug_assert!(!in_tracked_range);
let new_pte = E::new_page(pa, self.level(), prop);
self.write_pte(idx, new_pte);
false
}
};
if old_pte.is_present() {
if new_child_is_none {
*self.nr_children_mut() -= 1;
}
let paddr = old_pte.paddr();
if !old_pte.is_last(self.level()) {
Child::PageTable(RawPageTableNode {
raw: paddr,
_phantom: PhantomData,
})
} else if in_tracked_range {
Child::Page(unsafe { DynPage::from_raw(paddr) })
// SAFETY: The physical address of the old PTE points to a
// forgotten page. It is reclaimed only once.
Child::Page(unsafe { DynPage::from_raw(paddr) }, old_pte.prop())
} else {
Child::Untracked(paddr)
Child::Untracked(paddr, old_pte.prop())
}
} else {
if !new_child_is_none {
*self.nr_children_mut() += 1;
}
Child::None
}
}
@ -364,16 +401,17 @@ where
Child::PageTable(pt) => {
let guard = pt.clone_shallow().lock();
let new_child = guard.make_copy(0..nr_subpage_per_huge::<C>(), 0..0);
new_pt.set_child_pt(i, new_child.into_raw(), true);
let old = new_pt.replace_child(i, Child::PageTable(new_child.into_raw()), true);
debug_assert!(old.is_none());
copied_child_count -= 1;
}
Child::Page(page) => {
let prop = self.read_pte_prop(i);
new_pt.set_child_page(i, page.clone(), prop);
Child::Page(page, prop) => {
let old = new_pt.replace_child(i, Child::Page(page.clone(), prop), true);
debug_assert!(old.is_none());
copied_child_count -= 1;
}
Child::None => {}
Child::Untracked(_) => {
Child::Untracked(_, _) => {
unreachable!();
}
}
@ -386,11 +424,16 @@ where
debug_assert_eq!(self.level(), C::NR_LEVELS);
match self.child(i, /*meaningless*/ true) {
Child::PageTable(pt) => {
new_pt.set_child_pt(i, pt.clone_shallow(), /*meaningless*/ true);
let old = new_pt.replace_child(
i,
Child::PageTable(pt.clone_shallow()),
/*meaningless*/ true,
);
debug_assert!(old.is_none());
copied_child_count -= 1;
}
Child::None => {}
Child::Page(_) | Child::Untracked(_) => {
Child::Page(_, _) | Child::Untracked(_, _) => {
unreachable!();
}
}
@ -399,73 +442,23 @@ where
new_pt
}
/// Sets a child page table at a given index.
pub(super) fn set_child_pt(
&mut self,
idx: usize,
pt: RawPageTableNode<E, C>,
in_tracked_range: bool,
) {
// They should be ensured by the cursor.
debug_assert!(idx < nr_subpage_per_huge::<C>());
// The ownership is transferred to a raw PTE. Don't drop the handle.
let pt = ManuallyDrop::new(pt);
let pte = Some(E::new_pt(pt.paddr()));
self.overwrite_pte(idx, pte, in_tracked_range);
}
/// Map a page at a given index.
pub(super) fn set_child_page(&mut self, idx: usize, page: DynPage, prop: PageProperty) {
// They should be ensured by the cursor.
debug_assert!(idx < nr_subpage_per_huge::<C>());
debug_assert_eq!(page.level(), self.level());
// Use the physical address rather than the page handle to track
// the page, and record the physical address in the PTE.
let pte = Some(E::new_page(page.into_raw(), self.level(), prop));
self.overwrite_pte(idx, pte, true);
}
/// Sets an untracked child page at a given index.
///
/// # Safety
///
/// The caller must ensure that the physical address is valid and safe to map.
pub(super) unsafe fn set_child_untracked(&mut self, idx: usize, pa: Paddr, prop: PageProperty) {
// It should be ensured by the cursor.
debug_assert!(idx < nr_subpage_per_huge::<C>());
let pte = Some(E::new_page(pa, self.level(), prop));
self.overwrite_pte(idx, pte, false);
}
/// Reads the info from a page table entry at a given index.
pub(super) fn read_pte_prop(&self, idx: usize) -> PageProperty {
self.read_pte(idx).prop()
}
/// Splits the untracked huge page mapped at `idx` to smaller pages.
pub(super) fn split_untracked_huge(&mut self, idx: usize) {
// These should be ensured by the cursor.
debug_assert!(idx < nr_subpage_per_huge::<C>());
debug_assert!(self.level() > 1);
let Child::Untracked(pa) = self.child(idx, false) else {
let Child::Untracked(pa, prop) = self.child(idx, false) else {
panic!("`split_untracked_huge` not called on an untracked huge page");
};
let prop = self.read_pte_prop(idx);
let mut new_page = PageTableNode::<E, C>::alloc(self.level() - 1);
for i in 0..nr_subpage_per_huge::<C>() {
let small_pa = pa + i * page_size::<C>(self.level() - 1);
// SAFETY: the index is within the bound and either physical address and
// the property are valid.
unsafe { new_page.set_child_untracked(i, small_pa, prop) };
new_page.replace_child(i, Child::Untracked(small_pa, prop), false);
}
self.set_child_pt(idx, new_page.into_raw(), false);
self.replace_child(idx, Child::PageTable(new_page.into_raw()), false);
}
/// Protects an already mapped child at a given index.
@ -512,47 +505,6 @@ where
unsafe { &mut *self.meta().nr_children.get() }
}
/// Replaces a page table entry at a given index.
///
/// This method will ensure that the child presented by the overwritten
/// PTE is dropped, and the child count is updated.
///
/// The caller in this module will ensure that the PTE points to initialized
/// memory if the child is a page table.
fn overwrite_pte(&mut self, idx: usize, pte: Option<E>, in_tracked_range: bool) {
let existing_pte = self.read_pte(idx);
if existing_pte.is_present() {
self.write_pte(idx, pte.unwrap_or(E::new_absent()));
// Drop the child. We must set the PTE before dropping the child.
// Just restore the handle and drop the handle.
let paddr = existing_pte.paddr();
// SAFETY: Both the `from_raw` operations here are safe as the physical
// address is valid and casted from a handle.
unsafe {
if !existing_pte.is_last(self.level()) {
// This is a page table.
drop(Page::<PageTablePageMeta<E, C>>::from_raw(paddr));
} else if in_tracked_range {
// This is a frame.
drop(DynPage::from_raw(paddr));
}
}
// Update the child count.
if pte.is_none() {
*self.nr_children_mut() -= 1;
}
} else if let Some(e) = pte {
// SAFETY: This is safe as described in the above branch.
unsafe { (self.as_ptr() as *mut E).add(idx).write(e) };
*self.nr_children_mut() += 1;
}
}
fn as_ptr(&self) -> *const E {
paddr_to_vaddr(self.start_paddr()) as *const E
}

View File

@ -9,29 +9,31 @@
//! powerful concurrent accesses to the page table, and suffers from the same
//! validity concerns as described in [`super::page_table::cursor`].
use core::ops::Range;
use alloc::collections::vec_deque::VecDeque;
use core::{
ops::Range,
sync::atomic::{AtomicPtr, Ordering},
};
use spin::Once;
use super::{
io::Fallible,
kspace::KERNEL_PAGE_TABLE,
page::DynPage,
page_table::{PageTable, UserMode},
PageFlags, PageProperty, VmReader, VmWriter, PAGE_SIZE,
};
use crate::{
arch::mm::{
current_page_table_paddr, tlb_flush_addr, tlb_flush_addr_range,
tlb_flush_all_excluding_global, PageTableEntry, PagingConsts,
},
cpu::{CpuExceptionInfo, CpuSet, PinCurrentCpu},
cpu_local_cell,
arch::mm::{current_page_table_paddr, PageTableEntry, PagingConsts},
cpu::{num_cpus, CpuExceptionInfo, CpuSet, PinCurrentCpu},
cpu_local,
mm::{
page_table::{self, PageTableItem},
Frame, MAX_USERSPACE_VADDR,
},
prelude::*,
sync::SpinLock,
sync::{RwLock, RwLockReadGuard, SpinLock},
task::disable_preempt,
Error,
};
@ -55,28 +57,18 @@ use crate::{
pub struct VmSpace {
pt: PageTable<UserMode>,
page_fault_handler: Once<fn(&VmSpace, &CpuExceptionInfo) -> core::result::Result<(), ()>>,
/// The CPUs that the `VmSpace` is activated on.
///
/// TODO: implement an atomic bitset to optimize the performance in cases
/// that the number of CPUs is not large.
activated_cpus: SpinLock<CpuSet>,
/// A CPU can only activate a `VmSpace` when no mutable cursors are alive.
/// Cursors hold read locks and activation require a write lock.
activation_lock: RwLock<()>,
}
// Notes on TLB flushing:
//
// We currently assume that:
// 1. `VmSpace` _might_ be activated on the current CPU and the user memory _might_ be used
// immediately after we make changes to the page table entries. So we must invalidate the
// corresponding TLB caches accordingly.
// 2. `VmSpace` must _not_ be activated on another CPU. This assumption is trivial, since SMP
// support is not yet available. But we need to consider this situation in the future (TODO).
impl VmSpace {
/// Creates a new VM address space.
pub fn new() -> Self {
Self {
pt: KERNEL_PAGE_TABLE.get().unwrap().create_user_page_table(),
page_fault_handler: Once::new(),
activated_cpus: SpinLock::new(CpuSet::new_empty()),
activation_lock: RwLock::new(()),
}
}
@ -102,72 +94,62 @@ impl VmSpace {
/// The creation of the cursor may block if another cursor having an
/// overlapping range is alive. The modification to the mapping by the
/// cursor may also block or be overridden the mapping of another cursor.
pub fn cursor_mut(&self, va: &Range<Vaddr>) -> Result<CursorMut<'_>> {
Ok(self.pt.cursor_mut(va).map(CursorMut)?)
pub fn cursor_mut(&self, va: &Range<Vaddr>) -> Result<CursorMut<'_, '_>> {
Ok(self.pt.cursor_mut(va).map(|pt_cursor| {
let activation_lock = self.activation_lock.read();
let cur_cpu = pt_cursor.preempt_guard().current_cpu();
let mut activated_cpus = CpuSet::new_empty();
let mut need_self_flush = false;
let mut need_remote_flush = false;
for cpu in 0..num_cpus() {
// The activation lock is held; other CPUs cannot activate this `VmSpace`.
let ptr =
ACTIVATED_VM_SPACE.get_on_cpu(cpu).load(Ordering::Relaxed) as *const VmSpace;
if ptr == self as *const VmSpace {
activated_cpus.add(cpu);
if cpu == cur_cpu {
need_self_flush = true;
} else {
need_remote_flush = true;
}
}
}
CursorMut {
pt_cursor,
activation_lock,
activated_cpus,
need_remote_flush,
need_self_flush,
}
})?)
}
/// Activates the page table on the current CPU.
pub(crate) fn activate(self: &Arc<Self>) {
cpu_local_cell! {
/// The `Arc` pointer to the last activated VM space on this CPU. If the
/// pointer is NULL, it means that the last activated page table is merely
/// the kernel page table.
static LAST_ACTIVATED_VM_SPACE: *const VmSpace = core::ptr::null();
}
let preempt_guard = disable_preempt();
let mut activated_cpus = self.activated_cpus.lock();
// Ensure no mutable cursors (which holds read locks) are alive.
let _activation_lock = self.activation_lock.write();
let cpu = preempt_guard.current_cpu();
let activated_vm_space = ACTIVATED_VM_SPACE.get_on_cpu(cpu);
if !activated_cpus.contains(cpu) {
activated_cpus.add(cpu);
let last_ptr = activated_vm_space.load(Ordering::Relaxed) as *const VmSpace;
if last_ptr != Arc::as_ptr(self) {
self.pt.activate();
let last_ptr = LAST_ACTIVATED_VM_SPACE.load();
let ptr = Arc::into_raw(Arc::clone(self)) as *mut VmSpace;
activated_vm_space.store(ptr, Ordering::Relaxed);
if !last_ptr.is_null() {
// SAFETY: If the pointer is not NULL, it must be a valid
// pointer casted with `Arc::into_raw` on the last activated
// `Arc<VmSpace>`.
let last = unsafe { Arc::from_raw(last_ptr) };
debug_assert!(!Arc::ptr_eq(self, &last));
let mut last_cpus = last.activated_cpus.lock();
debug_assert!(last_cpus.contains(cpu));
last_cpus.remove(cpu);
}
LAST_ACTIVATED_VM_SPACE.store(Arc::into_raw(Arc::clone(self)));
}
if activated_cpus.count() > 1 {
// We don't support remote TLB flushing yet. It is less desirable
// to activate a `VmSpace` on more than one CPU.
log::warn!("A `VmSpace` is activated on more than one CPU");
// SAFETY: The pointer is cast from an `Arc` when it's activated
// the last time, so it can be restored and only restored once.
drop(unsafe { Arc::from_raw(last_ptr) });
}
}
/// Clears all mappings.
pub fn clear(&self) {
let mut cursor = self.pt.cursor_mut(&(0..MAX_USERSPACE_VADDR)).unwrap();
loop {
// SAFETY: It is safe to un-map memory in the userspace.
let result = unsafe { cursor.take_next(MAX_USERSPACE_VADDR - cursor.virt_addr()) };
match result {
PageTableItem::Mapped { page, .. } => {
drop(page);
}
PageTableItem::NotMapped { .. } => {
break;
}
PageTableItem::MappedUntracked { .. } => {
panic!("found untracked memory mapped into `VmSpace`");
}
}
}
// TODO: currently this method calls x86_64::flush_all(), which rewrite the Cr3 register.
// We should replace it with x86_64::flush_pcid(InvPicdCommand::AllExceptGlobal) after enabling PCID.
tlb_flush_all_excluding_global();
}
pub(crate) fn handle_page_fault(
@ -199,25 +181,12 @@ impl VmSpace {
pub fn fork_copy_on_write(&self) -> Self {
// Protect the parent VM space as read-only.
let end = MAX_USERSPACE_VADDR;
let mut cursor = self.pt.cursor_mut(&(0..end)).unwrap();
let mut cursor = self.cursor_mut(&(0..end)).unwrap();
let mut op = |prop: &mut PageProperty| {
prop.flags -= PageFlags::W;
};
loop {
// SAFETY: It is safe to protect memory in the userspace.
unsafe {
if cursor
.protect_next(end - cursor.virt_addr(), &mut op)
.is_none()
{
break;
}
};
}
// TODO: currently this method calls x86_64::flush_all(), which rewrite the Cr3 register.
// We should replace it with x86_64::flush_pcid(InvPicdCommand::AllExceptGlobal) after enabling PCID.
tlb_flush_all_excluding_global();
cursor.protect(end, &mut op);
let page_fault_handler = {
let new_handler = Once::new();
@ -227,10 +196,22 @@ impl VmSpace {
new_handler
};
let CursorMut {
pt_cursor,
activation_lock,
..
} = cursor;
let new_pt = self.pt.clone_with(pt_cursor);
// Release the activation lock after the page table is cloned to
// prevent modification to the parent page table while cloning.
drop(activation_lock);
Self {
pt: self.pt.clone_with(cursor),
pt: new_pt,
page_fault_handler,
activated_cpus: SpinLock::new(CpuSet::new_empty()),
activation_lock: RwLock::new(()),
}
}
@ -326,35 +307,42 @@ impl Cursor<'_> {
///
/// It exclusively owns a sub-tree of the page table, preventing others from
/// reading or modifying the same sub-tree.
pub struct CursorMut<'a>(page_table::CursorMut<'a, UserMode, PageTableEntry, PagingConsts>);
impl CursorMut<'_> {
/// The threshold used to determine whether need to flush TLB all
/// when flushing a range of TLB addresses. If the range of TLB entries
/// to be flushed exceeds this threshold, the overhead incurred by
/// flushing pages individually would surpass the overhead of flushing all entries at once.
const TLB_FLUSH_THRESHOLD: usize = 32 * PAGE_SIZE;
pub struct CursorMut<'a, 'b> {
pt_cursor: page_table::CursorMut<'a, UserMode, PageTableEntry, PagingConsts>,
#[allow(dead_code)]
activation_lock: RwLockReadGuard<'b, ()>,
// Better to store them here since loading and counting them from the CPUs
// list brings non-trivial overhead. We have a read lock so the stored set
// is always a superset of actual activated CPUs.
activated_cpus: CpuSet,
need_remote_flush: bool,
need_self_flush: bool,
}
impl CursorMut<'_, '_> {
/// Query about the current slot.
///
/// This is the same as [`Cursor::query`].
///
/// This function won't bring the cursor to the next slot.
pub fn query(&mut self) -> Result<VmItem> {
Ok(self.0.query().map(|item| item.try_into().unwrap())?)
Ok(self
.pt_cursor
.query()
.map(|item| item.try_into().unwrap())?)
}
/// Jump to the virtual address.
///
/// This is the same as [`Cursor::jump`].
pub fn jump(&mut self, va: Vaddr) -> Result<()> {
self.0.jump(va)?;
self.pt_cursor.jump(va)?;
Ok(())
}
/// Get the virtual address of the current slot.
pub fn virt_addr(&self) -> Vaddr {
self.0.virt_addr()
self.pt_cursor.virt_addr()
}
/// Map a frame into the current slot.
@ -367,11 +355,10 @@ impl CursorMut<'_> {
// When this bit is truly enabled, it needs to be set at a more appropriate location.
prop.flags |= PageFlags::ACCESSED;
// SAFETY: It is safe to map untyped memory into the userspace.
unsafe {
self.0.map(frame.into(), prop);
}
let old = unsafe { self.pt_cursor.map(frame.into(), prop) };
tlb_flush_addr_range(&(start_va..end_va));
self.issue_tlb_flush(TlbFlushOp::Range(start_va..end_va), old);
self.dispatch_tlb_flush();
}
/// Clear the mapping starting from the current slot.
@ -388,18 +375,13 @@ impl CursorMut<'_> {
pub fn unmap(&mut self, len: usize) {
assert!(len % super::PAGE_SIZE == 0);
let end_va = self.virt_addr() + len;
let need_flush_all = len >= Self::TLB_FLUSH_THRESHOLD;
loop {
// SAFETY: It is safe to un-map memory in the userspace.
let result = unsafe { self.0.take_next(end_va - self.virt_addr()) };
let result = unsafe { self.pt_cursor.take_next(end_va - self.virt_addr()) };
match result {
PageTableItem::Mapped { va, page, .. } => {
if !need_flush_all {
// TODO: Ask other processors to flush the TLB before we
// release the page back to the allocator.
tlb_flush_addr(va);
}
drop(page);
self.issue_tlb_flush(TlbFlushOp::Address(va), Some(page));
}
PageTableItem::NotMapped { .. } => {
break;
@ -409,9 +391,8 @@ impl CursorMut<'_> {
}
}
}
if need_flush_all {
tlb_flush_all_excluding_global();
}
self.dispatch_tlb_flush();
}
/// Change the mapping property starting from the current slot.
@ -426,17 +407,121 @@ impl CursorMut<'_> {
/// This method will panic if `len` is not page-aligned.
pub fn protect(&mut self, len: usize, mut op: impl FnMut(&mut PageProperty)) {
assert!(len % super::PAGE_SIZE == 0);
let end = self.0.virt_addr() + len;
let need_flush_all = len >= Self::TLB_FLUSH_THRESHOLD;
let end = self.virt_addr() + len;
let tlb_prefer_flush_all = len > TLB_FLUSH_ALL_THRESHOLD * PAGE_SIZE;
// SAFETY: It is safe to protect memory in the userspace.
while let Some(range) = unsafe { self.0.protect_next(end - self.0.virt_addr(), &mut op) } {
if !need_flush_all {
tlb_flush_addr(range.start);
while let Some(range) =
unsafe { self.pt_cursor.protect_next(end - self.virt_addr(), &mut op) }
{
if !tlb_prefer_flush_all {
self.issue_tlb_flush(TlbFlushOp::Range(range), None);
}
}
if need_flush_all {
tlb_flush_all_excluding_global();
if tlb_prefer_flush_all {
self.issue_tlb_flush(TlbFlushOp::All, None);
}
self.dispatch_tlb_flush();
}
fn issue_tlb_flush(&self, op: TlbFlushOp, drop_after_flush: Option<DynPage>) {
let request = TlbFlushRequest {
op,
drop_after_flush,
};
// Fast path for single CPU cases.
if !self.need_remote_flush {
if self.need_self_flush {
request.do_flush();
}
return;
}
// Slow path for multi-CPU cases.
for cpu in self.activated_cpus.iter() {
let mut queue = TLB_FLUSH_REQUESTS.get_on_cpu(cpu).lock();
queue.push_back(request.clone());
}
}
fn dispatch_tlb_flush(&self) {
if !self.need_remote_flush {
return;
}
fn do_remote_flush() {
let preempt_guard = disable_preempt();
let mut requests = TLB_FLUSH_REQUESTS
.get_on_cpu(preempt_guard.current_cpu())
.lock();
if requests.len() > TLB_FLUSH_ALL_THRESHOLD {
// TODO: in most cases, we need only to flush all the TLB entries
// for an ASID if it is enabled.
crate::arch::mm::tlb_flush_all_excluding_global();
requests.clear();
} else {
while let Some(request) = requests.pop_front() {
request.do_flush();
if matches!(request.op, TlbFlushOp::All) {
requests.clear();
break;
}
}
}
}
crate::smp::inter_processor_call(&self.activated_cpus.clone(), do_remote_flush);
}
}
/// The threshold used to determine whether we need to flush all TLB entries
/// when handling a bunch of TLB flush requests. If the number of requests
/// exceeds this threshold, the overhead incurred by flushing pages
/// individually would surpass the overhead of flushing all entries at once.
const TLB_FLUSH_ALL_THRESHOLD: usize = 32;
cpu_local! {
/// The queue of pending requests.
static TLB_FLUSH_REQUESTS: SpinLock<VecDeque<TlbFlushRequest>> = SpinLock::new(VecDeque::new());
/// The `Arc` pointer to the activated VM space on this CPU. If the pointer
/// is NULL, it means that the activated page table is merely the kernel
/// page table.
// TODO: If we are enabling ASID, we need to maintain the TLB state of each
// CPU, rather than merely the activated `VmSpace`. When ASID is enabled,
// the non-active `VmSpace`s can still have their TLB entries in the CPU!
static ACTIVATED_VM_SPACE: AtomicPtr<VmSpace> = AtomicPtr::new(core::ptr::null_mut());
}
#[derive(Debug, Clone)]
struct TlbFlushRequest {
op: TlbFlushOp,
// If we need to remove a mapped page from the page table, we can only
// recycle the page after all the relevant TLB entries in all CPUs are
// flushed. Otherwise if the page is recycled for other purposes, the user
// space program can still access the page through the TLB entries.
#[allow(dead_code)]
drop_after_flush: Option<DynPage>,
}
#[derive(Debug, Clone)]
enum TlbFlushOp {
All,
Address(Vaddr),
Range(Range<Vaddr>),
}
impl TlbFlushRequest {
/// Perform the TLB flush operation on the current CPU.
fn do_flush(&self) {
use crate::arch::mm::{
tlb_flush_addr, tlb_flush_addr_range, tlb_flush_all_excluding_global,
};
match &self.op {
TlbFlushOp::All => tlb_flush_all_excluding_global(),
TlbFlushOp::Address(addr) => tlb_flush_addr(*addr),
TlbFlushOp::Range(range) => tlb_flush_addr_range(range),
}
}
}

View File

@ -32,7 +32,7 @@ use crate::{
/// However if called on the current processor, it will be synchronous.
pub fn inter_processor_call(targets: &CpuSet, f: fn()) {
let irq_guard = trap::disable_local();
let this_cpu_id = irq_guard.current_cpu() as usize;
let this_cpu_id = irq_guard.current_cpu();
let irq_num = INTER_PROCESSOR_CALL_IRQ.get().unwrap().num();
let mut call_on_self = false;
@ -41,7 +41,7 @@ pub fn inter_processor_call(targets: &CpuSet, f: fn()) {
call_on_self = true;
continue;
}
CALL_QUEUES.get_on_cpu(cpu_id as u32).lock().push_back(f);
CALL_QUEUES.get_on_cpu(cpu_id).lock().push_back(f);
}
for cpu_id in targets.iter() {
if cpu_id == this_cpu_id {
@ -49,7 +49,7 @@ pub fn inter_processor_call(targets: &CpuSet, f: fn()) {
}
// SAFETY: It is safe to send inter processor call IPI to other CPUs.
unsafe {
crate::arch::irq::send_ipi(cpu_id as u32, irq_num);
crate::arch::irq::send_ipi(cpu_id, irq_num);
}
}
if call_on_self {

View File

@ -3,6 +3,7 @@
/// A guard for disable preempt.
#[clippy::has_significant_drop]
#[must_use]
#[derive(Debug)]
pub struct DisabledPreemptGuard {
// This private field prevents user from constructing values of this type directly.
_private: (),