From 525085ab865b3850c0de2d704bab1bba6976a980 Mon Sep 17 00:00:00 2001 From: Zhang Junyang Date: Tue, 10 Jun 2025 20:45:52 +0800 Subject: [PATCH] Implement system call `msync` --- docs/src/kernel/linux-compatibility.md | 4 +- kernel/src/syscall/mmap.rs | 48 ++++----- kernel/src/syscall/msync.rs | 98 ++++++++++++++++++- kernel/src/vm/vmar/mod.rs | 94 +++++++++++++++--- kernel/src/vm/vmar/static_cap.rs | 7 +- kernel/src/vm/vmar/vm_mapping.rs | 22 ++++- test/syscall_test/gvisor/Makefile | 3 +- .../gvisor/blocklists.exfat/msync_test | 1 + test/syscall_test/ltp/testcases/all.txt | 4 +- 9 files changed, 227 insertions(+), 54 deletions(-) create mode 100644 test/syscall_test/gvisor/blocklists.exfat/msync_test diff --git a/docs/src/kernel/linux-compatibility.md b/docs/src/kernel/linux-compatibility.md index c7f927b54..1d738d98a 100644 --- a/docs/src/kernel/linux-compatibility.md +++ b/docs/src/kernel/linux-compatibility.md @@ -15,7 +15,7 @@ support the loading of Linux kernel modules. ## System Calls At the time of writing, -Asterinas implements 204 out of the 336 system calls +Asterinas implements 213 out of the 336 system calls provided by Linux on x86-64 architecture. | Numbers | Names | Is Implemented | @@ -46,7 +46,7 @@ provided by Linux on x86-64 architecture. | 23 | select | ✅ | | 24 | sched_yield | ✅ | | 25 | mremap | ❌ | -| 26 | msync | ❌ | +| 26 | msync | ✅ | | 27 | mincore | ❌ | | 28 | madvise | ✅ | | 29 | shmget | ❌ | diff --git a/kernel/src/syscall/mmap.rs b/kernel/src/syscall/mmap.rs index 84b8e722d..35536068b 100644 --- a/kernel/src/syscall/mmap.rs +++ b/kernel/src/syscall/mmap.rs @@ -12,11 +12,7 @@ use crate::{ file_table::{get_file_fast, FileDesc}, }, prelude::*, - vm::{ - perms::VmPerms, - vmar::is_userspace_vaddr, - vmo::{VmoOptions, VmoRightsOp}, - }, + vm::{perms::VmPerms, vmar::is_userspace_vaddr, vmo::VmoOptions}, }; pub fn sys_mmap( @@ -124,34 +120,28 @@ fn do_sys_mmap( options = options.vmo(shared_vmo); } } else { - let vmo = { - let mut file_table = ctx.thread_local.borrow_file_table_mut(); - let file = get_file_fast!(&mut file_table, fd); - let inode_handle = file.as_inode_or_err()?; + let mut file_table = ctx.thread_local.borrow_file_table_mut(); + let file = get_file_fast!(&mut file_table, fd); + let inode_handle = file.as_inode_or_err()?; - let access_mode = inode_handle.access_mode(); - if vm_perms.contains(VmPerms::READ) && !access_mode.is_readable() { - return_errno!(Errno::EACCES); - } - if option.typ() == MMapType::Shared - && vm_perms.contains(VmPerms::WRITE) - && !access_mode.is_writable() - { - return_errno!(Errno::EACCES); - } + let access_mode = inode_handle.access_mode(); + if vm_perms.contains(VmPerms::READ) && !access_mode.is_readable() { + return_errno!(Errno::EACCES); + } + if option.typ() == MMapType::Shared + && vm_perms.contains(VmPerms::WRITE) + && !access_mode.is_writable() + { + return_errno!(Errno::EACCES); + } - let inode = inode_handle.dentry().inode(); - inode - .page_cache() - .ok_or(Error::with_message( - Errno::EBADF, - "File does not have page cache", - ))? - .to_dyn() - }; + let inode = inode_handle.dentry().inode(); + if inode.page_cache().is_none() { + return_errno_with_message!(Errno::EBADF, "File does not have page cache"); + } options = options - .vmo(vmo) + .inode(inode.clone()) .vmo_offset(offset) .handle_page_faults_around(); } diff --git a/kernel/src/syscall/msync.rs b/kernel/src/syscall/msync.rs index b0ed49179..44494093d 100644 --- a/kernel/src/syscall/msync.rs +++ b/kernel/src/syscall/msync.rs @@ -1,9 +1,99 @@ // SPDX-License-Identifier: MPL-2.0 -use super::SyscallReturn; -use crate::prelude::*; +use align_ext::AlignExt; + +use super::SyscallReturn; +use crate::{prelude::*, thread::kernel_thread::ThreadOptions}; + +bitflags! { + /// Flags for `msync`. + /// + /// See . + pub struct MsyncFlags: i32 { + /// Performs `msync` asynchronously. + const MS_ASYNC = 0x01; + /// Invalidates cache so that other processes mapping the same file + /// will immediately see the changes before this `msync` call. + /// + /// Should be a no-op since we use the same page cache for all processes. + const MS_INVALIDATE = 0x02; + /// Performs `msync` synchronously. + const MS_SYNC = 0x04; + } +} + +macro_rules! return_partially_mapped { + () => { + return_errno_with_message!(Errno::ENOMEM, "`msync` called on a partially mapped range") + }; +} + +pub fn sys_msync(start: Vaddr, size: usize, flag: i32, ctx: &Context) -> Result { + let flags = MsyncFlags::from_bits(flag).ok_or_else(|| Error::new(Errno::EINVAL))?; + + debug!("msync: start = {start:#x}, size = {size}, flags = {flags:?}"); + + if start % PAGE_SIZE != 0 || flags.contains(MsyncFlags::MS_ASYNC | MsyncFlags::MS_SYNC) { + return_errno!(Errno::EINVAL); + } + + if size == 0 { + return Ok(SyscallReturn::Return(0)); + } + + let range = { + let end = start + .checked_add(size) + .ok_or(Error::with_message( + Errno::EINVAL, + "`msync` `size` overflows", + ))? + .align_up(PAGE_SIZE); + start..end + }; + + let user_space = ctx.user_space(); + let root_vmar = user_space.root_vmar().dup()?; + let guard = root_vmar.query(range.clone()); + let mut mappings_iter = guard.iter(); + + // Check if the range is fully mapped. + let Some(first) = mappings_iter.next() else { + return_errno_with_message!(Errno::ENOMEM, "`msync` called on a not mapped range"); + }; + if first.map_to_addr() > range.start { + return_partially_mapped!(); + } + let mut last_end = first.map_end(); + for mapping in mappings_iter { + let start = mapping.map_to_addr(); + if start != last_end { + return_partially_mapped!(); + } + last_end = mapping.map_end(); + } + if last_end < range.end { + return_partially_mapped!(); + } + + // Do nothing if not file-backed, as says. + let inodes = guard + .iter() + .filter_map(|m| m.inode().cloned()) + .collect::>(); + + let task_fn = move || { + for inode in inodes { + // TODO: Sync a necessary range instead of syncing the whole inode. + let _ = inode.sync_all(); + } + }; + + if flags.contains(MsyncFlags::MS_ASYNC) { + ThreadOptions::new(task_fn).spawn(); + } else { + task_fn(); + } -pub fn sys_msync(_start: Vaddr, _size: usize, _flag: i32, _ctx: &Context) -> Result { - // TODO: implement real `msync`. Ok(SyscallReturn::Return(0)) } diff --git a/kernel/src/vm/vmar/mod.rs b/kernel/src/vm/vmar/mod.rs index 11e7e18f8..bf2afb3fc 100644 --- a/kernel/src/vm/vmar/mod.rs +++ b/kernel/src/vm/vmar/mod.rs @@ -16,6 +16,7 @@ use ostd::{ mm::{ tlb::TlbFlushOp, vm_space::CursorMut, PageFlags, PageProperty, VmSpace, MAX_USERSPACE_VADDR, }, + sync::RwMutexReadGuard, task::disable_preempt, }; @@ -24,6 +25,7 @@ use self::{ vm_mapping::{MappedVmo, VmMapping}, }; use crate::{ + fs::utils::Inode, prelude::*, process::{Process, ResourceType}, thread::exception::PageFaultInfo, @@ -164,6 +166,11 @@ impl VmarInner { Some(vm_mapping) } + /// Finds a set of [`VmMapping`]s that intersect with the provided range. + fn query(&self, range: &Range) -> impl Iterator { + self.vm_mappings.find(range) + } + /// Calculates the total amount of overlap between `VmMapping`s /// and the provided range. fn count_overlap_size(&self, range: Range) -> usize { @@ -315,6 +322,13 @@ impl Vmar_ { ) } + fn query(&self, range: Range) -> VmarQueryGuard<'_> { + VmarQueryGuard { + vmar: self.inner.read(), + range, + } + } + fn protect(&self, perms: VmPerms, range: Range) -> Result<()> { assert!(range.start % PAGE_SIZE == 0); assert!(range.end % PAGE_SIZE == 0); @@ -586,6 +600,7 @@ impl Vmar { pub struct VmarMapOptions<'a, R1, R2> { parent: &'a Vmar, vmo: Option>, + inode: Option>, perms: VmPerms, vmo_offset: usize, vmo_limit: usize, @@ -610,6 +625,7 @@ impl<'a, R1, R2> VmarMapOptions<'a, R1, R2> { Self { parent, vmo: None, + inode: None, perms, vmo_offset: 0, vmo_limit: usize::MAX, @@ -622,22 +638,30 @@ impl<'a, R1, R2> VmarMapOptions<'a, R1, R2> { } } - /// Binds a VMO to the mapping. + /// Binds a [`Vmo`] to the mapping. /// - /// If the mapping is a private mapping, its size may not be equal to that of the VMO. - /// For example, it is ok to create a mapping whose size is larger than - /// that of the VMO, although one cannot read from or write to the - /// part of the mapping that is not backed by the VMO. + /// If the mapping is a private mapping, its size may not be equal to that + /// of the [`Vmo`]. For example, it is OK to create a mapping whose size is + /// larger than that of the [`Vmo`], although one cannot read from or write + /// to the part of the mapping that is not backed by the [`Vmo`]. /// - /// So you may wonder: what is the point of supporting such _oversized_ - /// mappings? The reason is two-fold. - /// 1. VMOs are resizable. So even if a mapping is backed by a VMO whose - /// size is equal to that of the mapping initially, we cannot prevent - /// the VMO from shrinking. + /// Such _oversized_ mappings are useful for two reasons: + /// 1. [`Vmo`]s are resizable. So even if a mapping is backed by a VMO + /// whose size is equal to that of the mapping initially, we cannot + /// prevent the VMO from shrinking. /// 2. Mappings are not allowed to overlap by default. As a result, - /// oversized mappings can serve as a placeholder to prevent future - /// mappings from occupying some particular address ranges accidentally. + /// oversized mappings can reserve space for future expansions. + /// + /// The [`Vmo`] of a mapping will be implicitly set if [`Self::inode`] is + /// set. + /// + /// # Panics + /// + /// This function panics if an [`Inode`] is already provided. pub fn vmo(mut self, vmo: Vmo) -> Self { + if self.inode.is_some() { + panic!("Cannot set `vmo` when `inode` is already set"); + } self.vmo = Some(vmo); self @@ -712,6 +736,36 @@ impl<'a, R1, R2> VmarMapOptions<'a, R1, R2> { } } +impl VmarMapOptions<'_, R1, Rights> { + /// Binds an [`Inode`] to the mapping. + /// + /// This is used for file-backed mappings. The provided file inode will be + /// mapped. See [`Self::vmo`] for details on the map size. + /// + /// If an [`Inode`] is provided, the [`Self::vmo`] must not be provided + /// again. The actually mapped [`Vmo`] will be the [`Inode`]'s page cache. + /// + /// # Panics + /// + /// This function panics if: + /// - a [`Vmo`] or [`Inode`] is already provided; + /// - the provided [`Inode`] does not have a page cache. + pub fn inode(mut self, inode: Arc) -> Self { + if self.vmo.is_some() { + panic!("Cannot set `inode` when `vmo` is already set"); + } + self.vmo = Some( + inode + .page_cache() + .expect("Map an inode without page cache") + .to_dyn(), + ); + self.inode = Some(inode); + + self + } +} + impl VmarMapOptions<'_, R1, R2> where Vmo: VmoRightsOp, @@ -726,6 +780,7 @@ where let Self { parent, vmo, + inode, perms, vmo_offset, vmo_limit, @@ -784,6 +839,7 @@ where NonZeroUsize::new(map_size).unwrap(), map_to_addr, vmo, + inode, is_shared, handle_page_faults_around, perms, @@ -832,6 +888,20 @@ where } } +/// A guard that allows querying a [`Vmar`] for its mappings. +pub struct VmarQueryGuard<'a> { + vmar: RwMutexReadGuard<'a, VmarInner>, + range: Range, +} + +impl VmarQueryGuard<'_> { + /// Returns an iterator over the [`VmMapping`]s that intersect with the + /// provided range when calling [`Vmar::query`]. + pub fn iter(&self) -> impl Iterator { + self.vmar.query(&self.range) + } +} + /// Determines whether two ranges are intersected. /// returns false if one of the ranges has a length of 0 pub fn is_intersected(range1: &Range, range2: &Range) -> bool { diff --git a/kernel/src/vm/vmar/static_cap.rs b/kernel/src/vm/vmar/static_cap.rs index 1e7234041..901192567 100644 --- a/kernel/src/vm/vmar/static_cap.rs +++ b/kernel/src/vm/vmar/static_cap.rs @@ -5,7 +5,7 @@ use core::ops::Range; use aster_rights::{Dup, Read, Rights, TRightSet, TRights, Write}; use aster_rights_proc::require; -use super::{VmPerms, Vmar, VmarMapOptions, VmarRightsOp, Vmar_}; +use super::{VmPerms, Vmar, VmarMapOptions, VmarQueryGuard, VmarRightsOp, Vmar_}; use crate::{ prelude::*, thread::exception::PageFaultInfo, vm::page_fault_handler::PageFaultHandler, }; @@ -85,6 +85,11 @@ impl Vmar> { self.0.protect(perms, range) } + /// Finds all the mapped regions that intersect with the specified range. + pub fn query(&self, range: Range) -> VmarQueryGuard<'_> { + self.0.query(range) + } + /// Clears all mappings. /// /// After being cleared, this vmar will become an empty vmar diff --git a/kernel/src/vm/vmar/vm_mapping.rs b/kernel/src/vm/vmar/vm_mapping.rs index c25d34559..327b72b6c 100644 --- a/kernel/src/vm/vmar/vm_mapping.rs +++ b/kernel/src/vm/vmar/vm_mapping.rs @@ -16,6 +16,7 @@ use ostd::{ use super::{interval_set::Interval, RssType}; use crate::{ + fs::utils::Inode, prelude::*, thread::exception::PageFaultInfo, vm::{ @@ -42,7 +43,7 @@ use crate::{ /// This type controls the actual mapping in the [`VmSpace`]. It is a linear /// type and cannot be [`Drop`]. To remove a mapping, use [`Self::unmap`]. #[derive(Debug)] -pub(super) struct VmMapping { +pub struct VmMapping { /// The size of mapping, in bytes. The map size can even be larger than the /// size of VMO. Those pages outside VMO range cannot be read or write. /// @@ -56,6 +57,11 @@ pub(super) struct VmMapping { /// The start of the virtual address maps to the start of the range /// specified in [`MappedVmo`]. vmo: Option, + /// The inode of the file that backs the mapping. + /// + /// If the inode is `Some`, it means that the mapping is file-backed. + /// And the `vmo` field must be the page cache of the inode. + inode: Option>, /// Whether the mapping is shared. /// /// The updates to a shared mapping are visible among processes, or carried @@ -83,6 +89,7 @@ impl VmMapping { map_size: NonZeroUsize, map_to_addr: Vaddr, vmo: Option, + inode: Option>, is_shared: bool, handle_page_faults_around: bool, perms: VmPerms, @@ -91,6 +98,7 @@ impl VmMapping { map_size, map_to_addr, vmo, + inode, is_shared, handle_page_faults_around, perms, @@ -100,6 +108,7 @@ impl VmMapping { pub(super) fn new_fork(&self) -> Result { Ok(VmMapping { vmo: self.vmo.as_ref().map(|vmo| vmo.dup()).transpose()?, + inode: self.inode.clone(), ..*self }) } @@ -119,12 +128,17 @@ impl VmMapping { self.map_size.get() } - // Returns the permissions of pages in the mapping. + /// Returns the permissions of pages in the mapping. pub fn perms(&self) -> VmPerms { self.perms } - // Returns the mapping's RSS type. + /// Returns the inode of the file that backs the mapping. + pub fn inode(&self) -> Option<&Arc> { + self.inode.as_ref() + } + + /// Returns the mapping's RSS type. pub fn rss_type(&self) -> RssType { if self.vmo.is_none() { RssType::RSS_ANONPAGES @@ -407,12 +421,14 @@ impl VmMapping { map_to_addr: self.map_to_addr, map_size: NonZeroUsize::new(left_size).unwrap(), vmo: l_vmo, + inode: self.inode.clone(), ..self }; let right = Self { map_to_addr: at, map_size: NonZeroUsize::new(right_size).unwrap(), vmo: r_vmo, + inode: self.inode, ..self }; diff --git a/test/syscall_test/gvisor/Makefile b/test/syscall_test/gvisor/Makefile index 583d55beb..03d0e64b0 100644 --- a/test/syscall_test/gvisor/Makefile +++ b/test/syscall_test/gvisor/Makefile @@ -25,6 +25,7 @@ TESTS ?= \ mknod_test \ mmap_test \ mount_test \ + msync_test \ open_create_test \ open_test \ ppoll_test \ @@ -33,9 +34,9 @@ TESTS ?= \ preadv2_test \ proc_test \ pselect_test \ + pty_test \ pwrite64_test \ pwritev2_test \ - pty_test \ read_test \ readv_test \ rename_test \ diff --git a/test/syscall_test/gvisor/blocklists.exfat/msync_test b/test/syscall_test/gvisor/blocklists.exfat/msync_test new file mode 100644 index 000000000..3a7449026 --- /dev/null +++ b/test/syscall_test/gvisor/blocklists.exfat/msync_test @@ -0,0 +1 @@ +All/MsyncFullParamTest.InvalidateUnlockedSucceeds/* \ No newline at end of file diff --git a/test/syscall_test/ltp/testcases/all.txt b/test/syscall_test/ltp/testcases/all.txt index 0120e02d5..78a5fc5d9 100644 --- a/test/syscall_test/ltp/testcases/all.txt +++ b/test/syscall_test/ltp/testcases/all.txt @@ -924,8 +924,8 @@ mprotect05 msync01 msync02 -# msync03 -# msync04 +# msync03 # FIXME: This fails because the heap place holder is treated as mapped. Fix after https://github.com/asterinas/asterinas/pull/1999. +# msync04 # TODO: support `/proc/self/pagemap` and `/proc/kpageflags`. # munlock01 # munlock02