diff --git a/docs/src/kernel/linux-compatibility.md b/docs/src/kernel/linux-compatibility.md index 3d7d4c135..ae215fba8 100644 --- a/docs/src/kernel/linux-compatibility.md +++ b/docs/src/kernel/linux-compatibility.md @@ -304,13 +304,13 @@ provided by Linux on x86-64 architecture. | 281 | epoll_pwait | ✅ | | 282 | signalfd | ❌ | | 283 | timerfd_create | ❌ | -| 284 | eventfd | ❌ | +| 284 | eventfd | ✅ | | 285 | fallocate | ❌ | | 286 | timerfd_settime | ❌ | | 287 | timerfd_gettime | ❌ | | 288 | accept4 | ❌ | | 289 | signalfd4 | ❌ | -| 290 | eventfd2 | ❌ | +| 290 | eventfd2 | ✅ | | 291 | epoll_create1 | ✅ | | 292 | dup3 | ❌ | | 293 | pipe2 | ✅ | diff --git a/kernel/aster-nix/src/fs/file_handle.rs b/kernel/aster-nix/src/fs/file_handle.rs index 8a8ca02d6..5393983b7 100644 --- a/kernel/aster-nix/src/fs/file_handle.rs +++ b/kernel/aster-nix/src/fs/file_handle.rs @@ -80,7 +80,7 @@ pub trait FileLike: Send + Sync + Any { } fn seek(&self, seek_from: SeekFrom) -> Result { - return_errno_with_message!(Errno::EINVAL, "seek is not supported"); + return_errno_with_message!(Errno::ESPIPE, "seek is not supported"); } fn clean_for_close(&self) -> Result<()> { diff --git a/kernel/aster-nix/src/syscall/eventfd.rs b/kernel/aster-nix/src/syscall/eventfd.rs new file mode 100644 index 000000000..fd81f4343 --- /dev/null +++ b/kernel/aster-nix/src/syscall/eventfd.rs @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: MPL-2.0 + +//! `eventfd()` creates an "eventfd object" (we name it as `EventFile`) +//! which serves as a mechanism for event wait/notify. +//! +//! `EventFile` holds a u64 integer counter. +//! Writing to `EventFile` increments the counter by the written value. +//! Reading from `EventFile` returns the current counter value and resets it +//! (It is also possible to only read 1, +//! depending on whether the `EFD_SEMAPHORE` flag is set). +//! The read/write operations may be blocked based on file flags. +//! +//! For more detailed information about this syscall, +//! refer to the man 2 eventfd documentation. +//! + +use super::{SyscallReturn, SYS_EVENTFD, SYS_EVENTFD2}; +use crate::{ + events::{IoEvents, Observer}, + fs::{ + file_handle::FileLike, + file_table::{FdFlags, FileDesc}, + utils::{CreationFlags, StatusFlags}, + }, + log_syscall_entry, + prelude::*, + process::signal::{Pauser, Pollee, Poller}, +}; + +pub fn sys_eventfd(init_val: u64) -> Result { + log_syscall_entry!(SYS_EVENTFD); + debug!("init_val = 0x{:x}", init_val); + + let fd = do_sys_eventfd2(init_val, Flags::empty()); + + Ok(SyscallReturn::Return(fd as _)) +} + +pub fn sys_eventfd2(init_val: u64, flags: u32) -> Result { + log_syscall_entry!(SYS_EVENTFD2); + trace!("raw flags = {}", flags); + let flags = Flags::from_bits(flags) + .ok_or_else(|| Error::with_message(Errno::EINVAL, "unknown flags"))?; + debug!("init_val = 0x{:x}, flags = {:?}", init_val, flags); + + let fd = do_sys_eventfd2(init_val, flags); + + Ok(SyscallReturn::Return(fd as _)) +} + +fn do_sys_eventfd2(init_val: u64, flags: Flags) -> FileDesc { + let event_file = EventFile::new(init_val, flags); + let fd = { + let current = current!(); + let mut file_table = current.file_table().lock(); + let fd_flags = if flags.contains(Flags::EFD_CLOEXEC) { + FdFlags::CLOEXEC + } else { + FdFlags::empty() + }; + file_table.insert(Arc::new(event_file), fd_flags) + }; + fd +} + +bitflags! { + struct Flags: u32 { + const EFD_SEMAPHORE = 1; + const EFD_CLOEXEC = CreationFlags::O_CLOEXEC.bits(); + const EFD_NONBLOCK = StatusFlags::O_NONBLOCK.bits(); + } +} + +struct EventFile { + counter: Mutex, + pollee: Pollee, + flags: Mutex, + write_pauser: Arc, +} + +impl EventFile { + const MAX_COUNTER_VALUE: u64 = u64::MAX - 1; + + fn new(init_val: u64, flags: Flags) -> Self { + let counter = Mutex::new(init_val); + let pollee = Pollee::new(IoEvents::OUT); + let write_pauser = Pauser::new(); + Self { + counter, + pollee, + flags: Mutex::new(flags), + write_pauser, + } + } + + fn is_nonblocking(&self) -> bool { + self.flags.lock().contains(Flags::EFD_NONBLOCK) + } + + fn update_io_state(&self, counter: &MutexGuard) { + let is_readable = **counter != 0; + + // if it is possible to write a value of at least "1" + // without blocking, the file is writable + let is_writable = **counter < Self::MAX_COUNTER_VALUE; + + if is_writable { + if is_readable { + self.pollee.add_events(IoEvents::IN | IoEvents::OUT); + } else { + self.pollee.add_events(IoEvents::OUT); + self.pollee.del_events(IoEvents::IN); + } + + self.write_pauser.resume_all(); + + return; + } + + if is_readable { + self.pollee.add_events(IoEvents::IN); + self.pollee.del_events(IoEvents::OUT); + return; + } + + self.pollee.del_events(IoEvents::IN | IoEvents::OUT); + + // TODO: deal with overflow logic + } + + /// Adds val to the counter. + /// + /// If the new_value is overflowed or exceeds MAX_COUNTER_VALUE, the counter value + /// will not be modified, and this method returns `Err(EINVAL)`. + fn add_counter_val(&self, val: u64) -> Result<()> { + let mut counter = self.counter.lock(); + + let new_value = (*counter) + .checked_add(val) + .ok_or_else(|| Error::with_message(Errno::EINVAL, "arithmetic overflow"))?; + + if new_value <= Self::MAX_COUNTER_VALUE { + *counter = new_value; + self.update_io_state(&counter); + return Ok(()); + } + + return_errno_with_message!(Errno::EINVAL, "new value exceeds MAX_COUNTER_VALUE"); + } +} + +impl FileLike for EventFile { + fn read(&self, buf: &mut [u8]) -> Result { + let read_len = core::mem::size_of::(); + if buf.len() < read_len { + return_errno_with_message!(Errno::EINVAL, "buf len is less len u64 size"); + } + + loop { + let mut counter = self.counter.lock(); + + // Wait until the counter becomes non-zero + if *counter == 0 { + if self.is_nonblocking() { + return_errno_with_message!(Errno::EAGAIN, "try reading event file again"); + } + + self.update_io_state(&counter); + drop(counter); + + let poller = Poller::new(); + if self.pollee.poll(IoEvents::IN, Some(&poller)).is_empty() { + poller.wait()?; + } + continue; + } + + // Copy value from counter, and set the new counter value + if self.flags.lock().contains(Flags::EFD_SEMAPHORE) { + buf[..read_len].copy_from_slice(1u64.as_bytes()); + *counter -= 1; + } else { + buf[..read_len].copy_from_slice((*counter).as_bytes()); + *counter = 0; + } + + self.update_io_state(&counter); + break; + } + + Ok(read_len) + } + + fn write(&self, buf: &[u8]) -> Result { + let write_len = core::mem::size_of::(); + if buf.len() < write_len { + return_errno_with_message!(Errno::EINVAL, "buf len is less than the size of u64"); + } + + let supplied_value = u64::from_bytes(buf); + + // Try to add counter val at first + if self.add_counter_val(supplied_value).is_ok() { + return Ok(write_len); + } + + if self.is_nonblocking() { + return_errno_with_message!(Errno::EAGAIN, "try writing to event file again"); + } + + // Wait until counter can be added val to + self.write_pauser + .pause_until(|| self.add_counter_val(supplied_value).ok())?; + + Ok(write_len) + } + + fn poll(&self, mask: IoEvents, poller: Option<&Poller>) -> IoEvents { + self.pollee.poll(mask, poller) + } + + fn status_flags(&self) -> StatusFlags { + if self.is_nonblocking() { + StatusFlags::O_NONBLOCK + } else { + StatusFlags::empty() + } + } + + fn set_status_flags(&self, new_flags: StatusFlags) -> Result<()> { + let mut flags = self.flags.lock(); + + if new_flags.contains(StatusFlags::O_NONBLOCK) { + *flags |= Flags::EFD_NONBLOCK; + } else { + *flags &= !Flags::EFD_NONBLOCK; + } + + // TODO: deal with other flags + + Ok(()) + } + + fn register_observer( + &self, + observer: Weak>, + mask: IoEvents, + ) -> Result<()> { + self.pollee.register_observer(observer, mask); + Ok(()) + } + + fn unregister_observer( + &self, + observer: &Weak>, + ) -> Option>> { + self.pollee.unregister_observer(observer) + } +} diff --git a/kernel/aster-nix/src/syscall/mod.rs b/kernel/aster-nix/src/syscall/mod.rs index c429f6ce2..82932211e 100644 --- a/kernel/aster-nix/src/syscall/mod.rs +++ b/kernel/aster-nix/src/syscall/mod.rs @@ -5,15 +5,38 @@ use aster_frame::cpu::UserContext; use self::{ - accept::sys_accept, alarm::sys_alarm, bind::sys_bind, connect::sys_connect, - execve::sys_execveat, getgroups::sys_getgroups, getpeername::sys_getpeername, - getrandom::sys_getrandom, getresgid::sys_getresgid, getresuid::sys_getresuid, - getsid::sys_getsid, getsockname::sys_getsockname, getsockopt::sys_getsockopt, - listen::sys_listen, pread64::sys_pread64, recvfrom::sys_recvfrom, sendto::sys_sendto, - setfsgid::sys_setfsgid, setfsuid::sys_setfsuid, setgid::sys_setgid, setgroups::sys_setgroups, - setregid::sys_setregid, setresgid::sys_setresgid, setresuid::sys_setresuid, - setreuid::sys_setreuid, setsid::sys_setsid, setsockopt::sys_setsockopt, setuid::sys_setuid, - shutdown::sys_shutdown, sigaltstack::sys_sigaltstack, socket::sys_socket, + accept::sys_accept, + alarm::sys_alarm, + bind::sys_bind, + connect::sys_connect, + eventfd::{sys_eventfd, sys_eventfd2}, + execve::sys_execveat, + getgroups::sys_getgroups, + getpeername::sys_getpeername, + getrandom::sys_getrandom, + getresgid::sys_getresgid, + getresuid::sys_getresuid, + getsid::sys_getsid, + getsockname::sys_getsockname, + getsockopt::sys_getsockopt, + listen::sys_listen, + pread64::sys_pread64, + recvfrom::sys_recvfrom, + sendto::sys_sendto, + setfsgid::sys_setfsgid, + setfsuid::sys_setfsuid, + setgid::sys_setgid, + setgroups::sys_setgroups, + setregid::sys_setregid, + setresgid::sys_setresgid, + setresuid::sys_setresuid, + setreuid::sys_setreuid, + setsid::sys_setsid, + setsockopt::sys_setsockopt, + setuid::sys_setuid, + shutdown::sys_shutdown, + sigaltstack::sys_sigaltstack, + socket::sys_socket, socketpair::sys_socketpair, }; use crate::{ @@ -115,6 +138,7 @@ mod connect; mod constants; mod dup; mod epoll; +mod eventfd; mod execve; mod exit; mod exit_group; @@ -363,6 +387,8 @@ define_syscall_nums!( SYS_SET_ROBUST_LIST = 273, SYS_UTIMENSAT = 280, SYS_EPOLL_PWAIT = 281, + SYS_EVENTFD = 284, + SYS_EVENTFD2 = 290, SYS_EPOLL_CREATE1 = 291, SYS_PIPE2 = 293, SYS_PRLIMIT64 = 302, @@ -553,6 +579,8 @@ pub fn syscall_dispatch( SYS_SET_ROBUST_LIST => syscall_handler!(2, sys_set_robust_list, args), SYS_UTIMENSAT => syscall_handler!(4, sys_utimensat, args), SYS_EPOLL_PWAIT => syscall_handler!(5, sys_epoll_pwait, args), + SYS_EVENTFD => syscall_handler!(1, sys_eventfd, args), + SYS_EVENTFD2 => syscall_handler!(2, sys_eventfd2, args), SYS_EPOLL_CREATE1 => syscall_handler!(1, sys_epoll_create1, args), SYS_PIPE2 => syscall_handler!(2, sys_pipe2, args), SYS_PRLIMIT64 => syscall_handler!(4, sys_prlimit64, args), diff --git a/kernel/aster-nix/src/syscall/read.rs b/kernel/aster-nix/src/syscall/read.rs index ef8a93489..2b6f4b0d1 100644 --- a/kernel/aster-nix/src/syscall/read.rs +++ b/kernel/aster-nix/src/syscall/read.rs @@ -9,9 +9,13 @@ pub fn sys_read(fd: FileDesc, user_buf_addr: Vaddr, buf_len: usize) -> Result Resu fd, user_buf_ptr, user_buf_len ); - let current = current!(); - let file_table = current.file_table().lock(); - let file = file_table.get_file(fd)?; + let file = { + let current = current!(); + let file_table = current.file_table().lock(); + file_table.get_file(fd)?.clone() + }; + if user_buf_len == 0 { return Ok(SyscallReturn::Return(0)); } diff --git a/regression/apps/Makefile b/regression/apps/Makefile index 4f7e67421..85852d35e 100644 --- a/regression/apps/Makefile +++ b/regression/apps/Makefile @@ -11,6 +11,7 @@ REGRESSION_BUILD_DIR ?= $(INITRAMFS)/regression # These test apps are sorted by name TEST_APPS := \ execve \ + eventfd2 \ fork \ fork_c \ getpid \ diff --git a/regression/apps/eventfd2/Makefile b/regression/apps/eventfd2/Makefile new file mode 100644 index 000000000..ce42e33b0 --- /dev/null +++ b/regression/apps/eventfd2/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: MPL-2.0 + +include ../test_common.mk + +EXTRA_C_FLAGS := \ No newline at end of file diff --git a/regression/apps/eventfd2/eventfd2.c b/regression/apps/eventfd2/eventfd2.c new file mode 100644 index 000000000..aab32d70d --- /dev/null +++ b/regression/apps/eventfd2/eventfd2.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MPL-2.0 + +#include +#include +#include +#include +#include +#include + +int main() +{ + int efd; + uint64_t u; + ssize_t s; + + uint64_t values[] = { 11, 222, 3333 }; + size_t length = sizeof(values) / sizeof(values[0]); + + efd = eventfd(0, 0); + if (efd == -1) + err(EXIT_FAILURE, "eventfd"); + + switch (fork()) { + case 0: + for (size_t j = 0; j < length; j++) { + printf("Child writing %ld to efd\n", values[j]); + u = values[j]; /* strtoull() allows various bases */ + s = write(efd, &u, sizeof(uint64_t)); + if (s != sizeof(uint64_t)) + err(EXIT_FAILURE, "write"); + } + + printf("Child completed write loop\n"); + + exit(EXIT_SUCCESS); + + default: + sleep(2); + + printf("Parent about to read\n"); + s = read(efd, &u, sizeof(uint64_t)); + if (s != sizeof(uint64_t)) + err(EXIT_FAILURE, "read"); + printf("Parent read %" PRIu64 " (%#" PRIx64 ") from efd\n", u, + u); + if (u != 11 + 222 + 3333) { + err(EXIT_FAILURE, "read eventfd"); + exit(EXIT_FAILURE); + } + exit(EXIT_SUCCESS); + + case -1: + err(EXIT_FAILURE, "fork"); + } +} diff --git a/regression/apps/scripts/process.sh b/regression/apps/scripts/process.sh index 84087fa6c..b23241390 100755 --- a/regression/apps/scripts/process.sh +++ b/regression/apps/scripts/process.sh @@ -8,7 +8,7 @@ SCRIPT_DIR=/regression cd ${SCRIPT_DIR}/.. echo "Start process test......" -tests="hello_world/hello_world fork/fork execve/execve fork_c/fork signal_c/signal_test pthread/pthread_test hello_pie/hello pty/open_pty getpid/getpid" +tests="hello_world/hello_world fork/fork execve/execve fork_c/fork signal_c/signal_test pthread/pthread_test hello_pie/hello pty/open_pty getpid/getpid eventfd2/eventfd2" for testcase in ${tests} do echo "Running test ${testcase}......" diff --git a/regression/syscall_test/Makefile b/regression/syscall_test/Makefile index c878332c1..317b994cd 100644 --- a/regression/syscall_test/Makefile +++ b/regression/syscall_test/Makefile @@ -11,6 +11,7 @@ TESTS ?= \ chown_test \ chroot_test \ epoll_test \ + eventfd_test \ fsync_test \ getdents_test \ link_test \ diff --git a/regression/syscall_test/blocklists/eventfd_test b/regression/syscall_test/blocklists/eventfd_test new file mode 100644 index 000000000..7a5da35b9 --- /dev/null +++ b/regression/syscall_test/blocklists/eventfd_test @@ -0,0 +1,3 @@ +EventfdTest.IllegalPwrite +EventfdTest.SpliceFromPipePartialSucceeds +EventfdTest.NotifyNonZero_NoRandomSave \ No newline at end of file