From 657d6719c7e28cbfba459c7b5fef305d95e21e0e Mon Sep 17 00:00:00 2001 From: LI Qing Date: Wed, 15 May 2024 15:56:10 +0800 Subject: [PATCH] Add the `fast_copy` and `fast_copy_nonoverlapping` in frame --- framework/aster-frame/src/arch/x86/mm/mod.rs | 3 + framework/aster-frame/src/arch/x86/mm/util.rs | 174 ++++++++++++++++++ framework/aster-frame/src/io_mem.rs | 4 +- framework/aster-frame/src/vm/frame.rs | 6 +- 4 files changed, 182 insertions(+), 5 deletions(-) create mode 100644 framework/aster-frame/src/arch/x86/mm/util.rs diff --git a/framework/aster-frame/src/arch/x86/mm/mod.rs b/framework/aster-frame/src/arch/x86/mm/mod.rs index f1d611fc4..e247aabc6 100644 --- a/framework/aster-frame/src/arch/x86/mm/mod.rs +++ b/framework/aster-frame/src/arch/x86/mm/mod.rs @@ -1,8 +1,11 @@ // SPDX-License-Identifier: MPL-2.0 +mod util; + use alloc::fmt; use pod::Pod; +pub use util::{fast_copy, fast_copy_nonoverlapping}; use x86_64::{instructions::tlb, structures::paging::PhysFrame, VirtAddr}; use crate::vm::{ diff --git a/framework/aster-frame/src/arch/x86/mm/util.rs b/framework/aster-frame/src/arch/x86/mm/util.rs new file mode 100644 index 000000000..e83159acd --- /dev/null +++ b/framework/aster-frame/src/arch/x86/mm/util.rs @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: MPL-2.0 + +/// Copies `count * size_of::()` bytes from `src` to `dst`. +/// The source and destination may overlap. +/// +/// If the source and destination will never overlap, `fast_copy_nonoverlapping` can be used instead. +/// +/// # Performance +/// +/// This function is provided as a fast alternative to `core::ptr::copy` by +/// utilizing the CPU's `rep movsq` and `rep movsb` instructions for bulk memory copying. +/// These instructions can result in more efficient data transfers by moving larger blocks +/// of memory in a single operation, leading to fewer CPU cycles and better performance +/// in certain scenarios. +/// +/// # Safety +/// +/// The safety requirements of this function are consistent with `core::ptr::copy`. +#[inline] +pub unsafe fn fast_copy(src: *const T, dst: *mut T, count: usize) { + if src == dst || count == 0 { + return; + } + + if src < dst && src.add(count) > dst { + // Overlap and src is before dst + backward_copy(src, dst, count); + } else { + // No overlap, or src is after dst + forward_copy(src, dst, count); + } +} + +/// Copies `count * size_of::()` bytes from `src` to `dst`. +/// The source and destination must not overlap. +/// +/// For regions of memory which might overlap, use `fast_copy` instead. +/// +/// # Performance +/// +/// This function is provided as a fast alternative to `core::ptr::copy_nonoverlapping` by +/// utilizing the CPU's `rep movsq` and `rep movsb` instructions for bulk memory copying. +/// These instructions can result in more efficient data transfers by moving larger blocks +/// of memory in a single operation, leading to fewer CPU cycles and better performance +/// in certain scenarios. +/// +/// # Safety +/// +/// The safety requirements of this function are consistent with `core::ptr::copy_nonoverlapping`. +#[inline] +pub unsafe fn fast_copy_nonoverlapping(src: *const T, dst: *mut T, count: usize) { + if count == 0 { + return; + } + + forward_copy(src, dst, count); +} + +/// # Safety +/// +/// The `src` and `dst` must point to valid memory regions. +/// If the memory regions of `src` and `dst` overlap, `src` must be higher than `dst`. +#[inline] +unsafe fn forward_copy(src: *const T, dst: *mut T, count: usize) { + let bytes_count = count * core::mem::size_of::(); + + // The direction of string copy instructions such as `rep movsb` is controlled by DF flag. + // If `DF = 0`, then data copy is repeated from lower addresses to higher ones; + // Otherwise, the data copy will be done in the reversed direction. + // The System V ABI manual requires `DF = 0` on function entry + // and all code before the `rep movsb` instruction in this function do not change DF flag. + // Thus, we can safely assume `DF = 0`, which is exactly what we want. + if bytes_count % 8 == 0 { + // In most cases, `movsq` is faster than `movsb` + // because it transfers larger chunks of data in a single operation. + core::arch::asm!( + "rep movsq", + in("rcx") bytes_count / 8, + in("rsi") src, + in("rdi") dst, + lateout("rcx") _, + lateout("rsi") _, + lateout("rdi") _ + ); + } else { + core::arch::asm!( + "rep movsb", + in("rcx") bytes_count, + in("rsi") src, + in("rdi") dst, + lateout("rcx") _, + lateout("rsi") _, + lateout("rdi") _ + ); + } +} + +/// # Safety +/// +/// The `src` and `dst` must point to valid memory regions. +/// If the memory regions of `src` and `dst` overlap, `src` must be lower than `dst`. +#[inline] +unsafe fn backward_copy(src: *const T, dst: *mut T, count: usize) { + let bytes_count = count * core::mem::size_of::(); + let last_src = (src as *const u8).add(bytes_count).offset(-1); + let last_dst = (dst as *mut u8).add(bytes_count).offset(-1); + + core::arch::asm!( + "std", // Set the direction flag (DF) + "rep movsb", + in("rcx") bytes_count, + in("rsi") last_src, + in("rdi") last_dst, + lateout("rcx") _, + lateout("rsi") _, + lateout("rdi") _ + ); + + // System V ABI for AMD64 requires direction flag (DF) to be clear on function exit + core::arch::asm!("cld"); +} + +#[cfg(ktest)] +mod test { + use alloc::vec; + + use super::*; + #[ktest] + fn test_fast_copy_nonoverlapping() { + let src = vec![0u8; 8]; + let mut dst = vec![1u8; 8]; + + unsafe { + fast_copy_nonoverlapping(src.as_ptr(), dst.as_mut_ptr(), 8); + } + assert_eq!(src, dst); + } + + #[ktest] + fn test_fast_copy_src_after_dst() { + let mut src = vec![0u8; 8]; + src.extend(vec![1u8; 8]); + + unsafe { + fast_copy(src.as_ptr().add(4), src.as_mut_ptr(), 8); + } + + let expected_left = { + let mut vec = vec![0u8; 4]; + vec.extend(vec![1u8; 4]); + vec + }; + + assert_eq!(expected_left, src[0..8]); + } + + #[ktest] + fn test_fast_copy_src_before_dst() { + let mut src = vec![0u8; 8]; + src.extend(vec![1u8; 8]); + + unsafe { + fast_copy(src.as_ptr().add(4), src.as_mut_ptr().add(8), 8); + } + + let expected_right = { + let mut vec = vec![0u8; 4]; + vec.extend(vec![1u8; 4]); + vec + }; + + assert_eq!(expected_right, src[8..]); + } +} diff --git a/framework/aster-frame/src/io_mem.rs b/framework/aster-frame/src/io_mem.rs index 2c5b3b035..5b7f3e734 100644 --- a/framework/aster-frame/src/io_mem.rs +++ b/framework/aster-frame/src/io_mem.rs @@ -19,7 +19,7 @@ impl VmIo for IoMem { fn read_bytes(&self, offset: usize, buf: &mut [u8]) -> crate::Result<()> { self.check_range(offset, buf.len())?; unsafe { - core::ptr::copy( + crate::arch::mm::fast_copy( (self.virtual_address + offset) as *const u8, buf.as_mut_ptr(), buf.len(), @@ -31,7 +31,7 @@ impl VmIo for IoMem { fn write_bytes(&self, offset: usize, buf: &[u8]) -> crate::Result<()> { self.check_range(offset, buf.len())?; unsafe { - core::ptr::copy( + crate::arch::mm::fast_copy( buf.as_ptr(), (self.virtual_address + offset) as *mut u8, buf.len(), diff --git a/framework/aster-frame/src/vm/frame.rs b/framework/aster-frame/src/vm/frame.rs index 8ceb113eb..eec25e5bb 100644 --- a/framework/aster-frame/src/vm/frame.rs +++ b/framework/aster-frame/src/vm/frame.rs @@ -249,7 +249,7 @@ impl VmFrame { // Safety: src and dst is not overlapped. unsafe { - core::ptr::copy_nonoverlapping(src.as_ptr(), self.as_mut_ptr(), PAGE_SIZE); + crate::arch::mm::fast_copy_nonoverlapping(src.as_ptr(), self.as_mut_ptr(), PAGE_SIZE); } } } @@ -589,7 +589,7 @@ impl<'a> VmReader<'a> { // Safety: the memory range is valid since `copy_len` is the minimum // of the reader's remaining data and the writer's available space. unsafe { - core::ptr::copy(self.cursor, writer.cursor, copy_len); + crate::arch::mm::fast_copy(self.cursor, writer.cursor, copy_len); self.cursor = self.cursor.add(copy_len); writer.cursor = writer.cursor.add(copy_len); } @@ -714,7 +714,7 @@ impl<'a> VmWriter<'a> { // Safety: the memory range is valid since `copy_len` is the minimum // of the reader's remaining data and the writer's available space. unsafe { - core::ptr::copy(reader.cursor, self.cursor, copy_len); + crate::arch::mm::fast_copy(reader.cursor, self.cursor, copy_len); self.cursor = self.cursor.add(copy_len); reader.cursor = reader.cursor.add(copy_len); }