Add dynamically-allocated CPU-local objects

2025-06-08 04:55:03 +00:00 · 2025-06-03 14:00:47 +00:00 · 2025-06-03 14:00:47 +00:00 · dfd3042276
commit dfd3042276
parent f24bc718fa
12 changed files with 698 additions and 240 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1316,6 +1316,7 @@ dependencies = [
 "align_ext",
 "bit_field",
 "bitflags 1.3.2",
+ "bitvec",
 "buddy_system_allocator",
 "cfg-if",
 "fdt",
--- a/kernel/comps/softirq/src/taskless.rs
+++ b/kernel/comps/softirq/src/taskless.rs
@ -8,7 +8,7 @@ use core::{
 };

 use intrusive_collections::{intrusive_adapter, LinkedList, LinkedListAtomicLink};
-use ostd::{cpu::local::CpuLocal, cpu_local, trap};
+use ostd::{cpu::local::StaticCpuLocal, cpu_local, trap};

 use super::{
    softirq_id::{TASKLESS_SOFTIRQ_ID, TASKLESS_URGENT_SOFTIRQ_ID},
@ -123,7 +123,7 @@ impl Taskless {

 fn do_schedule(
    taskless: &Arc<Taskless>,
-    taskless_list: &'static CpuLocal<RefCell<LinkedList<TasklessAdapter>>>,
+    taskless_list: &'static StaticCpuLocal<RefCell<LinkedList<TasklessAdapter>>>,
 ) {
    if taskless.is_disabled.load(Ordering::Acquire) {
        return;
@ -158,7 +158,7 @@ pub(super) fn init() {
 /// If the `Taskless` is ready to be executed, it will be set to not scheduled
 /// and can be scheduled again.
 fn taskless_softirq_handler(
-    taskless_list: &'static CpuLocal<RefCell<LinkedList<TasklessAdapter>>>,
+    taskless_list: &'static StaticCpuLocal<RefCell<LinkedList<TasklessAdapter>>>,
    softirq_id: u8,
 ) {
    let mut processing_list = {
--- a/osdk/deps/frame-allocator/src/smp_counter.rs
+++ b/osdk/deps/frame-allocator/src/smp_counter.rs
@ -2,7 +2,7 @@

 //! A fast and scalable SMP counter.

-use ostd::cpu::{all_cpus, local::CpuLocal, CpuId};
+use ostd::cpu::{all_cpus, local::StaticCpuLocal, CpuId};

 use core::sync::atomic::{AtomicIsize, Ordering};

@ -43,7 +43,7 @@ macro_rules! fast_smp_counter {
 /// Nevertheless, if the sum of added value exceeds [`usize::MAX`] the counter
 /// will wrap on overflow.
 pub struct FastSmpCounter {
-    per_cpu_counter: &'static CpuLocal<AtomicIsize>,
+    per_cpu_counter: &'static StaticCpuLocal<AtomicIsize>,
 }

 impl FastSmpCounter {
@ -51,7 +51,7 @@ impl FastSmpCounter {
    ///
    /// This function should only be used by the [`fast_smp_counter!`] macro.
    #[doc(hidden)]
-    pub const fn new(per_cpu_counter: &'static CpuLocal<AtomicIsize>) -> Self {
+    pub const fn new(per_cpu_counter: &'static StaticCpuLocal<AtomicIsize>) -> Self {
        Self { per_cpu_counter }
    }

--- a/osdk/deps/heap-allocator/src/allocator.rs
+++ b/osdk/deps/heap-allocator/src/allocator.rs
@ -21,7 +21,7 @@ use crate::slab_cache::SlabCache;

 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 #[repr(usize)]
-enum CommonSizeClass {
+pub(crate) enum CommonSizeClass {
    Bytes8 = 8,
    Bytes16 = 16,
    Bytes32 = 32,
@ -34,7 +34,7 @@ enum CommonSizeClass {
 }

 impl CommonSizeClass {
-    const fn from_layout(layout: Layout) -> Option<Self> {
+    pub(crate) const fn from_layout(layout: Layout) -> Option<Self> {
        let size_class = match layout.size() {
            0..=8 => CommonSizeClass::Bytes8,
            9..=16 => CommonSizeClass::Bytes16,
@ -67,7 +67,7 @@ impl CommonSizeClass {
        })
    }

-    fn from_size(size: usize) -> Option<Self> {
+    pub(crate) const fn from_size(size: usize) -> Option<Self> {
        match size {
            8 => Some(CommonSizeClass::Bytes8),
            16 => Some(CommonSizeClass::Bytes16),
--- a/osdk/deps/heap-allocator/src/cpu_local_allocator.rs
+++ b/osdk/deps/heap-allocator/src/cpu_local_allocator.rs
@ -0,0 +1,126 @@
+// SPDX-License-Identifier: MPL-2.0
+
+use crate::allocator::CommonSizeClass;
+use alloc::vec::Vec;
+use core::ops::Deref;
+use ostd::{
+    cpu::{
+        local::{DynCpuLocalChunk, DynamicCpuLocal},
+        CpuId,
+    },
+    prelude::*,
+    sync::SpinLock,
+    Error,
+};
+
+/// Allocator for dynamically-allocated CPU-local objects.
+struct CpuLocalAllocator<const ITEM_SIZE: usize> {
+    chunks: SpinLock<Vec<DynCpuLocalChunk<ITEM_SIZE>>>,
+}
+
+impl<const ITEM_SIZE: usize> CpuLocalAllocator<ITEM_SIZE> {
+    /// Creates a new allocator for dynamically-allocated CPU-local objects.
+    pub(self) const fn new() -> Self {
+        Self {
+            chunks: SpinLock::new(Vec::new()),
+        }
+    }
+
+    /// Allocates a CPU-local object and initializes it with `init_values`.
+    pub(self) fn alloc<T>(
+        &'static self,
+        init_values: &mut impl FnMut(CpuId) -> T,
+    ) -> Result<DynamicCpuLocal<T>> {
+        let mut chunks = self.chunks.lock();
+
+        for chunk in chunks.iter_mut() {
+            if !chunk.is_full() {
+                let cpu_local = chunk.alloc::<T>(init_values).unwrap();
+                return Ok(cpu_local);
+            }
+        }
+
+        let mut new_chunk = DynCpuLocalChunk::<ITEM_SIZE>::new()?;
+        let cpu_local = new_chunk.alloc::<T>(init_values).unwrap();
+        chunks.push(new_chunk);
+
+        Ok(cpu_local)
+    }
+
+    /// Deallocates a CPU-local object.
+    pub(self) fn dealloc<T>(&self, cpu_local: DynamicCpuLocal<T>) {
+        let mut cpu_local = cpu_local;
+        let mut chunks = self.chunks.lock();
+
+        let mut chunk_index = None;
+        for (i, chunk) in chunks.iter_mut().enumerate() {
+            match chunk.try_dealloc(cpu_local) {
+                Ok(()) => {
+                    chunk_index = Some(i);
+                    break;
+                }
+                Err(returned) => cpu_local = returned,
+            }
+        }
+        let chunk_index = chunk_index.unwrap();
+        if chunks[chunk_index].is_empty() && chunks.iter().filter(|c| c.is_empty()).count() > 1 {
+            chunks.swap_remove(chunk_index);
+        }
+    }
+}
+
+/// A wrapper over [`DynamicCpuLocal<T>`] to deallocate CPU-local objects on
+/// drop automatically.
+pub struct CpuLocalBox<T>(Option<DynamicCpuLocal<T>>);
+
+impl<T> Deref for CpuLocalBox<T> {
+    type Target = DynamicCpuLocal<T>;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref().unwrap()
+    }
+}
+
+impl<T> Drop for CpuLocalBox<T> {
+    fn drop(&mut self) {
+        let cpu_local = self.0.take().unwrap();
+        dealloc_cpu_local(cpu_local);
+    }
+}
+
+/// Global allocators for dynamically-allocated CPU-local objects.
+static ALLOCATOR_8: CpuLocalAllocator<8> = CpuLocalAllocator::new();
+static ALLOCATOR_16: CpuLocalAllocator<16> = CpuLocalAllocator::new();
+static ALLOCATOR_32: CpuLocalAllocator<32> = CpuLocalAllocator::new();
+
+/// Allocates a dynamically-allocated CPU-local object of type `T` and
+/// initializes it with `init_values`.
+///
+/// Currently, the size of `T` must be no larger than 32 bytes.
+pub fn alloc_cpu_local<T>(mut init_values: impl FnMut(CpuId) -> T) -> Result<CpuLocalBox<T>> {
+    let size = core::mem::size_of::<T>();
+    let class = CommonSizeClass::from_size(size).ok_or(Error::InvalidArgs)?;
+    let cpu_local = match class {
+        CommonSizeClass::Bytes8 => ALLOCATOR_8.alloc::<T>(&mut init_values),
+        CommonSizeClass::Bytes16 => ALLOCATOR_16.alloc::<T>(&mut init_values),
+        CommonSizeClass::Bytes32 => ALLOCATOR_32.alloc::<T>(&mut init_values),
+        // TODO: Support contiguous allocations for larger sizes.
+        // Since cache lines are normally 64 bytes, when allocating CPU-local
+        // objects with larger sizes, we should allocate a `Vec` with size
+        // `num_cpus()` instead.
+        _ => Err(Error::InvalidArgs),
+    }?;
+    Ok(CpuLocalBox(Some(cpu_local)))
+}
+
+/// Deallocates a dynamically-allocated CPU-local object of type `T`.
+fn dealloc_cpu_local<T>(cpu_local: DynamicCpuLocal<T>) {
+    let size = core::mem::size_of::<T>();
+    let class = CommonSizeClass::from_size(size).unwrap();
+    match class {
+        CommonSizeClass::Bytes8 => ALLOCATOR_8.dealloc(cpu_local),
+        CommonSizeClass::Bytes16 => ALLOCATOR_16.dealloc(cpu_local),
+        CommonSizeClass::Bytes32 => ALLOCATOR_32.dealloc(cpu_local),
+        _ => todo!(),
+    }
+}
--- a/osdk/deps/heap-allocator/src/lib.rs
+++ b/osdk/deps/heap-allocator/src/lib.rs
@ -4,7 +4,11 @@
 #![no_std]
 #![deny(unsafe_code)]

+extern crate alloc;
+
 mod allocator;
+mod cpu_local_allocator;
 mod slab_cache;

 pub use allocator::{type_from_layout, HeapAllocator};
+pub use cpu_local_allocator::{alloc_cpu_local, CpuLocalBox};
--- a/ostd/Cargo.toml
+++ b/ostd/Cargo.toml
@ -37,6 +37,7 @@ spin = "0.9.4"
 smallvec = "1.13.2"
 unwinding = { version = "=0.2.5", default-features = false, features = ["fde-gnu-eh-frame-hdr", "hide-trace", "panic", "personality", "unwinder"] }
 volatile = "0.6.1"
+bitvec = { version = "1.0", default-features = false, features = ["alloc"] }

 [target.x86_64-unknown-none.dependencies]
 x86_64 = "0.14.13"
--- a/ostd/src/arch/x86/trap/gdt.rs
+++ b/ostd/src/arch/x86/trap/gdt.rs
@ -18,7 +18,7 @@ use x86_64::{
    PrivilegeLevel, VirtAddr,
 };

-use crate::cpu::local::CpuLocal;
+use crate::cpu::local::{CpuLocal, StaticCpuLocal};

 /// Initializes and loads the GDT and TSS.
 ///
@ -95,10 +95,10 @@ pub(super) unsafe fn init() {
 // No other special initialization is required because the kernel stack information is stored in
 // the TSS when we start the userspace program. See `syscall.S` for details.
 #[link_section = ".cpu_local_tss"]
-static LOCAL_TSS: CpuLocal<TaskStateSegment> = {
+static LOCAL_TSS: StaticCpuLocal<TaskStateSegment> = {
    let tss = TaskStateSegment::new();
    // SAFETY: The `.cpu_local_tss` section is part of the CPU-local area.
-    unsafe { CpuLocal::__new(tss) }
+    unsafe { CpuLocal::__new_static(tss) }
 };

 // Kernel code and data descriptors.
--- a/ostd/src/cpu/local/cpu_local.rs
+++ b/ostd/src/cpu/local/cpu_local.rs
@ -1,201 +0,0 @@
-// SPDX-License-Identifier: MPL-2.0
-
-//! The CPU-local variable implementation.
-
-use core::{marker::Sync, ops::Deref};
-
-use super::{__cpu_local_end, __cpu_local_start};
-use crate::{arch, cpu::CpuId, trap::DisabledLocalIrqGuard};
-
-/// Defines a CPU-local variable.
-///
-/// The accessors of the CPU-local variables are defined with [`CpuLocal`].
-///
-/// You can get the reference to the inner object on one CPU by calling
-/// [`CpuLocal::get_on_cpu`]. Also if you intend to access the inner object
-/// on the current CPU, you can use [`CpuLocal::get_with`]. The latter
-/// accessors can be used even if the inner object is not `Sync`.
-///
-/// # Example
-///
-/// ```rust
-/// use ostd::{cpu_local, cpu::PinCurrentCpu, task::disable_preempt, trap};
-/// use core::{sync::atomic::{AtomicU32, Ordering}, cell::Cell};
-///
-/// cpu_local! {
-///     static FOO: AtomicU32 = AtomicU32::new(1);
-///     pub static BAR: Cell<usize> = Cell::new(2);
-/// }
-///
-/// fn not_an_atomic_function() {
-///     let preempt_guard = disable_preempt();
-///     let ref_of_foo = FOO.get_on_cpu(preempt_guard.current_cpu());
-///     let val_of_foo = ref_of_foo.load(Ordering::Relaxed);
-///     println!("FOO VAL: {}", val_of_foo);
-///
-///     let irq_guard = trap::disable_local();
-///     let bar_guard = BAR.get_with(&irq_guard);
-///     let val_of_bar = bar_guard.get();
-///     println!("BAR VAL: {}", val_of_bar);
-/// }
-/// ```
-#[macro_export]
-macro_rules! cpu_local {
-    ($( $(#[$attr:meta])* $vis:vis static $name:ident: $t:ty = $init:expr; )*) => {
-        $(
-            #[link_section = ".cpu_local"]
-            $(#[$attr])* $vis static $name: $crate::cpu::local::CpuLocal<$t> = {
-                let val = $init;
-                // SAFETY: The per-CPU variable instantiated is statically
-                // stored in the special `.cpu_local` section.
-                unsafe {
-                    $crate::cpu::local::CpuLocal::__new(val)
-                }
-            };
-        )*
-    };
-}
-
-/// CPU-local objects.
-///
-/// CPU-local objects are instantiated once per CPU core. They can be shared to
-/// other cores. In the context of a preemptible kernel task, when holding the
-/// reference to the inner object, the object is always the one in the original
-/// core (when the reference is created), no matter which core the code is
-/// currently running on.
-///
-/// For the difference between [`CpuLocal`] and [`super::CpuLocalCell`], see
-/// [`super`].
-pub struct CpuLocal<T: 'static>(T);
-
-impl<T: 'static> CpuLocal<T> {
-    /// Creates a new CPU-local object.
-    ///
-    /// Please do not call this function directly. Instead, use the
-    /// `cpu_local!` macro.
-    ///
-    /// # Safety
-    ///
-    /// The caller should ensure that the object initialized by this
-    /// function resides in the `.cpu_local` section. Otherwise the
-    /// behavior is undefined.
-    #[doc(hidden)]
-    pub const unsafe fn __new(val: T) -> Self {
-        Self(val)
-    }
-
-    /// Gets access to the underlying value on the current CPU with a
-    /// provided IRQ guard.
-    ///
-    /// By this method, you can borrow a reference to the underlying value
-    /// even if `T` is not `Sync`. Because that it is per-CPU and IRQs are
-    /// disabled, no other running tasks can access it.
-    pub fn get_with<'a>(
-        &'static self,
-        guard: &'a DisabledLocalIrqGuard,
-    ) -> CpuLocalDerefGuard<'a, T> {
-        CpuLocalDerefGuard {
-            cpu_local: self,
-            guard,
-        }
-    }
-
-    /// Gets access to the underlying value through a raw pointer.
-    ///
-    /// This method is safe, but using the returned pointer will be unsafe.
-    pub(crate) fn as_ptr(&'static self) -> *const T {
-        super::is_used::debug_set_true();
-
-        let offset = self.get_offset();
-
-        let local_base = arch::cpu::local::get_base() as usize;
-        let local_va = local_base + offset;
-
-        // A sanity check about the alignment.
-        debug_assert_eq!(local_va % core::mem::align_of::<T>(), 0);
-
-        local_va as *mut T
-    }
-
-    /// Gets the offset of the CPU-local object in the CPU-local area.
-    fn get_offset(&'static self) -> usize {
-        let bsp_va = self as *const _ as usize;
-        let bsp_base = __cpu_local_start as usize;
-        // The implementation should ensure that the CPU-local object resides in the `.cpu_local`.
-        debug_assert!(bsp_va + core::mem::size_of::<T>() <= __cpu_local_end as usize);
-
-        bsp_va - bsp_base
-    }
-}
-
-impl<T: 'static + Sync> CpuLocal<T> {
-    /// Gets access to the CPU-local value on a specific CPU.
-    ///
-    /// This allows the caller to access CPU-local data from a remote CPU,
-    /// so the data type must be `Sync`.
-    pub fn get_on_cpu(&'static self, cpu_id: CpuId) -> &'static T {
-        super::is_used::debug_set_true();
-
-        let cpu_id = cpu_id.as_usize();
-
-        // If on the BSP, just use the statically linked storage.
-        if cpu_id == 0 {
-            return &self.0;
-        }
-
-        // SAFETY: At this time we have a non-BSP `CpuId`, which means that
-        // `init_cpu_nums` must have been called, so `copy_bsp_for_ap` must
-        // also have been called (see the implementation of `cpu::init_on_bsp`),
-        // so `CPU_LOCAL_STORAGES` must already be initialized.
-        let storages = unsafe { super::CPU_LOCAL_STORAGES.get_unchecked() };
-        // SAFETY: `cpu_id` is guaranteed to be in range because the type
-        // invariant of `CpuId`.
-        let storage = unsafe { *storages.get_unchecked(cpu_id - 1) };
-        let base = crate::mm::paddr_to_vaddr(storage);
-
-        let offset = self.get_offset();
-        let ptr = (base + offset) as *const T;
-
-        // SAFETY: `ptr` represents CPU-local data on a remote CPU. It
-        // contains valid data, the type is `Sync`, and no one will mutably
-        // borrow it, so creating an immutable borrow here is valid.
-        unsafe { &*ptr }
-    }
-}
-
-// SAFETY: At any given time, only one task can access the inner value `T` of a
-// CPU-local variable if `T` is not `Sync`. We guarantee it by disabling the
-// reference to the inner value, or turning off preemptions when creating
-// the reference.
-unsafe impl<T: 'static> Sync for CpuLocal<T> {}
-
-// Prevent valid instances of `CpuLocal` from being copied to any memory areas
-// outside the `.cpu_local` section.
-impl<T: 'static> !Copy for CpuLocal<T> {}
-impl<T: 'static> !Clone for CpuLocal<T> {}
-
-// In general, it does not make any sense to send instances of `CpuLocal` to
-// other tasks as they should live on other CPUs to make sending useful.
-impl<T: 'static> !Send for CpuLocal<T> {}
-
-/// A guard for accessing the CPU-local object.
-///
-/// It ensures that the CPU-local object is accessed with IRQs disabled.
-/// It is created by [`CpuLocal::borrow_with`].
-#[must_use]
-pub struct CpuLocalDerefGuard<'a, T: 'static> {
-    cpu_local: &'static CpuLocal<T>,
-    #[expect(dead_code)]
-    guard: &'a DisabledLocalIrqGuard,
-}
-
-impl<T: 'static> Deref for CpuLocalDerefGuard<'_, T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        // SAFETY: it should be properly initialized before accesses.
-        // And we do not create a mutable reference over it. The IRQs
-        // are disabled so it can only be referenced from this task.
-        unsafe { &*self.cpu_local.as_ptr() }
-    }
-}
--- a/ostd/src/cpu/local/dyn_cpu_local.rs
+++ b/ostd/src/cpu/local/dyn_cpu_local.rs
@ -0,0 +1,238 @@
+// SPDX-License-Identifier: MPL-2.0
+
+//! Dynamically-allocated CPU-local objects.
+
+use core::{marker::PhantomData, mem::ManuallyDrop, ptr::NonNull};
+
+use bitvec::prelude::{bitvec, BitVec};
+
+use super::{AnyStorage, CpuLocal};
+use crate::{
+    cpu::{all_cpus, num_cpus, CpuId, PinCurrentCpu},
+    mm::{paddr_to_vaddr, FrameAllocOptions, Segment, Vaddr, PAGE_SIZE},
+    trap::DisabledLocalIrqGuard,
+    Result,
+};
+
+/// A dynamically-allocated storage for a CPU-local variable of type `T`.
+///
+/// Such a CPU-local storage should be allocated and deallocated by
+/// [`DynCpuLocalChunk`], not directly. Dropping it without deallocation
+/// will cause panic.
+///
+/// When dropping a `CpuLocal<T, DynamicStorage<T>>`, we have no way to know
+/// which `DynCpuLocalChunk` the CPU-local object was originally allocated
+/// from. Therefore, we rely on the user to correctly manage the corresponding
+/// `DynCpuLocalChunk`, ensuring that both allocation and deallocation of
+/// `CpuLocal<T, DynamicStorage<T>>` occur within the same chunk.
+///
+/// To properly deallocate the CPU-local object, the user must explicitly call
+/// the appropriate `DynCpuLocalChunk`'s `try_dealloc<T>()`. Otherwise,
+/// dropping it directly will cause a panic.
+pub struct DynamicStorage<T>(NonNull<T>);
+
+unsafe impl<T> AnyStorage<T> for DynamicStorage<T> {
+    fn get_ptr_on_current(&self, guard: &DisabledLocalIrqGuard) -> *const T {
+        self.get_ptr_on_target(guard.current_cpu())
+    }
+
+    fn get_ptr_on_target(&self, cpu_id: CpuId) -> *const T {
+        let bsp_va = self.0.as_ptr() as usize;
+        let va = bsp_va + cpu_id.as_usize() * CHUNK_SIZE;
+        va as *mut T
+    }
+
+    fn get_mut_ptr_on_target(&mut self, cpu: CpuId) -> *mut T {
+        self.get_ptr_on_target(cpu).cast_mut()
+    }
+}
+
+impl<T> Drop for DynamicStorage<T> {
+    fn drop(&mut self) {
+        panic!(
+            "Do not drop `DynamicStorage<T>` directly. \
+            Use `DynCpuLocalChunk::try_dealloc<T>` instead."
+        );
+    }
+}
+
+impl<T: Sync + alloc::fmt::Debug + 'static> alloc::fmt::Debug for CpuLocal<T, DynamicStorage<T>> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        let mut list = f.debug_list();
+        for cpu in all_cpus() {
+            let val = self.get_on_cpu(cpu);
+            list.entry(&(&cpu, val));
+        }
+        list.finish()
+    }
+}
+
+impl<T> CpuLocal<T, DynamicStorage<T>> {
+    /// Creates a new dynamically-allocated CPU-local object, and
+    /// initializes it with `init_values`.
+    ///
+    /// The given `ptr` points to the variable located on the BSP.
+    ///
+    /// Please do not call this function directly. Instead, use
+    /// `DynCpuLocalChunk::alloc`.
+    ///
+    /// # Safety
+    ///
+    /// The caller must ensure that the new per-CPU object belongs to an
+    /// existing [`DynCpuLocalChunk`], and does not overlap with any existing
+    /// CPU-local object.
+    unsafe fn __new_dynamic(ptr: *mut T, init_values: &mut impl FnMut(CpuId) -> T) -> Self {
+        let mut storage = DynamicStorage(NonNull::new(ptr).unwrap());
+        for cpu in all_cpus() {
+            let ptr = storage.get_mut_ptr_on_target(cpu);
+            // SAFETY: `ptr` points to valid, uninitialized per-CPU memory
+            // reserved for CPU-local storage. This initialization occurs
+            // before any other code can access the memory. References to
+            // the data may only be created after `Self` is created, ensuring
+            // exclusive access by the current task. Each per-CPU memory
+            // region is written exactly once using `ptr::write`, which is
+            // safe for uninitialized memory.
+            unsafe {
+                core::ptr::write(ptr, init_values(cpu));
+            }
+        }
+
+        Self {
+            storage,
+            phantom: PhantomData,
+        }
+    }
+}
+
+const CHUNK_SIZE: usize = PAGE_SIZE;
+
+/// Footer metadata to describe a `SSTable`.
+#[derive(Debug, Clone, Copy)]
+struct DynCpuLocalMeta;
+crate::impl_frame_meta_for!(DynCpuLocalMeta);
+
+/// Manages dynamically-allocated CPU-local chunks.
+///
+/// Each CPU owns a chunk of size `CHUNK_SIZE`, and the chunks are laid
+/// out contiguously in the order of CPU IDs. Per-CPU variables lie within
+/// the chunks.
+pub struct DynCpuLocalChunk<const ITEM_SIZE: usize> {
+    segment: ManuallyDrop<Segment<DynCpuLocalMeta>>,
+    bitmap: BitVec,
+}
+
+impl<const ITEM_SIZE: usize> DynCpuLocalChunk<ITEM_SIZE> {
+    /// Creates a new dynamically-allocated CPU-local chunk.
+    pub fn new() -> Result<Self> {
+        let total_chunk_size = CHUNK_SIZE * num_cpus();
+        let segment = FrameAllocOptions::new()
+            .zeroed(false)
+            .alloc_segment_with(total_chunk_size.div_ceil(PAGE_SIZE), |_| DynCpuLocalMeta)?;
+
+        let num_items = CHUNK_SIZE / ITEM_SIZE;
+        const { assert!(CHUNK_SIZE % ITEM_SIZE == 0) };
+
+        Ok(Self {
+            segment: ManuallyDrop::new(segment),
+            bitmap: bitvec![0; num_items],
+        })
+    }
+
+    /// Returns a pointer to the local chunk owned by the BSP.
+    fn start_vaddr(&self) -> Vaddr {
+        paddr_to_vaddr(self.segment.start_paddr())
+    }
+
+    /// Allocates a CPU-local object from the chunk, and
+    /// initializes it with `init_values`.
+    ///
+    /// Returns `None` if the chunk is full.
+    pub fn alloc<T>(
+        &mut self,
+        init_values: &mut impl FnMut(CpuId) -> T,
+    ) -> Option<CpuLocal<T, DynamicStorage<T>>> {
+        const {
+            assert!(ITEM_SIZE.is_power_of_two());
+            assert!(core::mem::size_of::<T>() <= ITEM_SIZE);
+            assert!(core::mem::align_of::<T>() <= ITEM_SIZE);
+        }
+
+        let index = self.bitmap.first_zero()?;
+        self.bitmap.set(index, true);
+        // SAFETY: `index` refers to an available position in the chunk
+        // for allocating a new CPU-local object.
+        unsafe {
+            let vaddr = self.start_vaddr() + index * ITEM_SIZE;
+            Some(CpuLocal::__new_dynamic(vaddr as *mut T, init_values))
+        }
+    }
+
+    /// Gets the index of a dynamically-allocated CPU-local object
+    /// within the chunk.
+    ///
+    /// Returns `None` if the object does not belong to the chunk.
+    fn get_item_index<T>(&mut self, cpu_local: &CpuLocal<T, DynamicStorage<T>>) -> Option<usize> {
+        let vaddr = cpu_local.storage.0.as_ptr() as Vaddr;
+        let start_vaddr = self.start_vaddr();
+
+        let offset = vaddr.checked_sub(start_vaddr)?;
+        if offset > CHUNK_SIZE {
+            return None;
+        }
+
+        debug_assert_eq!(offset % ITEM_SIZE, 0);
+
+        Some(offset / ITEM_SIZE)
+    }
+
+    /// Attempts to deallocate a previously allocated CPU-local object.
+    ///
+    /// Returns `Err(cpu_local)` if the object does not belong to this chunk.
+    pub fn try_dealloc<T>(
+        &mut self,
+        mut cpu_local: CpuLocal<T, DynamicStorage<T>>,
+    ) -> core::result::Result<(), CpuLocal<T, DynamicStorage<T>>> {
+        let Some(index) = self.get_item_index(&cpu_local) else {
+            return Err(cpu_local);
+        };
+        self.bitmap.set(index, false);
+        for cpu in all_cpus() {
+            let ptr = cpu_local.storage.get_mut_ptr_on_target(cpu);
+            // SAFETY: `ptr` points to the valid CPU-local object. We can
+            // mutably borrow the CPU-local object on `cpu` because we have
+            // the exclusive access to `cpu_local`. Each CPU-local object
+            // is dropped exactly once. After the deallocation, no one will
+            // access the dropped CPU-local object, since we explicitly
+            // forget the `cpu_local`.
+            unsafe {
+                core::ptr::drop_in_place(ptr);
+            }
+        }
+        let _ = ManuallyDrop::new(cpu_local);
+        Ok(())
+    }
+
+    /// Checks whether the chunk is full.
+    pub fn is_full(&self) -> bool {
+        self.bitmap.all()
+    }
+
+    /// Checks whether the chunk is empty.
+    pub fn is_empty(&self) -> bool {
+        self.bitmap.not_any()
+    }
+}
+
+impl<const ITEM_SIZE: usize> Drop for DynCpuLocalChunk<ITEM_SIZE> {
+    fn drop(&mut self) {
+        if self.is_empty() {
+            // SAFETY: The `segment` does not contain any CPU-local objects.
+            // It is the last time the `segment` is accessed, and it will be
+            // dropped only once.
+            unsafe { ManuallyDrop::drop(&mut self.segment) }
+        } else {
+            // Leak the `segment` and panic.
+            panic!("Dropping `DynCpuLocalChunk` while some CPU-local objects are still alive");
+        }
+    }
+}
--- a/ostd/src/cpu/local/mod.rs
+++ b/ostd/src/cpu/local/mod.rs
@ -2,47 +2,68 @@

 //! CPU local storage.
 //!
-//! This module provides a mechanism to define CPU-local objects, by the macro
-//! [`crate::cpu_local!`].
+//! This module provides a mechanism to define CPU-local objects. Users can
+//! define a statically-allocated CPU-local object by the macro
+//! [`crate::cpu_local!`], or allocate a dynamically-allocated CPU-local
+//! object with the function [`osdk_heap_allocator::alloc_cpu_local`].
 //!
-//! Such a mechanism exploits the fact that constant values of non-[`Copy`]
-//! types can be bitwise copied. For example, a [`Option<T>`] object, though
-//! being not [`Copy`], have a constant constructor [`Option::None`] that
-//! produces a value that can be bitwise copied to create a new instance.
-//! [`alloc::sync::Arc`] however, don't have such a constructor, and thus cannot
-//! be directly used as a CPU-local object. Wrapping it in a type that has a
-//! constant constructor, like [`Option<T>`], can make it CPU-local.
+//! The mechanism for statically-allocated CPU-local objects exploits the fact
+//! that constant values of non-[`Copy`] types can be bitwise copied. For
+//! example, a [`Option<T>`] object, though being not [`Copy`], have a constant
+//! constructor [`Option::None`] that produces a value that can be bitwise
+//! copied to create a new instance. [`alloc::sync::Arc`] however, don't have
+//! such a constructor, and thus cannot be directly used as a statically-
+//! allocated CPU-local object. Wrapping it in a type that has a constant
+//! constructor, like [`Option<T>`], can make it statically-allocated CPU-local.
 //!
 //! # Implementation
 //!
-//! These APIs are implemented by placing the CPU-local objects in a special
-//! section `.cpu_local`. The bootstrap processor (BSP) uses the objects linked
-//! in this section, and these objects are copied to dynamically allocated
-//! local storage of each application processors (AP) during the initialization
-//! process.
+//! These APIs are implemented by the methods as follows:
+//! 1. For statically-allocated CPU-local objects, we place them in a special
+//!    section `.cpu_local`. The bootstrap processor (BSP) uses the objects
+//!    linked in this section, and these objects are copied to dynamically
+//!    allocated local storage of each application processors (AP) during the
+//!    initialization process.
+//! 2. For dynamically-allocated CPU-local objects, we prepare a fixed-size
+//!    chunk for each CPU. These per-CPU memory chunks are laid out contiguously
+//!    in memory in the order of the CPU IDs. A dynamically-allocated CPU-local
+//!    object can be allocated by occupying the same offset in each per-CPU
+//!    memory chunk.

 // This module also, provide CPU-local cell objects that have inner mutability.
 //
-// The difference between CPU-local objects (defined by [`crate::cpu_local!`])
-// and CPU-local cell objects (defined by [`crate::cpu_local_cell!`]) is that
-// the CPU-local objects can be shared across CPUs. While through a CPU-local
-// cell object you can only access the value on the current CPU, therefore
-// enabling inner mutability without locks.
+// The difference between statically-allocated CPU-local objects (defined by
+// [`crate::cpu_local!`]) and CPU-local cell objects (defined by
+// [`crate::cpu_local_cell!`]) is that the CPU-local objects can be shared
+// across CPUs. While through a CPU-local cell object you can only access the
+// value on the current CPU, therefore enabling inner mutability without locks.

 mod cell;
-mod cpu_local;
+mod dyn_cpu_local;
+mod static_cpu_local;

 pub(crate) mod single_instr;

-use core::alloc::Layout;
+use core::{alloc::Layout, marker::PhantomData, ops::Deref};

 use align_ext::AlignExt;
 pub use cell::CpuLocalCell;
-pub use cpu_local::{CpuLocal, CpuLocalDerefGuard};
+pub use dyn_cpu_local::DynCpuLocalChunk;
+use dyn_cpu_local::DynamicStorage;
 use spin::Once;
+use static_cpu_local::StaticStorage;

 use super::CpuId;
-use crate::mm::{frame::allocator, paddr_to_vaddr, Paddr, PAGE_SIZE};
+use crate::{
+    mm::{frame::allocator, paddr_to_vaddr, Paddr, PAGE_SIZE},
+    trap::DisabledLocalIrqGuard,
+};
+
+/// Dynamically-allocated CPU-local objects.
+pub type DynamicCpuLocal<T> = CpuLocal<T, DynamicStorage<T>>;
+
+/// Statically-allocated CPU-local objects.
+pub type StaticCpuLocal<T> = CpuLocal<T, static_cpu_local::StaticStorage<T>>;

 // These symbols are provided by the linker script.
 extern "C" {
@ -50,10 +71,120 @@ extern "C" {
    fn __cpu_local_end();
 }

-/// The CPU-local areas for APs.
+/// A trait to abstract any type that can be used as a slot for a CPU-local
+/// variable of type `T`.
+///
+/// Each slot provides the memory space for storing `num_cpus` instances
+/// of type `T`.
+///
+/// # Safety
+///
+/// The implementor must ensure that the returned pointer refers to the
+/// variable on the correct CPU.
+pub unsafe trait AnyStorage<T> {
+    /// Gets the `const` pointer for the object on the current CPU.
+    fn get_ptr_on_current(&self, guard: &DisabledLocalIrqGuard) -> *const T;
+
+    /// Gets the `const` pointer for the object on a target CPU.
+    fn get_ptr_on_target(&self, cpu: CpuId) -> *const T;
+
+    /// Gets the `mut` pointer for the object on a target CPU.
+    ///
+    /// This method is intended for use when initializing or dropping the storage.
+    fn get_mut_ptr_on_target(&mut self, cpu: CpuId) -> *mut T;
+}
+
+/// A CPU-local variable for type `T`, backed by a storage of type `S`.
+///
+/// CPU-local objects are instantiated once per CPU core. They can be shared to
+/// other cores. In the context of a preemptible kernel task, when holding the
+/// reference to the inner object, the object is always the one in the original
+/// core (when the reference is created), no matter which core the code is
+/// currently running on.
+pub struct CpuLocal<T, S: AnyStorage<T>> {
+    storage: S,
+    phantom: PhantomData<T>,
+}
+
+impl<T: 'static, S: AnyStorage<T>> CpuLocal<T, S> {
+    /// Gets access to the underlying value on the current CPU with a
+    /// provided IRQ guard.
+    ///
+    /// By this method, you can borrow a reference to the underlying value
+    /// on the current CPU even if `T` is not `Sync`.
+    pub fn get_with<'a>(
+        &'a self,
+        guard: &'a DisabledLocalIrqGuard,
+    ) -> CpuLocalDerefGuard<'a, T, S> {
+        CpuLocalDerefGuard {
+            cpu_local: self,
+            guard,
+        }
+    }
+}
+
+impl<T: 'static + Sync, S: AnyStorage<T>> CpuLocal<T, S> {
+    /// Gets access to the CPU-local value on a specific CPU.
+    ///
+    /// This allows the caller to access CPU-local data from a remote CPU,
+    /// so the data type must be `Sync`.
+    pub fn get_on_cpu(&self, target_cpu_id: CpuId) -> &T {
+        let ptr = self.storage.get_ptr_on_target(target_cpu_id);
+        // SAFETY: `ptr` represents CPU-local data on a remote CPU. It
+        // contains valid data, the type is `Sync`, and no one will mutably
+        // borrow it, so creating an immutable borrow here is valid.
+        unsafe { &*ptr }
+    }
+}
+
+/// A guard for accessing the CPU-local object.
+///
+/// It ensures that the CPU-local object is accessed with IRQs disabled.
+/// It is created by [`CpuLocal::get_with`].
+#[must_use]
+pub struct CpuLocalDerefGuard<'a, T: 'static, S: AnyStorage<T>> {
+    cpu_local: &'a CpuLocal<T, S>,
+    guard: &'a DisabledLocalIrqGuard,
+}
+
+impl<'a, T: 'static, S: AnyStorage<T>> Deref for CpuLocalDerefGuard<'a, T, S> {
+    type Target = T;
+
+    fn deref(&self) -> &'a Self::Target {
+        is_used::debug_set_true();
+
+        let ptr = self.cpu_local.storage.get_ptr_on_current(self.guard);
+        // SAFETY: `ptr` represents CPU-local data on the current CPU. It
+        // contains valid data, only the current task can reference the data
+        // (due to `self.guard`), and no one will mutably borrow it, so
+        // creating an immutable borrow here is valid.
+        unsafe { &*ptr }
+    }
+}
+
+// SAFETY: At any given time, only one task can access the inner value `T` of a
+// CPU-local variable if `T` is not `Sync`. We guarantee it by disabling the
+// reference to the inner value, or turning off preemptions when creating
+// the reference.
+unsafe impl<T: 'static, S: AnyStorage<T>> Sync for CpuLocal<T, S> {}
+unsafe impl<T: 'static> Send for CpuLocal<T, DynamicStorage<T>> {}
+
+// Implement `!Copy` and `!Clone` for `CpuLocal` to ensure memory safety:
+// - Prevent valid instances of `CpuLocal<T, StaticStorage<T>>` from being copied
+// to any memory areas outside the `.cpu_local` section.
+// - Prevent multiple valid instances of `CpuLocal<T, DynamicStorage<T>>` from
+// referring to the same CPU-local object, avoiding double deallocation.
+impl<T: 'static, S: AnyStorage<T>> !Copy for CpuLocal<T, S> {}
+impl<T: 'static, S: AnyStorage<T>> !Clone for CpuLocal<T, S> {}
+
+// In general, it does not make any sense to send instances of static `CpuLocal`
+// to other tasks as they should live on other CPUs to make sending useful.
+impl<T: 'static> !Send for CpuLocal<T, StaticStorage<T>> {}
+
+/// The static CPU-local areas for APs.
 static CPU_LOCAL_STORAGES: Once<&'static [Paddr]> = Once::new();

-/// Copies the CPU-local data on the bootstrap processor (BSP)
+/// Copies the static CPU-local data on the bootstrap processor (BSP)
 /// for application processors (APs).
 ///
 /// # Safety
@ -123,7 +254,7 @@ pub(crate) unsafe fn copy_bsp_for_ap(num_cpus: usize) {
    CPU_LOCAL_STORAGES.call_once(|| res);
 }

-/// Gets the pointer to the CPU-local storage for the given AP.
+/// Gets the pointer to the static CPU-local storage for the given AP.
 ///
 /// # Panics
 ///
@ -148,7 +279,8 @@ pub(crate) fn get_ap(cpu_id: CpuId) -> Paddr {
 }

 mod is_used {
-    //! This module tracks whether any CPU-local variables are used.
+    //! This module tracks whether any statically-allocated CPU-local
+    //! variables are used.
    //!
    //! [`copy_bsp_for_ap`] copies the CPU local data from the BSP
    //! to the APs, so it requires as a safety condition that the
--- a/ostd/src/cpu/local/static_cpu_local.rs
+++ b/ostd/src/cpu/local/static_cpu_local.rs
@ -0,0 +1,157 @@
+// SPDX-License-Identifier: MPL-2.0
+
+//! Statically-allocated CPU-local objects.
+
+use core::marker::PhantomData;
+
+use super::{AnyStorage, CpuLocal, __cpu_local_end, __cpu_local_start};
+use crate::{arch, cpu::CpuId, trap::DisabledLocalIrqGuard};
+
+/// Defines a statically-allocated CPU-local variable.
+///
+/// The accessors of the CPU-local variables are defined with [`CpuLocal`].
+///
+/// You can get the reference to the inner object on one CPU by calling
+/// [`CpuLocal::get_on_cpu`]. Also if you intend to access the inner object
+/// on the current CPU, you can use [`CpuLocal::get_with`]. The latter
+/// accessors can be used even if the inner object is not `Sync`.
+///
+/// # Example
+///
+/// ```rust
+/// use ostd::{cpu_local, cpu::PinCurrentCpu, task::disable_preempt, trap};
+/// use core::{sync::atomic::{AtomicU32, Ordering}, cell::Cell};
+///
+/// cpu_local! {
+///     static FOO: AtomicU32 = AtomicU32::new(1);
+///     pub static BAR: Cell<usize> = Cell::new(2);
+/// }
+///
+/// fn not_an_atomic_function() {
+///     let preempt_guard = disable_preempt();
+///     let ref_of_foo = FOO.get_on_cpu(preempt_guard.current_cpu());
+///     let val_of_foo = ref_of_foo.load(Ordering::Relaxed);
+///     println!("FOO VAL: {}", val_of_foo);
+///
+///     let irq_guard = trap::disable_local();
+///     let bar_guard = BAR.get_with(&irq_guard);
+///     let val_of_bar = bar_guard.get();
+///     println!("BAR VAL: {}", val_of_bar);
+/// }
+/// ```
+#[macro_export]
+macro_rules! cpu_local {
+    ($( $(#[$attr:meta])* $vis:vis static $name:ident: $t:ty = $init:expr; )*) => {
+        $(
+            #[link_section = ".cpu_local"]
+            $(#[$attr])* $vis static $name: $crate::cpu::local::StaticCpuLocal<$t> = {
+                let val = $init;
+                // SAFETY: The per-CPU variable instantiated is statically
+                // stored in the special `.cpu_local` section.
+                unsafe {
+                    $crate::cpu::local::CpuLocal::__new_static(val)
+                }
+            };
+        )*
+    };
+}
+
+/// A static storage for a CPU-local variable of type `T`.
+///
+/// Such a CPU-local storage is not intended to be allocated directly.
+/// Use the `cpu_local` macro instead.
+pub struct StaticStorage<T: 'static>(T);
+
+impl<T: 'static> StaticStorage<T> {
+    /// Gets access to the underlying value through a raw pointer.
+    ///
+    /// This method is safe, but using the returned pointer will be unsafe.
+    fn as_ptr(&self) -> *const T {
+        super::is_used::debug_set_true();
+
+        let offset = self.get_offset();
+
+        let local_base = arch::cpu::local::get_base() as usize;
+        let local_va = local_base + offset;
+
+        // A sanity check about the alignment.
+        debug_assert_eq!(local_va % core::mem::align_of::<T>(), 0);
+
+        local_va as *const T
+    }
+
+    /// Gets the offset of the CPU-local object in the CPU-local area.
+    fn get_offset(&self) -> usize {
+        let bsp_va = self as *const _ as usize;
+        let bsp_base = __cpu_local_start as usize;
+        // The implementation should ensure that the CPU-local object resides in the `.cpu_local`.
+        debug_assert!(bsp_va + core::mem::size_of::<T>() <= __cpu_local_end as usize);
+
+        bsp_va - bsp_base
+    }
+}
+
+unsafe impl<T: 'static> AnyStorage<T> for StaticStorage<T> {
+    fn get_ptr_on_current(&self, _guard: &DisabledLocalIrqGuard) -> *const T {
+        self.as_ptr()
+    }
+
+    fn get_ptr_on_target(&self, cpu_id: CpuId) -> *const T {
+        super::is_used::debug_set_true();
+
+        let cpu_id = cpu_id.as_usize();
+
+        // If on the BSP, just use the statically linked storage.
+        if cpu_id == 0 {
+            return &self.0 as *const T;
+        }
+
+        let base = {
+            // SAFETY: At this time we have a non-BSP `CpuId`, which means that
+            // `init_cpu_nums` must have been called, so `copy_bsp_for_ap` must
+            // also have been called (see the implementation of `cpu::init_on_bsp`),
+            // so `CPU_LOCAL_STORAGES` must already be initialized.
+            let storages = unsafe { super::CPU_LOCAL_STORAGES.get_unchecked() };
+            // SAFETY: `cpu_id` is guaranteed to be in range because the type
+            // invariant of `CpuId`.
+            let storage = unsafe { *storages.get_unchecked(cpu_id - 1) };
+            crate::mm::paddr_to_vaddr(storage)
+        };
+
+        let offset = self.get_offset();
+        (base + offset) as *const T
+    }
+
+    fn get_mut_ptr_on_target(&mut self, _: CpuId) -> *mut T {
+        // `StaticStorage<T>` does not support `get_mut_ptr_on_target`, because
+        // statically-allocated CPU-local objects do not require per-CPU initialization.
+        panic!("Can't get the mutable pointer of StaticStorage<T> on a target CPU.");
+    }
+}
+
+impl<T: 'static> CpuLocal<T, StaticStorage<T>> {
+    /// Creates a new statically-allocated CPU-local object.
+    ///
+    /// Please do not call this function directly. Instead, use the
+    /// `cpu_local!` macro.
+    ///
+    /// # Safety
+    ///
+    /// The caller should ensure that the object initialized by this
+    /// function resides in the `.cpu_local` section. Otherwise the
+    /// behavior is undefined.
+    #[doc(hidden)]
+    pub const unsafe fn __new_static(val: T) -> Self {
+        Self {
+            storage: StaticStorage(val),
+            phantom: PhantomData,
+        }
+    }
+
+    /// Gets access to the underlying value through a raw pointer.
+    ///
+    /// This method is safe, but using the returned pointer will be unsafe.
+    pub(crate) fn as_ptr(&self) -> *const T {
+        self.storage.as_ptr()
+    }
+}