diff --git a/kernel/aster-nix/src/taskless.rs b/kernel/aster-nix/src/taskless.rs
index 14bd44fa..64ac8f82 100644
--- a/kernel/aster-nix/src/taskless.rs
+++ b/kernel/aster-nix/src/taskless.rs
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: MPL-2.0
 
-#![allow(dead_code)]
-
 use alloc::{boxed::Box, sync::Arc};
 use core::{
     cell::RefCell,
@@ -10,7 +8,7 @@ use core::{
 };
 
 use intrusive_collections::{intrusive_adapter, LinkedList, LinkedListAtomicLink};
-use ostd::{cpu_local, trap::SoftIrqLine, CpuLocal};
+use ostd::{cpu::local::CpuLocal, cpu_local, trap::SoftIrqLine};
 
 use crate::softirq_id::{TASKLESS_SOFTIRQ_ID, TASKLESS_URGENT_SOFTIRQ_ID};
 
diff --git a/osdk/src/base_crate/x86_64.ld.template b/osdk/src/base_crate/x86_64.ld.template
index 57087f86..2803d195 100644
--- a/osdk/src/base_crate/x86_64.ld.template
+++ b/osdk/src/base_crate/x86_64.ld.template
@@ -122,13 +122,7 @@ SECTIONS
 
         # These 4 bytes are used to store the CPU ID.
         . += 4;
-
-        # These 4 bytes are used to store the number of preemption locks held.
-        # The reason is stated in the Rust documentation of
-        # [`ostd::task::processor::PreemptInfo`].
-        __cpu_local_preempt_lock_count = . - __cpu_local_start;
-        . += 4;
-
+        
         KEEP(*(SORT(.cpu_local)))
         __cpu_local_end = .;
     }
diff --git a/ostd/src/arch/x86/cpu/local.rs b/ostd/src/arch/x86/cpu/local.rs
index 325d692d..b5a64732 100644
--- a/ostd/src/arch/x86/cpu/local.rs
+++ b/ostd/src/arch/x86/cpu/local.rs
@@ -23,65 +23,205 @@ pub(crate) fn get_base() -> u64 {
     FS::read_base().as_u64()
 }
 
-pub mod preempt_lock_count {
-    //! We need to increment/decrement the per-CPU preemption lock count using
-    //! a single instruction. This requirement is stated by
-    //! [`crate::task::processor::PreemptInfo`].
+use crate::cpu::local::single_instr::{
+    SingleInstructionAddAssign, SingleInstructionBitAndAssign, SingleInstructionBitOrAssign,
+    SingleInstructionBitXorAssign, SingleInstructionLoad, SingleInstructionStore,
+    SingleInstructionSubAssign,
+};
 
-    /// The GDT ensures that the FS segment is initialized to zero on boot.
-    /// This assertion checks that the base address has been set.
-    macro_rules! debug_assert_initialized {
-        () => {
-            // The compiler may think that [`super::get_base`] has side effects
-            // so it may not be optimized out. We make sure that it will be
-            // conditionally compiled only in debug builds.
-            #[cfg(debug_assertions)]
-            debug_assert_ne!(super::get_base(), 0);
-        };
-    }
+/// The GDT ensures that the FS segment is initialized to zero on boot.
+/// This assertion checks that the base address has been set.
+macro_rules! debug_assert_initialized {
+    () => {
+        // The compiler may think that [`super::get_base`] has side effects
+        // so it may not be optimized out. We make sure that it will be
+        // conditionally compiled only in debug builds.
+        #[cfg(debug_assertions)]
+        debug_assert_ne!(get_base(), 0);
+    };
+}
 
-    /// Increments the per-CPU preemption lock count using one instruction.
-    pub(crate) fn inc() {
+macro_rules! impl_numeric_single_instruction_for {
+    ($([$typ: ty, $inout_type: ident, $register_format: expr])*) => {$(
+
+        impl SingleInstructionAddAssign<$typ> for $typ {
+            unsafe fn add_assign(offset: *mut Self, val: Self) {
+                debug_assert_initialized!();
+
+                core::arch::asm!(
+                    concat!("add fs:[{0}], {1", $register_format, "}"),
+                    in(reg) offset,
+                    in($inout_type) val,
+                    options(nostack),
+                );
+            }
+        }
+
+        impl SingleInstructionSubAssign<$typ> for $typ {
+            unsafe fn sub_assign(offset: *mut Self, val: Self) {
+                debug_assert_initialized!();
+
+                core::arch::asm!(
+                    concat!("sub fs:[{0}], {1", $register_format, "}"),
+                    in(reg) offset,
+                    in($inout_type) val,
+                    options(nostack),
+                );
+            }
+        }
+
+        impl SingleInstructionBitAndAssign<$typ> for $typ {
+            unsafe fn bitand_assign(offset: *mut Self, val: Self) {
+                debug_assert_initialized!();
+
+                core::arch::asm!(
+                    concat!("and fs:[{0}], {1", $register_format, "}"),
+                    in(reg) offset,
+                    in($inout_type) val,
+                    options(nostack),
+                );
+            }
+        }
+
+        impl SingleInstructionBitOrAssign<$typ> for $typ {
+            unsafe fn bitor_assign(offset: *mut Self, val: Self) {
+                debug_assert_initialized!();
+
+                core::arch::asm!(
+                    concat!("or fs:[{0}], {1", $register_format, "}"),
+                    in(reg) offset,
+                    in($inout_type) val,
+                    options(nostack),
+                );
+            }
+        }
+
+        impl SingleInstructionBitXorAssign<$typ> for $typ {
+            unsafe fn bitxor_assign(offset: *mut Self, val: Self) {
+                debug_assert_initialized!();
+
+                core::arch::asm!(
+                    concat!("xor fs:[{0}], {1", $register_format, "}"),
+                    in(reg) offset,
+                    in($inout_type) val,
+                    options(nostack),
+                );
+            }
+        }
+
+        impl SingleInstructionLoad for $typ {
+            unsafe fn load(offset: *const Self) -> Self {
+                debug_assert_initialized!();
+
+                let val: Self;
+                core::arch::asm!(
+                    concat!("mov {0", $register_format, "}, fs:[{1}]"),
+                    out($inout_type) val,
+                    in(reg) offset,
+                    options(nostack, readonly),
+                );
+                val
+            }
+        }
+
+        impl SingleInstructionStore for $typ {
+            unsafe fn store(offset: *mut Self, val: Self) {
+                debug_assert_initialized!();
+
+                core::arch::asm!(
+                    concat!("mov fs:[{0}], {1", $register_format, "}"),
+                    in(reg) offset,
+                    in($inout_type) val,
+                    options(nostack),
+                );
+            }
+        }
+
+    )*};
+}
+
+impl_numeric_single_instruction_for!(
+    [u64,   reg,    ":r"]
+    [usize, reg,    ":r"]
+    [u32,   reg,    ":e"]
+    [u16,   reg,    ":x"]
+    [u8,    reg_byte, ""]
+    [i64,   reg,    ":r"]
+    [isize, reg,    ":r"]
+    [i32,   reg,    ":e"]
+    [i16,   reg,    ":x"]
+    [i8,    reg_byte, ""]
+);
+
+macro_rules! impl_generic_single_instruction_for {
+    ($([<$gen_type:ident $(, $more_gen_type:ident)*>, $typ:ty])*) => {$(
+
+        impl<$gen_type $(, $more_gen_type)*> SingleInstructionLoad for $typ {
+            unsafe fn load(offset: *const Self) -> Self {
+                debug_assert_initialized!();
+
+                let val: Self;
+                core::arch::asm!(
+                    concat!("mov {0}, fs:[{1}]"),
+                    out(reg) val,
+                    in(reg) offset,
+                    options(nostack, readonly),
+                );
+                val
+            }
+        }
+
+        impl<$gen_type $(, $more_gen_type)*> SingleInstructionStore for $typ {
+            unsafe fn store(offset: *mut Self, val: Self) {
+                debug_assert_initialized!();
+
+                core::arch::asm!(
+                    concat!("mov fs:[{0}], {1}"),
+                    in(reg) offset,
+                    in(reg) val,
+                    options(nostack),
+                );
+            }
+        }
+    )*}
+}
+
+impl_generic_single_instruction_for!(
+    [<T>, *const T]
+    [<T>, *mut T]
+    [<T, R>, fn(T) -> R]
+);
+
+// In this module, booleans are represented by the least significant bit of a
+// `u8` type. Other bits must be zero. This definition is compatible with the
+// Rust reference: <https://doc.rust-lang.org/reference/types/boolean.html>.
+
+impl SingleInstructionLoad for bool {
+    unsafe fn load(offset: *const Self) -> Self {
         debug_assert_initialized!();
 
-        // SAFETY: The inline assembly increments the lock count in one
-        // instruction without side effects.
-        unsafe {
-            core::arch::asm!(
-                "add dword ptr fs:[__cpu_local_preempt_lock_count], 1",
-                options(nostack),
-            );
-        }
-    }
-
-    /// Decrements the per-CPU preemption lock count using one instruction.
-    pub(crate) fn dec() {
-        debug_assert_initialized!();
-
-        // SAFETY: The inline assembly decrements the lock count in one
-        // instruction without side effects.
-        unsafe {
-            core::arch::asm!(
-                "sub dword ptr fs:[__cpu_local_preempt_lock_count], 1",
-                options(nostack),
-            );
-        }
-    }
-
-    /// Gets the per-CPU preemption lock count using one instruction.
-    pub(crate) fn get() -> u32 {
-        debug_assert_initialized!();
-
-        let count: u32;
-        // SAFETY: The inline assembly reads the lock count in one instruction
-        // without side effects.
-        unsafe {
-            core::arch::asm!(
-                "mov {0:e}, fs:[__cpu_local_preempt_lock_count]",
-                out(reg) count,
-                options(nostack, readonly),
-            );
-        }
-        count
+        let val: u8;
+        core::arch::asm!(
+            "mov {0}, fs:[{1}]",
+            out(reg_byte) val,
+            in(reg) offset,
+            options(nostack, readonly),
+        );
+        debug_assert!(val == 1 || val == 0);
+        val == 1
+    }
+}
+
+impl SingleInstructionStore for bool {
+    unsafe fn store(offset: *mut Self, val: Self) {
+        debug_assert_initialized!();
+
+        let val: u8 = if val { 1 } else { 0 };
+        core::arch::asm!(
+            "mov fs:[{0}], {1}",
+            in(reg) offset,
+            in(reg_byte) val,
+            options(nostack),
+        );
     }
 }
diff --git a/ostd/src/arch/x86/mod.rs b/ostd/src/arch/x86/mod.rs
index 1693e0a8..3e9b3c36 100644
--- a/ostd/src/arch/x86/mod.rs
+++ b/ostd/src/arch/x86/mod.rs
@@ -73,7 +73,7 @@ pub(crate) fn init_on_bsp() {
 
     // SAFETY: no CPU local objects have been accessed by this far. And
     // we are on the BSP.
-    unsafe { crate::cpu::cpu_local::init_on_bsp() };
+    unsafe { crate::cpu::local::init_on_bsp() };
 
     crate::boot::smp::boot_all_aps();
 
diff --git a/ostd/src/arch/x86/trap.rs b/ostd/src/arch/x86/trap.rs
index dc0ce895..a38b498e 100644
--- a/ostd/src/arch/x86/trap.rs
+++ b/ostd/src/arch/x86/trap.rs
@@ -2,8 +2,6 @@
 
 //! Handles trap.
 
-use core::sync::atomic::{AtomicBool, Ordering};
-
 use align_ext::AlignExt;
 use log::debug;
 #[cfg(feature = "intel_tdx")]
@@ -15,7 +13,7 @@ use super::ex_table::ExTable;
 use crate::arch::{cpu::VIRTUALIZATION_EXCEPTION, tdx_guest::handle_virtual_exception};
 use crate::{
     cpu::{CpuException, CpuExceptionInfo, PageFaultErrorCode, PAGE_FAULT},
-    cpu_local,
+    cpu_local_cell,
     mm::{
         kspace::{KERNEL_PAGE_TABLE, LINEAR_MAPPING_BASE_VADDR, LINEAR_MAPPING_VADDR_RANGE},
         page_prop::{CachePolicy, PageProperty},
@@ -25,15 +23,15 @@ use crate::{
     trap::call_irq_callback_functions,
 };
 
-cpu_local! {
-    static IS_KERNEL_INTERRUPTED: AtomicBool = AtomicBool::new(false);
+cpu_local_cell! {
+    static IS_KERNEL_INTERRUPTED: bool = false;
 }
 
 /// Returns true if this function is called within the context of an IRQ handler
 /// and the IRQ occurs while the CPU is executing in the kernel mode.
 /// Otherwise, it returns false.
 pub fn is_kernel_interrupted() -> bool {
-    IS_KERNEL_INTERRUPTED.load(Ordering::Acquire)
+    IS_KERNEL_INTERRUPTED.load()
 }
 
 /// Only from kernel
@@ -64,9 +62,9 @@ extern "sysv64" fn trap_handler(f: &mut TrapFrame) {
             }
         }
     } else {
-        IS_KERNEL_INTERRUPTED.store(true, Ordering::Release);
+        IS_KERNEL_INTERRUPTED.store(true);
         call_irq_callback_functions(f, f.trap_num);
-        IS_KERNEL_INTERRUPTED.store(false, Ordering::Release);
+        IS_KERNEL_INTERRUPTED.store(false);
     }
 }
 
diff --git a/ostd/src/boot/smp.rs b/ostd/src/boot/smp.rs
index a40b417e..05b93845 100644
--- a/ostd/src/boot/smp.rs
+++ b/ostd/src/boot/smp.rs
@@ -115,7 +115,7 @@ fn ap_early_entry(local_apic_id: u32) -> ! {
 
     // SAFETY: we are on the AP.
     unsafe {
-        cpu::cpu_local::init_on_ap(local_apic_id);
+        cpu::local::init_on_ap(local_apic_id);
     }
 
     trap::init();
diff --git a/ostd/src/cpu/cpu_local.rs b/ostd/src/cpu/cpu_local.rs
deleted file mode 100644
index e72ed1ff..00000000
--- a/ostd/src/cpu/cpu_local.rs
+++ /dev/null
@@ -1,340 +0,0 @@
-// SPDX-License-Identifier: MPL-2.0
-
-//! CPU local storage.
-//!
-//! This module provides a mechanism to define CPU-local objects.
-//!
-//! This is acheived by placing the CPU-local objects in a special section
-//! `.cpu_local`. The bootstrap processor (BSP) uses the objects linked in this
-//! section, and these objects are copied to dynamically allocated local
-//! storage of each application processors (AP) during the initialization
-//! process.
-//!
-//! Such a mechanism exploits the fact that constant values of non-[`Copy`]
-//! types can be bitwise copied. For example, a [`Option<T>`] object, though
-//! being not [`Copy`], have a constant constructor [`Option::None`] that
-//! produces a value that can be bitwise copied to create a new instance.
-//! [`alloc::sync::Arc`] however, don't have such a constructor, and thus cannot
-//! be directly used as a CPU-local object. Wrapping it in a type that has a
-//! constant constructor, like [`Option<T>`], can make it CPU-local.
-
-use alloc::vec::Vec;
-use core::ops::Deref;
-
-use align_ext::AlignExt;
-
-use crate::{
-    arch, cpu,
-    mm::{
-        paddr_to_vaddr,
-        page::{self, meta::KernelMeta, ContPages},
-        PAGE_SIZE,
-    },
-    trap::{disable_local, DisabledLocalIrqGuard},
-};
-
-/// Defines a CPU-local variable.
-///
-/// # Example
-///
-/// ```rust
-/// use crate::cpu_local;
-/// use core::cell::RefCell;
-///
-/// cpu_local! {
-///     static FOO: RefCell<u32> = RefCell::new(1);
-///
-///     #[allow(unused)]
-///     pub static BAR: RefCell<f32> = RefCell::new(1.0);
-/// }
-///
-/// println!("FOO VAL: {:?}", *FOO.borrow());
-/// ```
-#[macro_export]
-macro_rules! cpu_local {
-    ($( $(#[$attr:meta])* $vis:vis static $name:ident: $t:ty = $init:expr; )*) => {
-        $(
-            #[link_section = ".cpu_local"]
-            $(#[$attr])* $vis static $name: $crate::CpuLocal<$t> = {
-                let val = $init;
-                // SAFETY: The CPU local variable instantiated is statically
-                // stored in the special `.cpu_local` section.
-                unsafe {
-                    $crate::CpuLocal::__new(val)
-                }
-            };
-        )*
-    };
-}
-
-/// CPU-local objects.
-///
-/// A CPU-local object only gives you immutable references to the underlying value.
-/// To mutate the value, one can use atomic values (e.g., [`AtomicU32`]) or internally mutable
-/// objects (e.g., [`RefCell`]).
-///
-/// [`AtomicU32`]: core::sync::atomic::AtomicU32
-/// [`RefCell`]: core::cell::RefCell
-pub struct CpuLocal<T>(T);
-
-// SAFETY: At any given time, only one task can access the inner value T
-// of a cpu-local variable even if `T` is not `Sync`.
-unsafe impl<T> Sync for CpuLocal<T> {}
-
-// Prevent valid instances of CpuLocal from being copied to any memory
-// area outside the .cpu_local section.
-impl<T> !Copy for CpuLocal<T> {}
-impl<T> !Clone for CpuLocal<T> {}
-
-// In general, it does not make any sense to send instances of CpuLocal to
-// other tasks as they should live on other CPUs to make sending useful.
-impl<T> !Send for CpuLocal<T> {}
-
-// A check to ensure that the CPU-local object is never accessed before the
-// initialization for all CPUs.
-#[cfg(debug_assertions)]
-use core::sync::atomic::{AtomicBool, Ordering};
-#[cfg(debug_assertions)]
-static IS_INITIALIZED: AtomicBool = AtomicBool::new(false);
-
-impl<T> CpuLocal<T> {
-    /// Initialize a CPU-local object.
-    ///
-    /// Please do not call this function directly. Instead, use the
-    /// `cpu_local!` macro.
-    ///
-    /// # Safety
-    ///
-    /// The caller should ensure that the object initialized by this
-    /// function resides in the `.cpu_local` section. Otherwise the
-    /// behavior is undefined.
-    #[doc(hidden)]
-    pub const unsafe fn __new(val: T) -> Self {
-        Self(val)
-    }
-
-    /// Get access to the underlying value with IRQs disabled.
-    ///
-    /// By this method, you can borrow a reference to the underlying value
-    /// even if `T` is not `Sync`. Because that it is per-CPU and IRQs are
-    /// disabled, no other running task can access it.
-    pub fn borrow_irq_disabled(&self) -> CpuLocalDerefGuard<'_, T> {
-        CpuLocalDerefGuard {
-            cpu_local: self,
-            _guard: disable_local(),
-        }
-    }
-
-    /// Get access to the underlying value through a raw pointer.
-    ///
-    /// This function calculates the virtual address of the CPU-local object based on the per-
-    /// cpu base address and the offset in the BSP.
-    fn get(&self) -> *const T {
-        // CPU-local objects should be initialized before being accessed. It should be ensured
-        // by the implementation of OSTD initialization.
-        #[cfg(debug_assertions)]
-        debug_assert!(IS_INITIALIZED.load(Ordering::Relaxed));
-
-        let offset = {
-            let bsp_va = self as *const _ as usize;
-            let bsp_base = __cpu_local_start as usize;
-            // The implementation should ensure that the CPU-local object resides in the `.cpu_local`.
-            debug_assert!(bsp_va + core::mem::size_of::<T>() <= __cpu_local_end as usize);
-
-            bsp_va - bsp_base as usize
-        };
-
-        let local_base = arch::cpu::local::get_base() as usize;
-        let local_va = local_base + offset;
-
-        // A sanity check about the alignment.
-        debug_assert_eq!(local_va % core::mem::align_of::<T>(), 0);
-
-        local_va as *mut T
-    }
-}
-
-// Considering a preemptive kernel, a CPU-local object may be dereferenced
-// when another task tries to access it. So, we need to ensure that `T` is
-// `Sync` before allowing it to be dereferenced.
-impl<T: Sync> Deref for CpuLocal<T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        // SAFETY: it should be properly initialized before accesses.
-        // And we do not create a mutable reference over it. It is
-        // `Sync` so it can be referenced from this task.
-        unsafe { &*self.get() }
-    }
-}
-
-/// A guard for accessing the CPU-local object.
-///
-/// It ensures that the CPU-local object is accessed with IRQs
-/// disabled. It is created by [`CpuLocal::borrow_irq_disabled`].
-/// Do not hold this guard for a long time.
-#[must_use]
-pub struct CpuLocalDerefGuard<'a, T> {
-    cpu_local: &'a CpuLocal<T>,
-    _guard: DisabledLocalIrqGuard,
-}
-
-impl<T> Deref for CpuLocalDerefGuard<'_, T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        // SAFETY: it should be properly initialized before accesses.
-        // And we do not create a mutable reference over it. The IRQs
-        // are disabled so it can be referenced from this task.
-        unsafe { &*self.cpu_local.get() }
-    }
-}
-
-/// Sets the base address of the CPU-local storage for the bootstrap processor.
-///
-/// It should be called early to let [`crate::task::disable_preempt`] work,
-/// which needs to update a CPU-local preempt lock count. Otherwise it may
-/// panic when calling [`crate::task::disable_preempt`].
-///
-/// # Safety
-///
-/// It should be called only once and only on the BSP.
-pub(crate) unsafe fn early_init_bsp_local_base() {
-    let start_base_va = __cpu_local_start as usize as u64;
-    // SAFETY: The base to be set is the start of the `.cpu_local` section,
-    // where accessing the CPU-local objects have defined behaviors.
-    unsafe {
-        arch::cpu::local::set_base(start_base_va);
-    }
-}
-
-/// The BSP initializes the CPU-local areas for APs. Here we use a
-/// non-disabling preempt version of lock because the [`crate::sync`]
-/// version needs `cpu_local` to work. Preemption and interrupts are
-/// disabled in this phase so it is safe to use this lock.
-static CPU_LOCAL_STORAGES: spin::RwLock<Vec<ContPages<KernelMeta>>> = spin::RwLock::new(Vec::new());
-
-/// Initializes the CPU local data for the bootstrap processor (BSP).
-///
-/// # Safety
-///
-/// This function can only called on the BSP, for once.
-///
-/// It must be guaranteed that the BSP will not access local data before
-/// this function being called, otherwise copying non-constant values
-/// will result in pretty bad undefined behavior.
-pub unsafe fn init_on_bsp() {
-    let bsp_base_va = __cpu_local_start as usize;
-    let bsp_end_va = __cpu_local_end as usize;
-
-    let num_cpus = super::num_cpus();
-
-    let mut cpu_local_storages = CPU_LOCAL_STORAGES.write();
-    for cpu_i in 1..num_cpus {
-        let ap_pages = {
-            let nbytes = (bsp_end_va - bsp_base_va).align_up(PAGE_SIZE);
-            page::allocator::alloc_contiguous(nbytes, |_| KernelMeta::default()).unwrap()
-        };
-        let ap_pages_ptr = paddr_to_vaddr(ap_pages.start_paddr()) as *mut u8;
-
-        // SAFETY: The BSP has not initialized the CPU-local area, so the objects in
-        // in the `.cpu_local` section can be bitwise bulk copied to the AP's local
-        // storage. The destination memory is allocated so it is valid to write to.
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                bsp_base_va as *const u8,
-                ap_pages_ptr,
-                bsp_end_va - bsp_base_va,
-            );
-        }
-
-        // SAFETY: the first 4 bytes is reserved for storing CPU ID.
-        unsafe {
-            (ap_pages_ptr as *mut u32).write(cpu_i);
-        }
-
-        // SAFETY: the second 4 bytes is reserved for storing the preemt count.
-        unsafe {
-            (ap_pages_ptr as *mut u32).add(1).write(0);
-        }
-
-        cpu_local_storages.push(ap_pages);
-    }
-
-    // Write the CPU ID of BSP to the first 4 bytes of the CPU-local area.
-    let bsp_cpu_id_ptr = bsp_base_va as *mut u32;
-    // SAFETY: the first 4 bytes is reserved for storing CPU ID.
-    unsafe {
-        bsp_cpu_id_ptr.write(0);
-    }
-
-    cpu::local::set_base(bsp_base_va as u64);
-
-    #[cfg(debug_assertions)]
-    IS_INITIALIZED.store(true, Ordering::Relaxed);
-}
-
-/// Initializes the CPU local data for the application processor (AP).
-///
-/// # Safety
-///
-/// This function can only called on the AP.
-pub unsafe fn init_on_ap(cpu_id: u32) {
-    let rlock = CPU_LOCAL_STORAGES.read();
-    let ap_pages = rlock.get(cpu_id as usize - 1).unwrap();
-
-    let ap_pages_ptr = paddr_to_vaddr(ap_pages.start_paddr()) as *mut u32;
-
-    debug_assert_eq!(
-        cpu_id,
-        // SAFETY: the CPU ID is stored at the beginning of the CPU local area.
-        unsafe { ap_pages_ptr.read() }
-    );
-
-    // SAFETY: the memory will be dedicated to the AP. And we are on the AP.
-    unsafe {
-        cpu::local::set_base(ap_pages_ptr as u64);
-    }
-}
-
-// These symbols are provided by the linker script.
-extern "C" {
-    fn __cpu_local_start();
-    fn __cpu_local_end();
-}
-
-#[cfg(ktest)]
-mod test {
-    use core::{
-        cell::RefCell,
-        sync::atomic::{AtomicU8, Ordering},
-    };
-
-    use ostd_macros::ktest;
-
-    use super::*;
-
-    #[ktest]
-    fn test_cpu_local() {
-        cpu_local! {
-            static FOO: RefCell<usize> = RefCell::new(1);
-            static BAR: AtomicU8 = AtomicU8::new(3);
-        }
-        for _ in 0..10 {
-            let foo_guard = FOO.borrow_irq_disabled();
-            assert_eq!(*foo_guard.borrow(), 1);
-            *foo_guard.borrow_mut() = 2;
-            drop(foo_guard);
-            for _ in 0..10 {
-                assert_eq!(BAR.load(Ordering::Relaxed), 3);
-                BAR.store(4, Ordering::Relaxed);
-                assert_eq!(BAR.load(Ordering::Relaxed), 4);
-                BAR.store(3, Ordering::Relaxed);
-            }
-            let foo_guard = FOO.borrow_irq_disabled();
-            assert_eq!(*foo_guard.borrow(), 2);
-            *foo_guard.borrow_mut() = 1;
-            drop(foo_guard);
-        }
-    }
-}
diff --git a/ostd/src/cpu/local/cell.rs b/ostd/src/cpu/local/cell.rs
new file mode 100644
index 00000000..97c6ceca
--- /dev/null
+++ b/ostd/src/cpu/local/cell.rs
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: MPL-2.0
+
+//! The implementaion of CPU-local variables that have inner mutability.
+
+use core::cell::UnsafeCell;
+
+use super::{__cpu_local_end, __cpu_local_start, single_instr::*};
+use crate::arch;
+
+/// Defines an inner-mutable CPU-local variable.
+///
+/// The accessors of the CPU-local variables are defined with [`CpuLocalCell`].
+///
+/// It should be noted that if the interrupts or preemption is enabled, two
+/// operations on the same CPU-local cell variable may access different objects
+/// since the task may live on different CPUs.
+///
+/// # Example
+///
+/// ```rust
+/// use ostd::cpu_local_cell;
+///
+/// cpu_local_cell! {
+///     static FOO: u32 = 1;
+///     pub static BAR: *const usize = core::ptr::null();
+/// }
+///
+/// fn not_an_atomic_function() {
+///     let bar_var: usize = 1;
+///     BAR.store(&bar_var as *const _);
+///     // Note that the value of `BAR` here doesn't nessarily equal to the address
+///     // of `bar_var`, since the task may be preempted and moved to another CPU.
+///     // You can avoid this by disabling interrupts (and preemption, if needed).
+///     println!("BAR VAL: {:?}", BAR.load());
+///
+///     let _irq_guard = ostd::trap::disable_local_irq();
+///     println!("1st FOO VAL: {:?}", FOO.load());
+///     // No suprises here, the two accesses must result in the same value.
+///     println!("2nd FOO VAL: {:?}", FOO.load());
+/// }
+/// ```
+macro_rules! cpu_local_cell {
+    ($( $(#[$attr:meta])* $vis:vis static $name:ident: $t:ty = $init:expr; )*) => {
+        $(
+            #[link_section = ".cpu_local"]
+            $(#[$attr])* $vis static $name: $crate::cpu::local::CpuLocalCell<$t> = {
+                let val = $init;
+                // SAFETY: The CPU local variable instantiated is statically
+                // stored in the special `.cpu_local` section.
+                unsafe {
+                    $crate::cpu::local::CpuLocalCell::__new(val)
+                }
+            };
+        )*
+    };
+}
+
+pub(crate) use cpu_local_cell;
+
+/// Inner mutable CPU-local objects.
+///
+/// CPU-local cell objects are only accessible from the current CPU. When
+/// accessing an underlying object using the same `CpuLocalCell` instance, the
+/// actually accessed object is always on the current CPU. So in a preemptive
+/// kernel task, the operated object may change if interrupts are enabled.
+///
+/// The inner mutability is provided by single instruction operations, and the
+/// CPU-local cell objects will not ever be shared between CPUs. So it is safe
+/// to modify the inner value without any locks.
+///
+/// You should only create the CPU-local cell object using the macro
+/// [`cpu_local_cell!`].
+///
+/// For the difference between [`super::CpuLocal`] and [`CpuLocalCell`], see
+/// [`super`].
+pub struct CpuLocalCell<T: 'static>(UnsafeCell<T>);
+
+impl<T: 'static> CpuLocalCell<T> {
+    /// Initialize a CPU-local object.
+    ///
+    /// Please do not call this function directly. Instead, use the
+    /// `cpu_local!` macro.
+    ///
+    /// # Safety
+    ///
+    /// The caller should ensure that the object initialized by this
+    /// function resides in the `.cpu_local` section. Otherwise the
+    /// behavior is undefined.
+    #[doc(hidden)]
+    pub const unsafe fn __new(val: T) -> Self {
+        Self(UnsafeCell::new(val))
+    }
+
+    /// Get access to the underlying value through a raw pointer.
+    ///
+    /// This function calculates the virtual address of the CPU-local object
+    /// based on the CPU-local base address and the offset in the BSP.
+    ///
+    /// # Safety
+    ///
+    /// The caller should ensure that within the entire execution of this
+    /// function, no interrupt or preemption can occur. Otherwise, the
+    /// returned pointer may points to the variable in another CPU.
+    pub unsafe fn as_ptr_mut(&'static self) -> *mut T {
+        super::has_init::assert_true();
+
+        let offset = {
+            let bsp_va = self as *const _ as usize;
+            let bsp_base = __cpu_local_start as usize;
+            // The implementation should ensure that the CPU-local object resides in the `.cpu_local`.
+            debug_assert!(bsp_va + core::mem::size_of::<T>() <= __cpu_local_end as usize);
+
+            bsp_va - bsp_base as usize
+        };
+
+        let local_base = arch::cpu::local::get_base() as usize;
+        let local_va = local_base + offset;
+
+        // A sanity check about the alignment.
+        debug_assert_eq!(local_va % core::mem::align_of::<T>(), 0);
+
+        local_va as *mut T
+    }
+}
+
+// SAFETY: At any given time, only one task can access the inner value T
+// of a cpu-local variable even if `T` is not `Sync`.
+unsafe impl<T: 'static> Sync for CpuLocalCell<T> {}
+
+// Prevent valid instances of CpuLocalCell from being copied to any memory
+// area outside the `.cpu_local` section.
+impl<T: 'static> !Copy for CpuLocalCell<T> {}
+impl<T: 'static> !Clone for CpuLocalCell<T> {}
+
+// In general, it does not make any sense to send instances of CpuLocalCell to
+// other tasks as they should live on other CPUs to make sending useful.
+impl<T: 'static> !Send for CpuLocalCell<T> {}
+
+// Accessors for the per-CPU objects whose type implements the single-
+// instruction operations.
+
+impl<T: 'static + SingleInstructionAddAssign<T>> CpuLocalCell<T> {
+    /// Adds a value to the per-CPU object in a single instruction.
+    ///
+    /// This operation wraps on overflow/underflow.
+    ///
+    /// Note that this memory operation will not be elided or reordered by the
+    /// compiler since it is a black-box.
+    pub fn add_assign(&'static self, rhs: T) {
+        let offset = self as *const _ as usize - __cpu_local_start as usize;
+        // SAFETY: The CPU-local object is defined in the `.cpu_local` section,
+        // so the pointer to the object is valid. And the reference is never shared.
+        unsafe {
+            T::add_assign(offset as *mut T, rhs);
+        }
+    }
+}
+
+impl<T: 'static + SingleInstructionSubAssign<T>> CpuLocalCell<T> {
+    /// Subtracts a value to the per-CPU object in a single instruction.
+    ///
+    /// This operation wraps on overflow/underflow.
+    ///
+    /// Note that this memory operation will not be elided or reordered by the
+    /// compiler since it is a black-box.
+    pub fn sub_assign(&'static self, rhs: T) {
+        let offset = self as *const _ as usize - __cpu_local_start as usize;
+        // SAFETY: The CPU-local object is defined in the `.cpu_local` section,
+        // so the pointer to the object is valid. And the reference is never shared.
+        unsafe {
+            T::sub_assign(offset as *mut T, rhs);
+        }
+    }
+}
+
+impl<T: 'static + SingleInstructionBitAndAssign<T>> CpuLocalCell<T> {
+    /// Bitwise ANDs a value to the per-CPU object in a single instruction.
+    ///
+    /// Note that this memory operation will not be elided or reordered by the
+    /// compiler since it is a black-box.
+    pub fn bitand_assign(&'static self, rhs: T) {
+        let offset = self as *const _ as usize - __cpu_local_start as usize;
+        // SAFETY: The CPU-local object is defined in the `.cpu_local` section,
+        // so the pointer to the object is valid. And the reference is never shared.
+        unsafe {
+            T::bitand_assign(offset as *mut T, rhs);
+        }
+    }
+}
+
+impl<T: 'static + SingleInstructionBitOrAssign<T>> CpuLocalCell<T> {
+    /// Bitwise ORs a value to the per-CPU object in a single instruction.
+    ///
+    /// Note that this memory operation will not be elided or reordered by the
+    /// compiler since it is a black-box.
+    pub fn bitor_assign(&'static self, rhs: T) {
+        let offset = self as *const _ as usize - __cpu_local_start as usize;
+        // SAFETY: The CPU-local object is defined in the `.cpu_local` section,
+        // so the pointer to the object is valid. And the reference is never shared.
+        unsafe {
+            T::bitor_assign(offset as *mut T, rhs);
+        }
+    }
+}
+
+impl<T: 'static + SingleInstructionBitXorAssign<T>> CpuLocalCell<T> {
+    /// Bitwise XORs a value to the per-CPU object in a single instruction.
+    ///
+    /// Note that this memory operation will not be elided or reordered by the
+    /// compiler since it is a black-box.
+    pub fn bitxor_assign(&'static self, rhs: T) {
+        let offset = self as *const _ as usize - __cpu_local_start as usize;
+        // SAFETY: The CPU-local object is defined in the `.cpu_local` section,
+        // so the pointer to the object is valid. And the reference is never shared.
+        unsafe {
+            T::bitxor_assign(offset as *mut T, rhs);
+        }
+    }
+}
+
+impl<T: 'static + SingleInstructionLoad> CpuLocalCell<T> {
+    /// Gets the value of the per-CPU object in a single instruction.
+    ///
+    /// Note that this memory operation will not be elided or reordered by the
+    /// compiler since it is a black-box.
+    pub fn load(&'static self) -> T {
+        let offset = self as *const _ as usize - __cpu_local_start as usize;
+        // SAFETY: The CPU-local object is defined in the `.cpu_local` section,
+        // so the pointer to the object is valid.
+        unsafe { T::load(offset as *const T) }
+    }
+}
+
+impl<T: 'static + SingleInstructionStore> CpuLocalCell<T> {
+    /// Writes a value to the per-CPU object in a single instruction.
+    ///
+    /// Note that this memory operation will not be elided or reordered by the
+    /// compiler since it is a black-box.
+    pub fn store(&'static self, val: T) {
+        let offset = self as *const _ as usize - __cpu_local_start as usize;
+        // SAFETY: The CPU-local object is defined in the `.cpu_local` section,
+        // so the pointer to the object is valid. And the reference is never shared.
+        unsafe {
+            T::store(offset as *mut T, val);
+        }
+    }
+}
diff --git a/ostd/src/cpu/local/cpu_local.rs b/ostd/src/cpu/local/cpu_local.rs
new file mode 100644
index 00000000..37724d27
--- /dev/null
+++ b/ostd/src/cpu/local/cpu_local.rs
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: MPL-2.0
+
+//! The CPU-local variable implementation.
+
+use core::{marker::Sync, ops::Deref};
+
+use super::{__cpu_local_end, __cpu_local_start};
+use crate::{
+    arch,
+    trap::{self, DisabledLocalIrqGuard},
+};
+
+/// Defines a CPU-local variable.
+///
+/// The accessors of the CPU-local variables are defined with [`CpuLocal`].
+///
+/// You can get the reference to the inner object by calling [`deref`]. But
+/// it is worth noting that the object is always the one in the original core
+/// when the reference is created. Use [`CpuLocal::borrow_irq_disabled`] if
+/// this is not expected, or if the inner type can't be shared across CPUs.
+///
+/// # Example
+///
+/// ```rust
+/// use ostd::{cpu_local, sync::SpinLock};
+/// use core::sync::atomic::{AtomicU32, Ordering};
+///
+/// cpu_local! {
+///     static FOO: AtomicU32 = AtomicU32::new(1);
+///     pub static BAR: SpinLock<usize> = SpinLock::new(2);
+/// }
+///
+/// fn not_an_atomic_function() {
+///     let ref_of_foo = FOO.deref();
+///     // Note that the value of `FOO` here doesn't necessarily equal to the value
+///     // of `FOO` of exactly the __current__ CPU. Since that task may be preempted
+///     // and moved to another CPU since `ref_of_foo` is created.
+///     let val_of_foo = ref_of_foo.load(Ordering::Relaxed);
+///     println!("FOO VAL: {}", val_of_foo);
+///
+///     let bar_guard = BAR.lock_irq_disabled();
+///     // Here the value of `BAR` is always the one in the __current__ CPU since
+///     // interrupts are disabled and we do not explicitly yield execution here.
+///     let val_of_bar = *bar_guard;
+///     println!("BAR VAL: {}", val_of_bar);
+/// }
+/// ```
+#[macro_export]
+macro_rules! cpu_local {
+    ($( $(#[$attr:meta])* $vis:vis static $name:ident: $t:ty = $init:expr; )*) => {
+        $(
+            #[link_section = ".cpu_local"]
+            $(#[$attr])* $vis static $name: $crate::cpu::local::CpuLocal<$t> = {
+                let val = $init;
+                // SAFETY: The per-CPU variable instantiated is statically
+                // stored in the special `.cpu_local` section.
+                unsafe {
+                    $crate::cpu::local::CpuLocal::__new(val)
+                }
+            };
+        )*
+    };
+}
+
+/// CPU-local objects.
+///
+/// CPU-local objects are instanciated once per CPU core. They can be shared to
+/// other cores. In the context of a preemptible kernel task, when holding the
+/// reference to the inner object, the object is always the one in the original
+/// core (when the reference is created), no matter which core the code is
+/// currently running on.
+///
+/// For the difference between [`CpuLocal`] and [`super::CpuLocalCell`], see
+/// [`super`].
+pub struct CpuLocal<T: 'static>(T);
+
+impl<T: 'static> CpuLocal<T> {
+    /// Creates a new CPU-local object.
+    ///
+    /// Please do not call this function directly. Instead, use the
+    /// `cpu_local!` macro.
+    ///
+    /// # Safety
+    ///
+    /// The caller should ensure that the object initialized by this
+    /// function resides in the `.cpu_local` section. Otherwise the
+    /// behavior is undefined.
+    #[doc(hidden)]
+    pub const unsafe fn __new(val: T) -> Self {
+        Self(val)
+    }
+
+    /// Get access to the underlying value with IRQs disabled.
+    ///
+    /// By this method, you can borrow a reference to the underlying value
+    /// even if `T` is not `Sync`. Because that it is per-CPU and IRQs are
+    /// disabled, no other running tasks can access it.
+    pub fn borrow_irq_disabled(&'static self) -> CpuLocalDerefGuard<'_, T> {
+        CpuLocalDerefGuard {
+            cpu_local: self,
+            _guard: InnerGuard::Created(trap::disable_local()),
+        }
+    }
+
+    /// Get access to the underlying value with a provided guard.
+    ///
+    /// Similar to [`CpuLocal::borrow_irq_disabled`], but you can provide
+    /// a guard to disable IRQs if you already have one.
+    pub fn borrow_with<'a>(
+        &'static self,
+        guard: &'a DisabledLocalIrqGuard,
+    ) -> CpuLocalDerefGuard<'a, T> {
+        CpuLocalDerefGuard {
+            cpu_local: self,
+            _guard: InnerGuard::Provided(guard),
+        }
+    }
+
+    /// Get access to the underlying value through a raw pointer.
+    ///
+    /// This function calculates the virtual address of the CPU-local object
+    /// based on the CPU-local base address and the offset in the BSP.
+    ///
+    /// # Safety
+    ///
+    /// The caller must ensure that the reference to `self` is static.
+    unsafe fn as_ptr(&self) -> *const T {
+        super::has_init::assert_true();
+
+        let offset = {
+            let bsp_va = self as *const _ as usize;
+            let bsp_base = __cpu_local_start as usize;
+            // The implementation should ensure that the CPU-local object resides in the `.cpu_local`.
+            debug_assert!(bsp_va + core::mem::size_of::<T>() <= __cpu_local_end as usize);
+
+            bsp_va - bsp_base as usize
+        };
+
+        let local_base = arch::cpu::local::get_base() as usize;
+        let local_va = local_base + offset;
+
+        // A sanity check about the alignment.
+        debug_assert_eq!(local_va % core::mem::align_of::<T>(), 0);
+
+        local_va as *mut T
+    }
+}
+
+// SAFETY: At any given time, only one task can access the inner value `T` of a
+// CPU-local variable if `T` is not `Sync`. We guarentee it by disabling the
+// reference to the inner value, or turning off preemptions when creating
+// the reference.
+unsafe impl<T: 'static> Sync for CpuLocal<T> {}
+
+// Prevent valid instances of `CpuLocal` from being copied to any memory areas
+// outside the `.cpu_local` section.
+impl<T: 'static> !Copy for CpuLocal<T> {}
+impl<T: 'static> !Clone for CpuLocal<T> {}
+
+// In general, it does not make any sense to send instances of `CpuLocal` to
+// other tasks as they should live on other CPUs to make sending useful.
+impl<T: 'static> !Send for CpuLocal<T> {}
+
+// For `Sync` types, we can create a reference over the inner type and allow
+// it to be shared across CPUs. So it is sound to provide a `Deref`
+// implementation. However it is up to the caller if sharing is desired.
+impl<T: 'static + Sync> Deref for CpuLocal<T> {
+    type Target = T;
+
+    /// Note that the reference to the inner object remains to the same object
+    /// accessed on the original CPU where the reference is created. If this
+    /// is not expected, turn off preemptions.
+    fn deref(&self) -> &Self::Target {
+        // SAFETY: it should be properly initialized before accesses.
+        // And we do not create a mutable reference over it. It is
+        // `Sync` so it can be referenced from this task. Here dereferencing
+        // from non-static instances is not feasible since no one can create
+        // a non-static instance of `CpuLocal`.
+        unsafe { &*self.as_ptr() }
+    }
+}
+
+/// A guard for accessing the CPU-local object.
+///
+/// It ensures that the CPU-local object is accessed with IRQs disabled.
+/// It is created by [`CpuLocal::borrow_irq_disabled`] or
+/// [`CpuLocal::borrow_with`]. Do not hold this guard for a longtime.
+#[must_use]
+pub struct CpuLocalDerefGuard<'a, T: 'static> {
+    cpu_local: &'static CpuLocal<T>,
+    _guard: InnerGuard<'a>,
+}
+
+enum InnerGuard<'a> {
+    #[allow(dead_code)]
+    Created(DisabledLocalIrqGuard),
+    #[allow(dead_code)]
+    Provided(&'a DisabledLocalIrqGuard),
+}
+
+impl<T: 'static> Deref for CpuLocalDerefGuard<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        // SAFETY: it should be properly initialized before accesses.
+        // And we do not create a mutable reference over it. The IRQs
+        // are disabled so it can only be referenced from this task.
+        unsafe { &*self.cpu_local.as_ptr() }
+    }
+}
diff --git a/ostd/src/cpu/local/mod.rs b/ostd/src/cpu/local/mod.rs
new file mode 100644
index 00000000..01467c9b
--- /dev/null
+++ b/ostd/src/cpu/local/mod.rs
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: MPL-2.0
+
+//! CPU local storage.
+//!
+//! This module provides a mechanism to define CPU-local objects, by the macro
+//! [`crate::cpu_local!`].
+//!
+//! Such a mechanism exploits the fact that constant values of non-[`Copy`]
+//! types can be bitwise copied. For example, a [`Option<T>`] object, though
+//! being not [`Copy`], have a constant constructor [`Option::None`] that
+//! produces a value that can be bitwise copied to create a new instance.
+//! [`alloc::sync::Arc`] however, don't have such a constructor, and thus cannot
+//! be directly used as a CPU-local object. Wrapping it in a type that has a
+//! constant constructor, like [`Option<T>`], can make it CPU-local.
+//!
+//! # Implementation
+//!
+//! These APIs are implemented by placing the CPU-local objects in a special
+//! section `.cpu_local`. The bootstrap processor (BSP) uses the objects linked
+//! in this section, and these objects are copied to dynamically allocated
+//! local storage of each application processors (AP) during the initialization
+//! process.
+
+// This module also, provide CPU-local cell objects that have inner mutability.
+//
+// The difference between CPU-local objects (defined by [`crate::cpu_local!`])
+// and CPU-local cell objects (defined by [`crate::cpu_local_cell!`]) is that
+// the CPU-local objects can be shared across CPUs. While through a CPU-local
+// cell object you can only access the value on the current CPU, therefore
+// enabling inner mutability without locks.
+//
+// The cell-variant is currently not a public API because that it is rather
+// hard to be used without introducing races. But it is useful for OSTD's
+// internal implementation.
+
+mod cell;
+mod cpu_local;
+
+pub(crate) mod single_instr;
+
+use alloc::vec::Vec;
+
+use align_ext::AlignExt;
+pub(crate) use cell::{cpu_local_cell, CpuLocalCell};
+pub use cpu_local::{CpuLocal, CpuLocalDerefGuard};
+
+use crate::{
+    arch,
+    mm::{
+        paddr_to_vaddr,
+        page::{self, meta::KernelMeta, ContPages},
+        PAGE_SIZE,
+    },
+};
+
+// These symbols are provided by the linker script.
+extern "C" {
+    fn __cpu_local_start();
+    fn __cpu_local_end();
+}
+
+cpu_local_cell! {
+    /// The count of the preempt lock.
+    ///
+    /// We need to access the preemption count before we can copy the section
+    /// for application processors. So, the preemption count is not copied from
+    /// bootstrap processor's section as the initialization. Instead it is
+    /// initialized to zero for application processors.
+    pub(crate) static PREEMPT_LOCK_COUNT: u32 = 0;
+}
+
+/// Sets the base address of the CPU-local storage for the bootstrap processor.
+///
+/// It should be called early to let [`crate::task::disable_preempt`] work,
+/// which needs to update a CPU-local preempt lock count. Otherwise it may
+/// panic when calling [`crate::task::disable_preempt`].
+///
+/// # Safety
+///
+/// It should be called only once and only on the BSP.
+pub(crate) unsafe fn early_init_bsp_local_base() {
+    let start_base_va = __cpu_local_start as usize as u64;
+    // SAFETY: The base to be set is the start of the `.cpu_local` section,
+    // where accessing the CPU-local objects have defined behaviors.
+    unsafe {
+        arch::cpu::local::set_base(start_base_va);
+    }
+}
+
+/// The BSP initializes the CPU-local areas for APs. Here we use a
+/// non-disabling preempt version of lock because the [`crate::sync`]
+/// version needs `cpu_local` to work. Preemption and interrupts are
+/// disabled in this phase so it is safe to use this lock.
+static CPU_LOCAL_STORAGES: spin::RwLock<Vec<ContPages<KernelMeta>>> = spin::RwLock::new(Vec::new());
+
+/// Initializes the CPU local data for the bootstrap processor (BSP).
+///
+/// # Safety
+///
+/// This function can only called on the BSP, for once.
+///
+/// It must be guaranteed that the BSP will not access local data before
+/// this function being called, otherwise copying non-constant values
+/// will result in pretty bad undefined behavior.
+pub unsafe fn init_on_bsp() {
+    let bsp_base_va = __cpu_local_start as usize;
+    let bsp_end_va = __cpu_local_end as usize;
+
+    let num_cpus = super::num_cpus();
+
+    let mut cpu_local_storages = CPU_LOCAL_STORAGES.write();
+    for cpu_i in 1..num_cpus {
+        let ap_pages = {
+            let nbytes = (bsp_end_va - bsp_base_va).align_up(PAGE_SIZE);
+            page::allocator::alloc_contiguous(nbytes, |_| KernelMeta::default()).unwrap()
+        };
+        let ap_pages_ptr = paddr_to_vaddr(ap_pages.start_paddr()) as *mut u8;
+
+        // SAFETY: The BSP has not initialized the CPU-local area, so the objects in
+        // in the `.cpu_local` section can be bitwise bulk copied to the AP's local
+        // storage. The destination memory is allocated so it is valid to write to.
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                bsp_base_va as *const u8,
+                ap_pages_ptr,
+                bsp_end_va - bsp_base_va,
+            );
+        }
+
+        // SAFETY: the first 4 bytes is reserved for storing CPU ID.
+        unsafe {
+            (ap_pages_ptr as *mut u32).write(cpu_i);
+        }
+
+        // SAFETY: the `PREEMPT_LOCK_COUNT` may be dirty on the BSP, so we need
+        // to ensure that it is initialized to zero for APs. The safety
+        // requirements are met since the static is defined in the `.cpu_local`
+        // section and the pointer to that static is the offset in the CPU-
+        // local area. It is a `usize` so it is safe to be overwritten.
+        unsafe {
+            let preempt_count_offset = &PREEMPT_LOCK_COUNT as *const _ as usize;
+            let ap_preempt_count_ptr = ap_pages_ptr.add(preempt_count_offset) as *mut usize;
+            ap_preempt_count_ptr.write(0);
+        }
+
+        cpu_local_storages.push(ap_pages);
+    }
+
+    // Write the CPU ID of BSP to the first 4 bytes of the CPU-local area.
+    let bsp_cpu_id_ptr = bsp_base_va as *mut u32;
+    // SAFETY: the first 4 bytes is reserved for storing CPU ID.
+    unsafe {
+        bsp_cpu_id_ptr.write(0);
+    }
+
+    arch::cpu::local::set_base(bsp_base_va as u64);
+
+    has_init::set_true();
+}
+
+/// Initializes the CPU local data for the application processor (AP).
+///
+/// # Safety
+///
+/// This function can only called on the AP.
+pub unsafe fn init_on_ap(cpu_id: u32) {
+    let rlock = CPU_LOCAL_STORAGES.read();
+    let ap_pages = rlock.get(cpu_id as usize - 1).unwrap();
+
+    let ap_pages_ptr = paddr_to_vaddr(ap_pages.start_paddr()) as *mut u32;
+
+    debug_assert_eq!(
+        cpu_id,
+        // SAFETY: the CPU ID is stored at the beginning of the CPU local area.
+        unsafe { ap_pages_ptr.read() }
+    );
+
+    // SAFETY: the memory will be dedicated to the AP. And we are on the AP.
+    unsafe {
+        arch::cpu::local::set_base(ap_pages_ptr as u64);
+    }
+}
+
+mod has_init {
+    //! This module is used to detect the programming error of using the CPU-local
+    //! mechanism before it is initialized. Such bugs have been found before and we
+    //! do not want to repeat this error again. This module is only incurs runtime
+    //! overhead if debug assertions are enabled.
+    cfg_if::cfg_if! {
+        if #[cfg(debug_assertions)] {
+            use core::sync::atomic::{AtomicBool, Ordering};
+
+            static IS_INITIALIZED: AtomicBool = AtomicBool::new(false);
+
+            pub fn assert_true() {
+                debug_assert!(IS_INITIALIZED.load(Ordering::Relaxed));
+            }
+
+            pub fn set_true() {
+                IS_INITIALIZED.store(true, Ordering::Relaxed);
+            }
+        } else {
+            pub fn assert_true() {}
+
+            pub fn set_true() {}
+        }
+    }
+}
+
+#[cfg(ktest)]
+mod test {
+    use core::cell::RefCell;
+
+    use ostd_macros::ktest;
+
+    #[ktest]
+    fn test_cpu_local() {
+        crate::cpu_local! {
+            static FOO: RefCell<usize> = RefCell::new(1);
+        }
+        let foo_guard = FOO.borrow_irq_disabled();
+        assert_eq!(*foo_guard.borrow(), 1);
+        *foo_guard.borrow_mut() = 2;
+        assert_eq!(*foo_guard.borrow(), 2);
+        drop(foo_guard);
+    }
+
+    #[ktest]
+    fn test_cpu_local_cell() {
+        crate::cpu_local_cell! {
+            static BAR: usize = 3;
+        }
+        let _guard = crate::trap::disable_local();
+        assert_eq!(BAR.load(), 3);
+        BAR.store(4);
+        assert_eq!(BAR.load(), 4);
+    }
+}
diff --git a/ostd/src/cpu/local/single_instr.rs b/ostd/src/cpu/local/single_instr.rs
new file mode 100644
index 00000000..1ac436c0
--- /dev/null
+++ b/ostd/src/cpu/local/single_instr.rs
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: MPL-2.0
+
+//! Extensions for CPU-local types that allows single-instruction operations.
+//!
+//! For some per-CPU objects, fetching or modifying the values of them can be
+//! done in a single instruction. Then we would avoid turning off interrupts
+//! when accessing them, which incurs non-trivial overhead.
+//!
+//! These traits are the architecture-specific interface for single-instruction
+//! operations. The architecture-specific module can implement these traits for
+//! common integer types. For architectures that don't support such single-
+//! instruction operations, we emulate a single-instruction implementation by
+//! disabling interruptions and preemptions.
+//!
+//! Currently we implement some of the [`core::ops`] operations. Bitwise shift
+//! implementations are missing. Also for less-fundamental types such as
+//! enumerations or boolean types, the caller can cast it themselves to the
+//! integer types, for which the operations are implemented.
+//!
+//! # Safety
+//!
+//! All operations in the provided traits are unsafe, and the caller should
+//! ensure that the offset is a valid pointer to a static [`CpuLocalCell`]
+//! object. The offset of the object is relative to the base address of the
+//! CPU-local storage. These operations are not atomic. Accessing the same
+//! address from multiple CPUs produces undefined behavior.
+//!
+//! [`CpuLocalCell`]: crate::cpu::local::CpuLocalCell
+
+/// An interface for architecture-specific single-instruction add operation.
+pub trait SingleInstructionAddAssign<Rhs = Self> {
+    /// Adds a value to the per-CPU object.
+    ///
+    /// This operation wraps on overflow.
+    ///
+    /// # Safety
+    ///
+    ///
+    unsafe fn add_assign(offset: *mut Self, rhs: Rhs);
+}
+
+impl<T: num_traits::WrappingAdd + Copy> SingleInstructionAddAssign<T> for T {
+    default unsafe fn add_assign(offset: *mut Self, rhs: T) {
+        let _guard = crate::trap::disable_local();
+        let base = crate::arch::cpu::local::get_base() as usize;
+        let addr = (base + offset as usize) as *mut Self;
+        addr.write(addr.read().wrapping_add(&rhs));
+    }
+}
+
+/// An interface for architecture-specific single-instruction subtract operation.
+pub trait SingleInstructionSubAssign<Rhs = Self> {
+    /// Subtracts a value to the per-CPU object.
+    ///
+    /// This operation wraps on overflow.
+    ///
+    /// # Safety
+    ///
+    /// Please refer to the module-level documentation of [`self`].
+    unsafe fn sub_assign(offset: *mut Self, rhs: Rhs);
+}
+
+impl<T: num_traits::WrappingSub + Copy> SingleInstructionSubAssign<T> for T {
+    default unsafe fn sub_assign(offset: *mut Self, rhs: T) {
+        let _guard = crate::trap::disable_local();
+        let base = crate::arch::cpu::local::get_base() as usize;
+        let addr = (base + offset as usize) as *mut Self;
+        addr.write(addr.read().wrapping_sub(&rhs));
+    }
+}
+
+/// An interface for architecture-specific single-instruction bitwise OR.
+pub trait SingleInstructionBitOrAssign<Rhs = Self> {
+    /// Bitwise ORs a value to the per-CPU object.
+    ///
+    /// # Safety
+    ///
+    /// Please refer to the module-level documentation of [`self`].
+    unsafe fn bitor_assign(offset: *mut Self, rhs: Rhs);
+}
+
+impl<T: core::ops::BitOr<Output = T> + Copy> SingleInstructionBitOrAssign<T> for T {
+    default unsafe fn bitor_assign(offset: *mut Self, rhs: T) {
+        let _guard = crate::trap::disable_local();
+        let base = crate::arch::cpu::local::get_base() as usize;
+        let addr = (base + offset as usize) as *mut Self;
+        addr.write(addr.read() | rhs);
+    }
+}
+
+/// An interface for architecture-specific single-instruction bitwise AND.
+pub trait SingleInstructionBitAndAssign<Rhs = Self> {
+    /// Bitwise ANDs a value to the per-CPU object.
+    ///
+    /// # Safety
+    ///
+    /// Please refer to the module-level documentation of [`self`].
+    unsafe fn bitand_assign(offset: *mut Self, rhs: Rhs);
+}
+
+impl<T: core::ops::BitAnd<Output = T> + Copy> SingleInstructionBitAndAssign<T> for T {
+    default unsafe fn bitand_assign(offset: *mut Self, rhs: T) {
+        let _guard = crate::trap::disable_local();
+        let base = crate::arch::cpu::local::get_base() as usize;
+        let addr = (base + offset as usize) as *mut Self;
+        addr.write(addr.read() & rhs);
+    }
+}
+
+/// An interface for architecture-specific single-instruction bitwise XOR.
+pub trait SingleInstructionBitXorAssign<Rhs = Self> {
+    /// Bitwise XORs a value to the per-CPU object.
+    ///
+    /// # Safety
+    ///
+    /// Please refer to the module-level documentation of [`self`].
+    unsafe fn bitxor_assign(offset: *mut Self, rhs: Rhs);
+}
+
+impl<T: core::ops::BitXor<Output = T> + Copy> SingleInstructionBitXorAssign<T> for T {
+    default unsafe fn bitxor_assign(offset: *mut Self, rhs: T) {
+        let _guard = crate::trap::disable_local();
+        let base = crate::arch::cpu::local::get_base() as usize;
+        let addr = (base + offset as usize) as *mut Self;
+        addr.write(addr.read() ^ rhs);
+    }
+}
+
+/// An interface for architecture-specific single-instruction get operation.
+pub trait SingleInstructionLoad {
+    /// Gets the value of the per-CPU object.
+    ///
+    /// # Safety
+    ///
+    /// Please refer to the module-level documentation of [`self`].
+    unsafe fn load(offset: *const Self) -> Self;
+}
+
+impl<T: Copy> SingleInstructionLoad for T {
+    default unsafe fn load(offset: *const Self) -> Self {
+        let _guard = crate::trap::disable_local();
+        let base = crate::arch::cpu::local::get_base() as usize;
+        let ptr = (base + offset as usize) as *const Self;
+        ptr.read()
+    }
+}
+
+/// An interface for architecture-specific single-instruction set operation.
+pub trait SingleInstructionStore {
+    /// Writes a value to the per-CPU object.
+    ///
+    /// # Safety
+    ///
+    /// Please refer to the module-level documentation of [`self`].
+    unsafe fn store(offset: *mut Self, val: Self);
+}
+
+impl<T: Copy> SingleInstructionStore for T {
+    default unsafe fn store(offset: *mut Self, val: Self) {
+        let _guard = crate::trap::disable_local();
+        let base = crate::arch::cpu::local::get_base() as usize;
+        let ptr = (base + offset as usize) as *mut Self;
+        ptr.write(val);
+    }
+}
diff --git a/ostd/src/cpu/mod.rs b/ostd/src/cpu/mod.rs
index fa73c96c..17289230 100644
--- a/ostd/src/cpu/mod.rs
+++ b/ostd/src/cpu/mod.rs
@@ -2,7 +2,7 @@
 
 //! CPU-related definitions.
 
-pub mod cpu_local;
+pub mod local;
 
 cfg_if::cfg_if! {
     if #[cfg(target_arch = "x86_64")]{
@@ -18,7 +18,7 @@ use bitvec::{
     slice::IterOnes,
 };
 
-use crate::{arch::boot::smp::get_num_processors, cpu};
+use crate::arch::{self, boot::smp::get_num_processors};
 
 /// The number of CPUs. Zero means uninitialized.
 static NUM_CPUS: AtomicU32 = AtomicU32::new(0);
@@ -47,7 +47,7 @@ pub fn num_cpus() -> u32 {
 pub fn this_cpu() -> u32 {
     // SAFETY: the cpu ID is stored at the beginning of the cpu local area, provided
     // by the linker script.
-    unsafe { (cpu::local::get_base() as usize as *mut u32).read() }
+    unsafe { (arch::cpu::local::get_base() as usize as *mut u32).read() }
 }
 
 /// A subset of all CPUs in the system.
diff --git a/ostd/src/lib.rs b/ostd/src/lib.rs
index eee80a8a..f99bb07e 100644
--- a/ostd/src/lib.rs
+++ b/ostd/src/lib.rs
@@ -11,6 +11,7 @@
 #![feature(generic_const_exprs)]
 #![feature(iter_from_coroutine)]
 #![feature(let_chains)]
+#![feature(min_specialization)]
 #![feature(negative_impls)]
 #![feature(new_uninit)]
 #![feature(panic_info_message)]
@@ -46,7 +47,9 @@ pub mod user;
 pub use ostd_macros::main;
 pub use ostd_pod::Pod;
 
-pub use self::{cpu::cpu_local::CpuLocal, error::Error, prelude::Result};
+pub use self::{error::Error, prelude::Result};
+// [`CpuLocalCell`] is easy to be mis-used, so we don't expose it to the users.
+pub(crate) use crate::cpu::local::cpu_local_cell;
 
 /// Initializes OSTD.
 ///
@@ -64,7 +67,7 @@ pub fn init() {
     arch::check_tdx_init();
 
     // SAFETY: This function is called only once and only on the BSP.
-    unsafe { cpu::cpu_local::early_init_bsp_local_base() };
+    unsafe { cpu::local::early_init_bsp_local_base() };
 
     mm::heap_allocator::init();
 
diff --git a/ostd/src/task/processor.rs b/ostd/src/task/processor.rs
index 71acd5ff..9dc72c2e 100644
--- a/ostd/src/task/processor.rs
+++ b/ostd/src/task/processor.rs
@@ -8,7 +8,7 @@ use super::{
     task::{context_switch, TaskContext},
     Task, TaskStatus,
 };
-use crate::{arch, cpu_local};
+use crate::{cpu::local::PREEMPT_LOCK_COUNT, cpu_local};
 
 pub struct Processor {
     current: Option<Arc<Task>>,
@@ -91,10 +91,11 @@ pub fn preempt(task: &Arc<Task>) {
 ///
 /// before context switch, current task will switch to the next task
 fn switch_to_task(next_task: Arc<Task>) {
-    if !PREEMPT_COUNT.is_preemptive() {
+    let preemt_lock_count = PREEMPT_LOCK_COUNT.load();
+    if preemt_lock_count != 0 {
         panic!(
             "Calling schedule() while holding {} locks",
-            PREEMPT_COUNT.num_locks()
+            preemt_lock_count
         );
     }
 
@@ -151,53 +152,6 @@ fn switch_to_task(next_task: Arc<Task>) {
     // to the next task switching.
 }
 
-static PREEMPT_COUNT: PreemptInfo = PreemptInfo::new();
-
-/// Currently, it only holds the number of preemption locks held by the
-/// current CPU. When it has a non-zero value, the CPU cannot call
-/// [`schedule()`].
-///
-/// For per-CPU preemption lock count, we cannot afford two non-atomic
-/// operations to increment and decrement the count. The [`crate::cpu_local`]
-/// implementation is free to read the base register and then calculate the
-/// address of the per-CPU variable using an additional instruction. Interrupts
-/// can happen between the address calculation and modification to that
-/// address. If the task is preempted to another CPU by this interrupt, the
-/// count of the original CPU will be mistakenly modified. To avoid this, we
-/// introduce [`crate::arch::cpu::local::preempt_lock_count`]. For x86_64 we
-/// can implement this using one instruction. In other less expressive
-/// architectures, we may need to disable interrupts.
-///
-/// Also, the preemption count is reserved in the `.cpu_local` section
-/// specified in the linker script. The reason is that we need to access the
-/// preemption count before we can copy the section for application processors.
-/// So, the preemption count is not copied from bootstrap processor's section
-/// as the initialization. Instead it is initialized to zero for application
-/// processors.
-struct PreemptInfo {}
-
-impl PreemptInfo {
-    const fn new() -> Self {
-        Self {}
-    }
-
-    fn increase_num_locks(&self) {
-        arch::cpu::local::preempt_lock_count::inc();
-    }
-
-    fn decrease_num_locks(&self) {
-        arch::cpu::local::preempt_lock_count::dec();
-    }
-
-    fn is_preemptive(&self) -> bool {
-        arch::cpu::local::preempt_lock_count::get() == 0
-    }
-
-    fn num_locks(&self) -> usize {
-        arch::cpu::local::preempt_lock_count::get() as usize
-    }
-}
-
 /// A guard for disable preempt.
 #[clippy::has_significant_drop]
 #[must_use]
@@ -210,7 +164,7 @@ impl !Send for DisablePreemptGuard {}
 
 impl DisablePreemptGuard {
     fn new() -> Self {
-        PREEMPT_COUNT.increase_num_locks();
+        PREEMPT_LOCK_COUNT.add_assign(1);
         Self { _private: () }
     }
 
@@ -223,7 +177,7 @@ impl DisablePreemptGuard {
 
 impl Drop for DisablePreemptGuard {
     fn drop(&mut self) {
-        PREEMPT_COUNT.decrease_num_locks();
+        PREEMPT_LOCK_COUNT.sub_assign(1);
     }
 }
 
diff --git a/ostd/src/trap/handler.rs b/ostd/src/trap/handler.rs
index d61bd2b2..3f359f70 100644
--- a/ostd/src/trap/handler.rs
+++ b/ostd/src/trap/handler.rs
@@ -1,17 +1,15 @@
 // SPDX-License-Identifier: MPL-2.0
 
-use core::sync::atomic::{AtomicBool, Ordering};
-
 use trapframe::TrapFrame;
 
-use crate::{arch::irq::IRQ_LIST, cpu_local};
+use crate::{arch::irq::IRQ_LIST, cpu_local_cell};
 
 pub(crate) fn call_irq_callback_functions(trap_frame: &TrapFrame, irq_number: usize) {
     // For x86 CPUs, interrupts are not re-entrant. Local interrupts will be disabled when
     // an interrupt handler is called (Unless interrupts are re-enabled in an interrupt handler).
     //
     // FIXME: For arch that supports re-entrant interrupts, we may need to record nested level here.
-    IN_INTERRUPT_CONTEXT.store(true, Ordering::Release);
+    IN_INTERRUPT_CONTEXT.store(true);
 
     let irq_line = IRQ_LIST.get().unwrap().get(irq_number).unwrap();
     let callback_functions = irq_line.callback_list();
@@ -22,20 +20,17 @@ pub(crate) fn call_irq_callback_functions(trap_frame: &TrapFrame, irq_number: us
 
     crate::arch::interrupts_ack(irq_number);
 
-    IN_INTERRUPT_CONTEXT.store(false, Ordering::Release);
-
     crate::arch::irq::enable_local();
     crate::trap::softirq::process_pending();
+
+    IN_INTERRUPT_CONTEXT.store(false);
 }
 
-cpu_local! {
-    static IN_INTERRUPT_CONTEXT: AtomicBool = AtomicBool::new(false);
+cpu_local_cell! {
+    static IN_INTERRUPT_CONTEXT: bool = false;
 }
 
 /// Returns whether we are in the interrupt context.
-///
-/// FIXME: Here only hardware irq is taken into account. According to linux implementation, if
-/// we are in softirq context, or bottom half is disabled, this function also returns true.
 pub fn in_interrupt_context() -> bool {
-    IN_INTERRUPT_CONTEXT.load(Ordering::Acquire)
+    IN_INTERRUPT_CONTEXT.load()
 }
diff --git a/ostd/src/trap/softirq.rs b/ostd/src/trap/softirq.rs
index df08a08d..3d9a136c 100644
--- a/ostd/src/trap/softirq.rs
+++ b/ostd/src/trap/softirq.rs
@@ -2,14 +2,12 @@
 
 //! Software interrupt.
 
-#![allow(unused_variables)]
-
 use alloc::boxed::Box;
-use core::sync::atomic::{AtomicBool, AtomicU8, Ordering};
+use core::sync::atomic::{AtomicU8, Ordering};
 
 use spin::Once;
 
-use crate::{cpu_local, task::disable_preempt};
+use crate::{cpu_local_cell, task::disable_preempt};
 
 /// A representation of a software interrupt (softirq) line.
 ///
@@ -70,7 +68,7 @@ impl SoftIrqLine {
     ///
     /// If this line is not enabled yet, the method has no effect.
     pub fn raise(&self) {
-        PENDING_MASK.fetch_or(1 << self.id, Ordering::Release);
+        PENDING_MASK.bitor_assign(1 << self.id);
     }
 
     /// Enables a softirq line by registering its callback.
@@ -105,24 +103,24 @@ pub(super) fn init() {
 
 static ENABLED_MASK: AtomicU8 = AtomicU8::new(0);
 
-cpu_local! {
-    static PENDING_MASK: AtomicU8 = AtomicU8::new(0);
-    static IS_ENABLED: AtomicBool = AtomicBool::new(true);
+cpu_local_cell! {
+    static PENDING_MASK: u8 = 0;
+    static IS_ENABLED: bool = true;
 }
 
 /// Enables softirq in current processor.
 fn enable_softirq_local() {
-    IS_ENABLED.store(true, Ordering::Release);
+    IS_ENABLED.store(true);
 }
 
 /// Disables softirq in current processor.
 fn disable_softirq_local() {
-    IS_ENABLED.store(false, Ordering::Release);
+    IS_ENABLED.store(false);
 }
 
 /// Checks whether the softirq is enabled in current processor.
 fn is_softirq_enabled() -> bool {
-    IS_ENABLED.load(Ordering::Acquire)
+    IS_ENABLED.load()
 }
 
 /// Processes pending softirqs.
@@ -136,12 +134,13 @@ pub(crate) fn process_pending() {
         return;
     }
 
-    let preempt_guard = disable_preempt();
+    let _preempt_guard = disable_preempt();
     disable_softirq_local();
 
-    for i in 0..SOFTIRQ_RUN_TIMES {
+    for _i in 0..SOFTIRQ_RUN_TIMES {
         let mut action_mask = {
-            let pending_mask = PENDING_MASK.fetch_and(0, Ordering::Acquire);
+            let pending_mask = PENDING_MASK.load();
+            PENDING_MASK.store(0);
             pending_mask & ENABLED_MASK.load(Ordering::Acquire)
         };