diff --git a/ostd/src/task/kernel_stack.rs b/ostd/src/task/kernel_stack.rs
index 4e6bc8dcb..ea2ea8097 100644
--- a/ostd/src/task/kernel_stack.rs
+++ b/ostd/src/task/kernel_stack.rs
@@ -1,6 +1,10 @@
 // SPDX-License-Identifier: MPL-2.0
 
+use core::sync::atomic::Ordering;
+
 use crate::{
+    arch::mm::tlb_flush_addr_range,
+    cpu::{AtomicCpuSet, CpuSet, PinCurrentCpu},
     impl_frame_meta_for,
     mm::{
         kspace::kvirt_area::{KVirtArea, Tracked},
@@ -8,6 +12,7 @@ use crate::{
         FrameAllocOptions, PAGE_SIZE,
     },
     prelude::*,
+    trap::DisabledLocalIrqGuard,
 };
 
 /// The kernel stack size of a task, specified in pages.
@@ -30,6 +35,7 @@ pub static KERNEL_STACK_SIZE: usize = STACK_SIZE_IN_PAGES as usize * PAGE_SIZE;
 #[expect(dead_code)]
 pub struct KernelStack {
     kvirt_area: KVirtArea<Tracked>,
+    tlb_coherent: AtomicCpuSet,
     end_vaddr: Vaddr,
     has_guard_page: bool,
 }
@@ -41,7 +47,13 @@ impl_frame_meta_for!(KernelStackMeta);
 
 impl KernelStack {
     /// Generates a kernel stack with guard pages.
-    /// 4 additional pages are allocated and regarded as guard pages, which should not be accessed.
+    ///
+    /// 4 additional pages are allocated and regarded as guard pages, which
+    /// should not be accessed.
+    //
+    // TODO: We map kernel stacks in the kernel virtual areas, which incurs
+    // non-negligible TLB and mapping overhead on task creation. This could
+    // be improved by caching/reusing kernel stacks with a pool.
     pub fn new_with_guard_page() -> Result<Self> {
         let mut new_kvirt_area = KVirtArea::<Tracked>::new(KERNEL_STACK_SIZE + 4 * PAGE_SIZE);
         let mapped_start = new_kvirt_area.range().start + 2 * PAGE_SIZE;
@@ -58,11 +70,21 @@ impl KernelStack {
 
         Ok(Self {
             kvirt_area: new_kvirt_area,
+            tlb_coherent: AtomicCpuSet::new(CpuSet::new_empty()),
             end_vaddr: mapped_end,
             has_guard_page: true,
         })
     }
 
+    /// Flushes the TLB for the current CPU if necessary.
+    pub(super) fn flush_tlb(&self, irq_guard: &DisabledLocalIrqGuard) {
+        let cur_cpu = irq_guard.current_cpu();
+        if !self.tlb_coherent.contains(cur_cpu, Ordering::Relaxed) {
+            tlb_flush_addr_range(&self.kvirt_area.range());
+            self.tlb_coherent.add(cur_cpu, Ordering::Relaxed);
+        }
+    }
+
     pub fn end_vaddr(&self) -> Vaddr {
         self.end_vaddr
     }
diff --git a/ostd/src/task/mod.rs b/ostd/src/task/mod.rs
index f8ef37066..4f210b5ce 100644
--- a/ostd/src/task/mod.rs
+++ b/ostd/src/task/mod.rs
@@ -52,7 +52,6 @@ pub struct Task {
     user_ctx: Option<Arc<UserContext>>,
     ctx: SyncUnsafeCell<TaskContext>,
     /// kernel stack, note that the top is SyscallFrame/TrapFrame
-    #[expect(dead_code)]
     kstack: KernelStack,
 
     schedule_info: TaskScheduleInfo,
diff --git a/ostd/src/task/processor.rs b/ostd/src/task/processor.rs
index e2452f498..fd974ba1f 100644
--- a/ostd/src/task/processor.rs
+++ b/ostd/src/task/processor.rs
@@ -67,6 +67,8 @@ pub(super) fn switch_to_task(next_task: Arc<Task>) {
     // may be unmapped, leading to instant failure.
     let old_prev = PREVIOUS_TASK_PTR.load();
     PREVIOUS_TASK_PTR.store(current_task_ptr);
+
+    next_task.kstack.flush_tlb(&irq_guard);
     CURRENT_TASK_PTR.store(Arc::into_raw(next_task));
 
     if let Some(handler) = POST_SCHEDULE_HANDLER.get() {