feat(ebpf):[WIP] add eBPF support (#948)

* feat(kprobe): Add basic kprobe support for x86_64

* feat: add ebpf support (#912)

- 实现bpf()一部分命令,包括几种基本map,相关的helper函数
- 实现部分perf相关的数据结构
- 暂时为文件实现简单mmap
- 实现一个使用kprobe统计syscall 调用次数的ebpf程序

对eBPF支持程度(基本):

- 简单的eBPF程序(没有指定特殊的Map)
- 使用内核已经实现的Map的eBPF程序
- 可以和kprobe配合使用
- 内核Map相关的接口定义已经实现,添加新的Map较为简单

不支持的功能:
- 区分不同的eBPF程序类型(Network/Cgroup)并限定可调用的helper函数集
- 与内核其它跟踪机制配合(tracepoint)
- 其它helper和Map


todo

- [ ]  修改mmap,需要讨论,因为这个和块缓存层相关
- [x]  添加文档
- [x]  修复可能的错误
- [x] 增加rbpf版本信息

* feat: add /sys/devices/system/cpu/possible file

* feat: add /sys/devices/system/cpu/online
This commit is contained in:
linfeng
2024-10-25 15:59:57 +08:00
committed by GitHub
parent 80c9e8f8f0
commit fae6e9ade4
126 changed files with 29529 additions and 62 deletions

View File

@ -3,12 +3,12 @@
//! 架构相关的处理逻辑参考: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/riscv/kernel/traps.c
use core::hint::spin_loop;
use log::error;
use log::{error, trace};
use system_error::SystemError;
use crate::{arch::syscall::syscall_handler, driver::irqchip::riscv_intc::riscv_intc_irq};
use super::TrapFrame;
use crate::exception::ebreak::EBreak;
use crate::{arch::syscall::syscall_handler, driver::irqchip::riscv_intc::riscv_intc_irq};
type ExceptionHandler = fn(&mut TrapFrame) -> Result<(), SystemError>;
@ -93,11 +93,10 @@ fn do_trap_insn_illegal(_trap_frame: &mut TrapFrame) -> Result<(), SystemError>
}
/// 处理断点异常 #3
fn do_trap_break(_trap_frame: &mut TrapFrame) -> Result<(), SystemError> {
error!("riscv64_do_irq: do_trap_break");
loop {
spin_loop();
}
fn do_trap_break(trap_frame: &mut TrapFrame) -> Result<(), SystemError> {
trace!("riscv64_do_irq: do_trap_break");
// handle breakpoint
EBreak::handle(trap_frame)
}
/// 处理加载地址不对齐异常 #4

View File

@ -1,3 +1,5 @@
use core::any::Any;
use kprobe::ProbeArgs;
use riscv::register::{scause::Scause, sstatus::Sstatus};
use system_error::SystemError;
@ -160,4 +162,21 @@ impl TrapFrame {
pub fn set_return_value(&mut self, value: usize) {
self.a0 = value;
}
/// 设置当前的程序计数器
pub fn set_pc(&mut self, pc: usize) {
self.epc = pc;
}
}
impl ProbeArgs for TrapFrame {
fn as_any(&self) -> &dyn Any {
self
}
fn break_address(&self) -> usize {
self.epc
}
fn debug_address(&self) -> usize {
self.epc
}
}

View File

@ -0,0 +1,85 @@
use crate::arch::interrupt::TrapFrame;
pub fn setup_single_step(frame: &mut TrapFrame, step_addr: usize) {
frame.set_pc(step_addr);
}
pub fn clear_single_step(frame: &mut TrapFrame, return_addr: usize) {
frame.set_pc(return_addr);
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct KProbeContext {
pub pc: usize,
pub ra: usize,
pub sp: usize,
pub gp: usize,
pub tp: usize,
pub t0: usize,
pub t1: usize,
pub t2: usize,
pub s0: usize,
pub s1: usize,
pub a0: usize,
pub a1: usize,
pub a2: usize,
pub a3: usize,
pub a4: usize,
pub a5: usize,
pub a6: usize,
pub a7: usize,
pub s2: usize,
pub s3: usize,
pub s4: usize,
pub s5: usize,
pub s6: usize,
pub s7: usize,
pub s8: usize,
pub s9: usize,
pub s10: usize,
pub s11: usize,
pub t3: usize,
pub t4: usize,
pub t5: usize,
pub t6: usize,
}
impl From<&TrapFrame> for KProbeContext {
fn from(trap_frame: &TrapFrame) -> Self {
Self {
pc: trap_frame.epc,
ra: trap_frame.ra,
sp: trap_frame.sp,
gp: trap_frame.gp,
tp: trap_frame.tp,
t0: trap_frame.t0,
t1: trap_frame.t1,
t2: trap_frame.t2,
s0: trap_frame.s0,
s1: trap_frame.s1,
a0: trap_frame.a0,
a1: trap_frame.a1,
a2: trap_frame.a2,
a3: trap_frame.a3,
a4: trap_frame.a4,
a5: trap_frame.a5,
a6: trap_frame.a6,
a7: trap_frame.a7,
s2: trap_frame.s2,
s3: trap_frame.s3,
s4: trap_frame.s4,
s5: trap_frame.s5,
s6: trap_frame.s6,
s7: trap_frame.s7,
s8: trap_frame.s8,
s9: trap_frame.s9,
s10: trap_frame.s10,
s11: trap_frame.s11,
t3: trap_frame.t3,
t4: trap_frame.t4,
t5: trap_frame.t5,
t6: trap_frame.t6,
}
}
}

View File

@ -5,6 +5,7 @@ pub mod elf;
pub mod init;
pub mod interrupt;
pub mod ipc;
pub mod kprobe;
mod kvm;
pub mod mm;
pub mod msi;

View File

@ -4,11 +4,12 @@ pub mod ipi;
pub mod msi;
pub mod trap;
use core::any::Any;
use core::{
arch::asm,
sync::atomic::{compiler_fence, Ordering},
};
use kprobe::ProbeArgs;
use log::error;
use system_error::SystemError;
@ -177,4 +178,21 @@ impl TrapFrame {
pub fn is_from_user(&self) -> bool {
return (self.cs & 0x3) != 0;
}
/// 设置当前的程序计数器
pub fn set_pc(&mut self, pc: usize) {
self.rip = pc as u64;
}
}
impl ProbeArgs for TrapFrame {
fn as_any(&self) -> &dyn Any {
self
}
fn break_address(&self) -> usize {
(self.rip - 1) as usize
}
fn debug_address(&self) -> usize {
self.rip as usize
}
}

View File

@ -1,6 +1,12 @@
use log::{error, warn};
use log::{error, trace, warn};
use system_error::SystemError;
use super::{
entry::{set_intr_gate, set_system_trap_gate},
TrapFrame,
};
use crate::exception::debug::DebugException;
use crate::exception::ebreak::EBreak;
use crate::{
arch::{CurrentIrqArch, MMArch},
exception::InterruptArch,
@ -9,11 +15,6 @@ use crate::{
smp::core::smp_get_processor_id,
};
use super::{
entry::{set_intr_gate, set_system_trap_gate},
TrapFrame,
};
extern "C" {
fn trap_divide_error();
fn trap_debug();
@ -125,8 +126,8 @@ unsafe extern "C" fn do_divide_error(regs: &'static TrapFrame, error_code: u64)
/// 处理调试异常 1 #DB
#[no_mangle]
unsafe extern "C" fn do_debug(regs: &'static TrapFrame, error_code: u64) {
error!(
unsafe extern "C" fn do_debug(regs: &'static mut TrapFrame, error_code: u64) {
trace!(
"do_debug(1), \tError code: {:#x},\trsp: {:#x},\trip: {:#x},\t CPU: {}, \tpid: {:?}",
error_code,
regs.rsp,
@ -134,7 +135,7 @@ unsafe extern "C" fn do_debug(regs: &'static TrapFrame, error_code: u64) {
smp_get_processor_id().data(),
ProcessManager::current_pid()
);
panic!("Debug Exception");
DebugException::handle(regs).unwrap();
}
/// 处理NMI中断 2 NMI
@ -153,8 +154,8 @@ unsafe extern "C" fn do_nmi(regs: &'static TrapFrame, error_code: u64) {
/// 处理断点异常 3 #BP
#[no_mangle]
unsafe extern "C" fn do_int3(regs: &'static TrapFrame, error_code: u64) {
error!(
unsafe extern "C" fn do_int3(regs: &'static mut TrapFrame, error_code: u64) {
trace!(
"do_int3(3), \tError code: {:#x},\trsp: {:#x},\trip: {:#x},\t CPU: {}, \tpid: {:?}",
error_code,
regs.rsp,
@ -162,7 +163,7 @@ unsafe extern "C" fn do_int3(regs: &'static TrapFrame, error_code: u64) {
smp_get_processor_id().data(),
ProcessManager::current_pid()
);
panic!("Int3");
EBreak::handle(regs).unwrap();
}
/// 处理溢出异常 4 #OF

View File

@ -0,0 +1,65 @@
use crate::arch::interrupt::TrapFrame;
pub fn setup_single_step(frame: &mut TrapFrame, step_addr: usize) {
frame.rflags |= 0x100;
frame.set_pc(step_addr);
}
pub fn clear_single_step(frame: &mut TrapFrame, return_addr: usize) {
frame.rflags &= !0x100;
frame.set_pc(return_addr);
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct KProbeContext {
pub r15: ::core::ffi::c_ulong,
pub r14: ::core::ffi::c_ulong,
pub r13: ::core::ffi::c_ulong,
pub r12: ::core::ffi::c_ulong,
pub rbp: ::core::ffi::c_ulong,
pub rbx: ::core::ffi::c_ulong,
pub r11: ::core::ffi::c_ulong,
pub r10: ::core::ffi::c_ulong,
pub r9: ::core::ffi::c_ulong,
pub r8: ::core::ffi::c_ulong,
pub rax: ::core::ffi::c_ulong,
pub rcx: ::core::ffi::c_ulong,
pub rdx: ::core::ffi::c_ulong,
pub rsi: ::core::ffi::c_ulong,
pub rdi: ::core::ffi::c_ulong,
pub orig_rax: ::core::ffi::c_ulong,
pub rip: ::core::ffi::c_ulong,
pub cs: ::core::ffi::c_ulong,
pub eflags: ::core::ffi::c_ulong,
pub rsp: ::core::ffi::c_ulong,
pub ss: ::core::ffi::c_ulong,
}
impl From<&TrapFrame> for KProbeContext {
fn from(trap_frame: &TrapFrame) -> Self {
Self {
r15: trap_frame.r15,
r14: trap_frame.r14,
r13: trap_frame.r13,
r12: trap_frame.r12,
rbp: trap_frame.rbp,
rbx: trap_frame.rbx,
r11: trap_frame.r11,
r10: trap_frame.r10,
r9: trap_frame.r9,
r8: trap_frame.r8,
rax: trap_frame.rax,
rcx: trap_frame.rcx,
rdx: trap_frame.rdx,
rsi: trap_frame.rsi,
rdi: trap_frame.rdi,
orig_rax: 0,
rip: trap_frame.rip,
cs: trap_frame.cs,
eflags: trap_frame.rflags,
rsp: trap_frame.rsp,
ss: trap_frame.ss,
}
}
}

View File

@ -8,6 +8,7 @@ pub mod fpu;
pub mod init;
pub mod interrupt;
pub mod ipc;
pub mod kprobe;
pub mod kvm;
pub mod libs;
pub mod mm;

View File

@ -0,0 +1,11 @@
pub const HELPER_MAP_LOOKUP_ELEM: u32 = 1;
pub const HELPER_MAP_UPDATE_ELEM: u32 = 2;
pub const HELPER_MAP_DELETE_ELEM: u32 = 3;
pub const HELPER_MAP_FOR_EACH_ELEM: u32 = 164;
pub const HELPER_MAP_LOOKUP_PERCPU_ELEM: u32 = 195;
pub const HELPER_PERF_EVENT_OUTPUT: u32 = 25;
pub const HELPER_BPF_PROBE_READ: u32 = 4;
pub const HELPER_TRACE_PRINTF: u32 = 6;
pub const HELPER_MAP_PUSH_ELEM: u32 = 87;
pub const HELPER_MAP_POP_ELEM: u32 = 88;
pub const HELPER_MAP_PEEK_ELEM: u32 = 89;

View File

@ -0,0 +1,340 @@
mod consts;
mod print;
use crate::bpf::helper::print::trace_printf;
use crate::bpf::map::{BpfCallBackFn, BpfMap};
use crate::include::bindings::linux_bpf::BPF_F_CURRENT_CPU;
use crate::libs::lazy_init::Lazy;
use crate::smp::core::smp_get_processor_id;
use alloc::{collections::BTreeMap, sync::Arc};
use core::ffi::c_void;
use system_error::SystemError;
type RawBPFHelperFn = fn(u64, u64, u64, u64, u64) -> u64;
type Result<T> = core::result::Result<T, SystemError>;
macro_rules! define_func {
($name:ident) => {
core::mem::transmute::<usize, RawBPFHelperFn>($name as usize)
};
}
/// See https://ebpf-docs.dylanreimerink.nl/linux/helper-function/bpf_map_lookup_elem/
unsafe fn raw_map_lookup_elem(map: *mut c_void, key: *const c_void) -> *const c_void {
let map = Arc::from_raw(map as *const BpfMap);
let key_size = map.key_size();
let key = core::slice::from_raw_parts(key as *const u8, key_size);
let value = map_lookup_elem(&map, key);
// log::info!("<raw_map_lookup_elem>: {:x?}", value);
// warning: We need to keep the map alive, so we don't drop it here.
let _ = Arc::into_raw(map);
match value {
Ok(Some(value)) => value as *const c_void,
_ => core::ptr::null_mut(),
}
}
pub fn map_lookup_elem(map: &Arc<BpfMap>, key: &[u8]) -> Result<Option<*const u8>> {
let mut binding = map.inner_map().lock();
let value = binding.lookup_elem(key);
match value {
Ok(Some(value)) => Ok(Some(value.as_ptr())),
_ => Ok(None),
}
}
/// See https://ebpf-docs.dylanreimerink.nl/linux/helper-function/bpf_perf_event_output/
///
/// See https://man7.org/linux/man-pages/man7/bpf-helpers.7.html
unsafe fn raw_perf_event_output(
ctx: *mut c_void,
map: *mut c_void,
flags: u64,
data: *mut c_void,
size: u64,
) -> i64 {
// log::info!("<raw_perf_event_output>: {:x?}", data);
let map = Arc::from_raw(map as *const BpfMap);
let data = core::slice::from_raw_parts(data as *const u8, size as usize);
let res = perf_event_output(ctx, &map, flags, data);
// warning: We need to keep the map alive, so we don't drop it here.
let _ = Arc::into_raw(map);
match res {
Ok(_) => 0,
Err(e) => e as i64,
}
}
pub fn perf_event_output(
ctx: *mut c_void,
map: &Arc<BpfMap>,
flags: u64,
data: &[u8],
) -> Result<()> {
let mut binding = map.inner_map().lock();
let index = flags as u32;
let flags = (flags >> 32) as u32;
let key = if index == BPF_F_CURRENT_CPU as u32 {
smp_get_processor_id().data()
} else {
index
};
let fd = binding
.lookup_elem(&key.to_ne_bytes())?
.ok_or(SystemError::ENOENT)?;
let fd = u32::from_ne_bytes(fd.try_into().map_err(|_| SystemError::EINVAL)?);
crate::perf::perf_event_output(ctx, fd as usize, flags, data)?;
Ok(())
}
/// See https://ebpf-docs.dylanreimerink.nl/linux/helper-function/bpf_probe_read/
fn raw_bpf_probe_read(dst: *mut c_void, size: u32, unsafe_ptr: *const c_void) -> i64 {
log::info!(
"raw_bpf_probe_read, dst:{:x}, size:{}, unsafe_ptr: {:x}",
dst as usize,
size,
unsafe_ptr as usize
);
let (dst, src) = unsafe {
let dst = core::slice::from_raw_parts_mut(dst as *mut u8, size as usize);
let src = core::slice::from_raw_parts(unsafe_ptr as *const u8, size as usize);
(dst, src)
};
let res = bpf_probe_read(dst, src);
match res {
Ok(_) => 0,
Err(e) => e as i64,
}
}
/// For tracing programs, safely attempt to read size
/// bytes from kernel space address unsafe_ptr and
/// store the data in dst.
pub fn bpf_probe_read(dst: &mut [u8], src: &[u8]) -> Result<()> {
log::info!("bpf_probe_read: len: {}", dst.len());
dst.copy_from_slice(src);
Ok(())
}
unsafe fn raw_map_update_elem(
map: *mut c_void,
key: *const c_void,
value: *const c_void,
flags: u64,
) -> i64 {
let map = Arc::from_raw(map as *const BpfMap);
let key_size = map.key_size();
let value_size = map.value_size();
// log::info!("<raw_map_update_elem>: flags: {:x?}", flags);
let key = core::slice::from_raw_parts(key as *const u8, key_size);
let value = core::slice::from_raw_parts(value as *const u8, value_size);
let res = map_update_elem(&map, key, value, flags);
let _ = Arc::into_raw(map);
match res {
Ok(_) => 0,
Err(e) => e as _,
}
}
pub fn map_update_elem(map: &Arc<BpfMap>, key: &[u8], value: &[u8], flags: u64) -> Result<()> {
let mut binding = map.inner_map().lock();
let value = binding.update_elem(key, value, flags);
value
}
/// Delete entry with key from map.
///
/// The delete map element helper call is used to delete values from maps.
unsafe fn raw_map_delete_elem(map: *mut c_void, key: *const c_void) -> i64 {
let map = Arc::from_raw(map as *const BpfMap);
let key_size = map.key_size();
let key = core::slice::from_raw_parts(key as *const u8, key_size);
let res = map_delete_elem(&map, key);
let _ = Arc::into_raw(map);
match res {
Ok(_) => 0,
Err(e) => e as i64,
}
}
pub fn map_delete_elem(map: &Arc<BpfMap>, key: &[u8]) -> Result<()> {
let mut binding = map.inner_map().lock();
let value = binding.delete_elem(key);
value
}
/// For each element in map, call callback_fn function with map, callback_ctx and other map-specific
/// parameters. The callback_fn should be a static function and the callback_ctx should be a pointer
/// to the stack. The flags is used to control certain aspects of the helper. Currently, the flags must
/// be 0.
///
/// The following are a list of supported map types and their respective expected callback signatures:
/// - BPF_MAP_TYPE_HASH
/// - BPF_MAP_TYPE_PERCPU_HASH
/// - BPF_MAP_TYPE_LRU_HASH
/// - BPF_MAP_TYPE_LRU_PERCPU_HASH
/// - BPF_MAP_TYPE_ARRAY
/// - BPF_MAP_TYPE_PERCPU_ARRAY
///
/// `long (*callback_fn)(struct bpf_map *map, const void key, void *value, void *ctx);`
///
/// For per_cpu maps, the map_value is the value on the cpu where the bpf_prog is running.
unsafe fn raw_map_for_each_elem(
map: *mut c_void,
cb: *const c_void,
ctx: *const c_void,
flags: u64,
) -> i64 {
let map = Arc::from_raw(map as *const BpfMap);
let cb = *core::mem::transmute::<*const c_void, *const BpfCallBackFn>(cb);
let res = map_for_each_elem(&map, cb, ctx as _, flags);
let _ = Arc::into_raw(map);
match res {
Ok(v) => v as i64,
Err(e) => e as i64,
}
}
pub fn map_for_each_elem(
map: &Arc<BpfMap>,
cb: BpfCallBackFn,
ctx: *const u8,
flags: u64,
) -> Result<u32> {
let mut binding = map.inner_map().lock();
let value = binding.for_each_elem(cb, ctx, flags);
value
}
/// Perform a lookup in percpu map for an entry associated to key on cpu.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/helper-function/bpf_map_lookup_percpu_elem/
unsafe fn raw_map_lookup_percpu_elem(
map: *mut c_void,
key: *const c_void,
cpu: u32,
) -> *const c_void {
let map = Arc::from_raw(map as *const BpfMap);
let key_size = map.key_size();
let key = core::slice::from_raw_parts(key as *const u8, key_size);
let value = map_lookup_percpu_elem(&map, key, cpu);
// warning: We need to keep the map alive, so we don't drop it here.
let _ = Arc::into_raw(map);
match value {
Ok(Some(value)) => value as *const c_void,
_ => core::ptr::null_mut(),
}
}
pub fn map_lookup_percpu_elem(
map: &Arc<BpfMap>,
key: &[u8],
cpu: u32,
) -> Result<Option<*const u8>> {
let mut binding = map.inner_map().lock();
let value = binding.lookup_percpu_elem(key, cpu);
match value {
Ok(Some(value)) => Ok(Some(value.as_ptr())),
_ => Ok(None),
}
}
/// Push an element value in map.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/helper-function/bpf_map_push_elem/
unsafe fn raw_map_push_elem(map: *mut c_void, value: *const c_void, flags: u64) -> i64 {
let map = Arc::from_raw(map as *const BpfMap);
let value_size = map.value_size();
let value = core::slice::from_raw_parts(value as *const u8, value_size);
let res = map_push_elem(&map, value, flags);
let _ = Arc::into_raw(map);
match res {
Ok(_) => 0,
Err(e) => e as i64,
}
}
pub fn map_push_elem(map: &Arc<BpfMap>, value: &[u8], flags: u64) -> Result<()> {
let mut binding = map.inner_map().lock();
let value = binding.push_elem(value, flags);
value
}
/// Pop an element from map.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/helper-function/bpf_map_pop_elem/
unsafe fn raw_map_pop_elem(map: *mut c_void, value: *mut c_void) -> i64 {
let map = Arc::from_raw(map as *const BpfMap);
let value_size = map.value_size();
let value = core::slice::from_raw_parts_mut(value as *mut u8, value_size);
let res = map_pop_elem(&map, value);
let _ = Arc::into_raw(map);
match res {
Ok(_) => 0,
Err(e) => e as i64,
}
}
pub fn map_pop_elem(map: &Arc<BpfMap>, value: &mut [u8]) -> Result<()> {
let mut binding = map.inner_map().lock();
let value = binding.pop_elem(value);
value
}
/// Get an element from map without removing it.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/helper-function/bpf_map_peek_elem/
unsafe fn raw_map_peek_elem(map: *mut c_void, value: *mut c_void) -> i64 {
let map = Arc::from_raw(map as *const BpfMap);
let value_size = map.value_size();
let value = core::slice::from_raw_parts_mut(value as *mut u8, value_size);
let res = map_peek_elem(&map, value);
let _ = Arc::into_raw(map);
match res {
Ok(_) => 0,
Err(e) => e as i64,
}
}
pub fn map_peek_elem(map: &Arc<BpfMap>, value: &mut [u8]) -> Result<()> {
let binding = map.inner_map().lock();
let value = binding.peek_elem(value);
value
}
pub static BPF_HELPER_FUN_SET: Lazy<BTreeMap<u32, RawBPFHelperFn>> = Lazy::new();
/// Initialize the helper functions.
pub fn init_helper_functions() {
use consts::*;
let mut map = BTreeMap::new();
unsafe {
// Map helpers::Generic map helpers
map.insert(HELPER_MAP_LOOKUP_ELEM, define_func!(raw_map_lookup_elem));
map.insert(HELPER_MAP_UPDATE_ELEM, define_func!(raw_map_update_elem));
map.insert(HELPER_MAP_DELETE_ELEM, define_func!(raw_map_delete_elem));
map.insert(
HELPER_MAP_FOR_EACH_ELEM,
define_func!(raw_map_for_each_elem),
);
map.insert(
HELPER_MAP_LOOKUP_PERCPU_ELEM,
define_func!(raw_map_lookup_percpu_elem),
);
// map.insert(93,define_func!(raw_bpf_spin_lock);
// map.insert(94,define_func!(raw_bpf_spin_unlock);
// Map helpers::Perf event array helpers
map.insert(
HELPER_PERF_EVENT_OUTPUT,
define_func!(raw_perf_event_output),
);
// Probe and trace helpers::Memory helpers
map.insert(HELPER_BPF_PROBE_READ, define_func!(raw_bpf_probe_read));
// Print helpers
map.insert(HELPER_TRACE_PRINTF, define_func!(trace_printf));
// Map helpers::Queue and stack helpers
map.insert(HELPER_MAP_PUSH_ELEM, define_func!(raw_map_push_elem));
map.insert(HELPER_MAP_POP_ELEM, define_func!(raw_map_pop_elem));
map.insert(HELPER_MAP_PEEK_ELEM, define_func!(raw_map_peek_elem));
}
BPF_HELPER_FUN_SET.init(map);
}

View File

@ -0,0 +1,25 @@
use core::{
ffi::{c_char, c_int},
fmt::Write,
};
use printf_compat::{format, output};
/// Printf according to the format string, function will return the number of bytes written(including '\0')
pub unsafe extern "C" fn printf(w: &mut impl Write, str: *const c_char, mut args: ...) -> c_int {
let bytes_written = format(str as _, args.as_va_list(), output::fmt_write(w));
bytes_written + 1
}
struct TerminalOut;
impl Write for TerminalOut {
fn write_str(&mut self, s: &str) -> core::fmt::Result {
print!("{}", s);
Ok(())
}
}
/// See https://ebpf-docs.dylanreimerink.nl/linux/helper-function/bpf_trace_printk/
pub fn trace_printf(fmt_ptr: u64, _fmt_len: u64, arg3: u64, arg4: u64, arg5: u64) -> u64 {
unsafe { printf(&mut TerminalOut, fmt_ptr as _, arg3, arg4, arg5) as u64 }
}

View File

@ -0,0 +1,283 @@
//! BPF_MAP_TYPE_ARRAY and BPF_MAP_TYPE_PERCPU_ARRAY
//!
//!
//! See https://docs.kernel.org/bpf/map_array.html
use super::super::Result;
use crate::bpf::map::util::round_up;
use crate::bpf::map::{BpfCallBackFn, BpfMapCommonOps, BpfMapMeta};
use crate::mm::percpu::{PerCpu, PerCpuVar};
use crate::smp::cpu::{smp_cpu_manager, ProcessorId};
use alloc::{vec, vec::Vec};
use core::{
fmt::{Debug, Formatter},
ops::{Index, IndexMut},
};
use log::info;
use system_error::SystemError;
/// The array map type is a generic map type with no restrictions on the structure of the value.
/// Like a normal array, the array map has a numeric key starting at 0 and incrementing.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/map-type/BPF_MAP_TYPE_ARRAY/
#[derive(Debug)]
pub struct ArrayMap {
max_entries: u32,
data: ArrayMapData,
}
struct ArrayMapData {
elem_size: u32,
/// The data is stored in a Vec<u8> with the size of elem_size * max_entries.
data: Vec<u8>,
}
impl Debug for ArrayMapData {
fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
f.debug_struct("ArrayMapData")
.field("elem_size", &self.elem_size)
.field("data_len", &self.data.len())
.finish()
}
}
impl ArrayMapData {
pub fn new(elem_size: u32, max_entries: u32) -> Self {
debug_assert!(elem_size % 8 == 0);
let total_size = elem_size * max_entries;
let data = vec![0; total_size as usize];
ArrayMapData { elem_size, data }
}
}
impl Index<u32> for ArrayMapData {
type Output = [u8];
fn index(&self, index: u32) -> &Self::Output {
let start = index * self.elem_size;
&self.data[start as usize..(start + self.elem_size) as usize]
}
}
impl IndexMut<u32> for ArrayMapData {
fn index_mut(&mut self, index: u32) -> &mut Self::Output {
let start = index * self.elem_size;
&mut self.data[start as usize..(start + self.elem_size) as usize]
}
}
impl ArrayMap {
pub fn new(attr: &BpfMapMeta) -> Result<Self> {
if attr.value_size == 0 || attr.max_entries == 0 || attr.key_size != 4 {
return Err(SystemError::EINVAL);
}
let elem_size = round_up(attr.value_size as usize, 8);
let data = ArrayMapData::new(elem_size as u32, attr.max_entries);
Ok(ArrayMap {
max_entries: attr.max_entries,
data,
})
}
}
impl BpfMapCommonOps for ArrayMap {
fn lookup_elem(&mut self, key: &[u8]) -> Result<Option<&[u8]>> {
if key.len() != 4 {
return Err(SystemError::EINVAL);
}
let index = u32::from_ne_bytes(key.try_into().map_err(|_| SystemError::EINVAL)?);
if index >= self.max_entries {
return Err(SystemError::EINVAL);
}
let val = self.data.index(index);
Ok(Some(val))
}
fn update_elem(&mut self, key: &[u8], value: &[u8], _flags: u64) -> Result<()> {
if key.len() != 4 {
return Err(SystemError::EINVAL);
}
let index = u32::from_ne_bytes(key.try_into().map_err(|_| SystemError::EINVAL)?);
if index >= self.max_entries {
return Err(SystemError::EINVAL);
}
if value.len() > self.data.elem_size as usize {
return Err(SystemError::EINVAL);
}
let old_value = self.data.index_mut(index);
old_value[..value.len()].copy_from_slice(value);
Ok(())
}
/// For ArrayMap, delete_elem is not supported.
fn delete_elem(&mut self, _key: &[u8]) -> Result<()> {
Err(SystemError::EINVAL)
}
fn for_each_elem(&mut self, cb: BpfCallBackFn, ctx: *const u8, flags: u64) -> Result<u32> {
if flags != 0 {
return Err(SystemError::EINVAL);
}
let mut total_used = 0;
for i in 0..self.max_entries {
let key = i.to_ne_bytes();
let value = self.data.index(i);
total_used += 1;
let res = cb(&key, value, ctx);
// return value: 0 - continue, 1 - stop and return
if res != 0 {
break;
}
}
Ok(total_used)
}
fn lookup_and_delete_elem(&mut self, _key: &[u8], _value: &mut [u8]) -> Result<()> {
Err(SystemError::EINVAL)
}
fn get_next_key(&self, key: Option<&[u8]>, next_key: &mut [u8]) -> Result<()> {
if let Some(key) = key {
if key.len() != 4 {
return Err(SystemError::EINVAL);
}
let index = u32::from_ne_bytes(key.try_into().map_err(|_| SystemError::EINVAL)?);
if index == self.max_entries - 1 {
return Err(SystemError::ENOENT);
}
let next_index = index + 1;
next_key.copy_from_slice(&next_index.to_ne_bytes());
} else {
next_key.copy_from_slice(&0u32.to_ne_bytes());
}
Ok(())
}
fn freeze(&self) -> Result<()> {
info!("fake freeze done for ArrayMap");
Ok(())
}
fn first_value_ptr(&self) -> Result<*const u8> {
Ok(self.data.data.as_ptr())
}
}
/// This is the per-CPU variant of the [ArrayMap] map type.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/map-type/BPF_MAP_TYPE_PERCPU_ARRAY/
pub struct PerCpuArrayMap {
per_cpu_data: PerCpuVar<ArrayMap>,
}
impl Debug for PerCpuArrayMap {
fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
f.debug_struct("PerCpuArrayMap")
.field("data", &self.per_cpu_data)
.finish()
}
}
impl PerCpuArrayMap {
pub fn new(attr: &BpfMapMeta) -> Result<Self> {
let num_cpus = PerCpu::MAX_CPU_NUM;
let mut data = Vec::with_capacity(num_cpus as usize);
for _ in 0..num_cpus {
let array_map = ArrayMap::new(attr)?;
data.push(array_map);
}
let per_cpu_data = PerCpuVar::new(data).ok_or(SystemError::EINVAL)?;
Ok(PerCpuArrayMap { per_cpu_data })
}
}
impl BpfMapCommonOps for PerCpuArrayMap {
fn lookup_elem(&mut self, key: &[u8]) -> Result<Option<&[u8]>> {
self.per_cpu_data.get_mut().lookup_elem(key)
}
fn update_elem(&mut self, key: &[u8], value: &[u8], flags: u64) -> Result<()> {
self.per_cpu_data.get_mut().update_elem(key, value, flags)
}
fn delete_elem(&mut self, key: &[u8]) -> Result<()> {
self.per_cpu_data.get_mut().delete_elem(key)
}
fn for_each_elem(&mut self, cb: BpfCallBackFn, ctx: *const u8, flags: u64) -> Result<u32> {
self.per_cpu_data.get_mut().for_each_elem(cb, ctx, flags)
}
fn lookup_and_delete_elem(&mut self, _key: &[u8], _value: &mut [u8]) -> Result<()> {
Err(SystemError::EINVAL)
}
fn lookup_percpu_elem(&mut self, key: &[u8], cpu: u32) -> Result<Option<&[u8]>> {
unsafe {
self.per_cpu_data
.force_get_mut(ProcessorId::new(cpu))
.lookup_elem(key)
}
}
fn get_next_key(&self, key: Option<&[u8]>, next_key: &mut [u8]) -> Result<()> {
self.per_cpu_data.get_mut().get_next_key(key, next_key)
}
fn first_value_ptr(&self) -> Result<*const u8> {
self.per_cpu_data.get_mut().first_value_ptr()
}
}
/// See https://ebpf-docs.dylanreimerink.nl/linux/map-type/BPF_MAP_TYPE_PERF_EVENT_ARRAY/
pub struct PerfEventArrayMap {
// The value is the file descriptor of the perf event.
fds: ArrayMapData,
}
impl Debug for PerfEventArrayMap {
fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
f.debug_struct("PerfEventArrayMap")
.field("fds", &self.fds)
.finish()
}
}
impl PerfEventArrayMap {
pub fn new(attr: &BpfMapMeta) -> Result<Self> {
let num_cpus = smp_cpu_manager().possible_cpus_count();
if attr.key_size != 4 || attr.value_size != 4 || attr.max_entries != num_cpus {
return Err(SystemError::EINVAL);
}
let fds = ArrayMapData::new(4, num_cpus);
Ok(PerfEventArrayMap { fds })
}
}
impl BpfMapCommonOps for PerfEventArrayMap {
fn lookup_elem(&mut self, key: &[u8]) -> Result<Option<&[u8]>> {
let cpu_id = u32::from_ne_bytes(key.try_into().map_err(|_| SystemError::EINVAL)?);
let value = self.fds.index(cpu_id);
Ok(Some(value))
}
fn update_elem(&mut self, key: &[u8], value: &[u8], _flags: u64) -> Result<()> {
assert_eq!(value.len(), 4);
let cpu_id = u32::from_ne_bytes(key.try_into().map_err(|_| SystemError::EINVAL)?);
let old_value = self.fds.index_mut(cpu_id);
old_value.copy_from_slice(value);
Ok(())
}
fn delete_elem(&mut self, key: &[u8]) -> Result<()> {
let cpu_id = u32::from_ne_bytes(key.try_into().map_err(|_| SystemError::EINVAL)?);
self.fds.index_mut(cpu_id).copy_from_slice(&[0; 4]);
Ok(())
}
fn for_each_elem(&mut self, cb: BpfCallBackFn, ctx: *const u8, _flags: u64) -> Result<u32> {
let mut total_used = 0;
let num_cpus = smp_cpu_manager().possible_cpus_count();
for i in 0..num_cpus {
let key = i.to_ne_bytes();
let value = self.fds.index(i);
total_used += 1;
let res = cb(&key, value, ctx);
if res != 0 {
break;
}
}
Ok(total_used)
}
fn lookup_and_delete_elem(&mut self, _key: &[u8], _value: &mut [u8]) -> Result<()> {
Err(SystemError::EINVAL)
}
fn first_value_ptr(&self) -> Result<*const u8> {
Ok(self.fds.data.as_ptr())
}
}

View File

@ -0,0 +1,156 @@
use super::Result;
use crate::bpf::map::util::{round_up, BpfMapUpdateElemFlags};
use crate::bpf::map::{BpfCallBackFn, BpfMapCommonOps, BpfMapMeta};
use crate::mm::percpu::{PerCpu, PerCpuVar};
use crate::smp::cpu::ProcessorId;
use alloc::{collections::BTreeMap, vec::Vec};
use core::fmt::Debug;
use system_error::SystemError;
type BpfHashMapKey = Vec<u8>;
type BpfHashMapValue = Vec<u8>;
/// The hash map type is a generic map type with no restrictions on the structure of the key and value.
/// Hash-maps are implemented using a hash table, allowing for lookups with arbitrary keys.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/map-type/BPF_MAP_TYPE_HASH/
#[derive(Debug)]
pub struct BpfHashMap {
_max_entries: u32,
_key_size: u32,
_value_size: u32,
data: BTreeMap<BpfHashMapKey, BpfHashMapValue>,
}
impl BpfHashMap {
pub fn new(attr: &BpfMapMeta) -> Result<Self> {
if attr.value_size == 0 || attr.max_entries == 0 {
return Err(SystemError::EINVAL);
}
let value_size = round_up(attr.value_size as usize, 8);
Ok(Self {
_max_entries: attr.max_entries,
_key_size: attr.key_size,
_value_size: value_size as u32,
data: BTreeMap::new(),
})
}
}
impl BpfMapCommonOps for BpfHashMap {
fn lookup_elem(&mut self, key: &[u8]) -> Result<Option<&[u8]>> {
let value = self.data.get(key).map(|v| v.as_slice());
Ok(value)
}
fn update_elem(&mut self, key: &[u8], value: &[u8], flags: u64) -> Result<()> {
let _flags = BpfMapUpdateElemFlags::from_bits_truncate(flags);
self.data.insert(key.to_vec(), value.to_vec());
Ok(())
}
fn delete_elem(&mut self, key: &[u8]) -> Result<()> {
self.data.remove(key);
Ok(())
}
fn for_each_elem(&mut self, cb: BpfCallBackFn, ctx: *const u8, flags: u64) -> Result<u32> {
if flags != 0 {
return Err(SystemError::EINVAL);
}
let mut total_used = 0;
for (key, value) in self.data.iter() {
let res = cb(key, value, ctx);
// return value: 0 - continue, 1 - stop and return
if res != 0 {
break;
}
total_used += 1;
}
Ok(total_used)
}
fn lookup_and_delete_elem(&mut self, key: &[u8], value: &mut [u8]) -> Result<()> {
let v = self
.data
.get(key)
.map(|v| v.as_slice())
.ok_or(SystemError::ENOENT)?;
value.copy_from_slice(v);
self.data.remove(key);
Ok(())
}
fn get_next_key(&self, key: Option<&[u8]>, next_key: &mut [u8]) -> Result<()> {
let mut iter = self.data.iter();
if let Some(key) = key {
for (k, _) in iter.by_ref() {
if k.as_slice() == key {
break;
}
}
}
let res = iter.next();
match res {
Some((k, _)) => {
next_key.copy_from_slice(k.as_slice());
Ok(())
}
None => Err(SystemError::ENOENT),
}
}
}
/// This is the per-CPU variant of the [BpfHashMap] map type.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/map-type/BPF_MAP_TYPE_PERCPU_HASH/
pub struct PerCpuHashMap {
per_cpu_maps: PerCpuVar<BpfHashMap>,
}
impl Debug for PerCpuHashMap {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("PerCpuHashMap")
.field("maps", &self.per_cpu_maps)
.finish()
}
}
impl PerCpuHashMap {
pub fn new(attr: &BpfMapMeta) -> Result<Self> {
let num_cpus = PerCpu::MAX_CPU_NUM;
let mut data = Vec::with_capacity(num_cpus as usize);
for _ in 0..num_cpus {
let array_map = BpfHashMap::new(attr)?;
data.push(array_map);
}
let per_cpu_maps = PerCpuVar::new(data).ok_or(SystemError::EINVAL)?;
Ok(PerCpuHashMap { per_cpu_maps })
}
}
impl BpfMapCommonOps for PerCpuHashMap {
fn lookup_elem(&mut self, key: &[u8]) -> Result<Option<&[u8]>> {
self.per_cpu_maps.get_mut().lookup_elem(key)
}
fn update_elem(&mut self, key: &[u8], value: &[u8], flags: u64) -> Result<()> {
self.per_cpu_maps.get_mut().update_elem(key, value, flags)
}
fn delete_elem(&mut self, key: &[u8]) -> Result<()> {
self.per_cpu_maps.get_mut().delete_elem(key)
}
fn for_each_elem(&mut self, cb: BpfCallBackFn, ctx: *const u8, flags: u64) -> Result<u32> {
self.per_cpu_maps.get_mut().for_each_elem(cb, ctx, flags)
}
fn lookup_and_delete_elem(&mut self, key: &[u8], value: &mut [u8]) -> Result<()> {
self.per_cpu_maps
.get_mut()
.lookup_and_delete_elem(key, value)
}
fn lookup_percpu_elem(&mut self, key: &[u8], cpu: u32) -> Result<Option<&[u8]>> {
unsafe {
self.per_cpu_maps
.force_get_mut(ProcessorId::new(cpu))
.lookup_elem(key)
}
}
fn get_next_key(&self, key: Option<&[u8]>, next_key: &mut [u8]) -> Result<()> {
self.per_cpu_maps.get_mut().get_next_key(key, next_key)
}
fn first_value_ptr(&self) -> Result<*const u8> {
self.per_cpu_maps.get_mut().first_value_ptr()
}
}

151
kernel/src/bpf/map/lru.rs Normal file
View File

@ -0,0 +1,151 @@
use super::{BpfCallBackFn, BpfMapCommonOps, Result};
use crate::bpf::map::util::BpfMapMeta;
use crate::mm::percpu::{PerCpu, PerCpuVar};
use crate::smp::cpu::ProcessorId;
use alloc::vec::Vec;
use core::fmt::Debug;
use core::num::NonZero;
use lru::LruCache;
use system_error::SystemError;
type BpfHashMapKey = Vec<u8>;
type BpfHashMapValue = Vec<u8>;
/// This map is the LRU (Least Recently Used) variant of the BPF_MAP_TYPE_HASH.
/// It is a generic map type that stores a fixed maximum number of key/value pairs.
/// When the map starts to get at capacity, the approximately least recently
/// used elements is removed to make room for new elements.
///
/// See https://docs.ebpf.io/linux/map-type/BPF_MAP_TYPE_LRU_HASH/
#[derive(Debug)]
pub struct LruMap {
_max_entries: u32,
data: LruCache<BpfHashMapKey, BpfHashMapValue>,
}
impl LruMap {
pub fn new(attr: &BpfMapMeta) -> Result<Self> {
if attr.value_size == 0 || attr.max_entries == 0 {
return Err(SystemError::EINVAL);
}
Ok(Self {
_max_entries: attr.max_entries,
data: LruCache::new(
NonZero::new(attr.max_entries as usize).ok_or(SystemError::EINVAL)?,
),
})
}
}
impl BpfMapCommonOps for LruMap {
fn lookup_elem(&mut self, key: &[u8]) -> Result<Option<&[u8]>> {
let value = self.data.get(key).map(|v| v.as_slice());
Ok(value)
}
fn update_elem(&mut self, key: &[u8], value: &[u8], _flags: u64) -> Result<()> {
self.data.put(key.to_vec(), value.to_vec());
Ok(())
}
fn delete_elem(&mut self, key: &[u8]) -> Result<()> {
self.data.pop(key);
Ok(())
}
fn for_each_elem(&mut self, cb: BpfCallBackFn, ctx: *const u8, flags: u64) -> Result<u32> {
if flags != 0 {
return Err(SystemError::EINVAL);
}
let mut total_used = 0;
for (key, value) in self.data.iter() {
let res = cb(key, value, ctx);
// return value: 0 - continue, 1 - stop and return
if res != 0 {
break;
}
total_used += 1;
}
Ok(total_used)
}
fn lookup_and_delete_elem(&mut self, key: &[u8], value: &mut [u8]) -> Result<()> {
let v = self
.data
.get(key)
.map(|v| v.as_slice())
.ok_or(SystemError::ENOENT)?;
value.copy_from_slice(v);
self.data.pop(key);
Ok(())
}
fn get_next_key(&self, key: Option<&[u8]>, next_key: &mut [u8]) -> Result<()> {
let mut iter = self.data.iter();
if let Some(key) = key {
for (k, _) in iter.by_ref() {
if k.as_slice() == key {
break;
}
}
}
let res = iter.next();
match res {
Some((k, _)) => {
next_key.copy_from_slice(k.as_slice());
Ok(())
}
None => Err(SystemError::ENOENT),
}
}
}
/// See https://ebpf-docs.dylanreimerink.nl/linux/map-type/BPF_MAP_TYPE_LRU_PERCPU_HASH/
pub struct PerCpuLruMap {
per_cpu_maps: PerCpuVar<LruMap>,
}
impl Debug for PerCpuLruMap {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("PerCpuLruMap")
.field("maps", &self.per_cpu_maps)
.finish()
}
}
impl PerCpuLruMap {
pub fn new(attr: &BpfMapMeta) -> Result<Self> {
let num_cpus = PerCpu::MAX_CPU_NUM;
let mut data = Vec::with_capacity(num_cpus as usize);
for _ in 0..num_cpus {
let array_map = LruMap::new(attr)?;
data.push(array_map);
}
let per_cpu_maps = PerCpuVar::new(data).ok_or(SystemError::EINVAL)?;
Ok(PerCpuLruMap { per_cpu_maps })
}
}
impl BpfMapCommonOps for PerCpuLruMap {
fn lookup_elem(&mut self, key: &[u8]) -> Result<Option<&[u8]>> {
self.per_cpu_maps.get_mut().lookup_elem(key)
}
fn update_elem(&mut self, key: &[u8], value: &[u8], flags: u64) -> Result<()> {
self.per_cpu_maps.get_mut().update_elem(key, value, flags)
}
fn delete_elem(&mut self, key: &[u8]) -> Result<()> {
self.per_cpu_maps.get_mut().delete_elem(key)
}
fn for_each_elem(&mut self, cb: BpfCallBackFn, ctx: *const u8, flags: u64) -> Result<u32> {
self.per_cpu_maps.get_mut().for_each_elem(cb, ctx, flags)
}
fn lookup_and_delete_elem(&mut self, key: &[u8], value: &mut [u8]) -> Result<()> {
self.per_cpu_maps
.get_mut()
.lookup_and_delete_elem(key, value)
}
fn lookup_percpu_elem(&mut self, key: &[u8], cpu: u32) -> Result<Option<&[u8]>> {
unsafe {
self.per_cpu_maps
.force_get_mut(ProcessorId::new(cpu))
.lookup_elem(key)
}
}
fn get_next_key(&self, key: Option<&[u8]>, next_key: &mut [u8]) -> Result<()> {
self.per_cpu_maps.get_mut().get_next_key(key, next_key)
}
}

416
kernel/src/bpf/map/mod.rs Normal file
View File

@ -0,0 +1,416 @@
mod array_map;
mod hash_map;
mod lru;
mod queue;
mod util;
use super::Result;
use crate::bpf::map::array_map::{ArrayMap, PerCpuArrayMap, PerfEventArrayMap};
use crate::bpf::map::hash_map::PerCpuHashMap;
use crate::bpf::map::util::{BpfMapGetNextKeyArg, BpfMapMeta, BpfMapUpdateArg};
use crate::filesystem::vfs::file::{File, FileMode};
use crate::filesystem::vfs::syscall::ModeType;
use crate::filesystem::vfs::{FilePrivateData, FileSystem, FileType, IndexNode, Metadata};
use crate::include::bindings::linux_bpf::{bpf_attr, bpf_map_type};
use crate::libs::casting::DowncastArc;
use crate::libs::spinlock::{SpinLock, SpinLockGuard};
use crate::process::ProcessManager;
use crate::syscall::user_access::{UserBufferReader, UserBufferWriter};
use alloc::boxed::Box;
use alloc::string::String;
use alloc::sync::Arc;
use alloc::vec::Vec;
use core::any::Any;
use core::fmt::Debug;
use intertrait::CastFromSync;
use log::{error, info};
use system_error::SystemError;
#[derive(Debug)]
pub struct BpfMap {
inner_map: SpinLock<Box<dyn BpfMapCommonOps>>,
meta: BpfMapMeta,
}
pub type BpfCallBackFn = fn(key: &[u8], value: &[u8], ctx: *const u8) -> i32;
pub trait BpfMapCommonOps: Send + Sync + Debug + CastFromSync {
/// Lookup an element in the map.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/helper-function/bpf_map_lookup_elem/
fn lookup_elem(&mut self, _key: &[u8]) -> Result<Option<&[u8]>> {
Err(SystemError::ENOSYS)
}
/// Update an element in the map.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/helper-function/bpf_map_update_elem/
fn update_elem(&mut self, _key: &[u8], _value: &[u8], _flags: u64) -> Result<()> {
Err(SystemError::ENOSYS)
}
/// Delete an element from the map.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/helper-function/bpf_map_delete_elem/
fn delete_elem(&mut self, _key: &[u8]) -> Result<()> {
Err(SystemError::ENOSYS)
}
/// For each element in map, call callback_fn function with map,
/// callback_ctx and other map-specific parameters.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/helper-function/bpf_for_each_map_elem/
fn for_each_elem(&mut self, _cb: BpfCallBackFn, _ctx: *const u8, _flags: u64) -> Result<u32> {
Err(SystemError::ENOSYS)
}
/// Look up an element with the given key in the map referred to by the file descriptor fd,
/// and if found, delete the element.
fn lookup_and_delete_elem(&mut self, _key: &[u8], _value: &mut [u8]) -> Result<()> {
Err(SystemError::ENOSYS)
}
/// erform a lookup in percpu map for an entry associated to key on cpu.
fn lookup_percpu_elem(&mut self, _key: &[u8], _cpu: u32) -> Result<Option<&[u8]>> {
Err(SystemError::ENOSYS)
}
/// Get the next key in the map. If key is None, get the first key.
///
/// Called from syscall
fn get_next_key(&self, _key: Option<&[u8]>, _next_key: &mut [u8]) -> Result<()> {
Err(SystemError::ENOSYS)
}
/// Push an element value in map.
fn push_elem(&mut self, _value: &[u8], _flags: u64) -> Result<()> {
Err(SystemError::ENOSYS)
}
/// Pop an element value from map.
fn pop_elem(&mut self, _value: &mut [u8]) -> Result<()> {
Err(SystemError::ENOSYS)
}
/// Peek an element value from map.
fn peek_elem(&self, _value: &mut [u8]) -> Result<()> {
Err(SystemError::ENOSYS)
}
/// Freeze the map.
///
/// It's useful for .rodata maps.
fn freeze(&self) -> Result<()> {
Err(SystemError::ENOSYS)
}
/// Get the first value pointer.
fn first_value_ptr(&self) -> Result<*const u8> {
Err(SystemError::ENOSYS)
}
}
impl DowncastArc for dyn BpfMapCommonOps {
fn as_any_arc(self: Arc<Self>) -> Arc<dyn Any> {
self
}
}
impl BpfMap {
pub fn new(map: Box<dyn BpfMapCommonOps>, meta: BpfMapMeta) -> Self {
assert_ne!(meta.key_size, 0);
BpfMap {
inner_map: SpinLock::new(map),
meta,
}
}
pub fn inner_map(&self) -> &SpinLock<Box<dyn BpfMapCommonOps>> {
&self.inner_map
}
pub fn key_size(&self) -> usize {
self.meta.key_size as usize
}
pub fn value_size(&self) -> usize {
self.meta.value_size as usize
}
}
impl IndexNode for BpfMap {
fn open(&self, _data: SpinLockGuard<FilePrivateData>, _mode: &FileMode) -> Result<()> {
Ok(())
}
fn close(&self, _data: SpinLockGuard<FilePrivateData>) -> Result<()> {
Ok(())
}
fn read_at(
&self,
_offset: usize,
_len: usize,
_buf: &mut [u8],
_data: SpinLockGuard<FilePrivateData>,
) -> Result<usize> {
Err(SystemError::ENOSYS)
}
fn write_at(
&self,
_offset: usize,
_len: usize,
_buf: &[u8],
_data: SpinLockGuard<FilePrivateData>,
) -> Result<usize> {
Err(SystemError::ENOSYS)
}
fn metadata(&self) -> Result<Metadata> {
let meta = Metadata {
mode: ModeType::from_bits_truncate(0o755),
file_type: FileType::File,
..Default::default()
};
Ok(meta)
}
fn resize(&self, _len: usize) -> Result<()> {
Ok(())
}
fn fs(&self) -> Arc<dyn FileSystem> {
todo!("BpfMap does not have a filesystem")
}
fn as_any_ref(&self) -> &dyn Any {
self
}
fn list(&self) -> Result<Vec<String>> {
Err(SystemError::ENOSYS)
}
}
/// Create a map and return a file descriptor that refers to
/// the map. The close-on-exec file descriptor flag
/// is automatically enabled for the new file descriptor.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/syscall/BPF_MAP_CREATE/
pub fn bpf_map_create(attr: &bpf_attr) -> Result<usize> {
let map_meta = BpfMapMeta::try_from(attr)?;
info!("The map attr is {:#?}", map_meta);
let map: Box<dyn BpfMapCommonOps> = match map_meta.map_type {
bpf_map_type::BPF_MAP_TYPE_ARRAY => {
let array_map = ArrayMap::new(&map_meta)?;
Box::new(array_map)
}
bpf_map_type::BPF_MAP_TYPE_PERCPU_ARRAY => {
let per_cpu_array_map = PerCpuArrayMap::new(&map_meta)?;
Box::new(per_cpu_array_map)
}
bpf_map_type::BPF_MAP_TYPE_PERF_EVENT_ARRAY => {
let perf_event_array_map = PerfEventArrayMap::new(&map_meta)?;
Box::new(perf_event_array_map)
}
bpf_map_type::BPF_MAP_TYPE_CPUMAP
| bpf_map_type::BPF_MAP_TYPE_DEVMAP
| bpf_map_type::BPF_MAP_TYPE_DEVMAP_HASH => {
error!("bpf map type {:?} not implemented", map_meta.map_type);
Err(SystemError::EINVAL)?
}
bpf_map_type::BPF_MAP_TYPE_HASH => {
let hash_map = hash_map::BpfHashMap::new(&map_meta)?;
Box::new(hash_map)
}
bpf_map_type::BPF_MAP_TYPE_PERCPU_HASH => {
let per_cpu_hash_map = PerCpuHashMap::new(&map_meta)?;
Box::new(per_cpu_hash_map)
}
bpf_map_type::BPF_MAP_TYPE_QUEUE => {
let queue_map = queue::QueueMap::new(&map_meta)?;
Box::new(queue_map)
}
bpf_map_type::BPF_MAP_TYPE_STACK => {
let stack_map = queue::StackMap::new(&map_meta)?;
Box::new(stack_map)
}
bpf_map_type::BPF_MAP_TYPE_LRU_HASH => {
let lru_hash_map = lru::LruMap::new(&map_meta)?;
Box::new(lru_hash_map)
}
bpf_map_type::BPF_MAP_TYPE_LRU_PERCPU_HASH => {
let lru_per_cpu_hash_map = lru::PerCpuLruMap::new(&map_meta)?;
Box::new(lru_per_cpu_hash_map)
}
_ => {
unimplemented!("bpf map type {:?} not implemented", map_meta.map_type)
}
};
let bpf_map = BpfMap::new(map, map_meta);
let fd_table = ProcessManager::current_pcb().fd_table();
let file = File::new(Arc::new(bpf_map), FileMode::O_RDWR | FileMode::O_CLOEXEC)?;
let fd = fd_table.write().alloc_fd(file, None).map(|x| x as usize)?;
info!("create map with fd: [{}]", fd);
Ok(fd)
}
/// Create or update an element (key/value pair) in a specified map.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/syscall/BPF_MAP_UPDATE_ELEM/
pub fn bpf_map_update_elem(attr: &bpf_attr) -> Result<usize> {
let arg = BpfMapUpdateArg::from(attr);
info!("<bpf_map_update_elem>: {:#x?}", arg);
let map = get_map_file(arg.map_fd as i32)?;
let meta = &map.meta;
let key_size = meta.key_size as usize;
let value_size = meta.value_size as usize;
let key_buf = UserBufferReader::new(arg.key as *const u8, key_size, true)?;
let value_buf = UserBufferReader::new(arg.value as *const u8, value_size, true)?;
let key = key_buf.read_from_user(0)?;
let value = value_buf.read_from_user(0)?;
map.inner_map.lock().update_elem(key, value, arg.flags)?;
info!("bpf_map_update_elem ok");
Ok(0)
}
pub fn bpf_map_freeze(attr: &bpf_attr) -> Result<usize> {
let arg = BpfMapUpdateArg::from(attr);
let map_fd = arg.map_fd;
info!("<bpf_map_freeze>: map_fd: {:}", map_fd);
let map = get_map_file(map_fd as i32)?;
map.inner_map.lock().freeze()?;
Ok(0)
}
/// Look up an element by key in a specified map and return its value.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/syscall/BPF_MAP_LOOKUP_ELEM/
pub fn bpf_lookup_elem(attr: &bpf_attr) -> Result<usize> {
let arg = BpfMapUpdateArg::from(attr);
// info!("<bpf_lookup_elem>: {:#x?}", arg);
let map = get_map_file(arg.map_fd as _)?;
let meta = &map.meta;
let key_size = meta.key_size as usize;
let value_size = meta.value_size as usize;
let key_buf = UserBufferReader::new(arg.key as *const u8, key_size, true)?;
let mut value_buf = UserBufferWriter::new(arg.value as *mut u8, value_size, true)?;
let key = key_buf.read_from_user(0)?;
let mut inner = map.inner_map.lock();
let r_value = inner.lookup_elem(key)?;
if let Some(r_value) = r_value {
value_buf.copy_to_user(r_value, 0)?;
Ok(0)
} else {
Err(SystemError::ENOENT)
}
}
/// Look up an element by key in a specified map and return the key of the next element.
///
/// - If key is `None`, the operation returns zero and sets the next_key pointer to the key of the first element.
/// - If key is `Some(T)`, the operation returns zero and sets the next_key pointer to the key of the next element.
/// - If key is the last element, returns -1 and errno is set to ENOENT.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/syscall/BPF_MAP_GET_NEXT_KEY/
pub fn bpf_map_get_next_key(attr: &bpf_attr) -> Result<usize> {
let arg = BpfMapGetNextKeyArg::from(attr);
// info!("<bpf_map_get_next_key>: {:#x?}", arg);
let map = get_map_file(arg.map_fd as i32)?;
let meta = &map.meta;
let key_size = meta.key_size as usize;
let key = if let Some(key_ptr) = arg.key {
let key_buf = UserBufferReader::new(key_ptr as *const u8, key_size, true)?;
let key = key_buf.read_from_user(0)?.to_vec();
Some(key)
} else {
None
};
let key = key.as_deref();
let mut next_key_buf = UserBufferWriter::new(arg.next_key as *mut u8, key_size, true)?;
let inner = map.inner_map.lock();
let next_key = next_key_buf.buffer(0)?;
inner.get_next_key(key, next_key)?;
// info!("next_key: {:?}", next_key);
Ok(0)
}
/// Look up and delete an element by key in a specified map.
///
/// # WARN
///
/// Not all map types (particularly array maps) support this operation,
/// instead a zero value can be written to the map value. Check the map types page to check for support.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/syscall/BPF_MAP_DELETE_ELEM/
pub fn bpf_map_delete_elem(attr: &bpf_attr) -> Result<usize> {
let arg = BpfMapUpdateArg::from(attr);
// info!("<bpf_map_delete_elem>: {:#x?}", arg);
let map = get_map_file(arg.map_fd as i32)?;
let meta = &map.meta;
let key_size = meta.key_size as usize;
let key_buf = UserBufferReader::new(arg.key as *const u8, key_size, true)?;
let key = key_buf.read_from_user(0)?;
map.inner_map.lock().delete_elem(key)?;
Ok(0)
}
/// Iterate and fetch multiple elements in a map.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/syscall/BPF_MAP_LOOKUP_BATCH/
pub fn bpf_map_lookup_batch(_attr: &bpf_attr) -> Result<usize> {
todo!()
}
/// Look up an element with the given key in the map referred to by the file descriptor fd,
/// and if found, delete the element.
///
/// For BPF_MAP_TYPE_QUEUE and BPF_MAP_TYPE_STACK map types, the flags argument needs to be set to 0,
/// but for other map types, it may be specified as:
/// - BPF_F_LOCK : If this flag is set, the command will acquire the spin-lock of the map value we are looking up.
///
/// If the map contains no spin-lock in its value, -EINVAL will be returned by the command.
///
/// The BPF_MAP_TYPE_QUEUE and BPF_MAP_TYPE_STACK map types implement this command as a “pop” operation,
/// deleting the top element rather than one corresponding to key.
/// The key and key_len parameters should be zeroed when issuing this operation for these map types.
///
/// This command is only valid for the following map types:
/// - BPF_MAP_TYPE_QUEUE
/// - BPF_MAP_TYPE_STACK
/// - BPF_MAP_TYPE_HASH
/// - BPF_MAP_TYPE_PERCPU_HASH
/// - BPF_MAP_TYPE_LRU_HASH
/// - BPF_MAP_TYPE_LRU_PERCPU_HASH
///
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/syscall/BPF_MAP_LOOKUP_AND_DELETE_ELEM/
pub fn bpf_map_lookup_and_delete_elem(attr: &bpf_attr) -> Result<usize> {
let arg = BpfMapUpdateArg::from(attr);
// info!("<bpf_map_lookup_and_delete_elem>: {:#x?}", arg);
let map = get_map_file(arg.map_fd as i32)?;
let meta = &map.meta;
let key_size = meta.key_size as usize;
let value_size = meta.value_size as usize;
let key_buf = UserBufferReader::new(arg.key as *const u8, key_size, true)?;
let mut value_buf = UserBufferWriter::new(arg.value as *mut u8, value_size, true)?;
let value = value_buf.buffer(0)?;
let key = key_buf.read_from_user(0)?;
let mut inner = map.inner_map.lock();
inner.lookup_and_delete_elem(key, value)?;
Ok(0)
}
fn get_map_file(fd: i32) -> Result<Arc<BpfMap>> {
let fd_table = ProcessManager::current_pcb().fd_table();
let map = fd_table
.read()
.get_file_by_fd(fd)
.ok_or(SystemError::EBADF)?;
let map = map
.inode()
.downcast_arc::<BpfMap>()
.ok_or(SystemError::EINVAL)?;
Ok(map)
}

154
kernel/src/bpf/map/queue.rs Normal file
View File

@ -0,0 +1,154 @@
use super::{BpfMapCommonOps, Result};
use crate::bpf::map::util::{BpfMapMeta, BpfMapUpdateElemFlags};
use alloc::vec::Vec;
use core::fmt::Debug;
use core::ops::Deref;
use core::ops::DerefMut;
use system_error::SystemError;
type BpfQueueValue = Vec<u8>;
/// BPF_MAP_TYPE_QUEUE provides FIFO storage and BPF_MAP_TYPE_STACK provides LIFO storage for BPF programs.
/// These maps support peek, pop and push operations that are exposed to BPF programs through the respective helpers.
/// These operations are exposed to userspace applications using the existing bpf syscall in the following way:
/// - `BPF_MAP_LOOKUP_ELEM` -> `peek`
/// - `BPF_MAP_UPDATE_ELEM` -> `push`
/// - `BPF_MAP_LOOKUP_AND_DELETE_ELEM ` -> `pop`
///
/// See https://docs.kernel.org/bpf/map_queue_stack.html
pub trait SpecialMap: Debug + Send + Sync + 'static {
/// Returns the number of elements the queue can hold.
fn push(&mut self, value: BpfQueueValue, flags: BpfMapUpdateElemFlags) -> Result<()>;
/// Removes the first element and returns it.
fn pop(&mut self) -> Option<BpfQueueValue>;
/// Returns the first element without removing it.
fn peek(&self) -> Option<&BpfQueueValue>;
}
/// The queue map type is a generic map type, resembling a FIFO (First-In First-Out) queue.
///
/// This map type has no keys, only values. The size and type of the values can be specified by the user
/// to fit a large variety of use cases. The typical use-case for this map type is to keep track of
/// a pool of elements such as available network ports when implementing NAT (network address translation).
///
/// As apposed to most map types, this map type uses a custom set of helpers to pop, peek and push elements.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/map-type/BPF_MAP_TYPE_QUEUE/
#[derive(Debug)]
pub struct QueueMap {
max_entries: u32,
data: Vec<BpfQueueValue>,
}
impl QueueMap {
pub fn new(attr: &BpfMapMeta) -> Result<Self> {
if attr.value_size == 0 || attr.max_entries == 0 || attr.key_size != 0 {
return Err(SystemError::EINVAL);
}
let data = Vec::with_capacity(attr.max_entries as usize);
Ok(Self {
max_entries: attr.max_entries,
data,
})
}
}
impl SpecialMap for QueueMap {
fn push(&mut self, value: BpfQueueValue, flags: BpfMapUpdateElemFlags) -> Result<()> {
if self.data.len() == self.max_entries as usize {
if flags.contains(BpfMapUpdateElemFlags::BPF_EXIST) {
// remove the first element
self.data.remove(0);
} else {
return Err(SystemError::ENOSPC);
}
}
self.data.push(value);
Ok(())
}
fn pop(&mut self) -> Option<BpfQueueValue> {
if self.data.is_empty() {
return None;
}
Some(self.data.remove(0))
}
fn peek(&self) -> Option<&BpfQueueValue> {
self.data.first()
}
}
/// The stack map type is a generic map type, resembling a stack data structure.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/map-type/BPF_MAP_TYPE_STACK/
#[derive(Debug)]
pub struct StackMap(QueueMap);
impl StackMap {
pub fn new(attr: &BpfMapMeta) -> Result<Self> {
QueueMap::new(attr).map(StackMap)
}
}
impl Deref for StackMap {
type Target = QueueMap;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for StackMap {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl SpecialMap for StackMap {
fn push(&mut self, value: BpfQueueValue, flags: BpfMapUpdateElemFlags) -> Result<()> {
if self.data.len() == self.max_entries as usize {
if flags.contains(BpfMapUpdateElemFlags::BPF_EXIST) {
// remove the last element
self.data.pop();
} else {
return Err(SystemError::ENOSPC);
}
}
self.data.push(value);
Ok(())
}
fn pop(&mut self) -> Option<BpfQueueValue> {
self.data.pop()
}
fn peek(&self) -> Option<&BpfQueueValue> {
self.data.last()
}
}
impl<T: SpecialMap> BpfMapCommonOps for T {
/// Equal to [QueueMap::peek]
fn lookup_elem(&mut self, _key: &[u8]) -> Result<Option<&[u8]>> {
Ok(self.peek().map(|v| v.as_slice()))
}
/// Equal to [QueueMap::push]
fn update_elem(&mut self, _key: &[u8], value: &[u8], flags: u64) -> Result<()> {
let flag = BpfMapUpdateElemFlags::from_bits_truncate(flags);
self.push(value.to_vec(), flag)
}
/// Equal to [QueueMap::pop]
fn lookup_and_delete_elem(&mut self, _key: &[u8], value: &mut [u8]) -> Result<()> {
if let Some(v) = self.pop() {
value.copy_from_slice(&v);
Ok(())
} else {
Err(SystemError::ENOENT)
}
}
fn push_elem(&mut self, value: &[u8], flags: u64) -> Result<()> {
self.update_elem(&[], value, flags)
}
fn pop_elem(&mut self, value: &mut [u8]) -> Result<()> {
self.lookup_and_delete_elem(&[], value)
}
fn peek_elem(&self, value: &mut [u8]) -> Result<()> {
self.peek()
.map(|v| value.copy_from_slice(v))
.ok_or(SystemError::ENOENT)
}
}

100
kernel/src/bpf/map/util.rs Normal file
View File

@ -0,0 +1,100 @@
use crate::include::bindings::linux_bpf::{bpf_attr, bpf_map_type};
use alloc::string::{String, ToString};
use core::ffi::CStr;
use num_traits::FromPrimitive;
use system_error::SystemError;
#[derive(Debug, Clone)]
pub struct BpfMapMeta {
pub map_type: bpf_map_type,
pub key_size: u32,
pub value_size: u32,
pub max_entries: u32,
pub _map_flags: u32,
pub _map_name: String,
}
impl TryFrom<&bpf_attr> for BpfMapMeta {
type Error = SystemError;
fn try_from(value: &bpf_attr) -> Result<Self, Self::Error> {
let u = unsafe { &value.__bindgen_anon_1 };
let map_name_slice = unsafe {
core::slice::from_raw_parts(u.map_name.as_ptr() as *const u8, u.map_name.len())
};
let map_name = CStr::from_bytes_until_nul(map_name_slice)
.map_err(|_| SystemError::EINVAL)?
.to_str()
.map_err(|_| SystemError::EINVAL)?
.to_string();
let map_type = bpf_map_type::from_u32(u.map_type).ok_or(SystemError::EINVAL)?;
Ok(BpfMapMeta {
map_type,
key_size: u.key_size,
value_size: u.value_size,
max_entries: u.max_entries,
_map_flags: u.map_flags,
_map_name: map_name,
})
}
}
#[derive(Debug)]
pub struct BpfMapUpdateArg {
pub map_fd: u32,
pub key: u64,
pub value: u64,
pub flags: u64,
}
impl From<&bpf_attr> for BpfMapUpdateArg {
fn from(value: &bpf_attr) -> Self {
unsafe {
let u = &value.__bindgen_anon_2;
BpfMapUpdateArg {
map_fd: u.map_fd,
key: u.key,
value: u.__bindgen_anon_1.value,
flags: u.flags,
}
}
}
}
#[derive(Debug)]
pub struct BpfMapGetNextKeyArg {
pub map_fd: u32,
pub key: Option<u64>,
pub next_key: u64,
}
impl From<&bpf_attr> for BpfMapGetNextKeyArg {
fn from(value: &bpf_attr) -> Self {
unsafe {
let u = &value.__bindgen_anon_2;
BpfMapGetNextKeyArg {
map_fd: u.map_fd,
key: if u.key != 0 { Some(u.key) } else { None },
next_key: u.__bindgen_anon_1.next_key,
}
}
}
}
#[inline]
/// Round up `x` to the nearest multiple of `align`.
pub fn round_up(x: usize, align: usize) -> usize {
(x + align - 1) & !(align - 1)
}
bitflags! {
/// flags for BPF_MAP_UPDATE_ELEM command
pub struct BpfMapUpdateElemFlags: u64 {
/// create new element or update existing
const BPF_ANY = 0;
/// create new element if it didn't exist
const BPF_NOEXIST = 1;
/// update existing element
const BPF_EXIST = 2;
/// spin_lock-ed map_lookup/map_update
const BPF_F_LOCK = 4;
}
}

50
kernel/src/bpf/mod.rs Normal file
View File

@ -0,0 +1,50 @@
pub mod helper;
pub mod map;
pub mod prog;
use crate::include::bindings::linux_bpf::{bpf_attr, bpf_cmd};
use crate::syscall::user_access::UserBufferReader;
use crate::syscall::Syscall;
use log::error;
use num_traits::FromPrimitive;
use system_error::SystemError;
type Result<T> = core::result::Result<T, SystemError>;
impl Syscall {
pub fn sys_bpf(cmd: u32, attr: *mut u8, size: u32) -> Result<usize> {
let buf = UserBufferReader::new(attr, size as usize, true)?;
let attr = buf.read_one_from_user::<bpf_attr>(0)?;
let cmd = bpf_cmd::from_u32(cmd).ok_or(SystemError::EINVAL)?;
bpf(cmd, attr)
}
}
pub fn bpf(cmd: bpf_cmd, attr: &bpf_attr) -> Result<usize> {
let res = match cmd {
// Map related commands
bpf_cmd::BPF_MAP_CREATE => map::bpf_map_create(attr),
bpf_cmd::BPF_MAP_UPDATE_ELEM => map::bpf_map_update_elem(attr),
bpf_cmd::BPF_MAP_LOOKUP_ELEM => map::bpf_lookup_elem(attr),
bpf_cmd::BPF_MAP_GET_NEXT_KEY => map::bpf_map_get_next_key(attr),
bpf_cmd::BPF_MAP_DELETE_ELEM => map::bpf_map_delete_elem(attr),
bpf_cmd::BPF_MAP_LOOKUP_AND_DELETE_ELEM => map::bpf_map_lookup_and_delete_elem(attr),
bpf_cmd::BPF_MAP_LOOKUP_BATCH => map::bpf_map_lookup_batch(attr),
bpf_cmd::BPF_MAP_FREEZE => map::bpf_map_freeze(attr),
// Program related commands
bpf_cmd::BPF_PROG_LOAD => prog::bpf_prog_load(attr),
// Object creation commands
bpf_cmd::BPF_BTF_LOAD => {
error!("bpf cmd {:?} not implemented", cmd);
return Err(SystemError::ENOSYS);
}
ty => {
unimplemented!("bpf cmd {:?} not implemented", ty)
}
};
res
}
/// Initialize the BPF system
pub fn init_bpf_system() {
helper::init_helper_functions();
}

123
kernel/src/bpf/prog/mod.rs Normal file
View File

@ -0,0 +1,123 @@
mod util;
mod verifier;
use super::Result;
use crate::bpf::map::BpfMap;
use crate::bpf::prog::util::{BpfProgMeta, BpfProgVerifierInfo};
use crate::bpf::prog::verifier::BpfProgVerifier;
use crate::filesystem::vfs::file::{File, FileMode};
use crate::filesystem::vfs::syscall::ModeType;
use crate::filesystem::vfs::{FilePrivateData, FileSystem, FileType, IndexNode, Metadata};
use crate::include::bindings::linux_bpf::bpf_attr;
use crate::libs::spinlock::SpinLockGuard;
use crate::process::ProcessManager;
use alloc::string::String;
use alloc::sync::Arc;
use alloc::vec::Vec;
use core::any::Any;
use system_error::SystemError;
#[derive(Debug)]
pub struct BpfProg {
meta: BpfProgMeta,
raw_file_ptr: Vec<usize>,
}
impl BpfProg {
pub fn new(meta: BpfProgMeta) -> Self {
Self {
meta,
raw_file_ptr: Vec::new(),
}
}
pub fn insns(&self) -> &[u8] {
&self.meta.insns
}
pub fn insns_mut(&mut self) -> &mut [u8] {
&mut self.meta.insns
}
pub fn insert_map(&mut self, map_ptr: usize) {
self.raw_file_ptr.push(map_ptr);
}
}
impl IndexNode for BpfProg {
fn open(&self, _data: SpinLockGuard<FilePrivateData>, _mode: &FileMode) -> Result<()> {
Ok(())
}
fn close(&self, _data: SpinLockGuard<FilePrivateData>) -> Result<()> {
Ok(())
}
fn read_at(
&self,
_offset: usize,
_len: usize,
_buf: &mut [u8],
_data: SpinLockGuard<FilePrivateData>,
) -> Result<usize> {
Err(SystemError::ENOSYS)
}
fn write_at(
&self,
_offset: usize,
_len: usize,
_buf: &[u8],
_data: SpinLockGuard<FilePrivateData>,
) -> Result<usize> {
Err(SystemError::ENOSYS)
}
fn metadata(&self) -> Result<Metadata> {
let meta = Metadata {
mode: ModeType::from_bits_truncate(0o755),
file_type: FileType::File,
..Default::default()
};
Ok(meta)
}
fn resize(&self, _len: usize) -> Result<()> {
Ok(())
}
fn fs(&self) -> Arc<dyn FileSystem> {
panic!("BpfProg does not have a filesystem")
}
fn as_any_ref(&self) -> &dyn Any {
self
}
fn list(&self) -> Result<Vec<String>> {
Err(SystemError::ENOSYS)
}
}
impl Drop for BpfProg {
fn drop(&mut self) {
unsafe {
for ptr in self.raw_file_ptr.iter() {
let file = Arc::from_raw(*ptr as *const u8 as *const BpfMap);
drop(file)
}
}
}
}
/// Load a BPF program into the kernel.
///
/// See https://ebpf-docs.dylanreimerink.nl/linux/syscall/BPF_PROG_LOAD/
pub fn bpf_prog_load(attr: &bpf_attr) -> Result<usize> {
let args = BpfProgMeta::try_from(attr)?;
// info!("bpf_prog_load: {:#?}", args);
let log_info = BpfProgVerifierInfo::from(attr);
let prog = BpfProg::new(args);
let fd_table = ProcessManager::current_pcb().fd_table();
let prog = BpfProgVerifier::new(prog, log_info.log_level, &mut []).verify(&fd_table)?;
let file = File::new(Arc::new(prog), FileMode::O_RDWR)?;
let fd = fd_table.write().alloc_fd(file, None).map(|x| x as usize)?;
Ok(fd)
}

112
kernel/src/bpf/prog/util.rs Normal file
View File

@ -0,0 +1,112 @@
use crate::include::bindings::linux_bpf::{bpf_attach_type, bpf_attr, bpf_prog_type};
use crate::syscall::user_access::{check_and_clone_cstr, UserBufferReader};
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use core::ffi::CStr;
use core::fmt::Debug;
use num_traits::FromPrimitive;
use system_error::SystemError;
bitflags::bitflags! {
pub struct VerifierLogLevel: u32 {
/// Sets no verifier logging.
const DISABLE = 0;
/// Enables debug verifier logging.
const DEBUG = 1;
/// Enables verbose verifier logging.
const VERBOSE = 2 | Self::DEBUG.bits();
/// Enables verifier stats.
const STATS = 4;
}
}
#[derive(Debug)]
pub struct BpfProgVerifierInfo {
/// This attribute specifies the level/detail of the log output. Valid values are.
pub log_level: VerifierLogLevel,
/// This attributes indicates the size of the memory region in bytes
/// indicated by `log_buf` which can safely be written to by the kernel.
pub _log_buf_size: u32,
/// This attributes can be set to a pointer to a memory region
/// allocated/reservedby the loader process where the verifier log will
/// be written to.
/// The detail of the log is set by log_level. The verifier log
/// is often the only indication in addition to the error code of
/// why the syscall command failed to load the program.
///
/// The log is also written to on success. If the kernel runs out of
/// space in the buffer while loading, the loading process will fail
/// and the command will return with an error code of -ENOSPC. So it
/// is important to correctly size the buffer when enabling logging.
pub _log_buf_ptr: usize,
}
impl From<&bpf_attr> for BpfProgVerifierInfo {
fn from(attr: &bpf_attr) -> Self {
unsafe {
let u = &attr.__bindgen_anon_3;
Self {
log_level: VerifierLogLevel::from_bits_truncate(u.log_level),
_log_buf_size: u.log_size,
_log_buf_ptr: u.log_buf as usize,
}
}
}
}
pub struct BpfProgMeta {
pub prog_flags: u32,
pub prog_type: bpf_prog_type,
pub expected_attach_type: bpf_attach_type,
pub insns: Vec<u8>,
pub license: String,
pub kern_version: u32,
pub name: String,
}
impl Debug for BpfProgMeta {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("BpfProgMeta")
.field("prog_flags", &self.prog_flags)
.field("prog_type", &self.prog_type)
.field("expected_attach_type", &self.expected_attach_type)
.field("insns_len", &(self.insns.len() / 8))
.field("license", &self.license)
.field("kern_version", &self.kern_version)
.field("name", &self.name)
.finish()
}
}
impl TryFrom<&bpf_attr> for BpfProgMeta {
type Error = SystemError;
fn try_from(attr: &bpf_attr) -> Result<Self, Self::Error> {
let u = unsafe { &attr.__bindgen_anon_3 };
let prog_type = bpf_prog_type::from_u32(u.prog_type).ok_or(SystemError::EINVAL)?;
let expected_attach_type =
bpf_attach_type::from_u32(u.expected_attach_type).ok_or(SystemError::EINVAL)?;
unsafe {
let insns_buf =
UserBufferReader::new(u.insns as *mut u8, u.insn_cnt as usize * 8, true)?;
let insns = insns_buf.read_from_user::<u8>(0)?.to_vec();
let name_slice =
core::slice::from_raw_parts(u.prog_name.as_ptr() as *const u8, u.prog_name.len());
let prog_name = CStr::from_bytes_until_nul(name_slice)
.map_err(|_| SystemError::EINVAL)?
.to_str()
.map_err(|_| SystemError::EINVAL)?
.to_string();
let license = check_and_clone_cstr(u.license as *const u8, None)?;
Ok(Self {
prog_flags: u.prog_flags,
prog_type,
expected_attach_type,
insns,
license: license.into_string().map_err(|_| SystemError::EINVAL)?,
kern_version: u.kern_version,
name: prog_name,
})
}
}
}

View File

@ -0,0 +1,131 @@
use super::super::Result;
use crate::bpf::map::BpfMap;
use crate::bpf::prog::util::VerifierLogLevel;
use crate::bpf::prog::BpfProg;
use crate::filesystem::vfs::file::FileDescriptorVec;
use crate::include::bindings::linux_bpf::*;
use crate::libs::casting::DowncastArc;
use crate::libs::rwlock::RwLock;
use alloc::{sync::Arc, vec::Vec};
use log::{error, info};
use rbpf::ebpf;
use rbpf::ebpf::to_insn_vec;
use system_error::SystemError;
/// The BPF program verifier.
///
/// See https://docs.kernel.org/bpf/verifier.html
#[derive(Debug)]
pub struct BpfProgVerifier<'a> {
prog: BpfProg,
_log_level: VerifierLogLevel,
_log_buf: &'a mut [u8],
}
impl<'a> BpfProgVerifier<'a> {
pub fn new(prog: BpfProg, log_level: VerifierLogLevel, log_buf: &'a mut [u8]) -> Self {
Self {
prog,
_log_level: log_level,
_log_buf: log_buf,
}
}
/// Relocate the program.
///
/// This function will relocate the program, and update the program's instructions.
fn relocation(&mut self, fd_table: &Arc<RwLock<FileDescriptorVec>>) -> Result<()> {
let instructions = self.prog.insns_mut();
let mut fmt_insn = to_insn_vec(instructions);
let mut index = 0;
let mut raw_file_ptr = vec![];
loop {
if index >= fmt_insn.len() {
break;
}
let mut insn = fmt_insn[index].clone();
if insn.opc == ebpf::LD_DW_IMM {
// relocate the instruction
let mut next_insn = fmt_insn[index + 1].clone();
// the imm is the map_fd because user lib has already done the relocation
let map_fd = insn.imm as usize;
let src_reg = insn.src;
// See https://www.kernel.org/doc/html/latest/bpf/standardization/instruction-set.html#id23
let ptr = match src_reg as u32 {
BPF_PSEUDO_MAP_VALUE => {
// dst = map_val(map_by_fd(imm)) + next_imm
// map_val(map) gets the address of the first value in a given map
let file = fd_table
.read()
.get_file_by_fd(map_fd as i32)
.ok_or(SystemError::EBADF)?;
let bpf_map = file
.inode()
.downcast_arc::<BpfMap>()
.ok_or(SystemError::EINVAL)?;
let first_value_ptr =
bpf_map.inner_map().lock().first_value_ptr()? as usize;
let offset = next_insn.imm as usize;
info!(
"Relocate for BPF_PSEUDO_MAP_VALUE, instruction index: {}, map_fd: {}",
index, map_fd
);
Some(first_value_ptr + offset)
}
BPF_PSEUDO_MAP_FD => {
// dst = map_by_fd(imm)
// map_by_fd(imm) means to convert a 32-bit file descriptor into an address of a map
let bpf_map = fd_table
.read()
.get_file_by_fd(map_fd as i32)
.ok_or(SystemError::EBADF)?
.inode()
.downcast_arc::<BpfMap>()
.ok_or(SystemError::EINVAL)?;
// todo!(warning: We need release after prog unload)
let map_ptr = Arc::into_raw(bpf_map) as usize;
info!(
"Relocate for BPF_PSEUDO_MAP_FD, instruction index: {}, map_fd: {}, ptr: {:#x}",
index, map_fd, map_ptr
);
raw_file_ptr.push(map_ptr);
Some(map_ptr)
}
ty => {
error!(
"relocation for ty: {} not implemented, instruction index: {}",
ty, index
);
None
}
};
if let Some(ptr) = ptr {
// The current ins store the map_data_ptr low 32 bits,
// the next ins store the map_data_ptr high 32 bits
insn.imm = ptr as i32;
next_insn.imm = (ptr >> 32) as i32;
fmt_insn[index] = insn;
fmt_insn[index + 1] = next_insn;
index += 2;
} else {
index += 1;
}
} else {
index += 1;
}
}
let fmt_insn = fmt_insn
.iter()
.flat_map(|ins| ins.to_vec())
.collect::<Vec<u8>>();
instructions.copy_from_slice(&fmt_insn);
for ptr in raw_file_ptr {
self.prog.insert_map(ptr);
}
Ok(())
}
pub fn verify(mut self, fd_table: &Arc<RwLock<FileDescriptorVec>>) -> Result<BpfProg> {
self.relocation(fd_table)?;
Ok(self.prog)
}
}

View File

@ -11,3 +11,12 @@ static inline int strlen(const char *s) {
}
return __res;
}
static inline int strcmp(const char *s1, const char *s2) {
while (*s1 && *s2 && *s1 == *s2) {
++s1;
++s2;
}
return *s1 - *s2;
}

View File

@ -0,0 +1,66 @@
use alloc::boxed::Box;
use alloc::string::String;
use kprobe::{CallBackFunc, KprobeBuilder, ProbeArgs};
use log::warn;
use system_error::SystemError;
pub struct KprobeInfo {
pub pre_handler: fn(&dyn ProbeArgs),
pub post_handler: fn(&dyn ProbeArgs),
pub fault_handler: Option<fn(&dyn ProbeArgs)>,
pub event_callback: Option<Box<dyn CallBackFunc>>,
pub symbol: Option<String>,
pub addr: Option<usize>,
pub offset: usize,
pub enable: bool,
}
extern "C" {
fn addr_from_symbol(symbol: *const u8) -> usize;
}
impl TryFrom<KprobeInfo> for KprobeBuilder {
type Error = SystemError;
fn try_from(kprobe_info: KprobeInfo) -> Result<Self, Self::Error> {
// 检查参数: symbol和addr必须有一个但不能同时有
if kprobe_info.symbol.is_none() && kprobe_info.addr.is_none() {
return Err(SystemError::EINVAL);
}
if kprobe_info.symbol.is_some() && kprobe_info.addr.is_some() {
return Err(SystemError::EINVAL);
}
let func_addr = if let Some(symbol) = kprobe_info.symbol.clone() {
let mut symbol_sting = symbol;
if !symbol_sting.ends_with("\0") {
symbol_sting.push('\0');
}
let symbol = symbol_sting.as_ptr();
let func_addr = unsafe { addr_from_symbol(symbol) };
if func_addr == 0 {
warn!(
"register_kprobe: the symbol: {:?} not found",
kprobe_info.symbol
);
return Err(SystemError::ENXIO);
}
func_addr
} else {
kprobe_info.addr.unwrap()
};
let mut builder = KprobeBuilder::new(
kprobe_info.symbol,
func_addr,
kprobe_info.offset,
kprobe_info.pre_handler,
kprobe_info.post_handler,
kprobe_info.enable,
);
if let Some(fault_handler) = kprobe_info.fault_handler {
builder = builder.with_fault_handler(fault_handler);
}
if let Some(event_callback) = kprobe_info.event_callback {
builder = builder.with_event_callback(event_callback);
}
Ok(builder)
}
}

View File

@ -0,0 +1,183 @@
use crate::debug::kprobe::args::KprobeInfo;
use crate::libs::rwlock::RwLock;
use crate::libs::spinlock::SpinLock;
use alloc::collections::BTreeMap;
use alloc::sync::Arc;
use alloc::vec::Vec;
use kprobe::{Kprobe, KprobeBuilder, KprobeOps, KprobePoint};
use system_error::SystemError;
pub mod args;
#[cfg(feature = "kprobe_test")]
mod test;
pub type LockKprobe = Arc<RwLock<Kprobe>>;
pub static KPROBE_MANAGER: SpinLock<KprobeManager> = SpinLock::new(KprobeManager::new());
static KPROBE_POINT_LIST: SpinLock<BTreeMap<usize, Arc<KprobePoint>>> =
SpinLock::new(BTreeMap::new());
/// 管理所有的kprobe探测点
#[derive(Debug, Default)]
pub struct KprobeManager {
break_list: BTreeMap<usize, Vec<LockKprobe>>,
debug_list: BTreeMap<usize, Vec<LockKprobe>>,
}
impl KprobeManager {
pub const fn new() -> Self {
KprobeManager {
break_list: BTreeMap::new(),
debug_list: BTreeMap::new(),
}
}
/// # 插入一个kprobe
///
/// ## 参数
/// - `kprobe`: kprobe的实例
pub fn insert_kprobe(&mut self, kprobe: LockKprobe) {
let probe_point = kprobe.read().probe_point().clone();
self.insert_break_point(probe_point.break_address(), kprobe.clone());
self.insert_debug_point(probe_point.debug_address(), kprobe);
}
/// # 向break_list中插入一个kprobe
///
/// ## 参数
/// - `address`: kprobe的地址, 由`KprobePoint::break_address()`或者`KprobeBuilder::probe_addr()`返回
/// - `kprobe`: kprobe的实例
fn insert_break_point(&mut self, address: usize, kprobe: LockKprobe) {
let list = self.break_list.entry(address).or_default();
list.push(kprobe);
}
/// # 向debug_list中插入一个kprobe
///
/// ## 参数
/// - `address`: kprobe的单步执行地址由`KprobePoint::debug_address()`返回
/// - `kprobe`: kprobe的实例
fn insert_debug_point(&mut self, address: usize, kprobe: LockKprobe) {
let list = self.debug_list.entry(address).or_default();
list.push(kprobe);
}
pub fn get_break_list(&self, address: usize) -> Option<&Vec<LockKprobe>> {
self.break_list.get(&address)
}
pub fn get_debug_list(&self, address: usize) -> Option<&Vec<LockKprobe>> {
self.debug_list.get(&address)
}
/// # 返回一个地址上注册的kprobe数量
///
/// ## 参数
/// - `address`: kprobe的地址, 由`KprobePoint::break_address()`或者`KprobeBuilder::probe_addr()`返回
pub fn kprobe_num(&self, address: usize) -> usize {
self.break_list_len(address)
}
#[inline]
fn break_list_len(&self, address: usize) -> usize {
self.break_list
.get(&address)
.map(|list| list.len())
.unwrap_or(0)
}
#[inline]
fn debug_list_len(&self, address: usize) -> usize {
self.debug_list
.get(&address)
.map(|list| list.len())
.unwrap_or(0)
}
/// # 移除一个kprobe
///
/// ## 参数
/// - `kprobe`: kprobe的实例
pub fn remove_kprobe(&mut self, kprobe: &LockKprobe) {
let probe_point = kprobe.read().probe_point().clone();
self.remove_one_break(probe_point.break_address(), kprobe);
self.remove_one_debug(probe_point.debug_address(), kprobe);
}
/// # 从break_list中移除一个kprobe
///
/// 如果没有其他kprobe注册在这个地址上则删除列表
///
/// ## 参数
/// - `address`: kprobe的地址, 由`KprobePoint::break_address()`或者`KprobeBuilder::probe_addr()`返回
/// - `kprobe`: kprobe的实例
fn remove_one_break(&mut self, address: usize, kprobe: &LockKprobe) {
if let Some(list) = self.break_list.get_mut(&address) {
list.retain(|x| !Arc::ptr_eq(x, kprobe));
}
if self.break_list_len(address) == 0 {
self.break_list.remove(&address);
}
}
/// # 从debug_list中移除一个kprobe
///
/// 如果没有其他kprobe注册在这个地址上则删除列表
///
/// ## 参数
/// - `address`: kprobe的单步执行地址由`KprobePoint::debug_address()`返回
/// - `kprobe`: kprobe的实例
fn remove_one_debug(&mut self, address: usize, kprobe: &LockKprobe) {
if let Some(list) = self.debug_list.get_mut(&address) {
list.retain(|x| !Arc::ptr_eq(x, kprobe));
}
if self.debug_list_len(address) == 0 {
self.debug_list.remove(&address);
}
}
}
#[cfg(feature = "kprobe_test")]
#[allow(unused)]
/// This function is only used for testing kprobe
pub fn kprobe_test() {
test::kprobe_test();
}
/// # 注册一个kprobe
///
/// 该函数会根据`symbol`查找对应的函数地址,如果找不到则返回错误。
///
/// ## 参数
/// - `kprobe_info`: kprobe的信息
pub fn register_kprobe(kprobe_info: KprobeInfo) -> Result<LockKprobe, SystemError> {
let kprobe_builder = KprobeBuilder::try_from(kprobe_info)?;
let address = kprobe_builder.probe_addr();
let existed_point = KPROBE_POINT_LIST.lock().get(&address).map(Clone::clone);
let kprobe = match existed_point {
Some(existed_point) => {
kprobe_builder
.with_probe_point(existed_point.clone())
.install()
.0
}
None => {
let (kprobe, probe_point) = kprobe_builder.install();
KPROBE_POINT_LIST.lock().insert(address, probe_point);
kprobe
}
};
let kprobe = Arc::new(RwLock::new(kprobe));
KPROBE_MANAGER.lock().insert_kprobe(kprobe.clone());
Ok(kprobe)
}
/// # 注销一个kprobe
///
/// ## 参数
/// - `kprobe`: 已安装的kprobe
pub fn unregister_kprobe(kprobe: LockKprobe) {
let kprobe_addr = kprobe.read().probe_point().break_address();
KPROBE_MANAGER.lock().remove_kprobe(&kprobe);
// 如果没有其他kprobe注册在这个地址上则删除探测点
if KPROBE_MANAGER.lock().kprobe_num(kprobe_addr) == 0 {
KPROBE_POINT_LIST.lock().remove(&kprobe_addr);
}
}

View File

@ -0,0 +1,84 @@
use crate::arch::interrupt::TrapFrame;
use crate::debug::kprobe::{register_kprobe, unregister_kprobe, KprobeInfo};
use alloc::string::ToString;
use kprobe::ProbeArgs;
use log::info;
#[inline(never)]
fn detect_func(x: usize, y: usize) -> usize {
let hart = 0;
info!("detect_func: hart_id: {}, x: {}, y:{}", hart, x, y);
hart
}
fn pre_handler(regs: &dyn ProbeArgs) {
let pt_regs = regs.as_any().downcast_ref::<TrapFrame>().unwrap();
info!(
"call pre_handler, the sp is {:#x}",
pt_regs as *const _ as usize
);
}
fn post_handler(regs: &dyn ProbeArgs) {
let pt_regs = regs.as_any().downcast_ref::<TrapFrame>().unwrap();
info!(
"call post_handler, the sp is {:#x}",
pt_regs as *const _ as usize
);
}
fn fault_handler(regs: &dyn ProbeArgs) {
let pt_regs = regs.as_any().downcast_ref::<TrapFrame>().unwrap();
info!(
"call fault_handler, the sp is {:#x}",
pt_regs as *const _ as usize
);
}
pub fn kprobe_test() {
info!("kprobe test for [detect_func]: {:#x}", detect_func as usize);
let kprobe_info = KprobeInfo {
pre_handler,
post_handler,
fault_handler: Some(fault_handler),
event_callback: None,
symbol: None,
addr: Some(detect_func as usize),
offset: 0,
enable: true,
};
let kprobe = register_kprobe(kprobe_info).unwrap();
let new_pre_handler = |regs: &dyn ProbeArgs| {
let pt_regs = regs.as_any().downcast_ref::<TrapFrame>().unwrap();
info!(
"call new pre_handler, the sp is {:#x}",
pt_regs as *const _ as usize
);
};
let kprobe_info = KprobeInfo {
pre_handler: new_pre_handler,
post_handler,
fault_handler: Some(fault_handler),
event_callback: None,
symbol: Some("dragonos_kernel::debug::kprobe::test::detect_func".to_string()),
addr: None,
offset: 0,
enable: true,
};
let kprobe2 = register_kprobe(kprobe_info).unwrap();
info!(
"install 2 kprobes at [detect_func]: {:#x}",
detect_func as usize
);
detect_func(1, 2);
unregister_kprobe(kprobe);
unregister_kprobe(kprobe2);
info!(
"uninstall 2 kprobes at [detect_func]: {:#x}",
detect_func as usize
);
detect_func(1, 2);
info!("kprobe test end");
}

View File

@ -1 +1,2 @@
pub mod klog;
pub mod kprobe;

View File

@ -1,5 +1,6 @@
#include "traceback.h"
#include <common/printk.h>
#include <common/string.h>
#include <process/process.h>
int lookup_kallsyms(uint64_t addr, int level)
@ -26,6 +27,18 @@ int lookup_kallsyms(uint64_t addr, int level)
return -1;
}
uint64_t addr_from_symbol(const char *symbol)
{
const char *str = (const char *)&kallsyms_names;
for (uint64_t i = 0; i < kallsyms_num; ++i)
{
if (strcmp(&str[kallsyms_names_index[i]], symbol) == 0)
return kallsyms_address[i];
}
return 0;
}
/**
* @brief 追溯内核栈调用情况
*

View File

@ -14,4 +14,5 @@ extern const char *kallsyms_names __attribute__((weak));
*
* @param regs 内核栈结构体
*/
void traceback(struct pt_regs *regs);
void traceback(struct pt_regs *regs);
uint64_t addr_from_symbol(const char *symbol);

View File

@ -11,8 +11,6 @@ use crate::{
libs::rwlock::{RwLock, RwLockReadGuard, RwLockWriteGuard},
};
use system_error::SystemError;
use super::{
class::Class,
device::{
@ -24,29 +22,36 @@ use super::{
kset::KSet,
subsys::SubSysPrivate,
};
use crate::filesystem::sysfs::file::sysfs_emit_str;
use crate::filesystem::sysfs::{Attribute, AttributeGroup, SysFSOpsSupport};
use crate::filesystem::vfs::syscall::ModeType;
use crate::libs::lazy_init::Lazy;
use system_error::SystemError;
#[inline(always)]
pub fn cpu_device_manager() -> &'static CpuDeviceManager {
return &CpuDeviceManager;
}
static CPU_DEVICE_MANAGER: Lazy<CpuDeviceManager> = Lazy::new();
#[derive(Debug)]
pub struct CpuDeviceManager;
pub struct CpuDeviceManager {
_root_device: Arc<CpuSubSystemFakeRootDevice>,
}
impl CpuDeviceManager {
/// 初始化设备驱动模型的CPU子系统
///
/// 参考 https://code.dragonos.org.cn/xref/linux-6.1.9/drivers/base/cpu.c?fi=get_cpu_device#622
pub fn init(&self) -> Result<(), SystemError> {
pub fn init() -> Result<(), SystemError> {
let cpu_subsys = CpuSubSystem::new();
let root_device = CpuSubSystemFakeRootDevice::new();
subsystem_manager()
.subsys_system_register(
&(cpu_subsys as Arc<dyn Bus>),
&(root_device as Arc<dyn Device>),
&(root_device.clone() as Arc<dyn Device>),
)
.expect("register cpu subsys failed");
let manager = Self {
_root_device: root_device,
};
CPU_DEVICE_MANAGER.init(manager);
return Ok(());
}
}
@ -190,6 +195,10 @@ impl Device for CpuSubSystemFakeRootDevice {
fn set_dev_parent(&self, dev_parent: Option<Weak<dyn Device>>) {
self.inner.write().device_common.parent = dev_parent;
}
fn attribute_groups(&self) -> Option<&'static [&'static dyn AttributeGroup]> {
Some(&[&AttrGroupCpu])
}
}
impl KObject for CpuSubSystemFakeRootDevice {
@ -249,3 +258,70 @@ impl KObject for CpuSubSystemFakeRootDevice {
*self.kobj_state_mut() = state;
}
}
#[derive(Debug)]
pub struct AttrGroupCpu;
impl AttributeGroup for AttrGroupCpu {
fn name(&self) -> Option<&str> {
None
}
fn attrs(&self) -> &[&'static dyn Attribute] {
&[&AttrCpuPossible, &AttrCpuOnline]
}
fn is_visible(
&self,
_kobj: Arc<dyn KObject>,
_attr: &'static dyn Attribute,
) -> Option<ModeType> {
None
}
}
#[derive(Debug)]
pub struct AttrCpuPossible;
impl Attribute for AttrCpuPossible {
fn name(&self) -> &str {
"possible"
}
fn mode(&self) -> ModeType {
ModeType::S_IRUGO
}
fn support(&self) -> SysFSOpsSupport {
SysFSOpsSupport::ATTR_SHOW
}
fn show(&self, _kobj: Arc<dyn KObject>, buf: &mut [u8]) -> Result<usize, SystemError> {
let cpu_manager = crate::smp::cpu::smp_cpu_manager();
let cpus = cpu_manager.possible_cpus_count();
let data = format!("0-{}", cpus - 1);
sysfs_emit_str(buf, &data)
}
}
#[derive(Debug)]
pub struct AttrCpuOnline;
impl Attribute for AttrCpuOnline {
fn name(&self) -> &str {
"online"
}
fn mode(&self) -> ModeType {
ModeType::S_IRUGO
}
fn support(&self) -> SysFSOpsSupport {
SysFSOpsSupport::ATTR_SHOW
}
fn show(&self, _kobj: Arc<dyn KObject>, buf: &mut [u8]) -> Result<usize, SystemError> {
let cpu_manager = crate::smp::cpu::smp_cpu_manager();
let cpus = cpu_manager.present_cpus_count();
let data = format!("0-{}", cpus - 1);
sysfs_emit_str(buf, &data)
}
}

View File

@ -3,7 +3,7 @@ use system_error::SystemError;
use super::{
class::classes_init,
cpu::cpu_device_manager,
cpu::CpuDeviceManager,
device::{bus::buses_init, init::devices_init},
firmware::firmware_init,
hypervisor::hypervisor_init,
@ -20,7 +20,7 @@ pub fn driver_init() -> Result<(), SystemError> {
hypervisor_init()?;
platform_bus_init()?;
serio_bus_init()?;
cpu_device_manager().init()?;
CpuDeviceManager::init()?;
// 至此,已完成设备驱动模型的初始化
return Ok(());

View File

@ -0,0 +1,33 @@
use crate::arch::interrupt::TrapFrame;
use crate::arch::kprobe::clear_single_step;
use crate::debug::kprobe::KPROBE_MANAGER;
use kprobe::{KprobeOps, ProbeArgs};
use log::debug;
use system_error::SystemError;
#[derive(Debug)]
pub struct DebugException;
impl DebugException {
pub fn handle(frame: &mut TrapFrame) -> Result<(), SystemError> {
Self::post_kprobe_handler(frame)
}
fn post_kprobe_handler(frame: &mut TrapFrame) -> Result<(), SystemError> {
let pc = frame.debug_address();
if let Some(kprobe_list) = KPROBE_MANAGER.lock().get_debug_list(pc) {
for kprobe in kprobe_list {
let guard = kprobe.read();
if guard.is_enabled() {
guard.call_post_handler(frame);
guard.call_event_callback(frame);
}
}
let return_address = kprobe_list[0].read().probe_point().return_address();
clear_single_step(frame, return_address);
} else {
debug!("There is no kprobe on pc {:#x}", pc);
}
Ok(())
}
}

View File

@ -0,0 +1,37 @@
use crate::arch::interrupt::TrapFrame;
use crate::arch::kprobe::setup_single_step;
use crate::debug::kprobe::KPROBE_MANAGER;
use crate::exception::debug::DebugException;
use kprobe::{KprobeOps, ProbeArgs};
use system_error::SystemError;
#[derive(Debug)]
pub struct EBreak;
impl EBreak {
pub fn handle(frame: &mut TrapFrame) -> Result<(), SystemError> {
Self::kprobe_handler(frame)
}
fn kprobe_handler(frame: &mut TrapFrame) -> Result<(), SystemError> {
let break_addr = frame.break_address();
let guard = KPROBE_MANAGER.lock();
let kprobe_list = guard.get_break_list(break_addr);
if let Some(kprobe_list) = kprobe_list {
for kprobe in kprobe_list {
let guard = kprobe.read();
if guard.is_enabled() {
guard.call_pre_handler(frame);
}
}
let single_step_address = kprobe_list[0].read().probe_point().single_step_address();
// setup_single_step
setup_single_step(frame, single_step_address);
} else {
// For some architectures, they do not support single step execution,
// and we need to use breakpoint exceptions to simulate
drop(guard);
DebugException::handle(frame)?;
}
Ok(())
}
}

View File

@ -4,7 +4,9 @@ use system_error::SystemError;
use crate::arch::CurrentIrqArch;
pub mod debug;
pub mod dummychip;
pub mod ebreak;
pub mod handle;
pub mod init;
pub mod ipi;

View File

@ -11,6 +11,8 @@ use system_error::SystemError;
use super::{Dirent, FileType, IndexNode, InodeId, Metadata, SpecialNodeData};
use crate::filesystem::eventfd::EventFdInode;
use crate::libs::lazy_init::Lazy;
use crate::perf::PerfEventInode;
use crate::{
arch::MMArch,
driver::{
@ -125,7 +127,7 @@ impl FileMode {
/// 页面缓存
pub struct PageCache {
xarray: SpinLock<XArray<Arc<Page>>>,
inode: Option<Weak<dyn IndexNode>>,
inode: Lazy<Weak<dyn IndexNode>>,
}
impl core::fmt::Debug for PageCache {
@ -148,13 +150,19 @@ impl PageCache {
pub fn new(inode: Option<Weak<dyn IndexNode>>) -> Arc<PageCache> {
let page_cache = Self {
xarray: SpinLock::new(XArray::new()),
inode,
inode: {
let v: Lazy<Weak<dyn IndexNode>> = Lazy::new();
if let Some(inode) = inode {
v.init(inode);
}
v
},
};
Arc::new(page_cache)
}
pub fn inode(&self) -> Option<Weak<dyn IndexNode>> {
self.inode.clone()
self.inode.try_get().cloned()
}
pub fn add_page(&self, offset: usize, page: &Arc<Page>) {
@ -176,8 +184,12 @@ impl PageCache {
cursor.remove();
}
pub fn set_inode(&mut self, inode: Weak<dyn IndexNode>) {
self.inode = Some(inode)
pub fn set_inode(&self, inode: Weak<dyn IndexNode>) -> Result<(), SystemError> {
if self.inode.initialized() {
return Err(SystemError::EINVAL);
}
self.inode.init(inode);
Ok(())
}
}
@ -603,11 +615,15 @@ impl File {
inode.inner().lock().remove_epoll(epoll)
}
_ => {
let inode = self.inode.downcast_ref::<EventFdInode>();
if let Some(inode) = inode {
return inode.remove_epoll(epoll);
}
let inode = self
.inode
.downcast_ref::<EventFdInode>()
.downcast_ref::<PerfEventInode>()
.ok_or(SystemError::ENOSYS)?;
inode.remove_epoll(epoll)
return inode.remove_epoll(epoll);
}
}
}
@ -745,7 +761,6 @@ impl FileDescriptorVec {
// 把文件描述符数组对应位置设置为空
let file = self.fds[fd as usize].take().unwrap();
return Ok(file);
}

View File

@ -125,6 +125,9 @@ bitflags! {
}
pub trait IndexNode: Any + Sync + Send + Debug + CastFromSync {
fn mmap(&self, _start: usize, _len: usize, _offset: usize) -> Result<(), SystemError> {
return Err(SystemError::ENOSYS);
}
/// @brief 打开文件
///
/// @return 成功Ok()

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +1,10 @@
#[allow(clippy::module_inception)]
#![allow(
dead_code,
non_camel_case_types,
non_snake_case,
clippy::all,
missing_docs,
clippy::module_inception
)]
pub mod bindings;
pub mod linux_bpf;

View File

@ -1,5 +1,3 @@
use log::warn;
use crate::{
arch::{
init::{early_setup_arch, setup_arch, setup_arch_post},
@ -30,6 +28,7 @@ use crate::{
clocksource::clocksource_boot_finish, timekeeping::timekeeping_init, timer::timer_init,
},
};
use log::warn;
use super::{
boot::{boot_callback_except_early, boot_callbacks},
@ -89,9 +88,8 @@ fn do_start_kernel() {
kthread_init();
setup_arch_post().expect("setup_arch_post failed");
clocksource_boot_finish();
Futex::init();
crate::bpf::init_bpf_system();
#[cfg(all(target_arch = "x86_64", feature = "kvm"))]
crate::virt::kvm::kvm_init();
}

View File

@ -21,6 +21,7 @@
#![feature(slice_ptr_get)]
#![feature(sync_unsafe_cell)]
#![feature(vec_into_raw_parts)]
#![feature(c_variadic)]
#![cfg_attr(target_os = "none", no_std)]
#![allow(internal_features)]
// clippy的配置
@ -46,6 +47,7 @@ mod arch;
mod libs;
#[macro_use]
mod include;
mod bpf;
mod debug;
mod driver; // 如果driver依赖了libs应该在libs后面导出
mod exception;
@ -55,12 +57,12 @@ mod ipc;
mod misc;
mod mm;
mod net;
mod perf;
mod process;
mod sched;
mod smp;
mod syscall;
mod time;
#[cfg(target_arch = "x86_64")]
mod virt;

View File

@ -272,16 +272,16 @@ impl PageFaultHandler {
/// - VmFaultReason: 页面错误处理信息标志
pub unsafe fn do_fault(pfm: &mut PageFaultMessage) -> VmFaultReason {
if !pfm.flags().contains(FaultFlags::FAULT_FLAG_WRITE) {
return Self::do_read_fault(pfm);
Self::do_read_fault(pfm)
} else if !pfm
.vma()
.lock_irqsave()
.vm_flags()
.contains(VmFlags::VM_SHARED)
{
return Self::do_cow_fault(pfm);
Self::do_cow_fault(pfm)
} else {
return Self::do_shared_fault(pfm);
Self::do_shared_fault(pfm)
}
}

View File

@ -377,7 +377,7 @@ impl InnerAddressSpace {
PageFrameCount::from_bytes(len).unwrap(),
prot_flags,
map_flags,
move |page, count, vm_flags, flags, mapper, flusher| {
|page, count, vm_flags, flags, mapper, flusher| {
if allocate_at_once {
VMA::zeroed(
page,
@ -386,7 +386,7 @@ impl InnerAddressSpace {
flags,
mapper,
flusher,
file,
file.clone(),
Some(pgoff),
)
} else {
@ -394,13 +394,17 @@ impl InnerAddressSpace {
VirtRegion::new(page.virt_address(), count.data() * MMArch::PAGE_SIZE),
vm_flags,
flags,
file,
file.clone(),
Some(pgoff),
false,
)))
}
},
)?;
// todo!(impl mmap for other file)
// https://github.com/DragonOS-Community/DragonOS/pull/912#discussion_r1765334272
let file = file.unwrap();
let _ = file.inode().mmap(start_vaddr.data(), len, offset);
return Ok(start_page);
}

333
kernel/src/perf/bpf.rs Normal file
View File

@ -0,0 +1,333 @@
use super::{PerfEventOps, Result};
use crate::arch::mm::LockedFrameAllocator;
use crate::arch::MMArch;
use crate::filesystem::vfs::file::PageCache;
use crate::filesystem::vfs::{FilePrivateData, FileSystem, IndexNode};
use crate::include::bindings::linux_bpf::{
perf_event_header, perf_event_mmap_page, perf_event_type,
};
use crate::libs::spinlock::{SpinLock, SpinLockGuard};
use crate::mm::allocator::page_frame::{FrameAllocator, PageFrameCount, PhysPageFrame};
use crate::mm::page::{page_manager_lock_irqsave, Page};
use crate::mm::{MemoryManagementArch, PhysAddr};
use crate::perf::util::{LostSamples, PerfProbeArgs, PerfSample, SampleHeader};
use alloc::string::String;
use alloc::sync::Arc;
use alloc::vec::Vec;
use core::any::Any;
use core::fmt::Debug;
use system_error::SystemError;
const PAGE_SIZE: usize = MMArch::PAGE_SIZE;
#[derive(Debug)]
pub struct BpfPerfEvent {
_args: PerfProbeArgs,
data: SpinLock<BpfPerfEventData>,
}
#[derive(Debug)]
pub struct BpfPerfEventData {
enabled: bool,
mmap_page: RingPage,
page_cache: Arc<PageCache>,
offset: usize,
}
#[derive(Debug)]
pub struct RingPage {
size: usize,
ptr: usize,
data_region_size: usize,
lost: usize,
phys_addr: PhysAddr,
}
impl RingPage {
pub fn empty() -> Self {
RingPage {
ptr: 0,
size: 0,
data_region_size: 0,
lost: 0,
phys_addr: PhysAddr::new(0),
}
}
pub fn new_init(start: usize, len: usize, phys_addr: PhysAddr) -> Self {
Self::init(start as _, len, phys_addr)
}
fn init(ptr: *mut u8, size: usize, phys_addr: PhysAddr) -> Self {
assert_eq!(size % PAGE_SIZE, 0);
assert!(size / PAGE_SIZE >= 2);
// The first page will be filled with perf_event_mmap_page
unsafe {
let perf_event_mmap_page = &mut *(ptr as *mut perf_event_mmap_page);
perf_event_mmap_page.data_offset = PAGE_SIZE as u64;
perf_event_mmap_page.data_size = (size - PAGE_SIZE) as u64;
// user will read sample or lost record from data_tail
perf_event_mmap_page.data_tail = 0;
// kernel will write sample or lost record from data_head
perf_event_mmap_page.data_head = 0;
// It is a ring buffer.
}
RingPage {
ptr: ptr as usize,
size,
data_region_size: size - PAGE_SIZE,
lost: 0,
phys_addr,
}
}
fn can_write(&self, data_size: usize, data_tail: usize, data_head: usize) -> bool {
if (data_head + 1) % self.data_region_size == data_tail {
// The buffer is full
return false;
}
let capacity = if data_head >= data_tail {
self.data_region_size - data_head + data_tail
} else {
data_tail - data_head
};
data_size <= capacity
}
pub fn write_event(&mut self, data: &[u8]) -> Result<()> {
let data_tail = unsafe { &mut (*(self.ptr as *mut perf_event_mmap_page)).data_tail };
let data_head = unsafe { &mut (*(self.ptr as *mut perf_event_mmap_page)).data_head };
// data_tail..data_head is the region that can be written
// check if there is enough space to write the event
let sample_size = PerfSample::calculate_size(data.len());
let can_write_sample =
self.can_write(sample_size, *data_tail as usize, *data_head as usize);
// log::error!(
// "can_write_sample: {}, data_tail: {}, data_head: {}, data.len(): {}, region_size: {}",
// can_write_sample,
// *data_tail,
// *data_head,
// data.len(),
// self.data_region_size
// );
if !can_write_sample {
//we need record it to the lost record
self.lost += 1;
// log::error!(
// "Lost record: {}, data_tail: {}, data_head: {}",
// self.lost,
// *data_tail,
// *data_head
// );
Ok(())
} else {
// we can write the sample to the page
// If the lost record is not zero, we need to write the lost record first.
let can_write_lost_record = self.can_write(
size_of::<LostSamples>(),
*data_tail as usize,
*data_head as usize,
);
if self.lost > 0 && can_write_lost_record {
let new_data_head = self.write_lost(*data_head as usize)?;
*data_head = new_data_head as u64;
// log::info!(
// "Write lost record: {}, data_tail: {}, new_data_head: {}",
// self.lost,
// *data_tail,
// *data_head
// );
self.lost = 0;
self.write_event(data)
} else {
let new_data_head = self.write_sample(data, *data_head as usize)?;
*data_head = new_data_head as u64;
// log::info!(
// "Write sample record, data_tail: {}, new_data_head: {}",
// *data_tail,
// *data_head
// );
Ok(())
}
}
}
/// Write any data to the page.
///
/// Return the new data_head
fn write_any(&mut self, data: &[u8], data_head: usize) -> Result<usize> {
let data_region_len = self.data_region_size;
let data_region = self.as_mut_slice()[PAGE_SIZE..].as_mut();
let data_len = data.len();
let end = (data_head + data_len) % data_region_len;
let start = data_head;
if start < end {
data_region[start..end].copy_from_slice(data);
} else {
let first_len = data_region_len - start;
data_region[start..start + first_len].copy_from_slice(&data[..first_len]);
data_region[0..end].copy_from_slice(&data[first_len..]);
}
Ok(end)
}
/// Write a sample to the page.
fn write_sample(&mut self, data: &[u8], data_head: usize) -> Result<usize> {
let perf_sample = PerfSample {
s_hdr: SampleHeader {
header: perf_event_header {
type_: perf_event_type::PERF_RECORD_SAMPLE as u32,
misc: 0,
size: size_of::<SampleHeader>() as u16 + data.len() as u16,
},
size: data.len() as u32,
},
value: data,
};
let new_head = self.write_any(perf_sample.s_hdr.as_bytes(), data_head)?;
self.write_any(perf_sample.value, new_head)
}
/// Write a lost record to the page.
///
/// Return the new data_head
fn write_lost(&mut self, data_head: usize) -> Result<usize> {
let lost = LostSamples {
header: perf_event_header {
type_: perf_event_type::PERF_RECORD_LOST as u32,
misc: 0,
size: size_of::<LostSamples>() as u16,
},
id: 0,
count: self.lost as u64,
};
self.write_any(lost.as_bytes(), data_head)
}
pub fn readable(&self) -> bool {
let data_tail = unsafe { &(*(self.ptr as *mut perf_event_mmap_page)).data_tail };
let data_head = unsafe { &(*(self.ptr as *mut perf_event_mmap_page)).data_head };
data_tail != data_head
}
pub fn as_slice(&self) -> &[u8] {
unsafe { core::slice::from_raw_parts(self.ptr as *const u8, self.size) }
}
pub fn as_mut_slice(&mut self) -> &mut [u8] {
unsafe { core::slice::from_raw_parts_mut(self.ptr as *mut u8, self.size) }
}
}
impl BpfPerfEvent {
pub fn new(args: PerfProbeArgs) -> Self {
BpfPerfEvent {
_args: args,
data: SpinLock::new(BpfPerfEventData {
enabled: false,
mmap_page: RingPage::empty(),
page_cache: PageCache::new(None),
offset: 0,
}),
}
}
pub fn do_mmap(&self, _start: usize, len: usize, offset: usize) -> Result<()> {
let mut data = self.data.lock();
// alloc page frame
let (phy_addr, page_count) =
unsafe { LockedFrameAllocator.allocate(PageFrameCount::new(len / PAGE_SIZE)) }
.ok_or(SystemError::ENOSPC)?;
let mut page_manager_guard = page_manager_lock_irqsave();
let mut cur_phys = PhysPageFrame::new(phy_addr);
for i in 0..page_count.data() {
let page = Arc::new(Page::new(true, cur_phys.phys_address()));
let paddr = cur_phys.phys_address();
page_manager_guard.insert(paddr, &page);
data.page_cache.add_page(i, &page);
cur_phys = cur_phys.next();
}
let virt_addr = unsafe { MMArch::phys_2_virt(phy_addr) }.ok_or(SystemError::EFAULT)?;
// create mmap page
let mmap_page = RingPage::new_init(virt_addr.data(), len, phy_addr);
data.mmap_page = mmap_page;
data.offset = offset;
Ok(())
}
pub fn write_event(&self, data: &[u8]) -> Result<()> {
let mut inner_data = self.data.lock();
inner_data.mmap_page.write_event(data)?;
Ok(())
}
}
impl Drop for BpfPerfEvent {
fn drop(&mut self) {
let mut page_manager_guard = page_manager_lock_irqsave();
let data = self.data.lock();
let phy_addr = data.mmap_page.phys_addr;
let len = data.mmap_page.size;
let page_count = PageFrameCount::new(len / PAGE_SIZE);
let mut cur_phys = PhysPageFrame::new(phy_addr);
for _ in 0..page_count.data() {
page_manager_guard.remove_page(&cur_phys.phys_address());
cur_phys = cur_phys.next();
}
}
}
impl IndexNode for BpfPerfEvent {
fn mmap(&self, start: usize, len: usize, offset: usize) -> Result<()> {
self.do_mmap(start, len, offset)
}
fn read_at(
&self,
_offset: usize,
_len: usize,
_buf: &mut [u8],
_data: SpinLockGuard<FilePrivateData>,
) -> Result<usize> {
panic!("PerfEventInode does not support read")
}
fn write_at(
&self,
_offset: usize,
_len: usize,
_buf: &[u8],
_data: SpinLockGuard<FilePrivateData>,
) -> Result<usize> {
panic!("PerfEventInode does not support write")
}
fn fs(&self) -> Arc<dyn FileSystem> {
panic!("PerfEventInode does not have a filesystem")
}
fn as_any_ref(&self) -> &dyn Any {
self
}
fn list(&self) -> Result<Vec<String>> {
Err(SystemError::ENOSYS)
}
fn page_cache(&self) -> Option<Arc<PageCache>> {
Some(self.data.lock().page_cache.clone())
}
}
impl PerfEventOps for BpfPerfEvent {
fn enable(&self) -> Result<()> {
self.data.lock().enabled = true;
Ok(())
}
fn disable(&self) -> Result<()> {
self.data.lock().enabled = false;
Ok(())
}
fn readable(&self) -> bool {
self.data.lock().mmap_page.readable()
}
}
pub fn perf_event_open_bpf(args: PerfProbeArgs) -> BpfPerfEvent {
BpfPerfEvent::new(args)
}

159
kernel/src/perf/kprobe.rs Normal file
View File

@ -0,0 +1,159 @@
use super::Result;
use crate::arch::interrupt::TrapFrame;
use crate::arch::kprobe::KProbeContext;
use crate::bpf::helper::BPF_HELPER_FUN_SET;
use crate::bpf::prog::BpfProg;
use crate::debug::kprobe::args::KprobeInfo;
use crate::debug::kprobe::{register_kprobe, unregister_kprobe, LockKprobe};
use crate::filesystem::vfs::file::{File, PageCache};
use crate::filesystem::vfs::{FilePrivateData, FileSystem, IndexNode};
use crate::libs::casting::DowncastArc;
use crate::libs::spinlock::SpinLockGuard;
use crate::perf::util::PerfProbeArgs;
use crate::perf::PerfEventOps;
use alloc::boxed::Box;
use alloc::string::String;
use alloc::sync::Arc;
use alloc::vec::Vec;
use core::any::Any;
use core::fmt::Debug;
use kprobe::{CallBackFunc, ProbeArgs};
use rbpf::EbpfVmRawOwned;
use system_error::SystemError;
#[derive(Debug)]
pub struct KprobePerfEvent {
_args: PerfProbeArgs,
kprobe: LockKprobe,
}
impl Drop for KprobePerfEvent {
fn drop(&mut self) {
unregister_kprobe(self.kprobe.clone());
}
}
impl KprobePerfEvent {
pub fn do_set_bpf_prog(&self, prog_file: Arc<File>) -> Result<()> {
let file = prog_file
.inode()
.downcast_arc::<BpfProg>()
.ok_or(SystemError::EINVAL)?;
let prog_slice = file.insns();
let mut vm =
EbpfVmRawOwned::new(Some(prog_slice.to_vec())).map_err(|_| SystemError::EINVAL)?;
vm.register_helper_set(BPF_HELPER_FUN_SET.get())
.map_err(|_| SystemError::EINVAL)?;
// create a callback to execute the ebpf prog
let callback = Box::new(KprobePerfCallBack::new(file, vm));
// update callback for kprobe
self.kprobe.write().update_event_callback(callback);
Ok(())
}
}
pub struct KprobePerfCallBack {
_bpf_prog_file: Arc<BpfProg>,
vm: EbpfVmRawOwned,
}
impl KprobePerfCallBack {
fn new(bpf_prog_file: Arc<BpfProg>, vm: EbpfVmRawOwned) -> Self {
Self {
_bpf_prog_file: bpf_prog_file,
vm,
}
}
}
impl CallBackFunc for KprobePerfCallBack {
fn call(&self, trap_frame: &dyn ProbeArgs) {
let trap_frame = trap_frame.as_any().downcast_ref::<TrapFrame>().unwrap();
let pt_regs = KProbeContext::from(trap_frame);
let probe_context = unsafe {
core::slice::from_raw_parts_mut(
&pt_regs as *const KProbeContext as *mut u8,
size_of::<KProbeContext>(),
)
};
let _res = self
.vm
.execute_program(probe_context)
.map_err(|_| SystemError::EINVAL);
}
}
impl IndexNode for KprobePerfEvent {
fn read_at(
&self,
_offset: usize,
_len: usize,
_buf: &mut [u8],
_data: SpinLockGuard<FilePrivateData>,
) -> Result<usize> {
panic!("read_at not implemented for PerfEvent");
}
fn write_at(
&self,
_offset: usize,
_len: usize,
_buf: &[u8],
_data: SpinLockGuard<FilePrivateData>,
) -> Result<usize> {
panic!("write_at not implemented for PerfEvent");
}
fn fs(&self) -> Arc<dyn FileSystem> {
panic!("fs not implemented for PerfEvent");
}
fn as_any_ref(&self) -> &dyn Any {
self
}
fn list(&self) -> Result<Vec<String>> {
Err(SystemError::ENOSYS)
}
fn page_cache(&self) -> Option<Arc<PageCache>> {
None
}
}
impl PerfEventOps for KprobePerfEvent {
fn set_bpf_prog(&self, bpf_prog: Arc<File>) -> Result<()> {
self.do_set_bpf_prog(bpf_prog)
}
fn enable(&self) -> Result<()> {
self.kprobe.write().enable();
Ok(())
}
fn disable(&self) -> Result<()> {
self.kprobe.write().disable();
Ok(())
}
fn readable(&self) -> bool {
true
}
}
pub fn perf_event_open_kprobe(args: PerfProbeArgs) -> KprobePerfEvent {
let symbol = args.name.clone();
log::info!("create kprobe for symbol: {symbol}");
let kprobe_info = KprobeInfo {
pre_handler: |_| {},
post_handler: |_| {},
fault_handler: None,
event_callback: None,
symbol: Some(symbol),
addr: None,
offset: 0,
enable: false,
};
let kprobe = register_kprobe(kprobe_info).expect("create kprobe failed");
KprobePerfEvent {
_args: args,
kprobe,
}
}

337
kernel/src/perf/mod.rs Normal file
View File

@ -0,0 +1,337 @@
mod bpf;
mod kprobe;
mod util;
use crate::filesystem::vfs::file::{File, FileMode, PageCache};
use crate::filesystem::vfs::syscall::ModeType;
use crate::filesystem::vfs::{
FilePrivateData, FileSystem, FileType, FsInfo, IndexNode, Metadata, SuperBlock,
};
use crate::include::bindings::linux_bpf::{
perf_event_attr, perf_event_sample_format, perf_sw_ids, perf_type_id,
};
use crate::libs::casting::DowncastArc;
use crate::libs::spinlock::{SpinLock, SpinLockGuard};
use crate::mm::fault::{PageFaultHandler, PageFaultMessage};
use crate::mm::VmFaultReason;
use crate::net::event_poll::{EPollEventType, EPollItem, EventPoll, KernelIoctlData};
use crate::perf::bpf::BpfPerfEvent;
use crate::perf::util::{PerfEventIoc, PerfEventOpenFlags, PerfProbeArgs};
use crate::process::ProcessManager;
use crate::syscall::user_access::UserBufferReader;
use crate::syscall::Syscall;
use alloc::boxed::Box;
use alloc::collections::LinkedList;
use alloc::string::String;
use alloc::sync::{Arc, Weak};
use alloc::vec::Vec;
use core::any::Any;
use core::ffi::c_void;
use core::fmt::Debug;
use core::ops::Deref;
use intertrait::{CastFrom, CastFromSync};
use log::info;
use num_traits::FromPrimitive;
use system_error::SystemError;
type Result<T> = core::result::Result<T, SystemError>;
pub trait PerfEventOps: Send + Sync + Debug + CastFromSync + CastFrom + IndexNode {
/// Set the bpf program for the perf event
fn set_bpf_prog(&self, _bpf_prog: Arc<File>) -> Result<()> {
Err(SystemError::ENOSYS)
}
/// Enable the perf event
fn enable(&self) -> Result<()> {
Err(SystemError::ENOSYS)
}
/// Disable the perf event
fn disable(&self) -> Result<()> {
Err(SystemError::ENOSYS)
}
/// Whether the perf event is readable
fn readable(&self) -> bool;
}
#[derive(Debug)]
pub struct PerfEventInode {
event: Box<dyn PerfEventOps>,
epitems: SpinLock<LinkedList<Arc<EPollItem>>>,
}
impl PerfEventInode {
pub fn new(event: Box<dyn PerfEventOps>) -> Self {
Self {
event,
epitems: SpinLock::new(LinkedList::new()),
}
}
pub fn remove_epoll(
&self,
epoll: &Weak<SpinLock<EventPoll>>,
) -> core::result::Result<(), SystemError> {
let is_remove = !self
.epitems
.lock_irqsave()
.extract_if(|x| x.epoll().ptr_eq(epoll))
.collect::<Vec<_>>()
.is_empty();
if is_remove {
return Ok(());
}
Err(SystemError::ENOENT)
}
fn do_poll(&self) -> Result<usize> {
let mut events = EPollEventType::empty();
if self.event.readable() {
events |= EPollEventType::EPOLLIN | EPollEventType::EPOLLRDNORM;
}
return Ok(events.bits() as usize);
}
fn epoll_callback(&self) -> Result<()> {
let pollflag = EPollEventType::from_bits_truncate(self.do_poll()? as u32);
// 唤醒epoll中等待的进程
EventPoll::wakeup_epoll(&self.epitems, Some(pollflag))
}
}
impl Deref for PerfEventInode {
type Target = Box<dyn PerfEventOps>;
fn deref(&self) -> &Self::Target {
&self.event
}
}
impl IndexNode for PerfEventInode {
fn mmap(&self, start: usize, len: usize, offset: usize) -> Result<()> {
self.event.mmap(start, len, offset)
}
fn open(&self, _data: SpinLockGuard<FilePrivateData>, _mode: &FileMode) -> Result<()> {
Ok(())
}
fn close(&self, _data: SpinLockGuard<FilePrivateData>) -> Result<()> {
Ok(())
}
fn read_at(
&self,
_offset: usize,
_len: usize,
_buf: &mut [u8],
_data: SpinLockGuard<FilePrivateData>,
) -> Result<usize> {
panic!("read_at not implemented for PerfEvent");
}
fn write_at(
&self,
_offset: usize,
_len: usize,
_buf: &[u8],
_data: SpinLockGuard<FilePrivateData>,
) -> Result<usize> {
panic!("write_at not implemented for PerfEvent");
}
fn poll(&self, _private_data: &FilePrivateData) -> Result<usize> {
self.do_poll()
}
fn metadata(&self) -> Result<Metadata> {
let meta = Metadata {
mode: ModeType::from_bits_truncate(0o755),
file_type: FileType::File,
..Default::default()
};
Ok(meta)
}
fn resize(&self, _len: usize) -> Result<()> {
Ok(())
}
fn ioctl(&self, cmd: u32, data: usize, _private_data: &FilePrivateData) -> Result<usize> {
let req = PerfEventIoc::from_u32(cmd).ok_or(SystemError::EINVAL)?;
info!("perf_event_ioctl: request: {:?}, arg: {}", req, data);
match req {
PerfEventIoc::Enable => {
self.event.enable()?;
Ok(0)
}
PerfEventIoc::Disable => {
self.event.disable()?;
Ok(0)
}
PerfEventIoc::SetBpf => {
info!("perf_event_ioctl: PERF_EVENT_IOC_SET_BPF, arg: {}", data);
let bpf_prog_fd = data;
let fd_table = ProcessManager::current_pcb().fd_table();
let file = fd_table
.read()
.get_file_by_fd(bpf_prog_fd as _)
.ok_or(SystemError::EBADF)?;
self.event.set_bpf_prog(file)?;
Ok(0)
}
}
}
fn kernel_ioctl(
&self,
arg: Arc<dyn KernelIoctlData>,
_data: &FilePrivateData,
) -> core::result::Result<usize, SystemError> {
let epitem = arg
.arc_any()
.downcast::<EPollItem>()
.map_err(|_| SystemError::EFAULT)?;
self.epitems.lock().push_back(epitem);
Ok(0)
}
fn fs(&self) -> Arc<dyn FileSystem> {
// panic!("PerfEvent does not have a filesystem")
Arc::new(PerfFakeFs)
}
fn as_any_ref(&self) -> &dyn Any {
self
}
fn list(&self) -> Result<Vec<String>> {
Err(SystemError::ENOSYS)
}
fn page_cache(&self) -> Option<Arc<PageCache>> {
self.event.page_cache()
}
}
#[derive(Debug)]
struct PerfFakeFs;
impl FileSystem for PerfFakeFs {
fn root_inode(&self) -> Arc<dyn IndexNode> {
panic!("PerfFakeFs does not have a root inode")
}
fn info(&self) -> FsInfo {
panic!("PerfFakeFs does not have a filesystem info")
}
fn as_any_ref(&self) -> &dyn Any {
self
}
fn name(&self) -> &str {
"perf"
}
fn super_block(&self) -> SuperBlock {
panic!("PerfFakeFs does not have a super block")
}
unsafe fn fault(&self, pfm: &mut PageFaultMessage) -> VmFaultReason {
let res = PageFaultHandler::filemap_fault(pfm);
res
}
unsafe fn map_pages(
&self,
pfm: &mut PageFaultMessage,
start_pgoff: usize,
end_pgoff: usize,
) -> VmFaultReason {
PageFaultHandler::filemap_map_pages(pfm, start_pgoff, end_pgoff)
}
}
impl Syscall {
pub fn sys_perf_event_open(
attr: *const u8,
pid: i32,
cpu: i32,
group_fd: i32,
flags: u32,
) -> Result<usize> {
let buf = UserBufferReader::new(
attr as *const perf_event_attr,
size_of::<perf_event_attr>(),
true,
)?;
let attr = buf.read_one_from_user(0)?;
perf_event_open(attr, pid, cpu, group_fd, flags)
}
}
pub fn perf_event_open(
attr: &perf_event_attr,
pid: i32,
cpu: i32,
group_fd: i32,
flags: u32,
) -> Result<usize> {
let args = PerfProbeArgs::try_from(attr, pid, cpu, group_fd, flags)?;
log::info!("perf_event_process: {:#?}", args);
let file_mode = if args
.flags
.contains(PerfEventOpenFlags::PERF_FLAG_FD_CLOEXEC)
{
FileMode::O_RDWR | FileMode::O_CLOEXEC
} else {
FileMode::O_RDWR
};
let event: Box<dyn PerfEventOps> = match args.type_ {
// Kprobe
// See /sys/bus/event_source/devices/kprobe/type
perf_type_id::PERF_TYPE_MAX => {
let kprobe_event = kprobe::perf_event_open_kprobe(args);
Box::new(kprobe_event)
}
perf_type_id::PERF_TYPE_SOFTWARE => {
// For bpf prog output
assert_eq!(args.config, perf_sw_ids::PERF_COUNT_SW_BPF_OUTPUT);
assert_eq!(
args.sample_type,
Some(perf_event_sample_format::PERF_SAMPLE_RAW)
);
let bpf_event = bpf::perf_event_open_bpf(args);
Box::new(bpf_event)
}
_ => {
unimplemented!("perf_event_process: unknown type: {:?}", args);
}
};
let page_cache = event.page_cache();
let perf_event = Arc::new(PerfEventInode::new(event));
if let Some(cache) = page_cache {
cache.set_inode(Arc::downgrade(&(perf_event.clone() as _)))?;
}
let file = File::new(perf_event, file_mode)?;
let fd_table = ProcessManager::current_pcb().fd_table();
let fd = fd_table.write().alloc_fd(file, None).map(|x| x as usize)?;
Ok(fd)
}
pub fn perf_event_output(_ctx: *mut c_void, fd: usize, _flags: u32, data: &[u8]) -> Result<()> {
let file = get_perf_event_file(fd)?;
let bpf_event_file = file.deref().deref();
let bpf_event_file = bpf_event_file
.deref()
.ref_any()
.downcast_ref::<BpfPerfEvent>()
.ok_or(SystemError::EINVAL)?;
bpf_event_file.write_event(data)?;
file.epoll_callback()?;
Ok(())
}
fn get_perf_event_file(fd: usize) -> Result<Arc<PerfEventInode>> {
let fd_table = ProcessManager::current_pcb().fd_table();
let file = fd_table
.read()
.get_file_by_fd(fd as _)
.ok_or(SystemError::EBADF)?;
let event = file
.inode()
.downcast_arc::<PerfEventInode>()
.ok_or(SystemError::EINVAL)?;
Ok(event)
}

123
kernel/src/perf/util.rs Normal file
View File

@ -0,0 +1,123 @@
use crate::include::bindings::linux_bpf::{
perf_event_attr, perf_event_header, perf_event_sample_format, perf_sw_ids, perf_type_id,
};
use crate::syscall::user_access::check_and_clone_cstr;
use alloc::string::String;
use num_traits::FromPrimitive;
use system_error::SystemError;
bitflags! {
pub struct PerfEventOpenFlags: u32 {
const PERF_FLAG_FD_NO_GROUP = 1;
const PERF_FLAG_FD_OUTPUT = 2;
const PERF_FLAG_PID_CGROUP = 4;
const PERF_FLAG_FD_CLOEXEC = 8;
}
}
/// The `PerfEventIoc` enum is used to define the ioctl commands for perf events.
///
/// See https://elixir.bootlin.com/linux/v6.1/source/include/uapi/linux/perf_event.h#L544
#[repr(u32)]
#[derive(Debug, Copy, Clone, FromPrimitive)]
pub enum PerfEventIoc {
/// Equivalent to [crate::include::bindings::linux_bpf::AYA_PERF_EVENT_IOC_ENABLE].
Enable = 9216,
/// Equivalent to [crate::include::bindings::linux_bpf::AYA_PERF_EVENT_IOC_DISABLE].
Disable = 9217,
/// Equivalent to [crate::include::bindings::linux_bpf::AYA_PERF_EVENT_IOC_SET_BPF].
SetBpf = 1074013192,
}
#[derive(Debug, Clone)]
#[allow(unused)]
/// `perf_event_open` syscall arguments.
pub struct PerfProbeArgs {
pub config: perf_sw_ids,
pub name: String,
pub offset: u64,
pub size: u32,
pub type_: perf_type_id,
pub pid: i32,
pub cpu: i32,
pub group_fd: i32,
pub flags: PerfEventOpenFlags,
pub sample_type: Option<perf_event_sample_format>,
}
impl PerfProbeArgs {
pub fn try_from(
attr: &perf_event_attr,
pid: i32,
cpu: i32,
group_fd: i32,
flags: u32,
) -> Result<Self, SystemError> {
let ty = perf_type_id::from_u32(attr.type_).ok_or(SystemError::EINVAL)?;
let config = perf_sw_ids::from_u32(attr.config as u32).ok_or(SystemError::EINVAL)?;
let name = if ty == perf_type_id::PERF_TYPE_MAX {
let name_ptr = unsafe { attr.__bindgen_anon_3.config1 } as *const u8;
let name = check_and_clone_cstr(name_ptr, None)?;
name.into_string().map_err(|_| SystemError::EINVAL)?
} else {
String::new()
};
let sample_ty = perf_event_sample_format::from_u32(attr.sample_type as u32);
let args = PerfProbeArgs {
config,
name,
offset: unsafe { attr.__bindgen_anon_4.config2 },
size: attr.size,
type_: ty,
pid,
cpu,
group_fd,
flags: PerfEventOpenFlags::from_bits_truncate(flags),
sample_type: sample_ty,
};
Ok(args)
}
}
/// The event type in our particular use case will be `PERF_RECORD_SAMPLE` or `PERF_RECORD_LOST`.
/// `PERF_RECORD_SAMPLE` indicating that there is an actual sample after this header.
/// And `PERF_RECORD_LOST` indicating that there is a record lost header following the perf event header.
#[repr(C)]
#[derive(Debug)]
pub struct LostSamples {
pub header: perf_event_header,
pub id: u64,
pub count: u64,
}
impl LostSamples {
pub fn as_bytes(&self) -> &[u8] {
unsafe { core::slice::from_raw_parts(self as *const Self as *const u8, size_of::<Self>()) }
}
}
#[repr(C)]
#[derive(Debug)]
pub struct SampleHeader {
pub header: perf_event_header,
pub size: u32,
}
impl SampleHeader {
pub fn as_bytes(&self) -> &[u8] {
unsafe { core::slice::from_raw_parts(self as *const Self as *const u8, size_of::<Self>()) }
}
}
#[repr(C)]
#[derive(Debug)]
pub struct PerfSample<'a> {
pub s_hdr: SampleHeader,
pub value: &'a [u8],
}
impl<'a> PerfSample<'a> {
pub fn calculate_size(value_size: usize) -> usize {
size_of::<SampleHeader>() + value_size
}
}

View File

@ -134,7 +134,6 @@ impl SmpCpuManager {
&self.possible_cpus
}
#[allow(dead_code)]
pub fn possible_cpus_count(&self) -> u32 {
self.possible_cnt.load(core::sync::atomic::Ordering::SeqCst)
}

View File

@ -1159,6 +1159,20 @@ impl Syscall {
let flags = args[1] as u32;
Self::sys_eventfd(initval, flags)
}
SYS_BPF => {
let cmd = args[0] as u32;
let attr = args[1] as *mut u8;
let size = args[2] as u32;
Self::sys_bpf(cmd, attr, size)
}
SYS_PERF_EVENT_OPEN => {
let attr = args[0] as *const u8;
let pid = args[1] as i32;
let cpu = args[2] as i32;
let group_fd = args[3] as i32;
let flags = args[4] as u32;
Self::sys_perf_event_open(attr, pid, cpu, group_fd, flags)
}
_ => panic!("Unsupported syscall ID: {}", syscall_num),
};

View File

@ -2,8 +2,6 @@ use core::{
ffi::{c_int, c_longlong},
time::Duration,
};
use log::warn;
use num_traits::FromPrimitive;
use system_error::SystemError;
@ -139,7 +137,7 @@ impl Syscall {
pub fn clock_gettime(clock_id: c_int, tp: *mut PosixTimeSpec) -> Result<usize, SystemError> {
let clock_id = PosixClockID::try_from(clock_id)?;
if clock_id != PosixClockID::Realtime {
warn!("clock_gettime: currently only support Realtime clock, but got {:?}. Defaultly return realtime!!!\n", clock_id);
// warn!("clock_gettime: currently only support Realtime clock, but got {:?}. Defaultly return realtime!!!\n", clock_id);
}
if tp.is_null() {
return Err(SystemError::EFAULT);