Add mlsdisk as a component

Co-authored-by: Shaowei Song <songshaowei.ssw@antgroup.com>
This commit is contained in:
Qingsong Chen
2024-12-27 11:49:46 +00:00
committed by Tate, Hongliang Tian
parent 6e691d5838
commit 56a137dc56
45 changed files with 13832 additions and 182 deletions

View File

@ -0,0 +1,95 @@
// SPDX-License-Identifier: MPL-2.0
use core::fmt;
/// The error types used in this crate.
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum Errno {
/// Transaction aborted.
TxAborted,
/// Not found.
NotFound,
/// Invalid arguments.
InvalidArgs,
/// Out of memory.
OutOfMemory,
/// Out of disk space.
OutOfDisk,
/// IO error.
IoFailed,
/// Permission denied.
PermissionDenied,
/// Unsupported.
Unsupported,
/// OS-specific unknown error.
OsSpecUnknown,
/// Encryption operation failed.
EncryptFailed,
/// Decryption operation failed.
DecryptFailed,
/// MAC (Message Authentication Code) mismatched.
MacMismatched,
/// Not aligned to `BLOCK_SIZE`.
NotBlockSizeAligned,
/// Try lock failed.
TryLockFailed,
}
/// The error with an error type and an error message used in this crate.
#[derive(Clone, Debug)]
pub struct Error {
errno: Errno,
msg: Option<&'static str>,
}
impl Error {
/// Creates a new error with the given error type and no error message.
pub const fn new(errno: Errno) -> Self {
Error { errno, msg: None }
}
/// Creates a new error with the given error type and the error message.
pub const fn with_msg(errno: Errno, msg: &'static str) -> Self {
Error {
errno,
msg: Some(msg),
}
}
/// Returns the error type.
pub fn errno(&self) -> Errno {
self.errno
}
}
impl From<Errno> for Error {
fn from(errno: Errno) -> Self {
Error::new(errno)
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{:?}", self)
}
}
impl fmt::Display for Errno {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{:?}", self)
}
}
#[macro_export]
macro_rules! return_errno {
($errno: expr) => {
return core::result::Result::Err($crate::Error::new($errno))
};
}
#[macro_export]
macro_rules! return_errno_with_msg {
($errno: expr, $msg: expr) => {
return core::result::Result::Err($crate::Error::with_msg($errno, $msg))
};
}

View File

@ -0,0 +1,243 @@
// SPDX-License-Identifier: MPL-2.0
//! This module provides API to represent buffers whose
//! sizes are block aligned. The advantage of using the
//! APIs provided this module over Rust std's counterparts
//! is to ensure the invariance of block-aligned length
//! at type level, eliminating the need for runtime check.
//!
//! There are three main types:
//! * `Buf`: A owned buffer backed by `Pages`, whose length is
//! a multiple of the block size.
//! * `BufRef`: An immutably-borrowed buffer whose length
//! is a multiple of the block size.
//! * `BufMut`: A mutably-borrowed buffer whose length is
//! a multiple of the block size.
//!
//! The basic usage is simple: replace the usage of `Box<[u8]>`
//! with `Buf`, `&[u8]` with `BufRef<[u8]>`,
//! and `&mut [u8]` with `BufMut<[u8]>`.
use alloc::vec;
use core::convert::TryFrom;
use lending_iterator::prelude::*;
use super::BLOCK_SIZE;
use crate::prelude::*;
/// A owned buffer whose length is a multiple of the block size.
pub struct Buf(Vec<u8>);
impl Buf {
/// Allocate specific number of blocks as memory buffer.
pub fn alloc(num_blocks: usize) -> Result<Self> {
if num_blocks == 0 {
return_errno_with_msg!(
InvalidArgs,
"num_blocks must be greater than 0 for allocation"
)
}
let buffer = vec![0; num_blocks * BLOCK_SIZE];
Ok(Self(buffer))
}
/// Returns the number of blocks of owned buffer.
pub fn nblocks(&self) -> usize {
self.0.len() / BLOCK_SIZE
}
/// Returns the immutable slice of owned buffer.
pub fn as_slice(&self) -> &[u8] {
self.0.as_slice()
}
/// Returns the mutable slice of owned buffer.
pub fn as_mut_slice(&mut self) -> &mut [u8] {
self.0.as_mut_slice()
}
/// Converts to immutably-borrowed buffer `BufRef`.
pub fn as_ref(&self) -> BufRef<'_> {
BufRef(self.as_slice())
}
/// Coverts to mutably-borrowed buffer `BufMut`.
pub fn as_mut(&mut self) -> BufMut<'_> {
BufMut(self.as_mut_slice())
}
}
/// An immutably-borrowed buffer whose length is a multiple of the block size.
#[derive(Clone, Copy)]
pub struct BufRef<'a>(&'a [u8]);
impl BufRef<'_> {
/// Returns the immutable slice of borrowed buffer.
pub fn as_slice(&self) -> &[u8] {
self.0
}
/// Returns the number of blocks of borrowed buffer.
pub fn nblocks(&self) -> usize {
self.0.len() / BLOCK_SIZE
}
/// Returns an iterator for immutable buffers of `BLOCK_SIZE`.
pub fn iter(&self) -> BufIter<'_> {
BufIter {
buf: BufRef(self.as_slice()),
offset: 0,
}
}
}
impl<'a> TryFrom<&'a [u8]> for BufRef<'a> {
type Error = crate::error::Error;
fn try_from(buf: &'a [u8]) -> Result<Self> {
if buf.is_empty() {
return_errno_with_msg!(InvalidArgs, "empty buf in `BufRef::try_from`");
}
if buf.len() % BLOCK_SIZE != 0 {
return_errno_with_msg!(
NotBlockSizeAligned,
"buf not block size aligned `BufRef::try_from`"
);
}
let new_self = Self(buf);
Ok(new_self)
}
}
/// A mutably-borrowed buffer whose length is a multiple of the block size.
pub struct BufMut<'a>(&'a mut [u8]);
impl BufMut<'_> {
/// Returns the immutable slice of borrowed buffer.
pub fn as_slice(&self) -> &[u8] {
self.0
}
/// Returns the mutable slice of borrowed buffer.
pub fn as_mut_slice(&mut self) -> &mut [u8] {
self.0
}
/// Returns the number of blocks of borrowed buffer.
pub fn nblocks(&self) -> usize {
self.0.len() / BLOCK_SIZE
}
/// Returns an iterator for immutable buffers of `BLOCK_SIZE`.
pub fn iter(&self) -> BufIter<'_> {
BufIter {
buf: BufRef(self.as_slice()),
offset: 0,
}
}
/// Returns an iterator for mutable buffers of `BLOCK_SIZE`.
pub fn iter_mut(&mut self) -> BufIterMut<'_> {
BufIterMut {
buf: BufMut(self.as_mut_slice()),
offset: 0,
}
}
}
impl<'a> TryFrom<&'a mut [u8]> for BufMut<'a> {
type Error = crate::error::Error;
fn try_from(buf: &'a mut [u8]) -> Result<Self> {
if buf.is_empty() {
return_errno_with_msg!(InvalidArgs, "empty buf in `BufMut::try_from`");
}
if buf.len() % BLOCK_SIZE != 0 {
return_errno_with_msg!(
NotBlockSizeAligned,
"buf not block size aligned `BufMut::try_from`"
);
}
let new_self = Self(buf);
Ok(new_self)
}
}
/// Iterator for immutable buffers of `BLOCK_SIZE`.
pub struct BufIter<'a> {
buf: BufRef<'a>,
offset: usize,
}
impl<'a> Iterator for BufIter<'a> {
type Item = BufRef<'a>;
fn next(&mut self) -> Option<Self::Item> {
if self.offset >= self.buf.0.len() {
return None;
}
let offset = self.offset;
self.offset += BLOCK_SIZE;
BufRef::try_from(&self.buf.0[offset..offset + BLOCK_SIZE]).ok()
}
}
/// Iterator for mutable buffers of `BLOCK_SIZE`.
pub struct BufIterMut<'a> {
buf: BufMut<'a>,
offset: usize,
}
#[gat]
impl LendingIterator for BufIterMut<'_> {
type Item<'next> = BufMut<'next>;
fn next(&mut self) -> Option<Self::Item<'_>> {
if self.offset >= self.buf.0.len() {
return None;
}
let offset = self.offset;
self.offset += BLOCK_SIZE;
BufMut::try_from(&mut self.buf.0[offset..offset + BLOCK_SIZE]).ok()
}
}
#[cfg(test)]
mod tests {
use lending_iterator::LendingIterator;
use super::{Buf, BufMut, BufRef, BLOCK_SIZE};
fn iterate_buf_ref<'a>(buf: BufRef<'a>) {
for block in buf.iter() {
assert_eq!(block.as_slice().len(), BLOCK_SIZE);
assert_eq!(block.nblocks(), 1);
}
}
fn iterate_buf_mut<'a>(mut buf: BufMut<'a>) {
let mut iter_mut = buf.iter_mut();
while let Some(mut block) = iter_mut.next() {
assert_eq!(block.as_mut_slice().len(), BLOCK_SIZE);
assert_eq!(block.nblocks(), 1);
}
}
#[test]
fn buf() {
let mut buf = Buf::alloc(10).unwrap();
assert_eq!(buf.nblocks(), 10);
assert_eq!(buf.as_slice().len(), 10 * BLOCK_SIZE);
iterate_buf_ref(buf.as_ref());
iterate_buf_mut(buf.as_mut());
let mut buf = [0u8; BLOCK_SIZE];
iterate_buf_ref(BufRef::try_from(buf.as_slice()).unwrap());
iterate_buf_mut(BufMut::try_from(buf.as_mut_slice()).unwrap());
}
}

View File

@ -0,0 +1,133 @@
// SPDX-License-Identifier: MPL-2.0
use core::sync::atomic::{AtomicUsize, Ordering};
use inherit_methods_macro::inherit_methods;
use super::{Buf, BufMut, BufRef};
use crate::{os::Mutex, prelude::*};
/// A log of data blocks that can support random reads and append-only
/// writes.
///
/// # Thread safety
///
/// `BlockLog` is a data structure of interior mutability.
/// It is ok to perform I/O on a `BlockLog` concurrently in multiple threads.
/// `BlockLog` promises the serialization of the append operations, i.e.,
/// concurrent appends are carried out as if they are done one by one.
pub trait BlockLog: Sync + Send {
/// Read one or multiple blocks at a specified position.
fn read(&self, pos: BlockId, buf: BufMut) -> Result<()>;
/// Append one or multiple blocks at the end,
/// returning the ID of the first newly-appended block.
fn append(&self, buf: BufRef) -> Result<BlockId>;
/// Ensure that blocks are persisted to the disk.
fn flush(&self) -> Result<()>;
/// Returns the number of blocks.
fn nblocks(&self) -> usize;
}
macro_rules! impl_blocklog_for {
($typ:ty,$from:tt) => {
#[inherit_methods(from = $from)]
impl<T: BlockLog> BlockLog for $typ {
fn read(&self, pos: BlockId, buf: BufMut) -> Result<()>;
fn append(&self, buf: BufRef) -> Result<BlockId>;
fn flush(&self) -> Result<()>;
fn nblocks(&self) -> usize;
}
};
}
impl_blocklog_for!(&T, "(**self)");
impl_blocklog_for!(&mut T, "(**self)");
impl_blocklog_for!(Box<T>, "(**self)");
impl_blocklog_for!(Arc<T>, "(**self)");
/// An in-memory log that impls `BlockLog`.
pub struct MemLog {
log: Mutex<Buf>,
append_pos: AtomicUsize,
}
impl BlockLog for MemLog {
fn read(&self, pos: BlockId, mut buf: BufMut) -> Result<()> {
let nblocks = buf.nblocks();
if pos + nblocks > self.nblocks() {
return_errno_with_msg!(InvalidArgs, "read range out of bound");
}
let log = self.log.lock();
let read_buf = &log.as_slice()[Self::offset(pos)..Self::offset(pos) + nblocks * BLOCK_SIZE];
buf.as_mut_slice().copy_from_slice(read_buf);
Ok(())
}
fn append(&self, buf: BufRef) -> Result<BlockId> {
let nblocks = buf.nblocks();
let mut log = self.log.lock();
let pos = self.append_pos.load(Ordering::Acquire);
if pos + nblocks > log.nblocks() {
return_errno_with_msg!(InvalidArgs, "append range out of bound");
}
let write_buf =
&mut log.as_mut_slice()[Self::offset(pos)..Self::offset(pos) + nblocks * BLOCK_SIZE];
write_buf.copy_from_slice(buf.as_slice());
self.append_pos.fetch_add(nblocks, Ordering::Release);
Ok(pos)
}
fn flush(&self) -> Result<()> {
Ok(())
}
fn nblocks(&self) -> usize {
self.append_pos.load(Ordering::Acquire)
}
}
impl MemLog {
pub fn create(num_blocks: usize) -> Result<Self> {
Ok(Self {
log: Mutex::new(Buf::alloc(num_blocks)?),
append_pos: AtomicUsize::new(0),
})
}
fn offset(pos: BlockId) -> usize {
pos * BLOCK_SIZE
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn mem_log() -> Result<()> {
let total_blocks = 64;
let append_nblocks = 8;
let mem_log = MemLog::create(total_blocks)?;
assert_eq!(mem_log.nblocks(), 0);
let mut append_buf = Buf::alloc(append_nblocks)?;
let content = 5_u8;
append_buf.as_mut_slice().fill(content);
let append_pos = mem_log.append(append_buf.as_ref())?;
assert_eq!(append_pos, 0);
assert_eq!(mem_log.nblocks(), append_nblocks);
mem_log.flush()?;
let mut read_buf = Buf::alloc(1)?;
let read_pos = 7 as BlockId;
mem_log.read(read_pos, read_buf.as_mut())?;
assert_eq!(
read_buf.as_slice(),
&append_buf.as_slice()[read_pos * BLOCK_SIZE..(read_pos + 1) * BLOCK_SIZE]
);
Ok(())
}
}

View File

@ -0,0 +1,114 @@
// SPDX-License-Identifier: MPL-2.0
use super::{BlockLog, BlockSet, BufMut, BufRef};
use crate::{os::Mutex, prelude::*};
/// `BlockRing<S>` emulates a blocks log (`BlockLog`) with infinite
/// storage capacity by using a block set (`S: BlockSet`) of finite storage
/// capacity.
///
/// `BlockRing<S>` uses the entire storage space provided by the underlying
/// block set (`S`) for user data, maintaining no extra metadata.
/// Having no metadata, `BlockRing<S>` has to put three responsibilities to
/// its user:
///
/// 1. Tracking the valid block range for read.
/// `BlockRing<S>` accepts reads at any position regardless of whether the
/// position refers to a valid block. It blindly redirects the read request to
/// the underlying block set after moduloing the target position by the
/// size of the block set.
///
/// 2. Setting the cursor for appending new blocks.
/// `BlockRing<S>` won't remember the progress of writing blocks after reboot.
/// Thus, after a `BlockRing<S>` is instantiated, the user must specify the
/// append cursor (using the `set_cursor` method) before appending new blocks.
///
/// 3. Avoiding overriding valid data blocks mistakenly.
/// As the underlying storage is used in a ring buffer style, old
/// blocks must be overridden to accommodate new blocks. The user must ensure
/// that the underlying storage is big enough to avoid overriding any useful
/// data.
pub struct BlockRing<S> {
storage: S,
// The cursor for appending new blocks
cursor: Mutex<Option<BlockId>>,
}
impl<S: BlockSet> BlockRing<S> {
/// Creates a new instance.
pub fn new(storage: S) -> Self {
Self {
storage,
cursor: Mutex::new(None),
}
}
/// Set the cursor for appending new blocks.
///
/// # Panics
///
/// Calling the `append` method without setting the append cursor first
/// via this method `set_cursor` causes panic.
pub fn set_cursor(&self, new_cursor: BlockId) {
*self.cursor.lock() = Some(new_cursor);
}
// Return a reference to the underlying storage.
pub fn storage(&self) -> &S {
&self.storage
}
}
impl<S: BlockSet> BlockLog for BlockRing<S> {
fn read(&self, pos: BlockId, buf: BufMut) -> Result<()> {
let pos = pos % self.storage.nblocks();
self.storage.read(pos, buf)
}
fn append(&self, buf: BufRef) -> Result<BlockId> {
let cursor = self
.cursor
.lock()
.expect("cursor must be set before appending new blocks");
let pos = cursor % self.storage.nblocks();
let new_cursor = cursor + buf.nblocks();
self.storage.write(pos, buf)?;
self.set_cursor(new_cursor);
Ok(cursor)
}
fn flush(&self) -> Result<()> {
self.storage.flush()
}
fn nblocks(&self) -> usize {
self.cursor.lock().unwrap_or(0)
}
}
#[cfg(test)]
mod tests {
use super::BlockRing;
use crate::layers::bio::{BlockLog, Buf, MemDisk};
#[test]
fn block_ring() {
let num_blocks = 16;
let disk = MemDisk::create(num_blocks).unwrap();
let block_ring = BlockRing::new(disk);
block_ring.set_cursor(num_blocks);
assert_eq!(block_ring.nblocks(), num_blocks);
let mut append_buf = Buf::alloc(1).unwrap();
append_buf.as_mut_slice().fill(1);
let pos = block_ring.append(append_buf.as_ref()).unwrap();
assert_eq!(pos, num_blocks);
assert_eq!(block_ring.nblocks(), num_blocks + 1);
let mut read_buf = Buf::alloc(1).unwrap();
block_ring
.read(pos % num_blocks, read_buf.as_mut())
.unwrap();
assert_eq!(read_buf.as_slice(), append_buf.as_slice());
}
}

View File

@ -0,0 +1,227 @@
// SPDX-License-Identifier: MPL-2.0
use core::ops::Range;
use inherit_methods_macro::inherit_methods;
use super::{Buf, BufMut, BufRef};
use crate::{error::Errno, os::Mutex, prelude::*};
/// A fixed set of data blocks that can support random reads and writes.
///
/// # Thread safety
///
/// `BlockSet` is a data structure of interior mutability.
/// It is ok to perform I/O on a `BlockSet` concurrently in multiple threads.
/// `BlockSet` promises the atomicity of reading and writing individual blocks.
pub trait BlockSet: Sync + Send {
/// Read one or multiple blocks at a specified position.
fn read(&self, pos: BlockId, buf: BufMut) -> Result<()>;
/// Read a slice of bytes at a specified byte offset.
fn read_slice(&self, offset: usize, buf: &mut [u8]) -> Result<()> {
let start_pos = offset / BLOCK_SIZE;
let end_pos = (offset + buf.len()).div_ceil(BLOCK_SIZE);
if end_pos > self.nblocks() {
return_errno_with_msg!(Errno::InvalidArgs, "read_slice position is out of range");
}
let nblocks = end_pos - start_pos;
let mut blocks = Buf::alloc(nblocks)?;
self.read(start_pos, blocks.as_mut())?;
let offset = offset % BLOCK_SIZE;
buf.copy_from_slice(&blocks.as_slice()[offset..offset + buf.len()]);
Ok(())
}
/// Write one or multiple blocks at a specified position.
fn write(&self, pos: BlockId, buf: BufRef) -> Result<()>;
/// Write a slice of bytes at a specified byte offset.
fn write_slice(&self, offset: usize, buf: &[u8]) -> Result<()> {
let start_pos = offset / BLOCK_SIZE;
let end_pos = (offset + buf.len()).div_ceil(BLOCK_SIZE);
if end_pos > self.nblocks() {
return_errno_with_msg!(Errno::InvalidArgs, "write_slice position is out of range");
}
let nblocks = end_pos - start_pos;
let mut blocks = Buf::alloc(nblocks)?;
// Maybe we should read the first block partially.
let start_offset = offset % BLOCK_SIZE;
if start_offset != 0 {
let mut start_block = Buf::alloc(1)?;
self.read(start_pos, start_block.as_mut())?;
blocks.as_mut_slice()[..start_offset]
.copy_from_slice(&start_block.as_slice()[..start_offset]);
}
// Copy the input buffer to the write buffer.
let end_offset = start_offset + buf.len();
blocks.as_mut_slice()[start_offset..end_offset].copy_from_slice(buf);
// Maybe we should read the last block partially.
if end_offset % BLOCK_SIZE != 0 {
let mut end_block = Buf::alloc(1)?;
self.read(end_pos, end_block.as_mut())?;
blocks.as_mut_slice()[end_offset..]
.copy_from_slice(&end_block.as_slice()[end_offset % BLOCK_SIZE..]);
}
// Write blocks.
self.write(start_pos, blocks.as_ref())?;
Ok(())
}
/// Get a subset of the blocks in the block set.
fn subset(&self, range: Range<BlockId>) -> Result<Self>
where
Self: Sized;
/// Ensure that blocks are persisted to the disk.
fn flush(&self) -> Result<()>;
/// Returns the number of blocks.
fn nblocks(&self) -> usize;
}
macro_rules! impl_blockset_for {
($typ:ty,$from:tt,$subset_fn:expr) => {
#[inherit_methods(from = $from)]
impl<T: BlockSet> BlockSet for $typ {
fn read(&self, pos: BlockId, buf: BufMut) -> Result<()>;
fn read_slice(&self, offset: usize, buf: &mut [u8]) -> Result<()>;
fn write(&self, pos: BlockId, buf: BufRef) -> Result<()>;
fn write_slice(&self, offset: usize, buf: &[u8]) -> Result<()>;
fn flush(&self) -> Result<()>;
fn nblocks(&self) -> usize;
fn subset(&self, range: Range<BlockId>) -> Result<Self> {
let closure = $subset_fn;
closure(self, range)
}
}
};
}
impl_blockset_for!(&T, "(**self)", |_this, _range| {
return_errno_with_msg!(Errno::NotFound, "cannot return `Self` by `subset` of `&T`");
});
impl_blockset_for!(&mut T, "(**self)", |_this, _range| {
return_errno_with_msg!(
Errno::NotFound,
"cannot return `Self` by `subset` of `&mut T`"
);
});
impl_blockset_for!(Box<T>, "(**self)", |this: &T, range| {
this.subset(range).map(|v| Box::new(v))
});
impl_blockset_for!(Arc<T>, "(**self)", |this: &Arc<T>, range| {
(**this).subset(range).map(|v| Arc::new(v))
});
/// A disk that impl `BlockSet`.
///
/// The `region` is the accessible subset.
#[derive(Clone)]
pub struct MemDisk {
disk: Arc<Mutex<Buf>>,
region: Range<BlockId>,
}
impl MemDisk {
/// Create a `MemDisk` with the number of blocks.
pub fn create(num_blocks: usize) -> Result<Self> {
let blocks = Buf::alloc(num_blocks)?;
Ok(Self {
disk: Arc::new(Mutex::new(blocks)),
region: Range {
start: 0,
end: num_blocks,
},
})
}
}
impl BlockSet for MemDisk {
fn read(&self, pos: BlockId, mut buf: BufMut) -> Result<()> {
if pos + buf.nblocks() > self.region.end {
return_errno_with_msg!(Errno::InvalidArgs, "read position is out of range");
}
let offset = (self.region.start + pos) * BLOCK_SIZE;
let buf_len = buf.as_slice().len();
let disk = self.disk.lock();
buf.as_mut_slice()
.copy_from_slice(&disk.as_slice()[offset..offset + buf_len]);
Ok(())
}
fn write(&self, pos: BlockId, buf: BufRef) -> Result<()> {
if pos + buf.nblocks() > self.region.end {
return_errno_with_msg!(Errno::InvalidArgs, "write position is out of range");
}
let offset = (self.region.start + pos) * BLOCK_SIZE;
let buf_len = buf.as_slice().len();
let mut disk = self.disk.lock();
disk.as_mut_slice()[offset..offset + buf_len].copy_from_slice(buf.as_slice());
Ok(())
}
fn subset(&self, range: Range<BlockId>) -> Result<Self> {
if self.region.start + range.end > self.region.end {
return_errno_with_msg!(Errno::InvalidArgs, "subset is out of range");
}
Ok(MemDisk {
disk: self.disk.clone(),
region: Range {
start: self.region.start + range.start,
end: self.region.start + range.end,
},
})
}
fn flush(&self) -> Result<()> {
Ok(())
}
fn nblocks(&self) -> usize {
self.region.len()
}
}
#[cfg(test)]
mod tests {
use core::ops::Range;
use crate::layers::bio::{BlockSet, Buf, MemDisk};
#[test]
fn mem_disk() {
let num_blocks = 64;
let disk = MemDisk::create(num_blocks).unwrap();
assert_eq!(disk.nblocks(), 64);
let mut buf = Buf::alloc(1).unwrap();
buf.as_mut_slice().fill(1);
disk.write(32, buf.as_ref()).unwrap();
let range = Range { start: 32, end: 64 };
let subset = disk.subset(range).unwrap();
assert_eq!(subset.nblocks(), 32);
buf.as_mut_slice().fill(0);
subset.read(0, buf.as_mut()).unwrap();
assert_eq!(buf.as_ref().as_slice(), [1u8; 4096]);
subset.write_slice(4096 - 4, &[2u8; 8]).unwrap();
let mut buf = [0u8; 16];
subset.read_slice(4096 - 8, &mut buf).unwrap();
assert_eq!(buf, [1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0]);
}
}

View File

@ -0,0 +1,24 @@
// SPDX-License-Identifier: MPL-2.0
//! The layer of untrusted block I/O.
use static_assertions::assert_eq_size;
mod block_buf;
mod block_log;
mod block_ring;
mod block_set;
pub use self::{
block_buf::{Buf, BufMut, BufRef},
block_log::{BlockLog, MemLog},
block_ring::BlockRing,
block_set::{BlockSet, MemDisk},
};
pub type BlockId = usize;
pub const BLOCK_SIZE: usize = 0x1000;
pub const BID_SIZE: usize = core::mem::size_of::<BlockId>();
// This definition of BlockId assumes the target architecture is 64-bit
assert_eq_size!(usize, u64);

View File

@ -0,0 +1,358 @@
// SPDX-License-Identifier: MPL-2.0
use ostd_pod::Pod;
use super::{Iv, Key, Mac, VersionId};
use crate::{
layers::bio::{BlockSet, Buf, BLOCK_SIZE},
os::{Aead, Mutex},
prelude::*,
};
/// A cryptographically-protected blob of user data.
///
/// `CryptoBlob<B>` allows a variable-length of user data to be securely
/// written to and read from a fixed, pre-allocated block set
/// (represented by `B: BlockSet`) on disk. Obviously, the length of user data
/// must be smaller than that of the block set.
///
/// # On-disk format
///
/// The on-disk format of `CryptoBlob` is shown below.
///
/// ```
/// ┌─────────┬─────────┬─────────┬──────────────────────────────┐
/// │VersionId│ MAC │ Length │ Encrypted Payload │
/// │ (8B) │ (16B) │ (8B) │ (Length bytes) │
/// └─────────┴─────────┴─────────┴──────────────────────────────┘
/// ```
///
/// The version ID increments by one each time the `CryptoBlob` is updated.
/// The MAC protects the integrity of the length and the encrypted payload.
///
/// # Security
///
/// To ensure the confidentiality and integrity of user data, `CryptoBlob`
/// takes several measures:
///
/// 1. Each instance of `CryptoBlob` is associated with a randomly-generated,
/// unique encryption key.
/// 2. Each instance of `CryptoBlob` maintains a version ID, which is
/// automatically incremented by one upon each write.
/// 3. The user data written to a `CryptoBlob` is protected with authenticated
/// encryption before being persisted to the disk.
/// The encryption takes the current version ID as the IV and generates a MAC
/// as the output.
/// 4. To read user data from a `CryptoBlob`, it first decrypts
/// the untrusted on-disk data with the encryption key associated with this object
/// and validating its integrity. Optimally, the user can check the version ID
/// of the decrypted user data and see if the version ID is up-to-date.
///
pub struct CryptoBlob<B> {
block_set: B,
key: Key,
header: Mutex<Option<Header>>,
}
#[repr(C)]
#[derive(Copy, Clone, Pod)]
struct Header {
version: VersionId,
mac: Mac,
payload_len: usize,
}
impl<B: BlockSet> CryptoBlob<B> {
/// The size of the header of a crypto blob in bytes.
pub const HEADER_NBYTES: usize = core::mem::size_of::<Header>();
/// Opens an existing `CryptoBlob`.
///
/// The capacity of this `CryptoBlob` object is determined by the size
/// of `block_set: B`.
pub fn open(key: Key, block_set: B) -> Self {
Self {
block_set,
key,
header: Mutex::new(None),
}
}
/// Creates a new `CryptoBlob`.
///
/// The encryption key of a `CryptoBlob` is generated randomly so that
/// no two `CryptoBlob` instances shall ever use the same key.
pub fn create(block_set: B, init_data: &[u8]) -> Result<Self> {
let capacity = block_set.nblocks() * BLOCK_SIZE - Self::HEADER_NBYTES;
if init_data.len() > capacity {
return_errno_with_msg!(OutOfDisk, "init_data is too large");
}
let nblocks = (Self::HEADER_NBYTES + init_data.len()).div_ceil(BLOCK_SIZE);
let mut block_buf = Buf::alloc(nblocks)?;
// Encrypt init_data.
let aead = Aead::new();
let key = Key::random();
let version: VersionId = 0;
let mut iv = Iv::new_zeroed();
iv.as_bytes_mut()[..version.as_bytes().len()].copy_from_slice(version.as_bytes());
let output = &mut block_buf.as_mut_slice()
[Self::HEADER_NBYTES..Self::HEADER_NBYTES + init_data.len()];
let mac = aead.encrypt(init_data, &key, &iv, &[], output)?;
// Store header.
let header = Header {
version,
mac,
payload_len: init_data.len(),
};
block_buf.as_mut_slice()[..Self::HEADER_NBYTES].copy_from_slice(header.as_bytes());
// Write to `BlockSet`.
block_set.write(0, block_buf.as_ref())?;
Ok(Self {
block_set,
key,
header: Mutex::new(Some(header)),
})
}
/// Write the buffer to the disk as the latest version of the content of
/// this `CryptoBlob`.
///
/// The size of the buffer must not be greater than the capacity of this
/// `CryptoBlob`.
///
/// Each successful write increments the version ID by one. If
/// there is no valid version ID, an `Error` will be returned.
/// User could get a version ID, either by a successful call to
/// `read`, or `recover_from` another valid `CryptoBlob`.
///
/// # Security
///
/// This content is guaranteed to be confidential as long as the key is not
/// known to an attacker.
pub fn write(&mut self, buf: &[u8]) -> Result<VersionId> {
if buf.len() > self.capacity() {
return_errno_with_msg!(OutOfDisk, "write data is too large");
}
let nblocks = (Self::HEADER_NBYTES + buf.len()).div_ceil(BLOCK_SIZE);
let mut block_buf = Buf::alloc(nblocks)?;
// Encrypt payload.
let aead = Aead::new();
let version = match self.version_id() {
Some(version) => version + 1,
None => return_errno_with_msg!(NotFound, "write with no valid version ID"),
};
let mut iv = Iv::new_zeroed();
iv.as_bytes_mut()[..version.as_bytes().len()].copy_from_slice(version.as_bytes());
let output =
&mut block_buf.as_mut_slice()[Self::HEADER_NBYTES..Self::HEADER_NBYTES + buf.len()];
let mac = aead.encrypt(buf, &self.key, &iv, &[], output)?;
// Store header.
let header = Header {
version,
mac,
payload_len: buf.len(),
};
block_buf.as_mut_slice()[..Self::HEADER_NBYTES].copy_from_slice(header.as_bytes());
// Write to `BlockSet`.
self.block_set.write(0, block_buf.as_ref())?;
*self.header.lock() = Some(header);
Ok(version)
}
/// Read the content of the `CryptoBlob` from the disk into the buffer.
///
/// The given buffer must has a length that is no less than the size of
/// the plaintext content of this `CryptoBlob`.
///
/// # Security
///
/// This content, including its length, is guaranteed to be authentic.
pub fn read(&self, buf: &mut [u8]) -> Result<usize> {
let header = match *self.header.lock() {
Some(header) => header,
None => {
let mut header = Header::new_zeroed();
self.block_set.read_slice(0, header.as_bytes_mut())?;
header
}
};
if header.payload_len > self.capacity() {
return_errno_with_msg!(OutOfDisk, "payload_len is greater than the capacity");
}
if header.payload_len > buf.len() {
return_errno_with_msg!(OutOfDisk, "read_buf is too small");
}
let nblock = (Self::HEADER_NBYTES + header.payload_len).div_ceil(BLOCK_SIZE);
let mut block_buf = Buf::alloc(nblock)?;
self.block_set.read(0, block_buf.as_mut())?;
// Decrypt payload.
let aead = Aead::new();
let version = header.version;
let mut iv = Iv::new_zeroed();
iv.as_bytes_mut()[..version.as_bytes().len()].copy_from_slice(version.as_bytes());
let input =
&block_buf.as_slice()[Self::HEADER_NBYTES..Self::HEADER_NBYTES + header.payload_len];
let output = &mut buf[..header.payload_len];
aead.decrypt(input, &self.key, &iv, &[], &header.mac, output)?;
*self.header.lock() = Some(header);
Ok(header.payload_len)
}
/// Returns the key associated with this `CryptoBlob`.
pub fn key(&self) -> &Key {
&self.key
}
/// Returns the current version ID.
///
/// # Security
///
/// It is valid after a successful call to `create`, `read` or `write`.
/// User could also get a version ID from another valid `CryptoBlob`,
/// (usually a backup), through method `recover_from`.
pub fn version_id(&self) -> Option<VersionId> {
self.header.lock().map(|header| header.version)
}
/// Recover from another `CryptoBlob`.
///
/// If `CryptoBlob` doesn't have a valid version ID, e.g., payload decryption
/// failed when `read`, user could call this method to recover version ID and
/// payload from another `CryptoBlob` (usually a backup).
pub fn recover_from(&mut self, other: &CryptoBlob<B>) -> Result<()> {
if self.capacity() != other.capacity() {
return_errno_with_msg!(InvalidArgs, "capacity not aligned, recover failed");
}
if self.header.lock().is_some() {
return_errno_with_msg!(InvalidArgs, "no need to recover");
}
let nblocks = self.block_set.nblocks();
// Read version ID and payload from another `CryptoBlob`.
let mut read_buf = Buf::alloc(nblocks)?;
let payload_len = other.read(read_buf.as_mut_slice())?;
let version = other.version_id().unwrap();
// Encrypt payload.
let aead = Aead::new();
let mut iv = Iv::new_zeroed();
iv.as_bytes_mut()[..version.as_bytes().len()].copy_from_slice(version.as_bytes());
let input = &read_buf.as_slice()[..payload_len];
let mut write_buf = Buf::alloc(nblocks)?;
let output =
&mut write_buf.as_mut_slice()[Self::HEADER_NBYTES..Self::HEADER_NBYTES + payload_len];
let mac = aead.encrypt(input, self.key(), &iv, &[], output)?;
// Store header.
let header = Header {
version,
mac,
payload_len,
};
write_buf.as_mut_slice()[..Self::HEADER_NBYTES].copy_from_slice(header.as_bytes());
// Write to `BlockSet`.
self.block_set.write(0, write_buf.as_ref())?;
*self.header.lock() = Some(header);
Ok(())
}
/// Returns the current MAC of encrypted payload.
///
/// # Security
///
/// It is valid after a successful call to `create`, `read` or `write`.
pub fn current_mac(&self) -> Option<Mac> {
self.header.lock().map(|header| header.mac)
}
/// Returns the capacity of this `CryptoBlob` in bytes.
pub fn capacity(&self) -> usize {
self.block_set.nblocks() * BLOCK_SIZE - Self::HEADER_NBYTES
}
/// Returns the number of blocks occupied by the underlying `BlockSet`.
pub fn nblocks(&self) -> usize {
self.block_set.nblocks()
}
}
#[cfg(test)]
mod tests {
use super::CryptoBlob;
use crate::layers::bio::{BlockSet, MemDisk, BLOCK_SIZE};
#[test]
fn create() {
let disk = MemDisk::create(2).unwrap();
let init_data = [1u8; BLOCK_SIZE];
let blob = CryptoBlob::create(disk, &init_data).unwrap();
println!("blob key: {:?}", blob.key());
assert_eq!(blob.version_id(), Some(0));
assert_eq!(blob.nblocks(), 2);
assert_eq!(
blob.capacity(),
2 * BLOCK_SIZE - CryptoBlob::<MemDisk>::HEADER_NBYTES
);
}
#[test]
fn open_and_read() {
let disk = MemDisk::create(4).unwrap();
let key = {
let subset = disk.subset(0..2).unwrap();
let init_data = [1u8; 1024];
let blob = CryptoBlob::create(subset, &init_data).unwrap();
blob.key
};
let subset = disk.subset(0..2).unwrap();
let blob = CryptoBlob::open(key, subset);
assert_eq!(blob.version_id(), None);
assert_eq!(blob.nblocks(), 2);
let mut buf = [0u8; BLOCK_SIZE];
let payload_len = blob.read(&mut buf).unwrap();
assert_eq!(buf[..payload_len], [1u8; 1024]);
}
#[test]
fn write() {
let disk = MemDisk::create(2).unwrap();
let init_data = [0u8; BLOCK_SIZE];
let mut blob = CryptoBlob::create(disk, &init_data).unwrap();
let write_buf = [1u8; 1024];
blob.write(&write_buf).unwrap();
let mut read_buf = [0u8; 1024];
blob.read(&mut read_buf).unwrap();
assert_eq!(read_buf, [1u8; 1024]);
assert_eq!(blob.version_id(), Some(1));
}
#[test]
fn recover_from() {
let disk = MemDisk::create(2).unwrap();
let init_data = [1u8; 1024];
let subset0 = disk.subset(0..1).unwrap();
let mut blob0 = CryptoBlob::create(subset0, &init_data).unwrap();
assert_eq!(blob0.version_id(), Some(0));
blob0.write(&init_data).unwrap();
assert_eq!(blob0.version_id(), Some(1));
let subset1 = disk.subset(1..2).unwrap();
let mut blob1 = CryptoBlob::open(blob0.key, subset1);
assert_eq!(blob1.version_id(), None);
blob1.recover_from(&blob0).unwrap();
let mut read_buf = [0u8; BLOCK_SIZE];
let payload_len = blob1.read(&mut read_buf).unwrap();
assert_eq!(read_buf[..payload_len], [1u8; 1024]);
assert_eq!(blob1.version_id(), Some(1));
}
}

View File

@ -0,0 +1,401 @@
// SPDX-License-Identifier: MPL-2.0
use core::ops::Range;
use lending_iterator::prelude::*;
use ostd_pod::Pod;
use super::{Iv, Key, Mac};
use crate::{
layers::bio::{BlockId, BlockLog, Buf, BLOCK_SIZE},
os::Aead,
prelude::*,
};
/// A cryptographically-protected chain of blocks.
///
/// `CryptoChain<L>` allows writing and reading a sequence of
/// consecutive blocks securely to and from an untrusted storage of data log
/// `L: BlockLog`.
/// The target use case of `CryptoChain` is to implement secure journals,
/// where old data are scanned and new data are appended.
///
/// # On-disk format
///
/// The on-disk format of each block is shown below.
///
/// ```text
/// ┌─────────────────────┬───────┬──────────┬──────────┬──────────┬─────────┐
/// │ Encrypted payload │ Gap │ Length │ PreMac │ CurrMac │ IV │
/// │(Length <= 4KB - 48B)│ │ (4B) │ (16B) │ (16B) │ (12B) │
/// └─────────────────────┴───────┴──────────┴──────────┴──────────┴─────────┘
///
/// ◄─────────────────────────── Block size (4KB) ──────────────────────────►
/// ```
///
/// Each block begins with encrypted user payload. The size of payload
/// must be smaller than that of block size as each block ends with a footer
/// (in plaintext).
/// The footer consists of fours parts: the length of the payload (in bytes),
/// the MAC of the previous block, the MAC of the current block, the IV used
/// for encrypting the current block.
/// The MAC of a block protects the encrypted payload, its length, and the MAC
/// of the previous block.
///
/// # Security
///
/// Each `CryptoChain` is assigned a randomly-generated encryption key.
/// Each block is encrypted using this key and a randomly-generated IV.
/// This setup ensures the confidentiality of payload and even the same payloads
/// result in different ciphertexts.
///
/// `CryptoChain` is called a "chain" of blocks because each block
/// not only stores its own MAC, but also the MAC of its previous block.
/// This effectively forms a "chain" (much like a blockchain),
/// ensuring the orderness and consecutiveness of the sequence of blocks.
///
/// Due to this chain structure, the integrity of a `CryptoChain` can be ensured
/// by verifying the MAC of the last block. Once the integrity of the last block
/// is verified, the integrity of all previous blocks can also be verified.
pub struct CryptoChain<L> {
block_log: L,
key: Key,
block_range: Range<BlockId>,
block_macs: Vec<Mac>,
}
#[repr(C)]
#[derive(Copy, Clone, Pod)]
struct Footer {
len: u32,
pre_mac: Mac,
this_mac: Mac,
this_iv: Iv,
}
impl<L: BlockLog> CryptoChain<L> {
/// The available size in each chained block is smaller than that of
/// the block size.
pub const AVAIL_BLOCK_SIZE: usize = BLOCK_SIZE - core::mem::size_of::<Footer>();
/// Construct a new `CryptoChain` using `block_log: L` as the storage.
pub fn new(block_log: L) -> Self {
Self {
block_log,
block_range: 0..0,
key: Key::random(),
block_macs: Vec::new(),
}
}
/// Recover an existing `CryptoChain` backed by `block_log: L`,
/// starting from its `from` block.
pub fn recover(key: Key, block_log: L, from: BlockId) -> Recovery<L> {
Recovery::new(block_log, key, from)
}
/// Read a block at a specified position.
///
/// The length of the given buffer should not be smaller than payload_len
/// stored in `Footer`.
///
/// # Security
///
/// The authenticity of the block is guaranteed.
pub fn read(&self, pos: BlockId, buf: &mut [u8]) -> Result<usize> {
if !self.block_range().contains(&pos) {
return_errno_with_msg!(NotFound, "read position is out of range");
}
// Read block and get footer.
let mut block_buf = Buf::alloc(1)?;
self.block_log.read(pos, block_buf.as_mut())?;
let footer: Footer = Pod::from_bytes(&block_buf.as_slice()[Self::AVAIL_BLOCK_SIZE..]);
let payload_len = footer.len as usize;
if payload_len > Self::AVAIL_BLOCK_SIZE || payload_len > buf.len() {
return_errno_with_msg!(OutOfDisk, "wrong payload_len or the read_buf is too small");
}
// Check the footer MAC, to ensure the orderness and consecutiveness of blocks.
let this_mac = self.block_macs.get(pos - self.block_range.start).unwrap();
if footer.this_mac.as_bytes() != this_mac.as_bytes() {
return_errno_with_msg!(NotFound, "check footer MAC failed");
}
// Decrypt payload.
let aead = Aead::new();
aead.decrypt(
&block_buf.as_slice()[..payload_len],
self.key(),
&footer.this_iv,
&footer.pre_mac,
&footer.this_mac,
&mut buf[..payload_len],
)?;
Ok(payload_len)
}
/// Append a block at the end.
///
/// The length of the given buffer must not be larger than `AVAIL_BLOCK_SIZE`.
///
/// # Security
///
/// The confidentiality of the block is guaranteed.
pub fn append(&mut self, buf: &[u8]) -> Result<()> {
if buf.len() > Self::AVAIL_BLOCK_SIZE {
return_errno_with_msg!(OutOfDisk, "append data is too large");
}
let mut block_buf = Buf::alloc(1)?;
// Encrypt payload.
let aead = Aead::new();
let this_iv = Iv::random();
let pre_mac = self.block_macs.last().copied().unwrap_or_default();
let output = &mut block_buf.as_mut_slice()[..buf.len()];
let this_mac = aead.encrypt(buf, self.key(), &this_iv, &pre_mac, output)?;
// Store footer.
let footer = Footer {
len: buf.len() as _,
pre_mac,
this_mac,
this_iv,
};
let buf = &mut block_buf.as_mut_slice()[Self::AVAIL_BLOCK_SIZE..];
buf.copy_from_slice(footer.as_bytes());
self.block_log.append(block_buf.as_ref())?;
self.block_range.end += 1;
self.block_macs.push(this_mac);
Ok(())
}
/// Ensures the persistence of data.
pub fn flush(&self) -> Result<()> {
self.block_log.flush()
}
/// Trim the blocks before a specified position (exclusive).
///
/// The purpose of this method is to free some memory used for keeping the
/// MACs of accessible blocks. After trimming, the range of accessible
/// blocks is shrunk accordingly.
pub fn trim(&mut self, before_block: BlockId) {
// We must ensure the invariance that there is at least one valid block
// after trimming.
debug_assert!(before_block < self.block_range.end);
if before_block <= self.block_range.start {
return;
}
let num_blocks_trimmed = before_block - self.block_range.start;
self.block_range.start = before_block;
self.block_macs.drain(..num_blocks_trimmed);
}
/// Returns the range of blocks that are accessible through the `CryptoChain`.
pub fn block_range(&self) -> &Range<BlockId> {
&self.block_range
}
/// Returns the underlying block log.
pub fn inner_log(&self) -> &L {
&self.block_log
}
/// Returns the encryption key of the `CryptoChain`.
pub fn key(&self) -> &Key {
&self.key
}
}
/// `Recovery<L>` represents an instance `CryptoChain<L>` being recovered.
///
/// An object `Recovery<L>` attempts to recover as many valid blocks of
/// a `CryptoChain` as possible. A block is valid if and only if its real MAC
/// is equal to the MAC value recorded in its successor.
///
/// For the last block, which does not have a successor block, the user
/// can obtain its MAC from `Recovery<L>` and verify the MAC by comparing it
/// with an expected value from another trusted source.
pub struct Recovery<L> {
block_log: L,
key: Key,
block_range: Range<BlockId>,
block_macs: Vec<Mac>,
read_buf: Buf,
payload: Buf,
}
impl<L: BlockLog> Recovery<L> {
/// Construct a new `Recovery` from the `first_block` of
/// `block_log: L`, using a cryptographic `key`.
pub fn new(block_log: L, key: Key, first_block: BlockId) -> Self {
Self {
block_log,
key,
block_range: first_block..first_block,
block_macs: Vec::new(),
read_buf: Buf::alloc(1).unwrap(),
payload: Buf::alloc(1).unwrap(),
}
}
/// Returns the number of valid blocks.
///
/// Each success call to `next` increments the number of valid blocks.
pub fn num_blocks(&self) -> usize {
self.block_range.len()
}
/// Returns the range of valid blocks.
///
/// Each success call to `next` increments the upper bound by one.
pub fn block_range(&self) -> &Range<BlockId> {
&self.block_range
}
/// Returns the MACs of valid blocks.
///
/// Each success call to `next` pushes the MAC of the new valid block.
pub fn block_macs(&self) -> &[Mac] {
&self.block_macs
}
/// Open a `CryptoChain<L>` from the recovery object.
///
/// User should call `next` to retrieve valid blocks as much as possible.
pub fn open(self) -> CryptoChain<L> {
CryptoChain {
block_log: self.block_log,
key: self.key,
block_range: self.block_range,
block_macs: self.block_macs,
}
}
}
#[gat]
impl<L: BlockLog> LendingIterator for Recovery<L> {
type Item<'a> = &'a [u8];
fn next(&mut self) -> Option<Self::Item<'_>> {
let next_block_id = self.block_range.end;
self.block_log
.read(next_block_id, self.read_buf.as_mut())
.ok()?;
// Deserialize footer.
let footer: Footer =
Pod::from_bytes(&self.read_buf.as_slice()[CryptoChain::<L>::AVAIL_BLOCK_SIZE..]);
let payload_len = footer.len as usize;
if payload_len > CryptoChain::<L>::AVAIL_BLOCK_SIZE {
return None;
}
// Decrypt payload.
let aead = Aead::new();
aead.decrypt(
&self.read_buf.as_slice()[..payload_len],
&self.key,
&footer.this_iv,
&footer.pre_mac,
&footer.this_mac,
&mut self.payload.as_mut_slice()[..payload_len],
)
.ok()?;
// Crypto blocks are chained: each block stores not only
// the MAC of its own, but also the MAC of its previous block.
// So we need to check whether the two MAC values are the same.
// There is one exception that the `pre_mac` of the first block
// is NOT checked.
if self
.block_macs()
.last()
.is_some_and(|mac| mac.as_bytes() != footer.pre_mac.as_bytes())
{
return None;
}
self.block_range.end += 1;
self.block_macs.push(footer.this_mac);
Some(&self.payload.as_slice()[..payload_len])
}
}
#[cfg(test)]
mod tests {
use lending_iterator::LendingIterator;
use super::CryptoChain;
use crate::layers::bio::{BlockLog, BlockRing, BlockSet, MemDisk};
#[test]
fn new() {
let disk = MemDisk::create(16).unwrap();
let block_ring = BlockRing::new(disk);
block_ring.set_cursor(0);
let chain = CryptoChain::new(block_ring);
assert_eq!(chain.block_log.nblocks(), 0);
assert_eq!(chain.block_range.start, 0);
assert_eq!(chain.block_range.end, 0);
assert_eq!(chain.block_macs.len(), 0);
}
#[test]
fn append_trim_and_read() {
let disk = MemDisk::create(16).unwrap();
let block_ring = BlockRing::new(disk);
block_ring.set_cursor(0);
let mut chain = CryptoChain::new(block_ring);
let data = [1u8; 1024];
chain.append(&data[..256]).unwrap();
chain.append(&data[..512]).unwrap();
assert_eq!(chain.block_range.end, 2);
assert_eq!(chain.block_macs.len(), 2);
chain.trim(1);
assert_eq!(chain.block_range.start, 1);
assert_eq!(chain.block_range.end, 2);
assert_eq!(chain.block_macs.len(), 1);
let mut buf = [0u8; 1024];
let len = chain.read(1, &mut buf).unwrap();
assert_eq!(len, 512);
assert_eq!(buf[..512], [1u8; 512]);
}
#[test]
fn recover() {
let disk = MemDisk::create(16).unwrap();
let key = {
let sub_disk = disk.subset(0..8).unwrap();
let block_ring = BlockRing::new(sub_disk);
block_ring.set_cursor(0);
let data = [1u8; 1024];
let mut chain = CryptoChain::new(block_ring);
for _ in 0..4 {
chain.append(&data).unwrap();
}
chain.flush().unwrap();
chain.key
};
let sub_disk = disk.subset(0..8).unwrap();
let block_ring = BlockRing::new(sub_disk);
let mut recover = CryptoChain::recover(key, block_ring, 2);
while let Some(payload) = recover.next() {
assert_eq!(payload.len(), 1024);
}
let chain = recover.open();
assert_eq!(chain.block_range(), &(2..4));
assert_eq!(chain.block_macs.len(), 2);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,18 @@
// SPDX-License-Identifier: MPL-2.0
//! The layer of cryptographical constructs.
mod crypto_blob;
mod crypto_chain;
mod crypto_log;
pub use self::{
crypto_blob::CryptoBlob,
crypto_chain::CryptoChain,
crypto_log::{CryptoLog, NodeCache, RootMhtMeta},
};
pub type Key = crate::os::AeadKey;
pub type Iv = crate::os::AeadIv;
pub type Mac = crate::os::AeadMac;
pub type VersionId = u64;

View File

@ -0,0 +1,154 @@
// SPDX-License-Identifier: MPL-2.0
use core::marker::PhantomData;
use serde::{ser::SerializeSeq, Deserialize, Serialize};
use crate::prelude::*;
/// An edit of `Edit<S>` is an incremental change to a state of `S`.
pub trait Edit<S>: Serialize + for<'de> Deserialize<'de> {
/// Apply this edit to a state.
fn apply_to(&self, state: &mut S);
}
/// A group of edits to a state.
pub struct EditGroup<E: Edit<S>, S> {
edits: Vec<E>,
_s: PhantomData<S>,
}
impl<E: Edit<S>, S> EditGroup<E, S> {
/// Creates an empty edit group.
pub fn new() -> Self {
Self {
edits: Vec::new(),
_s: PhantomData,
}
}
/// Adds an edit to the group.
pub fn push(&mut self, edit: E) {
self.edits.push(edit);
}
/// Returns an iterator to the contained edits.
pub fn iter(&self) -> impl Iterator<Item = &E> {
self.edits.iter()
}
/// Clears the edit group by removing all contained edits.
pub fn clear(&mut self) {
self.edits.clear()
}
/// Returns whether the edit group contains no edits.
pub fn is_empty(&self) -> bool {
self.len() == 0
}
/// Returns the length of the edit group.
pub fn len(&self) -> usize {
self.edits.len()
}
}
impl<E: Edit<S>, S> Edit<S> for EditGroup<E, S> {
fn apply_to(&self, state: &mut S) {
for edit in &self.edits {
edit.apply_to(state);
}
}
}
impl<E: Edit<S>, S> Serialize for EditGroup<E, S> {
fn serialize<Se>(&self, serializer: Se) -> core::result::Result<Se::Ok, Se::Error>
where
Se: serde::Serializer,
{
let mut seq = serializer.serialize_seq(Some(self.len()))?;
for edit in &self.edits {
seq.serialize_element(edit)?
}
seq.end()
}
}
impl<'de, E: Edit<S>, S> Deserialize<'de> for EditGroup<E, S> {
fn deserialize<D>(deserializer: D) -> core::result::Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct EditsVisitor<E: Edit<S>, S> {
_p: PhantomData<(E, S)>,
}
impl<'a, E: Edit<S>, S> serde::de::Visitor<'a> for EditsVisitor<E, S> {
type Value = EditGroup<E, S>;
fn expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result {
formatter.write_str("an edit group")
}
fn visit_seq<A>(self, mut seq: A) -> core::result::Result<Self::Value, A::Error>
where
A: serde::de::SeqAccess<'a>,
{
let mut edits = Vec::with_capacity(seq.size_hint().unwrap_or(0));
while let Some(e) = seq.next_element()? {
edits.push(e);
}
Ok(EditGroup {
edits,
_s: PhantomData,
})
}
}
deserializer.deserialize_seq(EditsVisitor { _p: PhantomData })
}
}
#[cfg(test)]
mod tests {
use serde::{Deserialize, Serialize};
use super::*;
#[derive(Serialize, Deserialize, Debug, PartialEq)]
struct XEdit {
x: i32,
}
struct XState {
sum: i32,
}
impl Edit<XState> for XEdit {
fn apply_to(&self, state: &mut XState) {
(*state).sum += self.x;
}
}
#[test]
fn serde_edit() {
let mut group = EditGroup::<XEdit, XState>::new();
let mut sum = 0;
for x in 0..10 {
sum += x;
let edit = XEdit { x };
group.push(edit);
}
let mut state = XState { sum: 0 };
group.apply_to(&mut state);
assert_eq!(state.sum, sum);
let mut buf = [0u8; 64];
let ser = postcard::to_slice(&group, buf.as_mut_slice()).unwrap();
println!("serialize len: {} data: {:?}", ser.len(), ser);
let de: EditGroup<XEdit, XState> = postcard::from_bytes(buf.as_slice()).unwrap();
println!("deserialize edits: {:?}", de.edits);
assert_eq!(de.len(), group.len());
assert_eq!(de.edits.as_slice(), group.edits.as_slice());
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,13 @@
// SPDX-License-Identifier: MPL-2.0
//! The layer of edit journal.
mod edits;
mod journal;
pub use self::{
edits::{Edit, EditGroup},
journal::{
CompactPolicy, DefaultCompactPolicy, EditJournal, EditJournalMeta, NeverCompactPolicy,
},
};

View File

@ -0,0 +1,480 @@
// SPDX-License-Identifier: MPL-2.0
//! Chunk-based storage management.
//!
//! A chunk is a group of consecutive blocks.
//! As the size of a chunk is much greater than that of a block,
//! the number of chunks is naturally far smaller than that of blocks.
//! This makes it possible to keep all metadata for chunks in memory.
//! Thus, managing chunks is more efficient than managing blocks.
//!
//! The primary API provided by this module is chunk allocators,
//! `ChunkAlloc`, which tracks whether chunks are free or not.
//!
//! # Examples
//!
//! Chunk allocators are used within transactions.
//!
//! ```
//! fn alloc_chunks(chunk_alloc: &ChunkAlloc, num_chunks: usize) -> Option<Vec<ChunkId>> {
//! let mut tx = chunk_alloc.new_tx();
//! let res: Option<Vec<ChunkId>> = tx.context(|| {
//! let mut chunk_ids = Vec::new();
//! for _ in 0..num_chunks {
//! chunk_ids.push(chunk_alloc.alloc()?);
//! }
//! Some(chunk_ids)
//! });
//! if res.is_some() {
//! tx.commit().ok()?;
//! } else {
//! tx.abort();
//! }
//! res
//! }
//! ```
//!
//! This above example showcases the power of transaction atomicity:
//! if anything goes wrong (e.g., allocation failures) during the transaction,
//! then the transaction can be aborted and all changes made to `chuck_alloc`
//! during the transaction will be rolled back automatically.
use serde::{Deserialize, Serialize};
use crate::{
layers::edit::Edit,
os::{HashMap, Mutex},
prelude::*,
tx::{CurrentTx, TxData, TxProvider},
util::BitMap,
};
/// The ID of a chunk.
pub type ChunkId = usize;
/// Number of blocks of a chunk.
pub const CHUNK_NBLOCKS: usize = 1024;
/// The chunk size is a multiple of the block size.
pub const CHUNK_SIZE: usize = CHUNK_NBLOCKS * BLOCK_SIZE;
/// A chunk allocator tracks which chunks are free.
#[derive(Clone)]
pub struct ChunkAlloc {
state: Arc<Mutex<ChunkAllocState>>,
tx_provider: Arc<TxProvider>,
}
impl ChunkAlloc {
/// Creates a chunk allocator that manages a specified number of
/// chunks (`capacity`). Initially, all chunks are free.
pub fn new(capacity: usize, tx_provider: Arc<TxProvider>) -> Self {
let state = ChunkAllocState::new(capacity);
Self::from_parts(state, tx_provider)
}
/// Constructs a `ChunkAlloc` from its parts.
pub(super) fn from_parts(mut state: ChunkAllocState, tx_provider: Arc<TxProvider>) -> Self {
state.in_journal = false;
let new_self = Self {
state: Arc::new(Mutex::new(state)),
tx_provider,
};
// TX data
new_self
.tx_provider
.register_data_initializer(Box::new(ChunkAllocEdit::new));
// Commit handler
new_self.tx_provider.register_commit_handler({
let state = new_self.state.clone();
move |current: CurrentTx<'_>| {
let state = state.clone();
current.data_with(move |edit: &ChunkAllocEdit| {
if edit.edit_table.is_empty() {
return;
}
let mut state = state.lock();
edit.apply_to(&mut state);
});
}
});
// Abort handler
new_self.tx_provider.register_abort_handler({
let state = new_self.state.clone();
move |current: CurrentTx<'_>| {
let state = state.clone();
current.data_with(move |edit: &ChunkAllocEdit| {
let mut state = state.lock();
for chunk_id in edit.iter_allocated_chunks() {
state.dealloc(chunk_id);
}
});
}
});
new_self
}
/// Creates a new transaction for the chunk allocator.
pub fn new_tx(&self) -> CurrentTx<'_> {
self.tx_provider.new_tx()
}
/// Allocates a chunk, returning its ID.
pub fn alloc(&self) -> Option<ChunkId> {
let chunk_id = {
let mut state = self.state.lock();
state.alloc()? // Update global state immediately
};
let mut current_tx = self.tx_provider.current();
current_tx.data_mut_with(|edit: &mut ChunkAllocEdit| {
edit.alloc(chunk_id);
});
Some(chunk_id)
}
/// Allocates `count` number of chunks. Returns IDs of newly-allocated
/// chunks, returns `None` if any allocation fails.
pub fn alloc_batch(&self, count: usize) -> Option<Vec<ChunkId>> {
let chunk_ids = {
let mut ids = Vec::with_capacity(count);
let mut state = self.state.lock();
for _ in 0..count {
match state.alloc() {
Some(id) => ids.push(id),
None => {
ids.iter().for_each(|id| state.dealloc(*id));
return None;
}
}
}
ids.sort_unstable();
ids
};
let mut current_tx = self.tx_provider.current();
current_tx.data_mut_with(|edit: &mut ChunkAllocEdit| {
for chunk_id in &chunk_ids {
edit.alloc(*chunk_id);
}
});
Some(chunk_ids)
}
/// Deallocates the chunk of a given ID.
///
/// # Panic
///
/// Deallocating a free chunk causes panic.
pub fn dealloc(&self, chunk_id: ChunkId) {
let mut current_tx = self.tx_provider.current();
current_tx.data_mut_with(|edit: &mut ChunkAllocEdit| {
let should_dealloc_now = edit.dealloc(chunk_id);
if should_dealloc_now {
let mut state = self.state.lock();
state.dealloc(chunk_id);
}
});
}
/// Deallocates the set of chunks of given IDs.
///
/// # Panic
///
/// Deallocating a free chunk causes panic.
pub fn dealloc_batch<I>(&self, chunk_ids: I)
where
I: Iterator<Item = ChunkId>,
{
let mut current_tx = self.tx_provider.current();
current_tx.data_mut_with(|edit: &mut ChunkAllocEdit| {
let mut state = self.state.lock();
for chunk_id in chunk_ids {
let should_dealloc_now = edit.dealloc(chunk_id);
if should_dealloc_now {
state.dealloc(chunk_id);
}
}
});
}
/// Returns the capacity of the allocator, which is the number of chunks.
pub fn capacity(&self) -> usize {
self.state.lock().capacity()
}
/// Returns the number of free chunks.
pub fn free_count(&self) -> usize {
self.state.lock().free_count()
}
}
impl Debug for ChunkAlloc {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let state = self.state.lock();
f.debug_struct("ChunkAlloc")
.field("bitmap_free_count", &state.free_count)
.field("bitmap_next_free", &state.next_free)
.finish()
}
}
////////////////////////////////////////////////////////////////////////////////
// Persistent State
////////////////////////////////////////////////////////////////////////////////
/// The persistent state of a chunk allocator.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ChunkAllocState {
// A bitmap where each bit indicates whether a corresponding chunk
// has been allocated.
alloc_map: BitMap,
// The number of free chunks.
free_count: usize,
// The next free chunk Id. Used to narrow the scope of
// searching for free chunk IDs.
next_free: usize,
/// Whether the state is in the journal or not.
in_journal: bool,
}
// TODO: Separate persistent and volatile state of `ChunkAlloc`
impl ChunkAllocState {
/// Creates a persistent state for managing chunks of the specified number.
/// Initially, all chunks are free.
pub fn new(capacity: usize) -> Self {
Self {
alloc_map: BitMap::repeat(false, capacity),
free_count: capacity,
next_free: 0,
in_journal: false,
}
}
/// Creates a persistent state in the journal. The state in the journal and
/// the state that `RawLogStore` manages act differently on allocation and
/// edits' appliance.
pub fn new_in_journal(capacity: usize) -> Self {
Self {
alloc_map: BitMap::repeat(false, capacity),
free_count: capacity,
next_free: 0,
in_journal: true,
}
}
/// Allocates a chunk, returning its ID.
pub fn alloc(&mut self) -> Option<ChunkId> {
let mut next_free = self.next_free;
if next_free == self.alloc_map.len() {
next_free = 0;
}
let free_chunk_id = {
if let Some(chunk_id) = self.alloc_map.first_zero(next_free) {
chunk_id
} else {
self.alloc_map
.first_zero(0)
.expect("there must exists a zero")
}
};
self.alloc_map.set(free_chunk_id, true);
self.free_count -= 1;
self.next_free = free_chunk_id + 1;
Some(free_chunk_id)
}
/// Deallocates the chunk of a given ID.
///
/// # Panic
///
/// Deallocating a free chunk causes panic.
pub fn dealloc(&mut self, chunk_id: ChunkId) {
debug_assert!(self.alloc_map[chunk_id]);
self.alloc_map.set(chunk_id, false);
self.free_count += 1;
}
/// Returns the total number of chunks.
pub fn capacity(&self) -> usize {
self.alloc_map.len()
}
/// Returns the number of free chunks.
pub fn free_count(&self) -> usize {
self.free_count
}
/// Returns whether a specific chunk is allocated.
pub fn is_chunk_allocated(&self, chunk_id: ChunkId) -> bool {
self.alloc_map[chunk_id]
}
}
////////////////////////////////////////////////////////////////////////////////
// Persistent Edit
////////////////////////////////////////////////////////////////////////////////
/// A persistent edit to the state of a chunk allocator.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ChunkAllocEdit {
edit_table: HashMap<ChunkId, ChunkEdit>,
}
/// The smallest unit of a persistent edit to the
/// state of a chunk allocator, which is
/// a chunk being either allocated or deallocated.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
enum ChunkEdit {
Alloc,
Dealloc,
}
impl ChunkAllocEdit {
/// Creates a new empty edit table.
pub fn new() -> Self {
Self {
edit_table: HashMap::new(),
}
}
/// Records a chunk allocation in the edit.
pub fn alloc(&mut self, chunk_id: ChunkId) {
let old_edit = self.edit_table.insert(chunk_id, ChunkEdit::Alloc);
// There must be a logical error if an edit has been recorded
// for the chunk. If the chunk edit is `ChunkEdit::Alloc`, then
// it is double allocations. If the chunk edit is `ChunkEdit::Dealloc`,
// then such deallocations can only take effect after the edit is
// committed. Thus, it is impossible to allocate the chunk again now.
assert!(old_edit.is_none());
}
/// Records a chunk deallocation in the edit.
///
/// The return value indicates whether the chunk being deallocated
/// is previously recorded in the edit as being allocated.
/// If so, the chunk can be deallocated in the `ChunkAllocState`.
pub fn dealloc(&mut self, chunk_id: ChunkId) -> bool {
match self.edit_table.get(&chunk_id) {
None => {
self.edit_table.insert(chunk_id, ChunkEdit::Dealloc);
false
}
Some(&ChunkEdit::Alloc) => {
self.edit_table.remove(&chunk_id);
true
}
Some(&ChunkEdit::Dealloc) => {
panic!("a chunk must not be deallocated twice");
}
}
}
/// Returns an iterator over all allocated chunks.
pub fn iter_allocated_chunks(&self) -> impl Iterator<Item = ChunkId> + '_ {
self.edit_table.iter().filter_map(|(id, edit)| {
if *edit == ChunkEdit::Alloc {
Some(*id)
} else {
None
}
})
}
pub fn is_empty(&self) -> bool {
self.edit_table.is_empty()
}
}
impl Edit<ChunkAllocState> for ChunkAllocEdit {
fn apply_to(&self, state: &mut ChunkAllocState) {
let mut to_be_deallocated = Vec::new();
for (&chunk_id, chunk_edit) in &self.edit_table {
match chunk_edit {
ChunkEdit::Alloc => {
if state.in_journal {
let _allocated_id = state.alloc().unwrap();
}
// Except journal, nothing needs to be done
}
ChunkEdit::Dealloc => {
to_be_deallocated.push(chunk_id);
}
}
}
for chunk_id in to_be_deallocated {
state.dealloc(chunk_id);
}
}
}
impl TxData for ChunkAllocEdit {}
#[cfg(test)]
mod tests {
use super::*;
fn new_chunk_alloc() -> ChunkAlloc {
let cap = 1024_usize;
let tx_provider = TxProvider::new();
let chunk_alloc = ChunkAlloc::new(cap, tx_provider);
assert_eq!(chunk_alloc.capacity(), cap);
assert_eq!(chunk_alloc.free_count(), cap);
chunk_alloc
}
fn do_alloc_dealloc_tx(chunk_alloc: &ChunkAlloc, alloc_cnt: usize, dealloc_cnt: usize) -> Tx {
debug_assert!(alloc_cnt <= chunk_alloc.capacity() && dealloc_cnt <= alloc_cnt);
let mut tx = chunk_alloc.new_tx();
tx.context(|| {
let chunk_id = chunk_alloc.alloc().unwrap();
let chunk_ids = chunk_alloc.alloc_batch(alloc_cnt - 1).unwrap();
let allocated_chunk_ids: Vec<ChunkId> = core::iter::once(chunk_id)
.chain(chunk_ids.into_iter())
.collect();
chunk_alloc.dealloc(allocated_chunk_ids[0]);
chunk_alloc.dealloc_batch(
allocated_chunk_ids[alloc_cnt - dealloc_cnt + 1..alloc_cnt]
.iter()
.cloned(),
);
});
tx
}
#[test]
fn chunk_alloc_dealloc_tx_commit() -> Result<()> {
let chunk_alloc = new_chunk_alloc();
let cap = chunk_alloc.capacity();
let (alloc_cnt, dealloc_cnt) = (cap, cap);
let mut tx = do_alloc_dealloc_tx(&chunk_alloc, alloc_cnt, dealloc_cnt);
tx.commit()?;
assert_eq!(chunk_alloc.free_count(), cap - alloc_cnt + dealloc_cnt);
Ok(())
}
#[test]
fn chunk_alloc_dealloc_tx_abort() -> Result<()> {
let chunk_alloc = new_chunk_alloc();
let cap = chunk_alloc.capacity();
let (alloc_cnt, dealloc_cnt) = (cap / 2, cap / 4);
let mut tx = do_alloc_dealloc_tx(&chunk_alloc, alloc_cnt, dealloc_cnt);
tx.abort();
assert_eq!(chunk_alloc.free_count(), cap);
Ok(())
}
}

View File

@ -0,0 +1,14 @@
// SPDX-License-Identifier: MPL-2.0
//! The layer of transactional logging.
//!
//! `TxLogStore` is a transactional, log-oriented file system.
//! It supports creating, deleting, listing, reading, and writing `TxLog`s.
//! Each `TxLog` is an append-only log, and assigned an unique `TxLogId`.
//! All `TxLogStore`'s APIs should be called within transactions (`TX`).
mod chunk;
mod raw_log;
mod tx_log;
pub use self::tx_log::{TxLog, TxLogId, TxLogStore};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,132 @@
// SPDX-License-Identifier: MPL-2.0
//! Compaction in `TxLsmTree`.
use core::marker::PhantomData;
use super::{
mem_table::ValueEx, sstable::SSTable, tx_lsm_tree::SSTABLE_CAPACITY, LsmLevel, RecordKey,
RecordValue, SyncId, TxEventListener,
};
use crate::{
layers::{bio::BlockSet, log::TxLogStore},
os::{JoinHandle, Mutex},
prelude::*,
};
/// A `Compactor` is currently used for asynchronous compaction
/// and specific compaction algorithm of `TxLsmTree`.
pub(super) struct Compactor<K, V> {
handle: Mutex<Option<JoinHandle<Result<()>>>>,
phantom: PhantomData<(K, V)>,
}
impl<K: RecordKey<K>, V: RecordValue> Compactor<K, V> {
/// Create a new `Compactor` instance.
pub fn new() -> Self {
Self {
handle: Mutex::new(None),
phantom: PhantomData,
}
}
/// Record current compaction thread handle.
pub fn record_handle(&self, handle: JoinHandle<Result<()>>) {
let mut handle_opt = self.handle.lock();
assert!(handle_opt.is_none());
let _ = handle_opt.insert(handle);
}
/// Wait until the compaction is finished.
pub fn wait_compaction(&self) -> Result<()> {
if let Some(handle) = self.handle.lock().take() {
handle.join().unwrap()
} else {
Ok(())
}
}
/// Core function for compacting overlapped records and building new SSTs.
///
/// # Panics
///
/// This method must be called within a TX. Otherwise, this method panics.
pub fn compact_records_and_build_ssts<D: BlockSet + 'static>(
upper_records: impl Iterator<Item = (K, ValueEx<V>)>,
lower_records: impl Iterator<Item = (K, ValueEx<V>)>,
tx_log_store: &Arc<TxLogStore<D>>,
event_listener: &Arc<dyn TxEventListener<K, V>>,
to_level: LsmLevel,
sync_id: SyncId,
) -> Result<Vec<SSTable<K, V>>> {
let mut created_ssts = Vec::new();
let mut upper_iter = upper_records.peekable();
let mut lower_iter = lower_records.peekable();
loop {
let mut record_cnt = 0;
let records_iter = core::iter::from_fn(|| {
if record_cnt == SSTABLE_CAPACITY {
return None;
}
record_cnt += 1;
match (upper_iter.peek(), lower_iter.peek()) {
(Some((upper_k, _)), Some((lower_k, _))) => match upper_k.cmp(lower_k) {
core::cmp::Ordering::Less => upper_iter.next(),
core::cmp::Ordering::Greater => lower_iter.next(),
core::cmp::Ordering::Equal => {
let (k, new_v_ex) = upper_iter.next().unwrap();
let (_, old_v_ex) = lower_iter.next().unwrap();
let (next_v_ex, dropped_v_opt) =
Self::compact_value_ex(new_v_ex, old_v_ex);
if let Some(dropped_v) = dropped_v_opt {
event_listener.on_drop_record(&(k, dropped_v)).unwrap();
}
Some((k, next_v_ex))
}
},
(Some(_), None) => upper_iter.next(),
(None, Some(_)) => lower_iter.next(),
(None, None) => None,
}
});
let mut records_iter = records_iter.peekable();
if records_iter.peek().is_none() {
break;
}
let new_log = tx_log_store.create_log(to_level.bucket())?;
let new_sst = SSTable::build(records_iter, sync_id, &new_log, None)?;
created_ssts.push(new_sst);
}
Ok(created_ssts)
}
/// Compact two `ValueEx<V>`s with the same key, returning
/// the compacted value and the dropped value if any.
fn compact_value_ex(new: ValueEx<V>, old: ValueEx<V>) -> (ValueEx<V>, Option<V>) {
match (new, old) {
(ValueEx::Synced(new_v), ValueEx::Synced(old_v)) => {
(ValueEx::Synced(new_v), Some(old_v))
}
(ValueEx::Unsynced(new_v), ValueEx::Synced(old_v)) => {
(ValueEx::SyncedAndUnsynced(old_v, new_v), None)
}
(ValueEx::Unsynced(new_v), ValueEx::Unsynced(old_v)) => {
(ValueEx::Unsynced(new_v), Some(old_v))
}
(ValueEx::Unsynced(new_v), ValueEx::SyncedAndUnsynced(old_sv, old_usv)) => {
(ValueEx::SyncedAndUnsynced(old_sv, new_v), Some(old_usv))
}
(ValueEx::SyncedAndUnsynced(new_sv, new_usv), ValueEx::Synced(old_sv)) => {
(ValueEx::SyncedAndUnsynced(new_sv, new_usv), Some(old_sv))
}
_ => {
unreachable!()
}
}
}
}

View File

@ -0,0 +1,402 @@
// SPDX-License-Identifier: MPL-2.0
//! MemTable.
use core::ops::Range;
use super::{tx_lsm_tree::OnDropRecodeFn, AsKV, RangeQueryCtx, RecordKey, RecordValue, SyncId};
use crate::{
os::{BTreeMap, Condvar, CvarMutex, Mutex, RwLock, RwLockReadGuard},
prelude::*,
};
/// Manager for an mutable `MemTable` and an immutable `MemTable`
/// in a `TxLsmTree`.
pub(super) struct MemTableManager<K: RecordKey<K>, V> {
mutable: Mutex<MemTable<K, V>>,
immutable: RwLock<MemTable<K, V>>, // Read-only most of the time
cvar: Condvar,
is_full: CvarMutex<bool>,
}
/// MemTable for LSM-Tree.
///
/// Manages organized key-value records in memory with a capacity.
/// Each `MemTable` is sync-aware (tagged with current sync ID).
/// Both synced and unsynced records can co-exist.
/// Also supports user-defined callback when a record is dropped.
pub(super) struct MemTable<K: RecordKey<K>, V> {
table: BTreeMap<K, ValueEx<V>>,
size: usize,
cap: usize,
sync_id: SyncId,
unsynced_range: Option<Range<K>>,
on_drop_record: Option<Arc<OnDropRecodeFn<K, V>>>,
}
/// An extended value which is sync-aware.
/// At most one unsynced and one synced records can coexist at the same time.
#[derive(Clone, Debug)]
pub(super) enum ValueEx<V> {
Synced(V),
Unsynced(V),
SyncedAndUnsynced(V, V),
}
impl<K: RecordKey<K>, V: RecordValue> MemTableManager<K, V> {
/// Creates a new `MemTableManager` given the current master sync ID,
/// the capacity and the callback when dropping records.
pub fn new(
sync_id: SyncId,
capacity: usize,
on_drop_record_in_memtable: Option<Arc<OnDropRecodeFn<K, V>>>,
) -> Self {
let mutable = Mutex::new(MemTable::new(
capacity,
sync_id,
on_drop_record_in_memtable.clone(),
));
let immutable = RwLock::new(MemTable::new(capacity, sync_id, on_drop_record_in_memtable));
Self {
mutable,
immutable,
cvar: Condvar::new(),
is_full: CvarMutex::new(false),
}
}
/// Gets the target value of the given key from the `MemTable`s.
pub fn get(&self, key: &K) -> Option<V> {
if let Some(value) = self.mutable.lock().get(key) {
return Some(*value);
}
if let Some(value) = self.immutable.read().get(key) {
return Some(*value);
}
None
}
/// Gets the range of values from the `MemTable`s.
pub fn get_range(&self, range_query_ctx: &mut RangeQueryCtx<K, V>) -> bool {
let is_completed = self.mutable.lock().get_range(range_query_ctx);
if is_completed {
return is_completed;
}
self.immutable.read().get_range(range_query_ctx)
}
/// Puts a key-value pair into the mutable `MemTable`, and
/// return whether the mutable `MemTable` is full.
pub fn put(&self, key: K, value: V) -> bool {
let mut is_full = self.is_full.lock().unwrap();
while *is_full {
is_full = self.cvar.wait(is_full).unwrap();
}
debug_assert!(!*is_full);
let mut mutable = self.mutable.lock();
let _ = mutable.put(key, value);
if mutable.at_capacity() {
*is_full = true;
}
*is_full
}
/// Sync the mutable `MemTable` with the given sync ID.
pub fn sync(&self, sync_id: SyncId) {
self.mutable.lock().sync(sync_id)
}
/// Switch two `MemTable`s. Should only be called in a situation that
/// the mutable `MemTable` becomes full and the immutable `MemTable` is
/// ready to be cleared.
pub fn switch(&self) -> Result<()> {
let mut is_full = self.is_full.lock().unwrap();
debug_assert!(*is_full);
let mut mutable = self.mutable.lock();
let sync_id = mutable.sync_id();
let mut immutable = self.immutable.write();
immutable.clear();
core::mem::swap(&mut *mutable, &mut *immutable);
debug_assert!(mutable.is_empty() && immutable.at_capacity());
// Update sync ID of the switched mutable `MemTable`
mutable.sync(sync_id);
*is_full = false;
self.cvar.notify_all();
Ok(())
}
/// Gets the immutable `MemTable` instance (read-only).
pub fn immutable_memtable(&self) -> RwLockReadGuard<MemTable<K, V>> {
self.immutable.read()
}
}
impl<K: RecordKey<K>, V: RecordValue> MemTable<K, V> {
/// Creates a new `MemTable`, given the capacity, the current sync ID,
/// and the callback of dropping record.
pub fn new(
cap: usize,
sync_id: SyncId,
on_drop_record: Option<Arc<OnDropRecodeFn<K, V>>>,
) -> Self {
Self {
table: BTreeMap::new(),
size: 0,
cap,
sync_id,
unsynced_range: None,
on_drop_record,
}
}
/// Gets the target value given the key.
pub fn get(&self, key: &K) -> Option<&V> {
let value_ex = self.table.get(key)?;
Some(value_ex.get())
}
/// Range query, returns whether the request is completed.
pub fn get_range(&self, range_query_ctx: &mut RangeQueryCtx<K, V>) -> bool {
debug_assert!(!range_query_ctx.is_completed());
let target_range = range_query_ctx.range_uncompleted().unwrap();
for (k, v_ex) in self.table.range(target_range) {
range_query_ctx.complete(*k, *v_ex.get());
}
range_query_ctx.is_completed()
}
/// Puts a new K-V record to the table, drop the old one.
pub fn put(&mut self, key: K, value: V) -> Option<V> {
let dropped_value = if let Some(value_ex) = self.table.get_mut(&key) {
if let Some(dropped) = value_ex.put(value) {
let _ = self
.on_drop_record
.as_ref()
.map(|on_drop_record| on_drop_record(&(key, dropped)));
Some(dropped)
} else {
self.size += 1;
None
}
} else {
let _ = self.table.insert(key, ValueEx::new(value));
self.size += 1;
None
};
if let Some(range) = self.unsynced_range.as_mut() {
if range.is_empty() {
*range = key..key + 1;
} else {
let start = key.min(range.start);
let end = (key + 1).max(range.end);
*range = start..end;
}
}
dropped_value
}
/// Sync the table, update the sync ID, drop the replaced one.
pub fn sync(&mut self, sync_id: SyncId) {
debug_assert!(self.sync_id <= sync_id);
if self.sync_id == sync_id {
return;
}
let filter_unsynced: Box<dyn Iterator<Item = _>> = if let Some(range) = &self.unsynced_range
{
Box::new(
self.table
.range_mut(range.clone())
.filter(|(_, v_ex)| v_ex.contains_unsynced()),
)
} else {
Box::new(
self.table
.iter_mut()
.filter(|(_, v_ex)| v_ex.contains_unsynced()),
)
};
for (k, v_ex) in filter_unsynced {
if let Some(dropped) = v_ex.sync() {
let _ = self
.on_drop_record
.as_ref()
.map(|on_drop_record| on_drop_record(&(*k, dropped)));
self.size -= 1;
}
}
self.sync_id = sync_id;
// Insert an empty range upon first sync
let _ = self
.unsynced_range
.get_or_insert_with(|| K::new_uninit()..K::new_uninit());
}
/// Return the sync ID of this table.
pub fn sync_id(&self) -> SyncId {
self.sync_id
}
/// Return an iterator over the table.
pub fn iter(&self) -> impl Iterator<Item = (&K, &ValueEx<V>)> {
self.table.iter()
}
/// Return the number of records in the table.
pub fn size(&self) -> usize {
self.size
}
/// Return whether the table is empty.
pub fn is_empty(&self) -> bool {
self.size == 0
}
/// Return whether the table is full.
pub fn at_capacity(&self) -> bool {
self.size >= self.cap
}
/// Clear all records from the table.
pub fn clear(&mut self) {
self.table.clear();
self.size = 0;
self.unsynced_range = None;
}
}
impl<V: RecordValue> ValueEx<V> {
/// Creates a new unsynced value.
pub fn new(value: V) -> Self {
Self::Unsynced(value)
}
/// Gets the most recent value.
pub fn get(&self) -> &V {
match self {
Self::Synced(v) => v,
Self::Unsynced(v) => v,
Self::SyncedAndUnsynced(_, v) => v,
}
}
/// Puts a new value, return the replaced value if any.
fn put(&mut self, value: V) -> Option<V> {
let existed = core::mem::take(self);
match existed {
ValueEx::Synced(v) => {
*self = Self::SyncedAndUnsynced(v, value);
None
}
ValueEx::Unsynced(v) => {
*self = Self::Unsynced(value);
Some(v)
}
ValueEx::SyncedAndUnsynced(sv, usv) => {
*self = Self::SyncedAndUnsynced(sv, value);
Some(usv)
}
}
}
/// Sync the value, return the replaced value if any.
fn sync(&mut self) -> Option<V> {
debug_assert!(self.contains_unsynced());
let existed = core::mem::take(self);
match existed {
ValueEx::Unsynced(v) => {
*self = Self::Synced(v);
None
}
ValueEx::SyncedAndUnsynced(sv, usv) => {
*self = Self::Synced(usv);
Some(sv)
}
ValueEx::Synced(_) => unreachable!(),
}
}
/// Whether the value contains an unsynced value.
pub fn contains_unsynced(&self) -> bool {
match self {
ValueEx::Unsynced(_) | ValueEx::SyncedAndUnsynced(_, _) => true,
ValueEx::Synced(_) => false,
}
}
}
impl<V: RecordValue> Default for ValueEx<V> {
fn default() -> Self {
Self::Unsynced(V::new_uninit())
}
}
impl<K: RecordKey<K>, V: RecordValue> Debug for MemTableManager<K, V> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("MemTableManager")
.field("mutable_memtable_size", &self.mutable.lock().size())
.field("immutable_memtable_size", &self.immutable_memtable().size())
.finish()
}
}
#[cfg(test)]
mod tests {
use core::sync::atomic::{AtomicU16, Ordering};
use super::*;
#[test]
fn memtable_fns() -> Result<()> {
impl RecordValue for u16 {}
let drop_count = Arc::new(AtomicU16::new(0));
let dc = drop_count.clone();
let drop_fn = move |_: &dyn AsKV<usize, u16>| {
dc.fetch_add(1, Ordering::Relaxed);
};
let mut table = MemTable::<usize, u16>::new(4, 0, Some(Arc::new(drop_fn)));
table.put(1, 11);
table.put(2, 12);
table.put(2, 22);
assert_eq!(drop_count.load(Ordering::Relaxed), 1);
assert_eq!(table.size(), 2);
assert_eq!(table.at_capacity(), false);
table.sync(1);
table.put(2, 32);
assert_eq!(table.size(), 3);
assert_eq!(*table.get(&2).unwrap(), 32);
table.sync(2);
assert_eq!(drop_count.load(Ordering::Relaxed), 2);
table.put(2, 52);
table.put(3, 13);
assert_eq!(table.at_capacity(), true);
let mut range_query_ctx = RangeQueryCtx::new(2, 2);
assert_eq!(table.get_range(&mut range_query_ctx), true);
assert_eq!(range_query_ctx.into_results(), vec![(2, 52), (3, 13)]);
assert_eq!(table.sync_id(), 2);
table.clear();
assert_eq!(table.is_empty(), true);
Ok(())
}
}

View File

@ -0,0 +1,79 @@
// SPDX-License-Identifier: MPL-2.0
//! The layer of transactional Lsm-Tree.
//!
//! This module provides the implementation for `TxLsmTree`.
//! `TxLsmTree` is similar to general-purpose LSM-Tree, supporting `put()`, `get()`, `get_range()`
//! key-value records, which are managed in MemTables and SSTables.
//!
//! `TxLsmTree` is transactional in the sense that
//! 1) it supports `sync()` that guarantees changes are persisted atomically and irreversibly,
//! synchronized records and unsynchronized records can co-existed.
//! 2) its internal data is securely stored in `TxLogStore` (L3) and updated in transactions for consistency,
//! WALs and SSTables are stored and managed in `TxLogStore`.
//!
//! `TxLsmTree` supports piggybacking callbacks during compaction and recovery.
//!
//! # Usage Example
//!
//! Create a `TxLsmTree` then put some records into it.
//!
//! ```
//! // Prepare an underlying disk (implement `BlockSet`) first
//! let nblocks = 1024;
//! let mem_disk = MemDisk::create(nblocks)?;
//!
//! // Prepare an underlying `TxLogStore` (L3) for storing WALs and SSTs
//! let tx_log_store = Arc::new(TxLogStore::format(mem_disk)?);
//!
//! // Create a `TxLsmTree` with the created `TxLogStore`
//! let tx_lsm_tree: TxLsmTree<BlockId, String, MemDisk> =
//! TxLsmTree::format(tx_log_store, Arc::new(YourFactory), None)?;
//!
//! // Put some key-value records into the tree
//! for i in 0..10 {
//! let k = i as BlockId;
//! let v = i.to_string();
//! tx_lsm_tree.put(k, v)?;
//! }
//!
//! // Issue a sync operation to the tree to ensure persistency
//! tx_lsm_tree.sync()?;
//!
//! // Use `get()` (or `get_range()`) to query the tree
//! let target_value = tx_lsm_tree.get(&5).unwrap();
//! // Check the previously put value
//! assert_eq(target_value, "5");
//!
//! // `TxLsmTree` supports user-defined per-TX callbacks
//! struct YourFactory;
//! struct YourListener;
//!
//! impl<K, V> TxEventListenerFactory<K, V> for YourFactory {
//! // Support create per-TX (upon compaction or upon recovery) listener
//! fn new_event_listener(&self, tx_type: TxType) -> Arc<dyn TxEventListener<K, V>> {
//! Arc::new(YourListener::new(tx_type))
//! }
//! }
//!
//! // Support defining callbacks when record is added or drop, or
//! // at some critical points during a TX
//! impl<K, V> TxEventListener<K, V> for YourListener {
//! /* details omitted, see the API for more */
//! }
//! ```
mod compaction;
mod mem_table;
mod range_query_ctx;
mod sstable;
mod tx_lsm_tree;
mod wal;
pub use self::{
range_query_ctx::RangeQueryCtx,
tx_lsm_tree::{
AsKV, LsmLevel, RecordKey, RecordValue, SyncId, SyncIdStore, TxEventListener,
TxEventListenerFactory, TxLsmTree, TxType,
},
};

View File

@ -0,0 +1,96 @@
// SPDX-License-Identifier: MPL-2.0
// Context for range query.
use core::ops::RangeInclusive;
use super::{RecordKey, RecordValue};
use crate::{prelude::*, util::BitMap};
/// Context for a range query request.
/// It tracks the completing process of each slot within the range.
/// A "slot" indicates one specific key-value pair of the query.
#[derive(Debug)]
pub struct RangeQueryCtx<K, V> {
start: K,
num_values: usize,
complete_table: BitMap,
min_uncompleted: usize,
res: Vec<(K, V)>,
}
impl<K: RecordKey<K>, V: RecordValue> RangeQueryCtx<K, V> {
/// Create a new context with the given start key,
/// and the number of values for query.
pub fn new(start: K, num_values: usize) -> Self {
Self {
start,
num_values,
complete_table: BitMap::repeat(false, num_values),
min_uncompleted: 0,
res: Vec::with_capacity(num_values),
}
}
/// Gets the uncompleted range within the whole, returns `None`
/// if all slots are already completed.
pub fn range_uncompleted(&self) -> Option<RangeInclusive<K>> {
if self.is_completed() {
return None;
}
debug_assert!(self.min_uncompleted < self.num_values);
let first_uncompleted = self.start + self.min_uncompleted;
let last_uncompleted = self.start + self.complete_table.last_zero()?;
Some(first_uncompleted..=last_uncompleted)
}
/// Whether the uncompleted range contains the target key.
pub fn contains_uncompleted(&self, key: &K) -> bool {
let nth = *key - self.start;
nth < self.num_values && !self.complete_table[nth]
}
/// Whether the range query context is completed, means
/// all slots are filled with the corresponding values.
pub fn is_completed(&self) -> bool {
self.min_uncompleted == self.num_values
}
/// Complete one slot within the range, with the specific
/// key and the queried value.
pub fn complete(&mut self, key: K, value: V) {
let nth = key - self.start;
if self.complete_table[nth] {
return;
}
self.res.push((key, value));
self.complete_table.set(nth, true);
self.update_min_uncompleted(nth);
}
/// Mark the specific slot as completed.
pub fn mark_completed(&mut self, key: K) {
let nth = key - self.start;
self.complete_table.set(nth, true);
self.update_min_uncompleted(nth);
}
/// Turn the context into final results.
pub fn into_results(self) -> Vec<(K, V)> {
debug_assert!(self.is_completed());
self.res
}
fn update_min_uncompleted(&mut self, completed_nth: usize) {
if self.min_uncompleted == completed_nth {
if let Some(next_uncompleted) = self.complete_table.first_zero(completed_nth) {
self.min_uncompleted = next_uncompleted;
} else {
// Indicate all slots are completed
self.min_uncompleted = self.num_values;
}
}
}
}

View File

@ -0,0 +1,779 @@
// SPDX-License-Identifier: MPL-2.0
//! Sorted String Table.
use alloc::vec;
use core::{marker::PhantomData, mem::size_of, num::NonZeroUsize, ops::RangeInclusive};
use lru::LruCache;
use ostd_pod::Pod;
use super::{
mem_table::ValueEx, tx_lsm_tree::AsKVex, RangeQueryCtx, RecordKey, RecordValue, SyncId,
TxEventListener,
};
use crate::{
layers::{
bio::{BlockSet, Buf, BufMut, BufRef, BID_SIZE},
log::{TxLog, TxLogId, TxLogStore},
},
os::Mutex,
prelude::*,
};
/// Sorted String Table (SST) for `TxLsmTree`.
///
/// Responsible for storing, managing key-value records on a `TxLog` (L3).
/// Records are serialized, sorted, organized on the `TxLog`.
/// Supports three access modes: point query, range query and whole scan.
pub(super) struct SSTable<K, V> {
id: TxLogId,
footer: Footer<K>,
cache: Mutex<LruCache<BlockId, Arc<RecordBlock>>>,
phantom: PhantomData<(K, V)>,
}
/// Footer of a `SSTable`, contains metadata of itself
/// index entries for locating record blocks.
#[derive(Debug)]
struct Footer<K> {
meta: FooterMeta,
index: Vec<IndexEntry<K>>,
}
/// Footer metadata to describe a `SSTable`.
#[repr(C)]
#[derive(Copy, Clone, Pod, Debug)]
struct FooterMeta {
num_index: u16,
index_nblocks: u16,
total_records: u32,
record_block_size: u32,
sync_id: SyncId,
}
const FOOTER_META_SIZE: usize = size_of::<FooterMeta>();
/// Index entry to describe a `RecordBlock` in a `SSTable`.
#[derive(Debug)]
struct IndexEntry<K> {
pos: BlockId,
first: K,
last: K,
}
/// A block full of serialized records.
struct RecordBlock {
buf: Vec<u8>,
}
const RECORD_BLOCK_NBLOCKS: usize = 32;
/// The size of a `RecordBlock`, which is a multiple of `BLOCK_SIZE`.
const RECORD_BLOCK_SIZE: usize = RECORD_BLOCK_NBLOCKS * BLOCK_SIZE;
/// Accessor for a query.
enum QueryAccessor<K> {
Point(K),
Range(RangeInclusive<K>),
}
/// Iterator over `RecordBlock` for query purpose.
struct BlockQueryIter<'a, K, V> {
block: &'a RecordBlock,
offset: usize,
accessor: &'a QueryAccessor<K>,
phantom: PhantomData<(K, V)>,
}
/// Accessor for a whole table scan.
struct ScanAccessor<'a, K, V> {
all_synced: bool,
discard_unsynced: bool,
event_listener: Option<&'a Arc<dyn TxEventListener<K, V>>>,
}
/// Iterator over `RecordBlock` for scan purpose.
struct BlockScanIter<'a, K, V> {
block: Arc<RecordBlock>,
offset: usize,
accessor: ScanAccessor<'a, K, V>,
}
/// Iterator over `SSTable`.
pub(super) struct SstIter<'a, K, V, D> {
sst: &'a SSTable<K, V>,
curr_nth_index: usize,
curr_rb_iter: Option<BlockScanIter<'a, K, V>>,
tx_log_store: &'a Arc<TxLogStore<D>>,
}
/// Format on a `TxLog`:
///
/// ```text
/// | [Record] | [Record] |...| Footer |
/// |K|flag|V(V)| ... | [Record] |...| [IndexEntry] | FooterMeta |
/// |RECORD_BLOCK_SIZE|RECORD_BLOCK_SIZE|...| |
/// ```
impl<K: RecordKey<K>, V: RecordValue> SSTable<K, V> {
const K_SIZE: usize = size_of::<K>();
const V_SIZE: usize = size_of::<V>();
const FLAG_SIZE: usize = size_of::<RecordFlag>();
const MIN_RECORD_SIZE: usize = BID_SIZE + Self::FLAG_SIZE + Self::V_SIZE;
const MAX_RECORD_SIZE: usize = BID_SIZE + Self::FLAG_SIZE + 2 * Self::V_SIZE;
const INDEX_ENTRY_SIZE: usize = BID_SIZE + 2 * Self::K_SIZE;
const CACHE_CAP: usize = 1024;
/// Return the ID of this `SSTable`, which is the same ID
/// to the underlying `TxLog`.
pub fn id(&self) -> TxLogId {
self.id
}
/// Return the sync ID of this `SSTable`, it may be smaller than the
/// current master sync ID.
pub fn sync_id(&self) -> SyncId {
self.footer.meta.sync_id
}
/// The range of keys covered by this `SSTable`.
pub fn range(&self) -> RangeInclusive<K> {
RangeInclusive::new(
self.footer.index[0].first,
self.footer.index[self.footer.meta.num_index as usize - 1].last,
)
}
/// Whether the target key is within the range, "within the range" doesn't mean
/// the `SSTable` do have this key.
pub fn is_within_range(&self, key: &K) -> bool {
self.range().contains(key)
}
/// Whether the target range is overlapped with the range of this `SSTable`.
pub fn overlap_with(&self, rhs_range: &RangeInclusive<K>) -> bool {
let lhs_range = self.range();
!(lhs_range.end() < rhs_range.start() || lhs_range.start() > rhs_range.end())
}
// Accessing functions below
/// Point query.
///
/// # Panics
///
/// This method must be called within a TX. Otherwise, this method panics.
pub fn access_point<D: BlockSet + 'static>(
&self,
key: &K,
tx_log_store: &Arc<TxLogStore<D>>,
) -> Result<V> {
debug_assert!(self.range().contains(key));
let target_rb_pos = self
.footer
.index
.iter()
.find_map(|entry| {
if entry.is_within_range(key) {
Some(entry.pos)
} else {
None
}
})
.ok_or(Error::with_msg(NotFound, "target key not found in sst"))?;
let accessor = QueryAccessor::Point(*key);
let target_rb = self.target_record_block(target_rb_pos, tx_log_store)?;
let mut iter = BlockQueryIter::<'_, K, V> {
block: &target_rb,
offset: 0,
accessor: &accessor,
phantom: PhantomData,
};
iter.find_map(|(k, v_opt)| if k == *key { v_opt } else { None })
.ok_or(Error::with_msg(NotFound, "target value not found in SST"))
}
/// Range query.
///
/// # Panics
///
/// This method must be called within a TX. Otherwise, this method panics.
pub fn access_range<D: BlockSet + 'static>(
&self,
range_query_ctx: &mut RangeQueryCtx<K, V>,
tx_log_store: &Arc<TxLogStore<D>>,
) -> Result<()> {
debug_assert!(!range_query_ctx.is_completed());
let range_uncompleted = range_query_ctx.range_uncompleted().unwrap();
let target_rbs = self.footer.index.iter().filter_map(|entry| {
if entry.overlap_with(&range_uncompleted) {
Some(entry.pos)
} else {
None
}
});
let accessor = QueryAccessor::Range(range_uncompleted.clone());
for target_rb_pos in target_rbs {
let target_rb = self.target_record_block(target_rb_pos, tx_log_store)?;
let iter = BlockQueryIter::<'_, K, V> {
block: &target_rb,
offset: 0,
accessor: &accessor,
phantom: PhantomData,
};
let targets: Vec<_> = iter
.filter_map(|(k, v_opt)| {
if range_uncompleted.contains(&k) {
Some((k, v_opt.unwrap()))
} else {
None
}
})
.collect();
for (target_k, target_v) in targets {
range_query_ctx.complete(target_k, target_v);
}
}
Ok(())
}
/// Locate the target record block given its position, it
/// resides in either the cache or the log.
fn target_record_block<D: BlockSet + 'static>(
&self,
target_pos: BlockId,
tx_log_store: &Arc<TxLogStore<D>>,
) -> Result<Arc<RecordBlock>> {
let mut cache = self.cache.lock();
if let Some(cached_rb) = cache.get(&target_pos) {
Ok(cached_rb.clone())
} else {
let mut rb = RecordBlock::from_buf(vec![0; RECORD_BLOCK_SIZE]);
// TODO: Avoid opening the log on every call
let tx_log = tx_log_store.open_log(self.id, false)?;
tx_log.read(target_pos, BufMut::try_from(rb.as_mut_slice()).unwrap())?;
let rb = Arc::new(rb);
cache.put(target_pos, rb.clone());
Ok(rb)
}
}
/// Return the iterator over this `SSTable`.
/// The given `event_listener` (optional) is used on dropping records
/// during iteration.
///
/// # Panics
///
/// This method must be called within a TX. Otherwise, this method panics.
pub fn iter<'a, D: BlockSet + 'static>(
&'a self,
sync_id: SyncId,
discard_unsynced: bool,
tx_log_store: &'a Arc<TxLogStore<D>>,
event_listener: Option<&'a Arc<dyn TxEventListener<K, V>>>,
) -> SstIter<'a, K, V, D> {
let all_synced = sync_id > self.sync_id();
let accessor = ScanAccessor {
all_synced,
discard_unsynced,
event_listener,
};
let first_rb = self
.target_record_block(self.footer.index[0].pos, tx_log_store)
.unwrap();
SstIter {
sst: self,
curr_nth_index: 0,
curr_rb_iter: Some(BlockScanIter {
block: first_rb,
offset: 0,
accessor,
}),
tx_log_store,
}
}
/// Scan the whole SST and collect all records.
///
/// # Panics
///
/// This method must be called within a TX. Otherwise, this method panics.
pub fn access_scan<D: BlockSet + 'static>(
&self,
sync_id: SyncId,
discard_unsynced: bool,
tx_log_store: &Arc<TxLogStore<D>>,
event_listener: Option<&Arc<dyn TxEventListener<K, V>>>,
) -> Result<Vec<(K, ValueEx<V>)>> {
let all_records = self
.iter(sync_id, discard_unsynced, tx_log_store, event_listener)
.collect();
Ok(all_records)
}
// Building functions below
/// Builds a SST given a bunch of records, after the SST becomes immutable.
/// The given `event_listener` (optional) is used on adding records.
///
/// # Panics
///
/// This method must be called within a TX. Otherwise, this method panics.
pub fn build<'a, D: BlockSet + 'static, I, KVex>(
records_iter: I,
sync_id: SyncId,
tx_log: &'a Arc<TxLog<D>>,
event_listener: Option<&'a Arc<dyn TxEventListener<K, V>>>,
) -> Result<Self>
where
I: Iterator<Item = KVex>,
KVex: AsKVex<K, V>,
Self: 'a,
{
let mut cache = LruCache::new(NonZeroUsize::new(Self::CACHE_CAP).unwrap());
let (total_records, index_vec) =
Self::build_record_blocks(records_iter, tx_log, &mut cache, event_listener)?;
let footer = Self::build_footer::<D>(index_vec, total_records, sync_id, tx_log)?;
Ok(Self {
id: tx_log.id(),
footer,
cache: Mutex::new(cache),
phantom: PhantomData,
})
}
/// Builds all the record blocks from the given records. Put the blocks to the log
/// and the cache.
fn build_record_blocks<'a, D: BlockSet + 'static, I, KVex>(
records_iter: I,
tx_log: &'a TxLog<D>,
cache: &mut LruCache<BlockId, Arc<RecordBlock>>,
event_listener: Option<&'a Arc<dyn TxEventListener<K, V>>>,
) -> Result<(usize, Vec<IndexEntry<K>>)>
where
I: Iterator<Item = KVex>,
KVex: AsKVex<K, V>,
Self: 'a,
{
let mut index_vec = Vec::new();
let mut total_records = 0;
let mut pos = 0 as BlockId;
let (mut first_k, mut curr_k) = (None, None);
let mut inner_offset = 0;
let mut block_buf = Vec::with_capacity(RECORD_BLOCK_SIZE);
for kv_ex in records_iter {
let (key, value_ex) = (*kv_ex.key(), kv_ex.value_ex());
total_records += 1;
if inner_offset == 0 {
debug_assert!(block_buf.is_empty());
let _ = first_k.insert(key);
}
let _ = curr_k.insert(key);
block_buf.extend_from_slice(key.as_bytes());
inner_offset += Self::K_SIZE;
match value_ex {
ValueEx::Synced(v) => {
block_buf.push(RecordFlag::Synced as u8);
block_buf.extend_from_slice(v.as_bytes());
if let Some(listener) = event_listener {
listener.on_add_record(&(&key, v))?;
}
inner_offset += 1 + Self::V_SIZE;
}
ValueEx::Unsynced(v) => {
block_buf.push(RecordFlag::Unsynced as u8);
block_buf.extend_from_slice(v.as_bytes());
if let Some(listener) = event_listener {
listener.on_add_record(&(&key, v))?;
}
inner_offset += 1 + Self::V_SIZE;
}
ValueEx::SyncedAndUnsynced(sv, usv) => {
block_buf.push(RecordFlag::SyncedAndUnsynced as u8);
block_buf.extend_from_slice(sv.as_bytes());
block_buf.extend_from_slice(usv.as_bytes());
if let Some(listener) = event_listener {
listener.on_add_record(&(&key, sv))?;
listener.on_add_record(&(&key, usv))?;
}
inner_offset += Self::MAX_RECORD_SIZE;
}
}
let cap_remained = RECORD_BLOCK_SIZE - inner_offset;
if cap_remained >= Self::MAX_RECORD_SIZE {
continue;
}
let index_entry = IndexEntry {
pos,
first: first_k.unwrap(),
last: key,
};
build_one_record_block(&index_entry, &mut block_buf, tx_log, cache)?;
index_vec.push(index_entry);
pos += RECORD_BLOCK_NBLOCKS;
inner_offset = 0;
block_buf.clear();
}
debug_assert!(total_records > 0);
if !block_buf.is_empty() {
let last_entry = IndexEntry {
pos,
first: first_k.unwrap(),
last: curr_k.unwrap(),
};
build_one_record_block(&last_entry, &mut block_buf, tx_log, cache)?;
index_vec.push(last_entry);
}
fn build_one_record_block<K: RecordKey<K>, D: BlockSet + 'static>(
entry: &IndexEntry<K>,
buf: &mut Vec<u8>,
tx_log: &TxLog<D>,
cache: &mut LruCache<BlockId, Arc<RecordBlock>>,
) -> Result<()> {
buf.resize(RECORD_BLOCK_SIZE, 0);
let record_block = RecordBlock::from_buf(buf.clone());
tx_log.append(BufRef::try_from(record_block.as_slice()).unwrap())?;
cache.put(entry.pos, Arc::new(record_block));
Ok(())
}
Ok((total_records, index_vec))
}
/// Builds the footer from the given index entries. The footer block will be appended
/// to the SST log's end.
fn build_footer<'a, D: BlockSet + 'static>(
index_vec: Vec<IndexEntry<K>>,
total_records: usize,
sync_id: SyncId,
tx_log: &'a TxLog<D>,
) -> Result<Footer<K>>
where
Self: 'a,
{
let footer_buf_len = align_up(
index_vec.len() * Self::INDEX_ENTRY_SIZE + FOOTER_META_SIZE,
BLOCK_SIZE,
);
let mut append_buf = Vec::with_capacity(footer_buf_len);
for entry in &index_vec {
append_buf.extend_from_slice(&entry.pos.to_le_bytes());
append_buf.extend_from_slice(entry.first.as_bytes());
append_buf.extend_from_slice(entry.last.as_bytes());
}
append_buf.resize(footer_buf_len, 0);
let meta = FooterMeta {
num_index: index_vec.len() as _,
index_nblocks: (footer_buf_len / BLOCK_SIZE) as _,
total_records: total_records as _,
record_block_size: RECORD_BLOCK_SIZE as _,
sync_id,
};
append_buf[footer_buf_len - FOOTER_META_SIZE..].copy_from_slice(meta.as_bytes());
tx_log.append(BufRef::try_from(&append_buf[..]).unwrap())?;
Ok(Footer {
meta,
index: index_vec,
})
}
/// Builds a SST from a `TxLog`, loads the footer and the index blocks.
///
/// # Panics
///
/// This method must be called within a TX. Otherwise, this method panics.
pub fn from_log<D: BlockSet + 'static>(tx_log: &Arc<TxLog<D>>) -> Result<Self> {
let nblocks = tx_log.nblocks();
let mut rbuf = Buf::alloc(1)?;
// Load footer block (last block)
tx_log.read(nblocks - 1, rbuf.as_mut())?;
let meta = FooterMeta::from_bytes(&rbuf.as_slice()[BLOCK_SIZE - FOOTER_META_SIZE..]);
let mut rbuf = Buf::alloc(meta.index_nblocks as _)?;
tx_log.read(nblocks - meta.index_nblocks as usize, rbuf.as_mut())?;
let mut index = Vec::with_capacity(meta.num_index as _);
let mut cache = LruCache::new(NonZeroUsize::new(Self::CACHE_CAP).unwrap());
let mut record_block = vec![0; RECORD_BLOCK_SIZE];
for i in 0..meta.num_index as _ {
let buf =
&rbuf.as_slice()[i * Self::INDEX_ENTRY_SIZE..(i + 1) * Self::INDEX_ENTRY_SIZE];
let pos = BlockId::from_le_bytes(buf[..BID_SIZE].try_into().unwrap());
let first = K::from_bytes(&buf[BID_SIZE..BID_SIZE + Self::K_SIZE]);
let last =
K::from_bytes(&buf[Self::INDEX_ENTRY_SIZE - Self::K_SIZE..Self::INDEX_ENTRY_SIZE]);
tx_log.read(pos, BufMut::try_from(&mut record_block[..]).unwrap())?;
let _ = cache.put(pos, Arc::new(RecordBlock::from_buf(record_block.clone())));
index.push(IndexEntry { pos, first, last })
}
let footer = Footer { meta, index };
Ok(Self {
id: tx_log.id(),
footer,
cache: Mutex::new(cache),
phantom: PhantomData,
})
}
}
impl<K: RecordKey<K>> IndexEntry<K> {
pub fn range(&self) -> RangeInclusive<K> {
self.first..=self.last
}
pub fn is_within_range(&self, key: &K) -> bool {
self.range().contains(key)
}
pub fn overlap_with(&self, rhs_range: &RangeInclusive<K>) -> bool {
let lhs_range = self.range();
!(lhs_range.end() < rhs_range.start() || lhs_range.start() > rhs_range.end())
}
}
impl RecordBlock {
pub fn from_buf(buf: Vec<u8>) -> Self {
debug_assert_eq!(buf.len(), RECORD_BLOCK_SIZE);
Self { buf }
}
pub fn as_slice(&self) -> &[u8] {
&self.buf
}
pub fn as_mut_slice(&mut self) -> &mut [u8] {
&mut self.buf
}
}
impl<K: RecordKey<K>> QueryAccessor<K> {
pub fn hit_target(&self, target: &K) -> bool {
match self {
QueryAccessor::Point(k) => k == target,
QueryAccessor::Range(range) => range.contains(target),
}
}
}
impl<K: RecordKey<K>, V: RecordValue> Iterator for BlockQueryIter<'_, K, V> {
type Item = (K, Option<V>);
fn next(&mut self) -> Option<Self::Item> {
let mut offset = self.offset;
let buf_slice = &self.block.buf;
let (k_size, v_size) = (SSTable::<K, V>::K_SIZE, SSTable::<K, V>::V_SIZE);
if offset + SSTable::<K, V>::MIN_RECORD_SIZE > RECORD_BLOCK_SIZE {
return None;
}
let key = K::from_bytes(&buf_slice[offset..offset + k_size]);
offset += k_size;
let flag = RecordFlag::from(buf_slice[offset]);
offset += 1;
if flag == RecordFlag::Invalid {
return None;
}
let hit_target = self.accessor.hit_target(&key);
let value_opt = match flag {
RecordFlag::Synced | RecordFlag::Unsynced => {
let v_opt = if hit_target {
Some(V::from_bytes(&buf_slice[offset..offset + v_size]))
} else {
None
};
offset += v_size;
v_opt
}
RecordFlag::SyncedAndUnsynced => {
let v_opt = if hit_target {
Some(V::from_bytes(
&buf_slice[offset + v_size..offset + 2 * v_size],
))
} else {
None
};
offset += 2 * v_size;
v_opt
}
_ => unreachable!(),
};
self.offset = offset;
Some((key, value_opt))
}
}
impl<K: RecordKey<K>, V: RecordValue> Iterator for BlockScanIter<'_, K, V> {
type Item = (K, ValueEx<V>);
fn next(&mut self) -> Option<Self::Item> {
let mut offset = self.offset;
let buf_slice = &self.block.buf;
let (k_size, v_size) = (SSTable::<K, V>::K_SIZE, SSTable::<K, V>::V_SIZE);
let (all_synced, discard_unsynced, event_listener) = (
self.accessor.all_synced,
self.accessor.discard_unsynced,
&self.accessor.event_listener,
);
let (key, value_ex) = loop {
if offset + SSTable::<K, V>::MIN_RECORD_SIZE > RECORD_BLOCK_SIZE {
return None;
}
let key = K::from_bytes(&buf_slice[offset..offset + k_size]);
offset += k_size;
let flag = RecordFlag::from(buf_slice[offset]);
offset += 1;
if flag == RecordFlag::Invalid {
return None;
}
let v_ex = match flag {
RecordFlag::Synced => {
let v = V::from_bytes(&buf_slice[offset..offset + v_size]);
offset += v_size;
ValueEx::Synced(v)
}
RecordFlag::Unsynced => {
let v = V::from_bytes(&buf_slice[offset..offset + v_size]);
offset += v_size;
if all_synced {
ValueEx::Synced(v)
} else if discard_unsynced {
if let Some(listener) = event_listener {
listener.on_drop_record(&(key, v)).unwrap();
}
continue;
} else {
ValueEx::Unsynced(v)
}
}
RecordFlag::SyncedAndUnsynced => {
let sv = V::from_bytes(&buf_slice[offset..offset + v_size]);
offset += v_size;
let usv = V::from_bytes(&buf_slice[offset..offset + v_size]);
offset += v_size;
if all_synced {
if let Some(listener) = event_listener {
listener.on_drop_record(&(key, sv)).unwrap();
}
ValueEx::Synced(usv)
} else if discard_unsynced {
if let Some(listener) = event_listener {
listener.on_drop_record(&(key, usv)).unwrap();
}
ValueEx::Synced(sv)
} else {
ValueEx::SyncedAndUnsynced(sv, usv)
}
}
_ => unreachable!(),
};
break (key, v_ex);
};
self.offset = offset;
Some((key, value_ex))
}
}
impl<K: RecordKey<K>, V: RecordValue, D: BlockSet + 'static> Iterator for SstIter<'_, K, V, D> {
type Item = (K, ValueEx<V>);
fn next(&mut self) -> Option<Self::Item> {
// Iterate over the current record block first
if let Some(next) = self.curr_rb_iter.as_mut().unwrap().next() {
return Some(next);
}
let curr_rb_iter = self.curr_rb_iter.take().unwrap();
self.curr_nth_index += 1;
// Iteration goes to the end
if self.curr_nth_index >= self.sst.footer.meta.num_index as _ {
return None;
}
// Ready to iterate the next record block
let next_pos = self.sst.footer.index[self.curr_nth_index].pos;
let next_rb = self
.sst
.target_record_block(next_pos, self.tx_log_store)
.unwrap();
let mut next_rb_iter = BlockScanIter {
block: next_rb,
offset: 0,
accessor: curr_rb_iter.accessor,
};
let next = next_rb_iter.next()?;
let _ = self.curr_rb_iter.insert(next_rb_iter);
Some(next)
}
}
impl<K: Debug, V> Debug for SSTable<K, V> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("SSTable")
.field("id", &self.id)
.field("footer", &self.footer.meta)
.field(
"range",
&RangeInclusive::new(
&self.footer.index[0].first,
&self.footer.index[self.footer.meta.num_index as usize - 1].last,
),
)
.finish()
}
}
/// Flag bit for records in SSTable.
#[derive(PartialEq, Eq, Debug)]
#[repr(u8)]
enum RecordFlag {
Synced = 7,
Unsynced = 11,
SyncedAndUnsynced = 19,
Invalid,
}
impl From<u8> for RecordFlag {
fn from(value: u8) -> Self {
match value {
7 => RecordFlag::Synced,
11 => RecordFlag::Unsynced,
19 => RecordFlag::SyncedAndUnsynced,
_ => RecordFlag::Invalid,
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,279 @@
// SPDX-License-Identifier: MPL-2.0
//! Transactions in WriteAhead Log.
use alloc::vec;
use core::{fmt::Debug, mem::size_of};
use ostd_pod::Pod;
use super::{AsKV, SyncId};
use crate::{
layers::{
bio::{BlockId, BlockSet, Buf, BufRef},
log::{TxLog, TxLogId, TxLogStore},
},
os::Mutex,
prelude::*,
tx::CurrentTx,
};
/// The bucket name of WAL.
pub(super) const BUCKET_WAL: &str = "WAL";
/// WAL append TX in `TxLsmTree`.
///
/// A `WalAppendTx` is used to append records, sync and discard WALs.
/// A WAL is storing, managing key-value records which are going to
/// put in `MemTable`. It's space is backed by a `TxLog` (L3).
#[derive(Clone)]
pub(super) struct WalAppendTx<D> {
inner: Arc<Mutex<WalTxInner<D>>>,
}
struct WalTxInner<D> {
/// The appended WAL of ongoing Tx.
appended_log: Option<Arc<TxLog<D>>>,
/// Current log ID of WAL for later use.
log_id: Option<TxLogId>,
/// Store current sync ID as the first record of WAL.
sync_id: SyncId,
/// A buffer to cache appended records.
record_buf: Vec<u8>,
/// Store for WALs.
tx_log_store: Arc<TxLogStore<D>>,
}
impl<D: BlockSet + 'static> WalAppendTx<D> {
const BUF_CAP: usize = 1024 * BLOCK_SIZE;
/// Prepare a new WAL TX.
pub fn new(store: &Arc<TxLogStore<D>>, sync_id: SyncId) -> Self {
Self {
inner: Arc::new(Mutex::new(WalTxInner {
appended_log: None,
log_id: None,
sync_id,
record_buf: Vec::with_capacity(Self::BUF_CAP),
tx_log_store: store.clone(),
})),
}
}
/// Append phase for an Append TX, mainly to append newly records to the WAL.
pub fn append<K: Pod, V: Pod>(&self, record: &dyn AsKV<K, V>) -> Result<()> {
let mut inner = self.inner.lock();
if inner.appended_log.is_none() {
inner.prepare()?;
}
{
let record_buf = &mut inner.record_buf;
record_buf.push(WalAppendFlag::Record as u8);
record_buf.extend_from_slice(record.key().as_bytes());
record_buf.extend_from_slice(record.value().as_bytes());
}
const MAX_RECORD_SIZE: usize = 49;
if inner.record_buf.len() <= Self::BUF_CAP - MAX_RECORD_SIZE {
return Ok(());
}
inner.align_record_buf();
let wal_tx = inner.tx_log_store.current_tx();
let wal_log = inner.appended_log.as_ref().unwrap();
self.flush_buf(&inner.record_buf, &wal_tx, wal_log)?;
inner.record_buf.clear();
Ok(())
}
/// Commit phase for an Append TX, mainly to commit (or abort) the TX.
/// After the committed WAL is sealed. Return the corresponding log ID.
///
/// # Panics
///
/// This method panics if current WAL's TX does not exist.
pub fn commit(&self) -> Result<TxLogId> {
let mut inner = self.inner.lock();
let wal_log = inner
.appended_log
.take()
.expect("current WAL TX must exist");
let wal_id = inner.log_id.take().unwrap();
debug_assert_eq!(wal_id, wal_log.id());
if !inner.record_buf.is_empty() {
inner.align_record_buf();
let wal_tx = inner.tx_log_store.current_tx();
self.flush_buf(&inner.record_buf, &wal_tx, &wal_log)?;
inner.record_buf.clear();
}
drop(wal_log);
inner.tx_log_store.current_tx().commit()?;
Ok(wal_id)
}
/// Appends current sync ID to WAL then commit the TX to ensure WAL's persistency.
/// Save the log ID for later appending.
pub fn sync(&self, sync_id: SyncId) -> Result<()> {
let mut inner = self.inner.lock();
if inner.appended_log.is_none() {
inner.prepare()?;
}
inner.record_buf.push(WalAppendFlag::Sync as u8);
inner.record_buf.extend_from_slice(&sync_id.to_le_bytes());
inner.sync_id = sync_id;
inner.align_record_buf();
let wal_log = inner.appended_log.take().unwrap();
self.flush_buf(
&inner.record_buf,
&inner.tx_log_store.current_tx(),
&wal_log,
)?;
inner.record_buf.clear();
drop(wal_log);
inner.tx_log_store.current_tx().commit()
}
/// Flushes the buffer to the backed log.
fn flush_buf(
&self,
record_buf: &[u8],
wal_tx: &CurrentTx<'_>,
log: &Arc<TxLog<D>>,
) -> Result<()> {
debug_assert!(!record_buf.is_empty() && record_buf.len() % BLOCK_SIZE == 0);
let res = wal_tx.context(|| {
let buf = BufRef::try_from(record_buf).unwrap();
log.append(buf)
});
if res.is_err() {
wal_tx.abort();
}
res
}
/// Collects the synced records only and the maximum sync ID in the WAL.
pub fn collect_synced_records_and_sync_id<K: Pod, V: Pod>(
wal: &TxLog<D>,
) -> Result<(Vec<(K, V)>, SyncId)> {
let nblocks = wal.nblocks();
let mut records = Vec::new();
// TODO: Allocate separate buffers for large WAL
let mut buf = Buf::alloc(nblocks)?;
wal.read(0 as BlockId, buf.as_mut())?;
let buf_slice = buf.as_slice();
let k_size = size_of::<K>();
let v_size = size_of::<V>();
let total_bytes = nblocks * BLOCK_SIZE;
let mut offset = 0;
let (mut max_sync_id, mut synced_len) = (None, 0);
loop {
const MIN_RECORD_SIZE: usize = 9;
if offset > total_bytes - MIN_RECORD_SIZE {
break;
}
let flag = WalAppendFlag::try_from(buf_slice[offset]);
offset += 1;
if flag.is_err() {
continue;
}
match flag.unwrap() {
WalAppendFlag::Record => {
let record = {
let k = K::from_bytes(&buf_slice[offset..offset + k_size]);
let v =
V::from_bytes(&buf_slice[offset + k_size..offset + k_size + v_size]);
offset += k_size + v_size;
(k, v)
};
records.push(record);
}
WalAppendFlag::Sync => {
let sync_id = SyncId::from_le_bytes(
buf_slice[offset..offset + size_of::<SyncId>()]
.try_into()
.unwrap(),
);
offset += size_of::<SyncId>();
let _ = max_sync_id.insert(sync_id);
synced_len = records.len();
}
}
}
if let Some(max_sync_id) = max_sync_id {
records.truncate(synced_len);
Ok((records, max_sync_id))
} else {
Ok((vec![], 0))
}
}
}
impl<D: BlockSet + 'static> WalTxInner<D> {
/// Prepare phase for an Append TX, mainly to create new TX and WAL.
pub fn prepare(&mut self) -> Result<()> {
debug_assert!(self.appended_log.is_none());
let appended_log = {
let store = &self.tx_log_store;
let wal_tx = store.new_tx();
let log_id_opt = self.log_id;
let res = wal_tx.context(|| {
if let Some(log_id) = log_id_opt {
store.open_log(log_id, true)
} else {
store.create_log(BUCKET_WAL)
}
});
if res.is_err() {
wal_tx.abort();
}
let wal_log = res?;
let _ = self.log_id.insert(wal_log.id());
wal_log
};
let _ = self.appended_log.insert(appended_log);
// Record the sync ID at the beginning of the WAL
debug_assert!(self.record_buf.is_empty());
self.record_buf.push(WalAppendFlag::Sync as u8);
self.record_buf
.extend_from_slice(&self.sync_id.to_le_bytes());
Ok(())
}
fn align_record_buf(&mut self) {
let aligned_len = align_up(self.record_buf.len(), BLOCK_SIZE);
self.record_buf.resize(aligned_len, 0);
}
}
/// Two content kinds in a WAL.
#[derive(PartialEq, Eq, Debug)]
#[repr(u8)]
enum WalAppendFlag {
Record = 13,
Sync = 23,
}
impl TryFrom<u8> for WalAppendFlag {
type Error = Error;
fn try_from(value: u8) -> Result<Self> {
match value {
13 => Ok(WalAppendFlag::Record),
23 => Ok(WalAppendFlag::Sync),
_ => Err(Error::new(InvalidArgs)),
}
}
}

View File

@ -0,0 +1,291 @@
// SPDX-License-Identifier: MPL-2.0
//! Block I/O (BIO).
use alloc::collections::VecDeque;
use core::{
any::{Any, TypeId},
ptr::NonNull,
sync::atomic::{AtomicUsize, Ordering},
};
use hashbrown::HashMap;
use crate::{
os::{Mutex, MutexGuard},
prelude::*,
Buf,
};
/// A queue for managing block I/O requests (`BioReq`).
/// It provides a concurrency-safe way to store and manage
/// block I/O requests that need to be processed by a block device.
pub struct BioReqQueue {
queue: Mutex<VecDeque<BioReq>>,
num_reqs: AtomicUsize,
}
impl BioReqQueue {
/// Create a new `BioReqQueue` instance.
pub fn new() -> Self {
Self {
queue: Mutex::new(VecDeque::new()),
num_reqs: AtomicUsize::new(0),
}
}
/// Enqueue a block I/O request.
pub fn enqueue(&self, req: BioReq) -> Result<()> {
req.submit();
self.queue.lock().push_back(req);
self.num_reqs.fetch_add(1, Ordering::Release);
Ok(())
}
/// Dequeue a block I/O request.
pub fn dequeue(&self) -> Option<BioReq> {
if let Some(req) = self.queue.lock().pop_front() {
self.num_reqs.fetch_sub(1, Ordering::Release);
Some(req)
} else {
debug_assert_eq!(self.num_reqs.load(Ordering::Acquire), 0);
None
}
}
/// Returns the number of pending requests in this queue.
pub fn num_reqs(&self) -> usize {
self.num_reqs.load(Ordering::Acquire)
}
/// Returns whether there are no pending requests in this queue.
pub fn is_empty(&self) -> bool {
self.num_reqs() == 0
}
}
/// A block I/O request.
pub struct BioReq {
type_: BioType,
addr: BlockId,
nblocks: u32,
bufs: Mutex<Vec<Buf>>,
status: Mutex<BioStatus>,
on_complete: Option<BioReqOnCompleteFn>,
ext: Mutex<HashMap<TypeId, Box<dyn Any + Send + Sync>>>,
}
/// The type of a block request.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum BioType {
/// A read request.
Read,
/// A write request.
Write,
/// A sync request.
Sync,
}
/// A response from a block device.
pub type BioResp = Result<()>;
/// The type of the callback function invoked upon the completion of
/// a block I/O request.
pub type BioReqOnCompleteFn = fn(/* req = */ &BioReq, /* resp = */ &BioResp);
/// The status describing a block I/O request.
#[derive(Clone, Debug)]
enum BioStatus {
Init,
Submitted,
Completed(BioResp),
}
impl BioReq {
/// Returns the type of the request.
pub fn type_(&self) -> BioType {
self.type_
}
/// Returns the starting address of requested blocks.
///
/// The return value is meaningless if the request is not a read or write.
pub fn addr(&self) -> BlockId {
self.addr
}
/// Access the immutable buffers with a closure.
pub fn access_bufs_with<F, R>(&self, mut f: F) -> R
where
F: FnMut(&[Buf]) -> R,
{
let bufs = self.bufs.lock();
(f)(&bufs)
}
/// Access the mutable buffers with a closure.
pub(super) fn access_mut_bufs_with<F, R>(&self, mut f: F) -> R
where
F: FnMut(&mut [Buf]) -> R,
{
let mut bufs = self.bufs.lock();
(f)(&mut bufs)
}
/// Take the buffers out of the request.
pub(super) fn take_bufs(&self) -> Vec<Buf> {
let mut bufs = self.bufs.lock();
let mut ret_bufs = Vec::new();
core::mem::swap(&mut *bufs, &mut ret_bufs);
ret_bufs
}
/// Returns the number of buffers associated with the request.
///
/// If the request is a flush, then the returned value is meaningless.
pub fn nbufs(&self) -> usize {
self.bufs.lock().len()
}
/// Returns the number of blocks to read or write by this request.
///
/// If the request is a flush, then the returned value is meaningless.
pub fn nblocks(&self) -> usize {
self.nblocks as usize
}
/// Returns the extensions of the request.
///
/// The extensions of a request is a set of objects that may be added, removed,
/// or accessed by block devices and their users. Each of the extension objects
/// must have a different type. To avoid conflicts, it is recommended to use only
/// private types for the extension objects.
pub fn ext(&self) -> MutexGuard<HashMap<TypeId, Box<dyn Any + Send + Sync>>> {
self.ext.lock()
}
/// Update the status of the request to "completed" by giving the response
/// to the request.
///
/// After the invoking this API, the request is considered completed, which
/// means the request must have taken effect. For example, a completed read
/// request must have all its buffers filled with data.
///
/// # Panics
///
/// If the request has not been submitted yet, or has been completed already,
/// this method will panic.
pub(super) fn complete(&self, resp: BioResp) {
let mut status = self.status.lock();
match *status {
BioStatus::Submitted => {
if let Some(on_complete) = self.on_complete {
(on_complete)(self, &resp);
}
*status = BioStatus::Completed(resp);
}
_ => panic!("cannot complete before submitting or complete twice"),
}
}
/// Mark the request as submitted.
pub(super) fn submit(&self) {
let mut status = self.status.lock();
match *status {
BioStatus::Init => *status = BioStatus::Submitted,
_ => unreachable!(),
}
}
}
/// A builder for `BioReq`.
pub struct BioReqBuilder {
type_: BioType,
addr: Option<BlockId>,
bufs: Option<Vec<Buf>>,
on_complete: Option<BioReqOnCompleteFn>,
ext: Option<HashMap<TypeId, Box<dyn Any + Send + Sync>>>,
}
impl BioReqBuilder {
/// Creates a builder of a block request of the given type.
pub fn new(type_: BioType) -> Self {
Self {
type_,
addr: None,
bufs: None,
on_complete: None,
ext: None,
}
}
/// Specify the block address of the request.
pub fn addr(mut self, addr: BlockId) -> Self {
self.addr = Some(addr);
self
}
/// Give the buffers of the request.
pub fn bufs(mut self, bufs: Vec<Buf>) -> Self {
self.bufs = Some(bufs);
self
}
/// Specify a callback invoked when the request is complete.
pub fn on_complete(mut self, on_complete: BioReqOnCompleteFn) -> Self {
self.on_complete = Some(on_complete);
self
}
/// Add an extension object to the request.
pub fn ext<T: Any + Send + Sync + Sized>(mut self, obj: T) -> Self {
if self.ext.is_none() {
self.ext = Some(HashMap::new());
}
let _ = self
.ext
.as_mut()
.unwrap()
.insert(TypeId::of::<T>(), Box::new(obj));
self
}
/// Build the request.
pub fn build(mut self) -> BioReq {
let type_ = self.type_;
if type_ == BioType::Sync {
debug_assert!(
self.addr.is_none(),
"addr is only meaningful for a read or write",
);
debug_assert!(
self.bufs.is_none(),
"bufs is only meaningful for a read or write",
);
}
let addr = self.addr.unwrap_or(0 as BlockId);
let bufs = self.bufs.take().unwrap_or_default();
let nblocks = {
let nbytes = bufs
.iter()
.map(|buf| buf.as_slice().len())
.fold(0_usize, |sum, len| sum.saturating_add(len));
(nbytes / BLOCK_SIZE) as u32
};
let ext = self.ext.take().unwrap_or_default();
let on_complete = self.on_complete.take();
BioReq {
type_,
addr,
nblocks,
bufs: Mutex::new(bufs),
status: Mutex::new(BioStatus::Init),
on_complete,
ext: Mutex::new(ext),
}
}
}

View File

@ -0,0 +1,403 @@
// SPDX-License-Identifier: MPL-2.0
//! Block allocation.
use alloc::vec;
use core::{
mem::size_of,
num::NonZeroUsize,
sync::atomic::{AtomicBool, AtomicUsize, Ordering},
};
use ostd_pod::Pod;
use serde::{Deserialize, Serialize};
use super::sworndisk::Hba;
use crate::{
layers::{
bio::{BlockSet, Buf, BufRef, BID_SIZE},
log::{TxLog, TxLogStore},
},
os::{BTreeMap, Condvar, CvarMutex, Mutex},
prelude::*,
util::BitMap,
};
/// The bucket name of block validity table.
const BUCKET_BLOCK_VALIDITY_TABLE: &str = "BVT";
/// The bucket name of block alloc/dealloc log.
const BUCKET_BLOCK_ALLOC_LOG: &str = "BAL";
/// Block validity table. Global allocator for `SwornDisk`,
/// which manages validities of user data blocks.
pub(super) struct AllocTable {
bitmap: Mutex<BitMap>,
next_avail: AtomicUsize,
nblocks: NonZeroUsize,
is_dirty: AtomicBool,
cvar: Condvar,
num_free: CvarMutex<usize>,
}
/// Per-TX block allocator in `SwornDisk`, recording validities
/// of user data blocks within each TX. All metadata will be stored in
/// `TxLog`s of bucket `BAL` during TX for durability and recovery purpose.
pub(super) struct BlockAlloc<D> {
alloc_table: Arc<AllocTable>, // Point to the global allocator
diff_table: Mutex<BTreeMap<Hba, AllocDiff>>, // Per-TX diffs of block validity
store: Arc<TxLogStore<D>>, // Store for diff log from L3
diff_log: Mutex<Option<Arc<TxLog<D>>>>, // Opened diff log (currently not in-use)
}
/// Incremental diff of block validity.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[repr(u8)]
enum AllocDiff {
Alloc = 3,
Dealloc = 7,
Invalid,
}
const DIFF_RECORD_SIZE: usize = size_of::<AllocDiff>() + size_of::<Hba>();
impl AllocTable {
/// Create a new `AllocTable` given the total number of blocks.
pub fn new(nblocks: NonZeroUsize) -> Self {
Self {
bitmap: Mutex::new(BitMap::repeat(true, nblocks.get())),
next_avail: AtomicUsize::new(0),
nblocks,
is_dirty: AtomicBool::new(false),
cvar: Condvar::new(),
num_free: CvarMutex::new(nblocks.get()),
}
}
/// Allocate a free slot for a new block, returns `None`
/// if there are no free slots.
pub fn alloc(&self) -> Option<Hba> {
let mut bitmap = self.bitmap.lock();
let next_avail = self.next_avail.load(Ordering::Acquire);
let hba = if let Some(hba) = bitmap.first_one(next_avail) {
hba
} else {
bitmap.first_one(0)?
};
bitmap.set(hba, false);
self.next_avail.store(hba + 1, Ordering::Release);
Some(hba as Hba)
}
/// Allocate multiple free slots for a bunch of new blocks, returns `None`
/// if there are no free slots for all.
pub fn alloc_batch(&self, count: NonZeroUsize) -> Result<Vec<Hba>> {
let cnt = count.get();
let mut num_free = self.num_free.lock().unwrap();
while *num_free < cnt {
// TODO: May not be woken, may require manual triggering of a compaction in L4
num_free = self.cvar.wait(num_free).unwrap();
}
debug_assert!(*num_free >= cnt);
let hbas = self.do_alloc_batch(count).unwrap();
debug_assert_eq!(hbas.len(), cnt);
*num_free -= cnt;
let _ = self
.is_dirty
.compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed);
Ok(hbas)
}
fn do_alloc_batch(&self, count: NonZeroUsize) -> Option<Vec<Hba>> {
let count = count.get();
debug_assert!(count > 0);
let mut bitmap = self.bitmap.lock();
let mut next_avail = self.next_avail.load(Ordering::Acquire);
if next_avail + count > self.nblocks.get() {
next_avail = bitmap.first_one(0)?;
}
let hbas = if let Some(hbas) = bitmap.first_ones(next_avail, count) {
hbas
} else {
next_avail = bitmap.first_one(0)?;
bitmap.first_ones(next_avail, count)?
};
hbas.iter().for_each(|hba| bitmap.set(*hba, false));
next_avail = hbas.last().unwrap() + 1;
self.next_avail.store(next_avail, Ordering::Release);
Some(hbas)
}
/// Recover the `AllocTable` from the latest `BVT` log and a bunch of `BAL` logs
/// in the given store.
pub fn recover<D: BlockSet + 'static>(
nblocks: NonZeroUsize,
store: &Arc<TxLogStore<D>>,
) -> Result<Self> {
let tx = store.new_tx();
let res: Result<_> = tx.context(|| {
// Recover the block validity table from `BVT` log first
let bvt_log_res = store.open_log_in(BUCKET_BLOCK_VALIDITY_TABLE);
let mut bitmap = match bvt_log_res {
Ok(bvt_log) => {
let mut buf = Buf::alloc(bvt_log.nblocks())?;
bvt_log.read(0 as BlockId, buf.as_mut())?;
postcard::from_bytes(buf.as_slice()).map_err(|_| {
Error::with_msg(InvalidArgs, "deserialize block validity table failed")
})?
}
Err(e) => {
if e.errno() != NotFound {
return Err(e);
}
BitMap::repeat(true, nblocks.get())
}
};
// Iterate each `BAL` log and apply each diff, from older to newer
let bal_log_ids_res = store.list_logs_in(BUCKET_BLOCK_ALLOC_LOG);
if let Err(e) = &bal_log_ids_res
&& e.errno() == NotFound
{
let next_avail = bitmap.first_one(0).unwrap_or(0);
let num_free = bitmap.count_ones();
return Ok(Self {
bitmap: Mutex::new(bitmap),
next_avail: AtomicUsize::new(next_avail),
nblocks,
is_dirty: AtomicBool::new(false),
cvar: Condvar::new(),
num_free: CvarMutex::new(num_free),
});
}
let mut bal_log_ids = bal_log_ids_res?;
bal_log_ids.sort();
for bal_log_id in bal_log_ids {
let bal_log_res = store.open_log(bal_log_id, false);
if let Err(e) = &bal_log_res
&& e.errno() == NotFound
{
continue;
}
let bal_log = bal_log_res?;
let log_nblocks = bal_log.nblocks();
let mut buf = Buf::alloc(log_nblocks)?;
bal_log.read(0 as BlockId, buf.as_mut())?;
let buf_slice = buf.as_slice();
let mut offset = 0;
while offset <= log_nblocks * BLOCK_SIZE - DIFF_RECORD_SIZE {
let diff = AllocDiff::from(buf_slice[offset]);
offset += 1;
if diff == AllocDiff::Invalid {
continue;
}
let bid = BlockId::from_bytes(&buf_slice[offset..offset + BID_SIZE]);
offset += BID_SIZE;
match diff {
AllocDiff::Alloc => bitmap.set(bid, false),
AllocDiff::Dealloc => bitmap.set(bid, true),
_ => unreachable!(),
}
}
}
let next_avail = bitmap.first_one(0).unwrap_or(0);
let num_free = bitmap.count_ones();
Ok(Self {
bitmap: Mutex::new(bitmap),
next_avail: AtomicUsize::new(next_avail),
nblocks,
is_dirty: AtomicBool::new(false),
cvar: Condvar::new(),
num_free: CvarMutex::new(num_free),
})
});
let recov_self = res.map_err(|_| {
tx.abort();
Error::with_msg(TxAborted, "recover block validity table TX aborted")
})?;
tx.commit()?;
Ok(recov_self)
}
/// Persist the block validity table to `BVT` log. GC all existed `BAL` logs.
pub fn do_compaction<D: BlockSet + 'static>(&self, store: &Arc<TxLogStore<D>>) -> Result<()> {
if !self.is_dirty.load(Ordering::Relaxed) {
return Ok(());
}
// Serialize the block validity table
let bitmap = self.bitmap.lock();
const BITMAP_MAX_SIZE: usize = 1792 * BLOCK_SIZE; // TBD
let mut ser_buf = vec![0; BITMAP_MAX_SIZE];
let ser_len = postcard::to_slice::<BitMap>(&bitmap, &mut ser_buf)
.map_err(|_| Error::with_msg(InvalidArgs, "serialize block validity table failed"))?
.len();
ser_buf.resize(align_up(ser_len, BLOCK_SIZE), 0);
drop(bitmap);
// Persist the serialized block validity table to `BVT` log
// and GC any old `BVT` logs and `BAL` logs
let tx = store.new_tx();
let res: Result<_> = tx.context(|| {
if let Ok(bvt_log_ids) = store.list_logs_in(BUCKET_BLOCK_VALIDITY_TABLE) {
for bvt_log_id in bvt_log_ids {
store.delete_log(bvt_log_id)?;
}
}
let bvt_log = store.create_log(BUCKET_BLOCK_VALIDITY_TABLE)?;
bvt_log.append(BufRef::try_from(&ser_buf[..]).unwrap())?;
if let Ok(bal_log_ids) = store.list_logs_in(BUCKET_BLOCK_ALLOC_LOG) {
for bal_log_id in bal_log_ids {
store.delete_log(bal_log_id)?;
}
}
Ok(())
});
if res.is_err() {
tx.abort();
return_errno_with_msg!(TxAborted, "persist block validity table TX aborted");
}
tx.commit()?;
self.is_dirty.store(false, Ordering::Relaxed);
Ok(())
}
/// Mark a specific slot deallocated.
pub fn set_deallocated(&self, nth: usize) {
let mut num_free = self.num_free.lock().unwrap();
self.bitmap.lock().set(nth, true);
*num_free += 1;
const AVG_ALLOC_COUNT: usize = 1024;
if *num_free >= AVG_ALLOC_COUNT {
self.cvar.notify_one();
}
}
}
impl<D: BlockSet + 'static> BlockAlloc<D> {
/// Create a new `BlockAlloc` with the given global allocator and store.
pub fn new(alloc_table: Arc<AllocTable>, store: Arc<TxLogStore<D>>) -> Self {
Self {
alloc_table,
diff_table: Mutex::new(BTreeMap::new()),
store,
diff_log: Mutex::new(None),
}
}
/// Record a diff of `Alloc`.
pub fn alloc_block(&self, block_id: Hba) -> Result<()> {
let mut diff_table = self.diff_table.lock();
let replaced = diff_table.insert(block_id, AllocDiff::Alloc);
debug_assert!(
replaced != Some(AllocDiff::Alloc),
"can't allocate a block twice"
);
Ok(())
}
/// Record a diff of `Dealloc`.
pub fn dealloc_block(&self, block_id: Hba) -> Result<()> {
let mut diff_table = self.diff_table.lock();
let replaced = diff_table.insert(block_id, AllocDiff::Dealloc);
debug_assert!(
replaced != Some(AllocDiff::Dealloc),
"can't deallocate a block twice"
);
Ok(())
}
/// Prepare the block validity diff log.
///
/// # Panics
///
/// This method must be called within a TX. Otherwise, this method panics.
pub fn prepare_diff_log(&self) -> Result<()> {
// Do nothing for now
Ok(())
}
/// Persist the metadata in diff table to the block validity diff log.
///
/// # Panics
///
/// This method must be called within a TX. Otherwise, this method panics.
pub fn update_diff_log(&self) -> Result<()> {
let diff_table = self.diff_table.lock();
if diff_table.is_empty() {
return Ok(());
}
let diff_log = self.store.create_log(BUCKET_BLOCK_ALLOC_LOG)?;
const MAX_BUF_SIZE: usize = 1024 * BLOCK_SIZE;
let mut diff_buf = Vec::with_capacity(MAX_BUF_SIZE);
for (block_id, block_diff) in diff_table.iter() {
diff_buf.push(*block_diff as u8);
diff_buf.extend_from_slice(block_id.as_bytes());
if diff_buf.len() + DIFF_RECORD_SIZE > MAX_BUF_SIZE {
diff_buf.resize(align_up(diff_buf.len(), BLOCK_SIZE), 0);
diff_log.append(BufRef::try_from(&diff_buf[..]).unwrap())?;
diff_buf.clear();
}
}
if diff_buf.is_empty() {
return Ok(());
}
diff_buf.resize(align_up(diff_buf.len(), BLOCK_SIZE), 0);
diff_log.append(BufRef::try_from(&diff_buf[..]).unwrap())
}
/// Update the metadata in diff table to the in-memory block validity table.
pub fn update_alloc_table(&self) {
let diff_table = self.diff_table.lock();
let alloc_table = &self.alloc_table;
let mut num_free = alloc_table.num_free.lock().unwrap();
let mut bitmap = alloc_table.bitmap.lock();
let mut num_dealloc = 0_usize;
for (block_id, block_diff) in diff_table.iter() {
match block_diff {
AllocDiff::Alloc => {
debug_assert!(!bitmap[*block_id]);
}
AllocDiff::Dealloc => {
debug_assert!(!bitmap[*block_id]);
bitmap.set(*block_id, true);
num_dealloc += 1;
}
AllocDiff::Invalid => unreachable!(),
};
}
*num_free += num_dealloc;
const AVG_ALLOC_COUNT: usize = 1024;
if *num_free >= AVG_ALLOC_COUNT {
alloc_table.cvar.notify_one();
}
}
}
impl From<u8> for AllocDiff {
fn from(value: u8) -> Self {
match value {
3 => AllocDiff::Alloc,
7 => AllocDiff::Dealloc,
_ => AllocDiff::Invalid,
}
}
}

View File

@ -0,0 +1,137 @@
// SPDX-License-Identifier: MPL-2.0
//! Data buffering.
use core::ops::RangeInclusive;
use super::sworndisk::RecordKey;
use crate::{
layers::bio::{BufMut, BufRef},
os::{BTreeMap, Condvar, CvarMutex, Mutex},
prelude::*,
};
/// A buffer to cache data blocks before they are written to disk.
#[derive(Debug)]
pub(super) struct DataBuf {
buf: Mutex<BTreeMap<RecordKey, Arc<DataBlock>>>,
cap: usize,
cvar: Condvar,
is_full: CvarMutex<bool>,
}
/// User data block.
pub(super) struct DataBlock([u8; BLOCK_SIZE]);
impl DataBuf {
/// Create a new empty data buffer with a given capacity.
pub fn new(cap: usize) -> Self {
Self {
buf: Mutex::new(BTreeMap::new()),
cap,
cvar: Condvar::new(),
is_full: CvarMutex::new(false),
}
}
/// Get the buffered data block with the key and copy
/// the content into `buf`.
pub fn get(&self, key: RecordKey, buf: &mut BufMut) -> Option<()> {
debug_assert_eq!(buf.nblocks(), 1);
if let Some(block) = self.buf.lock().get(&key) {
buf.as_mut_slice().copy_from_slice(block.as_slice());
Some(())
} else {
None
}
}
/// Get the buffered data blocks which keys are within the given range.
pub fn get_range(&self, range: RangeInclusive<RecordKey>) -> Vec<(RecordKey, Arc<DataBlock>)> {
self.buf
.lock()
.iter()
.filter_map(|(k, v)| {
if range.contains(k) {
Some((*k, v.clone()))
} else {
None
}
})
.collect()
}
/// Put the data block in `buf` into the buffer. Return
/// whether the buffer is full after insertion.
pub fn put(&self, key: RecordKey, buf: BufRef) -> bool {
debug_assert_eq!(buf.nblocks(), 1);
let mut is_full = self.is_full.lock().unwrap();
while *is_full {
is_full = self.cvar.wait(is_full).unwrap();
}
debug_assert!(!*is_full);
let mut data_buf = self.buf.lock();
let _ = data_buf.insert(key, DataBlock::from_buf(buf));
if data_buf.len() >= self.cap {
*is_full = true;
}
*is_full
}
/// Return the number of data blocks of the buffer.
pub fn nblocks(&self) -> usize {
self.buf.lock().len()
}
/// Return whether the buffer is full.
pub fn at_capacity(&self) -> bool {
self.nblocks() >= self.cap
}
/// Return whether the buffer is empty.
pub fn is_empty(&self) -> bool {
self.nblocks() == 0
}
/// Empty the buffer.
pub fn clear(&self) {
let mut is_full = self.is_full.lock().unwrap();
self.buf.lock().clear();
if *is_full {
*is_full = false;
self.cvar.notify_all();
}
}
/// Return all the buffered data blocks.
pub fn all_blocks(&self) -> Vec<(RecordKey, Arc<DataBlock>)> {
self.buf
.lock()
.iter()
.map(|(k, v)| (*k, v.clone()))
.collect()
}
}
impl DataBlock {
/// Create a new data block from the given `buf`.
pub fn from_buf(buf: BufRef) -> Arc<Self> {
debug_assert_eq!(buf.nblocks(), 1);
Arc::new(DataBlock(buf.as_slice().try_into().unwrap()))
}
/// Return the immutable slice of the data block.
pub fn as_slice(&self) -> &[u8] {
&self.0
}
}
impl Debug for DataBlock {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("DataBlock")
.field("first 16 bytes", &&self.0[..16])
.finish()
}
}

View File

@ -0,0 +1,41 @@
// SPDX-License-Identifier: MPL-2.0
//! The layer of secure virtual disk.
//!
//! `SwornDisk` provides three block I/O interfaces, `read()`, `write()` and `sync()`.
//! `SwornDisk` protects a logical block of user data using authenticated encryption.
//! The metadata of the encrypted logical blocks are inserted into a secure index `TxLsmTree`.
//!
//! `SwornDisk`'s backed untrusted host disk space is managed in `BlockAlloc`. Block reclamation can be
//! delayed to user-defined callbacks on `TxLsmTree`.
//! `SwornDisk` supports buffering written logical blocks.
//!
//! # Usage Example
//!
//! Write, sync then read blocks from `SwornDisk`.
//!
//! ```
//! let nblocks = 1024;
//! let mem_disk = MemDisk::create(nblocks)?;
//! let root_key = Key::random();
//! let sworndisk = SwornDisk::create(mem_disk.clone(), root_key)?;
//!
//! let num_rw = 128;
//! let mut rw_buf = Buf::alloc(1)?;
//! for i in 0..num_rw {
//! rw_buf.as_mut_slice().fill(i as u8);
//! sworndisk.write(i as Lba, rw_buf.as_ref())?;
//! }
//! sworndisk.sync()?;
//! for i in 0..num_rw {
//! sworndisk.read(i as Lba, rw_buf.as_mut())?;
//! assert_eq!(rw_buf.as_slice()[0], i as u8);
//! }
//! ```
mod bio;
mod block_alloc;
mod data_buf;
mod sworndisk;
pub use self::sworndisk::SwornDisk;

View File

@ -0,0 +1,881 @@
// SPDX-License-Identifier: MPL-2.0
//! SwornDisk as a block device.
//!
//! API: submit_bio(), submit_bio_sync(), create(), open(),
//! read(), readv(), write(), writev(), sync().
//!
//! Responsible for managing a `TxLsmTree`, whereas the TX logs (WAL and SSTs)
//! are stored; an untrusted disk storing user data, a `BlockAlloc` for managing data blocks'
//! allocation metadata. `TxLsmTree` and `BlockAlloc` are manipulated
//! based on internal transactions.
use core::{
num::NonZeroUsize,
ops::{Add, Sub},
sync::atomic::{AtomicBool, Ordering},
};
use ostd::mm::VmIo;
use ostd_pod::Pod;
use super::{
bio::{BioReq, BioReqQueue, BioResp, BioType},
block_alloc::{AllocTable, BlockAlloc},
data_buf::DataBuf,
};
use crate::{
layers::{
bio::{BlockId, BlockSet, Buf, BufMut, BufRef},
log::TxLogStore,
lsm::{
AsKV, LsmLevel, RangeQueryCtx, RecordKey as RecordK, RecordValue as RecordV,
SyncIdStore, TxEventListener, TxEventListenerFactory, TxLsmTree, TxType,
},
},
os::{Aead, AeadIv as Iv, AeadKey as Key, AeadMac as Mac, RwLock},
prelude::*,
tx::CurrentTx,
};
/// Logical Block Address.
pub type Lba = BlockId;
/// Host Block Address.
pub type Hba = BlockId;
/// SwornDisk.
pub struct SwornDisk<D: BlockSet> {
inner: Arc<DiskInner<D>>,
}
/// Inner structures of `SwornDisk`.
struct DiskInner<D: BlockSet> {
/// Block I/O request queue.
bio_req_queue: BioReqQueue,
/// A `TxLsmTree` to store metadata of the logical blocks.
logical_block_table: TxLsmTree<RecordKey, RecordValue, D>,
/// The underlying disk where user data is stored.
user_data_disk: D,
/// Manage space of the data disk.
block_validity_table: Arc<AllocTable>,
/// TX log store for managing logs in `TxLsmTree` and block alloc logs.
tx_log_store: Arc<TxLogStore<D>>,
/// A buffer to cache data blocks.
data_buf: DataBuf,
/// Root encryption key.
root_key: Key,
/// Whether `SwornDisk` is dropped.
is_dropped: AtomicBool,
/// Scope lock for control write and sync operation.
write_sync_region: RwLock<()>,
}
impl<D: BlockSet + 'static> aster_block::BlockDevice for SwornDisk<D> {
fn enqueue(
&self,
bio: aster_block::bio::SubmittedBio,
) -> core::result::Result<(), aster_block::bio::BioEnqueueError> {
use aster_block::bio::{BioStatus, BioType, SubmittedBio};
if bio.type_() == BioType::Discard {
warn!("discard operation not supported");
bio.complete(BioStatus::NotSupported);
return Ok(());
}
if bio.type_() == BioType::Flush {
let status = match self.sync() {
Ok(_) => BioStatus::Complete,
Err(_) => BioStatus::IoError,
};
bio.complete(status);
return Ok(());
}
let start_offset = bio.sid_range().start.to_offset();
let start_lba = start_offset / BLOCK_SIZE;
let end_offset = bio.sid_range().end.to_offset();
let end_lba = end_offset.div_ceil(BLOCK_SIZE);
let nblocks = end_lba - start_lba;
let Ok(buf) = Buf::alloc(nblocks) else {
bio.complete(BioStatus::NoSpace);
return Ok(());
};
let handle_read_bio = |mut buf: Buf| {
if self.read(start_lba, buf.as_mut()).is_err() {
return BioStatus::IoError;
}
let mut base = start_offset % BLOCK_SIZE;
bio.segments().iter().for_each(|seg| {
let offset = seg.nbytes();
let _ = seg.write_bytes(0, &buf.as_slice()[base..base + offset]);
base += offset;
});
BioStatus::Complete
};
let handle_write_bio = |mut buf: Buf| {
let mut base = start_offset % BLOCK_SIZE;
// Read the first unaligned block.
if base != 0 {
let buf_mut = BufMut::try_from(&mut buf.as_mut_slice()[..BLOCK_SIZE]).unwrap();
if self.read(start_lba, buf_mut).is_err() {
return BioStatus::IoError;
}
}
// Read the last unaligned block.
if end_offset % BLOCK_SIZE != 0 {
let offset = buf.as_slice().len() - BLOCK_SIZE;
let buf_mut = BufMut::try_from(&mut buf.as_mut_slice()[offset..]).unwrap();
if self.read(end_lba - 1, buf_mut).is_err() {
return BioStatus::IoError;
}
}
bio.segments().iter().for_each(|seg| {
let offset = seg.nbytes();
let _ = seg.read_bytes(0, &mut buf.as_mut_slice()[base..base + offset]);
base += offset;
});
if self.write(start_lba, buf.as_ref()).is_err() {
return BioStatus::IoError;
}
BioStatus::Complete
};
let status = match bio.type_() {
BioType::Read => handle_read_bio(buf),
BioType::Write => handle_write_bio(buf),
_ => BioStatus::NotSupported,
};
bio.complete(status);
Ok(())
}
fn metadata(&self) -> aster_block::BlockDeviceMeta {
use aster_block::{BlockDeviceMeta, BLOCK_SIZE, SECTOR_SIZE};
BlockDeviceMeta {
max_nr_segments_per_bio: usize::MAX,
nr_sectors: (BLOCK_SIZE / SECTOR_SIZE) * self.total_blocks(),
}
}
}
impl<D: BlockSet + 'static> SwornDisk<D> {
/// Read a specified number of blocks at a logical block address on the device.
/// The block contents will be read into a single contiguous buffer.
pub fn read(&self, lba: Lba, buf: BufMut) -> Result<()> {
self.check_rw_args(lba, buf.nblocks())?;
self.inner.read(lba, buf)
}
/// Read multiple blocks at a logical block address on the device.
/// The block contents will be read into several scattered buffers.
pub fn readv<'a>(&self, lba: Lba, bufs: &'a mut [BufMut<'a>]) -> Result<()> {
self.check_rw_args(lba, bufs.iter().fold(0, |acc, buf| acc + buf.nblocks()))?;
self.inner.readv(lba, bufs)
}
/// Write a specified number of blocks at a logical block address on the device.
/// The block contents reside in a single contiguous buffer.
pub fn write(&self, lba: Lba, buf: BufRef) -> Result<()> {
self.check_rw_args(lba, buf.nblocks())?;
let _rguard = self.inner.write_sync_region.read();
self.inner.write(lba, buf)
}
/// Write multiple blocks at a logical block address on the device.
/// The block contents reside in several scattered buffers.
pub fn writev(&self, lba: Lba, bufs: &[BufRef]) -> Result<()> {
self.check_rw_args(lba, bufs.iter().fold(0, |acc, buf| acc + buf.nblocks()))?;
let _rguard = self.inner.write_sync_region.read();
self.inner.writev(lba, bufs)
}
/// Sync all cached data in the device to the storage medium for durability.
pub fn sync(&self) -> Result<()> {
let _wguard = self.inner.write_sync_region.write();
// TODO: Error handling the sync operation
self.inner.sync().unwrap();
trace!("[SwornDisk] Sync completed. {self:?}");
Ok(())
}
/// Returns the total number of blocks in the device.
pub fn total_blocks(&self) -> usize {
self.inner.user_data_disk.nblocks()
}
/// Creates a new `SwornDisk` on the given disk, with the root encryption key.
pub fn create(
disk: D,
root_key: Key,
sync_id_store: Option<Arc<dyn SyncIdStore>>,
) -> Result<Self> {
let data_disk = Self::subdisk_for_data(&disk)?;
let lsm_tree_disk = Self::subdisk_for_logical_block_table(&disk)?;
let tx_log_store = Arc::new(TxLogStore::format(lsm_tree_disk, root_key)?);
let block_validity_table = Arc::new(AllocTable::new(
NonZeroUsize::new(data_disk.nblocks()).unwrap(),
));
let listener_factory = Arc::new(TxLsmTreeListenerFactory::new(
tx_log_store.clone(),
block_validity_table.clone(),
));
let logical_block_table = {
let table = block_validity_table.clone();
let on_drop_record_in_memtable = move |record: &dyn AsKV<RecordKey, RecordValue>| {
// Deallocate the host block while the corresponding record is dropped in `MemTable`
table.set_deallocated(record.value().hba);
};
TxLsmTree::format(
tx_log_store.clone(),
listener_factory,
Some(Arc::new(on_drop_record_in_memtable)),
sync_id_store,
)?
};
let new_self = Self {
inner: Arc::new(DiskInner {
bio_req_queue: BioReqQueue::new(),
logical_block_table,
user_data_disk: data_disk,
block_validity_table,
tx_log_store,
data_buf: DataBuf::new(DATA_BUF_CAP),
root_key,
is_dropped: AtomicBool::new(false),
write_sync_region: RwLock::new(()),
}),
};
info!("[SwornDisk] Created successfully! {:?}", &new_self);
// XXX: Would `disk::drop()` bring unexpected behavior?
Ok(new_self)
}
/// Opens the `SwornDisk` on the given disk, with the root encryption key.
pub fn open(
disk: D,
root_key: Key,
sync_id_store: Option<Arc<dyn SyncIdStore>>,
) -> Result<Self> {
let data_disk = Self::subdisk_for_data(&disk)?;
let lsm_tree_disk = Self::subdisk_for_logical_block_table(&disk)?;
let tx_log_store = Arc::new(TxLogStore::recover(lsm_tree_disk, root_key)?);
let block_validity_table = Arc::new(AllocTable::recover(
NonZeroUsize::new(data_disk.nblocks()).unwrap(),
&tx_log_store,
)?);
let listener_factory = Arc::new(TxLsmTreeListenerFactory::new(
tx_log_store.clone(),
block_validity_table.clone(),
));
let logical_block_table = {
let table = block_validity_table.clone();
let on_drop_record_in_memtable = move |record: &dyn AsKV<RecordKey, RecordValue>| {
// Deallocate the host block while the corresponding record is dropped in `MemTable`
table.set_deallocated(record.value().hba);
};
TxLsmTree::recover(
tx_log_store.clone(),
listener_factory,
Some(Arc::new(on_drop_record_in_memtable)),
sync_id_store,
)?
};
let opened_self = Self {
inner: Arc::new(DiskInner {
bio_req_queue: BioReqQueue::new(),
logical_block_table,
user_data_disk: data_disk,
block_validity_table,
data_buf: DataBuf::new(DATA_BUF_CAP),
tx_log_store,
root_key,
is_dropped: AtomicBool::new(false),
write_sync_region: RwLock::new(()),
}),
};
info!("[SwornDisk] Opened successfully! {:?}", &opened_self);
Ok(opened_self)
}
/// Submit a new block I/O request and wait its completion (Synchronous).
pub fn submit_bio_sync(&self, bio_req: BioReq) -> BioResp {
bio_req.submit();
self.inner.handle_bio_req(&bio_req)
}
// TODO: Support handling request asynchronously
/// Check whether the arguments are valid for read/write operations.
fn check_rw_args(&self, lba: Lba, buf_nblocks: usize) -> Result<()> {
if lba + buf_nblocks > self.inner.user_data_disk.nblocks() {
Err(Error::with_msg(
OutOfDisk,
"read/write out of disk capacity",
))
} else {
Ok(())
}
}
fn subdisk_for_data(disk: &D) -> Result<D> {
disk.subset(0..disk.nblocks() * 15 / 16) // TBD
}
fn subdisk_for_logical_block_table(disk: &D) -> Result<D> {
disk.subset(disk.nblocks() * 15 / 16..disk.nblocks()) // TBD
}
}
/// Capacity of the user data blocks buffer.
const DATA_BUF_CAP: usize = 1024;
impl<D: BlockSet + 'static> DiskInner<D> {
/// Read a specified number of blocks at a logical block address on the device.
/// The block contents will be read into a single contiguous buffer.
pub fn read(&self, lba: Lba, buf: BufMut) -> Result<()> {
let nblocks = buf.nblocks();
let res = if nblocks == 1 {
self.read_one_block(lba, buf)
} else {
self.read_multi_blocks(lba, &mut [buf])
};
// Allow empty read
if let Err(e) = &res
&& e.errno() == NotFound
{
warn!("[SwornDisk] read contains empty read on lba {lba}");
return Ok(());
}
res
}
/// Read multiple blocks at a logical block address on the device.
/// The block contents will be read into several scattered buffers.
pub fn readv<'a>(&self, lba: Lba, bufs: &'a mut [BufMut<'a>]) -> Result<()> {
let res = self.read_multi_blocks(lba, bufs);
// Allow empty read
if let Err(e) = &res
&& e.errno() == NotFound
{
warn!("[SwornDisk] readv contains empty read on lba {lba}");
return Ok(());
}
res
}
fn read_one_block(&self, lba: Lba, mut buf: BufMut) -> Result<()> {
debug_assert_eq!(buf.nblocks(), 1);
// Search in `DataBuf` first
if self.data_buf.get(RecordKey { lba }, &mut buf).is_some() {
return Ok(());
}
// Search in `TxLsmTree` then
let value = self.logical_block_table.get(&RecordKey { lba })?;
// Perform disk read and decryption
let mut cipher = Buf::alloc(1)?;
self.user_data_disk.read(value.hba, cipher.as_mut())?;
Aead::new().decrypt(
cipher.as_slice(),
&value.key,
&Iv::new_zeroed(),
&[],
&value.mac,
buf.as_mut_slice(),
)?;
Ok(())
}
fn read_multi_blocks<'a>(&self, lba: Lba, bufs: &'a mut [BufMut<'a>]) -> Result<()> {
let mut buf_vec = BufMutVec::from_bufs(bufs);
let nblocks = buf_vec.nblocks();
let mut range_query_ctx =
RangeQueryCtx::<RecordKey, RecordValue>::new(RecordKey { lba }, nblocks);
// Search in `DataBuf` first
for (key, data_block) in self
.data_buf
.get_range(range_query_ctx.range_uncompleted().unwrap())
{
buf_vec
.nth_buf_mut_slice(key.lba - lba)
.copy_from_slice(data_block.as_slice());
range_query_ctx.mark_completed(key);
}
if range_query_ctx.is_completed() {
return Ok(());
}
// Search in `TxLsmTree` then
self.logical_block_table.get_range(&mut range_query_ctx)?;
// Allow empty read
debug_assert!(range_query_ctx.is_completed());
let mut res = range_query_ctx.into_results();
let record_batches = {
res.sort_by(|(_, v1), (_, v2)| v1.hba.cmp(&v2.hba));
res.chunk_by(|(_, v1), (_, v2)| v2.hba - v1.hba == 1)
};
// Perform disk read in batches and decryption
let mut cipher_buf = Buf::alloc(nblocks)?;
let cipher_slice = cipher_buf.as_mut_slice();
for record_batch in record_batches {
self.user_data_disk.read(
record_batch.first().unwrap().1.hba,
BufMut::try_from(&mut cipher_slice[..record_batch.len() * BLOCK_SIZE]).unwrap(),
)?;
for (nth, (key, value)) in record_batch.iter().enumerate() {
Aead::new().decrypt(
&cipher_slice[nth * BLOCK_SIZE..(nth + 1) * BLOCK_SIZE],
&value.key,
&Iv::new_zeroed(),
&[],
&value.mac,
buf_vec.nth_buf_mut_slice(key.lba - lba),
)?;
}
}
Ok(())
}
/// Write a specified number of blocks at a logical block address on the device.
/// The block contents reside in a single contiguous buffer.
pub fn write(&self, mut lba: Lba, buf: BufRef) -> Result<()> {
// Write block contents to `DataBuf` directly
for block_buf in buf.iter() {
let buf_at_capacity = self.data_buf.put(RecordKey { lba }, block_buf);
// Flush all data blocks in `DataBuf` to disk if it's full
if buf_at_capacity {
// TODO: Error handling: Should discard current write in `DataBuf`
self.flush_data_buf()?;
}
lba += 1;
}
Ok(())
}
/// Write multiple blocks at a logical block address on the device.
/// The block contents reside in several scattered buffers.
pub fn writev(&self, mut lba: Lba, bufs: &[BufRef]) -> Result<()> {
for buf in bufs {
self.write(lba, *buf)?;
lba += buf.nblocks();
}
Ok(())
}
fn flush_data_buf(&self) -> Result<()> {
let records = self.write_blocks_from_data_buf()?;
// Insert new records of data blocks to `TxLsmTree`
for (key, value) in records {
// TODO: Error handling: Should dealloc the written blocks
self.logical_block_table.put(key, value)?;
}
self.data_buf.clear();
Ok(())
}
fn write_blocks_from_data_buf(&self) -> Result<Vec<(RecordKey, RecordValue)>> {
let data_blocks = self.data_buf.all_blocks();
let num_write = data_blocks.len();
let mut records = Vec::with_capacity(num_write);
if num_write == 0 {
return Ok(records);
}
// Allocate slots for data blocks
let hbas = self
.block_validity_table
.alloc_batch(NonZeroUsize::new(num_write).unwrap())?;
debug_assert_eq!(hbas.len(), num_write);
let hba_batches = hbas.chunk_by(|hba1, hba2| hba2 - hba1 == 1);
// Perform encryption and batch disk write
let mut cipher_buf = Buf::alloc(num_write)?;
let mut cipher_slice = cipher_buf.as_mut_slice();
let mut nth = 0;
for hba_batch in hba_batches {
for (i, &hba) in hba_batch.iter().enumerate() {
let (lba, data_block) = &data_blocks[nth];
let key = Key::random();
let mac = Aead::new().encrypt(
data_block.as_slice(),
&key,
&Iv::new_zeroed(),
&[],
&mut cipher_slice[i * BLOCK_SIZE..(i + 1) * BLOCK_SIZE],
)?;
records.push((*lba, RecordValue { hba, key, mac }));
nth += 1;
}
self.user_data_disk.write(
*hba_batch.first().unwrap(),
BufRef::try_from(&cipher_slice[..hba_batch.len() * BLOCK_SIZE]).unwrap(),
)?;
cipher_slice = &mut cipher_slice[hba_batch.len() * BLOCK_SIZE..];
}
Ok(records)
}
/// Sync all cached data in the device to the storage medium for durability.
pub fn sync(&self) -> Result<()> {
self.flush_data_buf()?;
debug_assert!(self.data_buf.is_empty());
self.logical_block_table.sync()?;
// XXX: May impact performance when there comes frequent syncs
self.block_validity_table
.do_compaction(&self.tx_log_store)?;
self.tx_log_store.sync()?;
self.user_data_disk.flush()
}
/// Handle one block I/O request. Mark the request completed when finished,
/// return any error that occurs.
pub fn handle_bio_req(&self, req: &BioReq) -> BioResp {
let res = match req.type_() {
BioType::Read => self.do_read(req),
BioType::Write => self.do_write(req),
BioType::Sync => self.do_sync(req),
};
req.complete(res.clone());
res
}
/// Handle a read I/O request.
fn do_read(&self, req: &BioReq) -> BioResp {
debug_assert_eq!(req.type_(), BioType::Read);
let lba = req.addr() as Lba;
let mut req_bufs = req.take_bufs();
let mut bufs = {
let mut bufs = Vec::with_capacity(req.nbufs());
for buf in req_bufs.iter_mut() {
bufs.push(BufMut::try_from(buf.as_mut_slice())?);
}
bufs
};
if bufs.len() == 1 {
let buf = bufs.remove(0);
return self.read(lba, buf);
}
self.readv(lba, &mut bufs)
}
/// Handle a write I/O request.
fn do_write(&self, req: &BioReq) -> BioResp {
debug_assert_eq!(req.type_(), BioType::Write);
let lba = req.addr() as Lba;
let req_bufs = req.take_bufs();
let bufs = {
let mut bufs = Vec::with_capacity(req.nbufs());
for buf in req_bufs.iter() {
bufs.push(BufRef::try_from(buf.as_slice())?);
}
bufs
};
self.writev(lba, &bufs)
}
/// Handle a sync I/O request.
fn do_sync(&self, req: &BioReq) -> BioResp {
debug_assert_eq!(req.type_(), BioType::Sync);
self.sync()
}
}
impl<D: BlockSet> Drop for SwornDisk<D> {
fn drop(&mut self) {
self.inner.is_dropped.store(true, Ordering::Release);
}
}
impl<D: BlockSet + 'static> Debug for SwornDisk<D> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("SwornDisk")
.field("user_data_nblocks", &self.inner.user_data_disk.nblocks())
.field("logical_block_table", &self.inner.logical_block_table)
.finish()
}
}
/// A wrapper for `[BufMut]` used in `readv()`.
struct BufMutVec<'a> {
bufs: &'a mut [BufMut<'a>],
nblocks: usize,
}
impl<'a> BufMutVec<'a> {
pub fn from_bufs(bufs: &'a mut [BufMut<'a>]) -> Self {
debug_assert!(!bufs.is_empty());
let nblocks = bufs
.iter()
.map(|buf| buf.nblocks())
.fold(0_usize, |sum, nblocks| sum.saturating_add(nblocks));
Self { bufs, nblocks }
}
pub fn nblocks(&self) -> usize {
self.nblocks
}
pub fn nth_buf_mut_slice(&mut self, mut nth: usize) -> &mut [u8] {
debug_assert!(nth < self.nblocks);
for buf in self.bufs.iter_mut() {
let nblocks = buf.nblocks();
if nth >= buf.nblocks() {
nth -= nblocks;
} else {
return &mut buf.as_mut_slice()[nth * BLOCK_SIZE..(nth + 1) * BLOCK_SIZE];
}
}
&mut []
}
}
/// Listener factory for `TxLsmTree`.
struct TxLsmTreeListenerFactory<D> {
store: Arc<TxLogStore<D>>,
alloc_table: Arc<AllocTable>,
}
impl<D> TxLsmTreeListenerFactory<D> {
fn new(store: Arc<TxLogStore<D>>, alloc_table: Arc<AllocTable>) -> Self {
Self { store, alloc_table }
}
}
impl<D: BlockSet + 'static> TxEventListenerFactory<RecordKey, RecordValue>
for TxLsmTreeListenerFactory<D>
{
fn new_event_listener(
&self,
tx_type: TxType,
) -> Arc<dyn TxEventListener<RecordKey, RecordValue>> {
Arc::new(TxLsmTreeListener::new(
tx_type,
Arc::new(BlockAlloc::new(
self.alloc_table.clone(),
self.store.clone(),
)),
))
}
}
/// Event listener for `TxLsmTree`.
struct TxLsmTreeListener<D> {
tx_type: TxType,
block_alloc: Arc<BlockAlloc<D>>,
}
impl<D> TxLsmTreeListener<D> {
fn new(tx_type: TxType, block_alloc: Arc<BlockAlloc<D>>) -> Self {
Self {
tx_type,
block_alloc,
}
}
}
/// Register callbacks for different TXs in `TxLsmTree`.
impl<D: BlockSet + 'static> TxEventListener<RecordKey, RecordValue> for TxLsmTreeListener<D> {
fn on_add_record(&self, record: &dyn AsKV<RecordKey, RecordValue>) -> Result<()> {
match self.tx_type {
TxType::Compaction {
to_level: LsmLevel::L0,
} => self.block_alloc.alloc_block(record.value().hba),
// Major Compaction TX and Migration TX do not add new records
TxType::Compaction { .. } | TxType::Migration => {
// Do nothing
Ok(())
}
}
}
fn on_drop_record(&self, record: &dyn AsKV<RecordKey, RecordValue>) -> Result<()> {
match self.tx_type {
// Minor Compaction TX doesn't compact records
TxType::Compaction {
to_level: LsmLevel::L0,
} => {
unreachable!();
}
TxType::Compaction { .. } | TxType::Migration => {
self.block_alloc.dealloc_block(record.value().hba)
}
}
}
fn on_tx_begin(&self, tx: &mut CurrentTx<'_>) -> Result<()> {
match self.tx_type {
TxType::Compaction { .. } | TxType::Migration => {
tx.context(|| self.block_alloc.prepare_diff_log().unwrap())
}
}
Ok(())
}
fn on_tx_precommit(&self, tx: &mut CurrentTx<'_>) -> Result<()> {
match self.tx_type {
TxType::Compaction { .. } | TxType::Migration => {
tx.context(|| self.block_alloc.update_diff_log().unwrap())
}
}
Ok(())
}
fn on_tx_commit(&self) {
match self.tx_type {
TxType::Compaction { .. } | TxType::Migration => self.block_alloc.update_alloc_table(),
}
}
}
/// Key-Value record for `TxLsmTree`.
pub(super) struct Record {
key: RecordKey,
value: RecordValue,
}
/// The key of a `Record`.
#[repr(C)]
#[derive(Clone, Copy, Pod, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
pub(super) struct RecordKey {
/// Logical block address of user data block.
pub lba: Lba,
}
/// The value of a `Record`.
#[repr(C)]
#[derive(Clone, Copy, Pod, Debug)]
pub(super) struct RecordValue {
/// Host block address of user data block.
pub hba: Hba,
/// Encryption key of the data block.
pub key: Key,
/// Encrypted MAC of the data block.
pub mac: Mac,
}
impl Add<usize> for RecordKey {
type Output = Self;
fn add(self, other: usize) -> Self::Output {
Self {
lba: self.lba + other,
}
}
}
impl Sub<RecordKey> for RecordKey {
type Output = usize;
fn sub(self, other: RecordKey) -> Self::Output {
self.lba - other.lba
}
}
impl RecordK<RecordKey> for RecordKey {}
impl RecordV for RecordValue {}
impl AsKV<RecordKey, RecordValue> for Record {
fn key(&self) -> &RecordKey {
&self.key
}
fn value(&self) -> &RecordValue {
&self.value
}
}
#[cfg(test)]
mod tests {
use core::ptr::NonNull;
use std::thread;
use super::*;
use crate::layers::{bio::MemDisk, disk::bio::BioReqBuilder};
#[test]
fn sworndisk_fns() -> Result<()> {
let nblocks = 64 * 1024;
let mem_disk = MemDisk::create(nblocks)?;
let root_key = Key::random();
// Create a new `SwornDisk` then do some writes
let sworndisk = SwornDisk::create(mem_disk.clone(), root_key, None)?;
let num_rw = 1024;
// Submit a write block I/O request
let mut bufs = Vec::with_capacity(num_rw);
(0..num_rw).for_each(|i| {
let mut buf = Buf::alloc(1).unwrap();
buf.as_mut_slice().fill(i as u8);
bufs.push(buf);
});
let bio_req = BioReqBuilder::new(BioType::Write)
.addr(0 as BlockId)
.bufs(bufs)
.build();
sworndisk.submit_bio_sync(bio_req)?;
// Sync the `SwornDisk` then do some reads
sworndisk.submit_bio_sync(BioReqBuilder::new(BioType::Sync).build())?;
let mut rbuf = Buf::alloc(1)?;
for i in 0..num_rw {
sworndisk.read(i as Lba, rbuf.as_mut())?;
assert_eq!(rbuf.as_slice()[0], i as u8);
}
// Open the closed `SwornDisk` then test its data's existence
drop(sworndisk);
thread::spawn(move || -> Result<()> {
let opened_sworndisk = SwornDisk::open(mem_disk, root_key, None)?;
let mut rbuf = Buf::alloc(2)?;
opened_sworndisk.read(5 as Lba, rbuf.as_mut())?;
assert_eq!(rbuf.as_slice()[0], 5u8);
assert_eq!(rbuf.as_slice()[4096], 6u8);
Ok(())
})
.join()
.unwrap()
}
}

View File

@ -0,0 +1,14 @@
// SPDX-License-Identifier: MPL-2.0
#[path = "0-bio/mod.rs"]
pub mod bio;
#[path = "1-crypto/mod.rs"]
pub mod crypto;
#[path = "5-disk/mod.rs"]
pub mod disk;
#[path = "2-edit/mod.rs"]
pub mod edit;
#[path = "3-log/mod.rs"]
pub mod log;
#[path = "4-lsm/mod.rs"]
pub mod lsm;

View File

@ -0,0 +1,27 @@
// SPDX-License-Identifier: MPL-2.0
#![no_std]
#![deny(unsafe_code)]
#![feature(let_chains)]
#![feature(negative_impls)]
#![feature(slice_as_chunks)]
#![allow(dead_code, unused_imports)]
mod error;
mod layers;
mod os;
mod prelude;
mod tx;
mod util;
extern crate alloc;
pub use self::{
error::{Errno, Error},
layers::{
bio::{BlockId, BlockSet, Buf, BufMut, BufRef, BLOCK_SIZE},
disk::SwornDisk,
},
os::{Aead, AeadIv, AeadKey, AeadMac, Rng},
util::{Aead as _, RandomInit, Rng as _},
};

View File

@ -0,0 +1,404 @@
// SPDX-License-Identifier: MPL-2.0
//! OS-specific or OS-dependent APIs.
pub use alloc::{
boxed::Box,
collections::BTreeMap,
string::{String, ToString},
sync::{Arc, Weak},
vec::Vec,
};
use core::{
fmt,
sync::atomic::{AtomicBool, Ordering},
};
use aes_gcm::{
aead::{AeadInPlace, Key, NewAead, Nonce, Tag},
aes::Aes128,
Aes128Gcm,
};
use ctr::cipher::{NewCipher, StreamCipher};
pub use hashbrown::{HashMap, HashSet};
pub use ostd::sync::{Mutex, MutexGuard, RwLock, SpinLock};
use ostd::{
arch::read_random,
sync::{self, PreemptDisabled, WaitQueue},
task::{Task, TaskOptions},
};
use ostd_pod::Pod;
use serde::{Deserialize, Serialize};
use crate::{
error::{Errno, Error},
prelude::Result,
};
pub type RwLockReadGuard<'a, T> = sync::RwLockReadGuard<'a, T, PreemptDisabled>;
pub type RwLockWriteGuard<'a, T> = sync::RwLockWriteGuard<'a, T, PreemptDisabled>;
pub type SpinLockGuard<'a, T> = sync::SpinLockGuard<'a, T, PreemptDisabled>;
pub type Tid = u32;
/// A struct to get a unique identifier for the current thread.
pub struct CurrentThread;
impl CurrentThread {
/// Returns the Tid of current kernel thread.
pub fn id() -> Tid {
let Some(task) = Task::current() else {
return 0;
};
task.data() as *const _ as u32
}
}
/// A `Condvar` (Condition Variable) is a synchronization primitive that can block threads
/// until a certain condition becomes true.
///
/// This is a copy from `aster-nix`.
pub struct Condvar {
waitqueue: Arc<WaitQueue>,
counter: SpinLock<Inner>,
}
struct Inner {
waiter_count: u64,
notify_count: u64,
}
impl Condvar {
/// Creates a new condition variable.
pub fn new() -> Self {
Condvar {
waitqueue: Arc::new(WaitQueue::new()),
counter: SpinLock::new(Inner {
waiter_count: 0,
notify_count: 0,
}),
}
}
/// Atomically releases the given `MutexGuard`,
/// blocking the current thread until the condition variable
/// is notified, after which the mutex will be reacquired.
///
/// Returns a new `MutexGuard` if the operation is successful,
/// or returns the provided guard
/// within a `LockErr` if the waiting operation fails.
pub fn wait<'a, T>(&self, guard: MutexGuard<'a, T>) -> Result<MutexGuard<'a, T>> {
let cond = || {
// Check if the notify counter is greater than 0.
let mut counter = self.counter.lock();
if counter.notify_count > 0 {
// Decrement the notify counter.
counter.notify_count -= 1;
Some(())
} else {
None
}
};
{
let mut counter = self.counter.lock();
counter.waiter_count += 1;
}
let lock = MutexGuard::get_lock(&guard);
drop(guard);
self.waitqueue.wait_until(cond);
Ok(lock.lock())
}
/// Wakes up one blocked thread waiting on this condition variable.
///
/// If there is a waiting thread, it will be unblocked
/// and allowed to reacquire the associated mutex.
/// If no threads are waiting, this function is a no-op.
pub fn notify_one(&self) {
let mut counter = self.counter.lock();
if counter.waiter_count == 0 {
return;
}
counter.notify_count += 1;
self.waitqueue.wake_one();
counter.waiter_count -= 1;
}
/// Wakes up all blocked threads waiting on this condition variable.
///
/// This method will unblock all waiting threads
/// and they will be allowed to reacquire the associated mutex.
/// If no threads are waiting, this function is a no-op.
pub fn notify_all(&self) {
let mut counter = self.counter.lock();
if counter.waiter_count == 0 {
return;
}
counter.notify_count = counter.waiter_count;
self.waitqueue.wake_all();
counter.waiter_count = 0;
}
}
impl fmt::Debug for Condvar {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Condvar").finish_non_exhaustive()
}
}
/// Wrap the `Mutex` provided by kernel, used for `Condvar`.
#[repr(transparent)]
pub struct CvarMutex<T> {
inner: Mutex<T>,
}
// TODO: add distinguish guard type for `CvarMutex` if needed.
impl<T> CvarMutex<T> {
/// Constructs a new `Mutex` lock, using the kernel's `struct mutex`.
pub fn new(t: T) -> Self {
Self {
inner: Mutex::new(t),
}
}
/// Acquires the lock and gives the caller access to the data protected by it.
pub fn lock(&self) -> Result<MutexGuard<'_, T>> {
let guard = self.inner.lock();
Ok(guard)
}
}
impl<T: fmt::Debug> fmt::Debug for CvarMutex<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str("No data, since `CvarMutex` does't support `try_lock` now")
}
}
/// Spawns a new thread, returning a `JoinHandle` for it.
pub fn spawn<F, T>(f: F) -> JoinHandle<T>
where
F: FnOnce() -> T + Send + Sync + 'static,
T: Send + 'static,
{
let is_finished = Arc::new(AtomicBool::new(false));
let data = Arc::new(SpinLock::new(None));
let is_finished_clone = is_finished.clone();
let data_clone = data.clone();
let task = TaskOptions::new(move || {
let data = f();
*data_clone.lock() = Some(data);
is_finished_clone.store(true, Ordering::Release);
})
.spawn()
.unwrap();
JoinHandle {
task,
is_finished,
data,
}
}
/// An owned permission to join on a thread (block on its termination).
///
/// This struct is created by the `spawn` function.
pub struct JoinHandle<T> {
task: Arc<Task>,
is_finished: Arc<AtomicBool>,
data: Arc<SpinLock<Option<T>>>,
}
impl<T> JoinHandle<T> {
/// Checks if the associated thread has finished running its main function.
pub fn is_finished(&self) -> bool {
self.is_finished.load(Ordering::Acquire)
}
/// Waits for the associated thread to finish.
pub fn join(self) -> Result<T> {
while !self.is_finished() {
Task::yield_now();
}
let data = self.data.lock().take().unwrap();
Ok(data)
}
}
impl<T> fmt::Debug for JoinHandle<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("JoinHandle").finish_non_exhaustive()
}
}
/// A random number generator.
pub struct Rng;
impl crate::util::Rng for Rng {
fn new(_seed: &[u8]) -> Self {
Self
}
fn fill_bytes(&self, dest: &mut [u8]) -> Result<()> {
let (chunks, remain) = dest.as_chunks_mut::<8>();
chunks.iter_mut().for_each(|chunk| {
chunk.copy_from_slice(read_random().unwrap_or(0u64).as_bytes());
});
remain.copy_from_slice(&read_random().unwrap_or(0u64).as_bytes()[..remain.len()]);
Ok(())
}
}
/// A macro to define byte_array_types used by `Aead` or `Skcipher`.
macro_rules! new_byte_array_type {
($name:ident, $n:expr) => {
#[repr(C)]
#[derive(Copy, Clone, Pod, Debug, Default, Deserialize, Serialize)]
pub struct $name([u8; $n]);
impl core::ops::Deref for $name {
type Target = [u8];
fn deref(&self) -> &Self::Target {
self.0.as_slice()
}
}
impl core::ops::DerefMut for $name {
fn deref_mut(&mut self) -> &mut Self::Target {
self.0.as_mut_slice()
}
}
impl crate::util::RandomInit for $name {
fn random() -> Self {
use crate::util::Rng;
let mut result = Self::default();
let rng = self::Rng::new(&[]);
rng.fill_bytes(&mut result).unwrap_or_default();
result
}
}
};
}
const AES_GCM_KEY_SIZE: usize = 16;
const AES_GCM_IV_SIZE: usize = 12;
const AES_GCM_MAC_SIZE: usize = 16;
new_byte_array_type!(AeadKey, AES_GCM_KEY_SIZE);
new_byte_array_type!(AeadIv, AES_GCM_IV_SIZE);
new_byte_array_type!(AeadMac, AES_GCM_MAC_SIZE);
/// An `AEAD` cipher.
#[derive(Debug, Default)]
pub struct Aead;
impl Aead {
/// Construct an `Aead` instance.
pub fn new() -> Self {
Self
}
}
impl crate::util::Aead for Aead {
type Key = AeadKey;
type Iv = AeadIv;
type Mac = AeadMac;
fn encrypt(
&self,
input: &[u8],
key: &AeadKey,
iv: &AeadIv,
aad: &[u8],
output: &mut [u8],
) -> Result<AeadMac> {
let key = Key::<Aes128Gcm>::from_slice(key);
let nonce = Nonce::<Aes128Gcm>::from_slice(iv);
let cipher = Aes128Gcm::new(key);
output.copy_from_slice(input);
let tag = cipher
.encrypt_in_place_detached(nonce, aad, output)
.map_err(|_| Error::with_msg(Errno::EncryptFailed, "aes-128-gcm encryption failed"))?;
let mut aead_mac = AeadMac::new_zeroed();
aead_mac.copy_from_slice(&tag);
Ok(aead_mac)
}
fn decrypt(
&self,
input: &[u8],
key: &AeadKey,
iv: &AeadIv,
aad: &[u8],
mac: &AeadMac,
output: &mut [u8],
) -> Result<()> {
let key = Key::<Aes128Gcm>::from_slice(key);
let nonce = Nonce::<Aes128Gcm>::from_slice(iv);
let tag = Tag::<Aes128Gcm>::from_slice(mac);
let cipher = Aes128Gcm::new(key);
output.copy_from_slice(input);
cipher
.decrypt_in_place_detached(nonce, aad, output, tag)
.map_err(|_| Error::with_msg(Errno::DecryptFailed, "aes-128-gcm decryption failed"))
}
}
type Aes128Ctr = ctr::Ctr128LE<Aes128>;
const AES_CTR_KEY_SIZE: usize = 16;
const AES_CTR_IV_SIZE: usize = 16;
new_byte_array_type!(SkcipherKey, AES_CTR_KEY_SIZE);
new_byte_array_type!(SkcipherIv, AES_CTR_IV_SIZE);
/// A symmetric key cipher.
#[derive(Debug, Default)]
pub struct Skcipher;
// TODO: impl `Skcipher` with linux kernel Crypto API.
impl Skcipher {
/// Construct a `Skcipher` instance.
pub fn new() -> Self {
Self
}
}
impl crate::util::Skcipher for Skcipher {
type Key = SkcipherKey;
type Iv = SkcipherIv;
fn encrypt(
&self,
input: &[u8],
key: &SkcipherKey,
iv: &SkcipherIv,
output: &mut [u8],
) -> Result<()> {
let mut cipher = Aes128Ctr::new_from_slices(key, iv).unwrap();
output.copy_from_slice(input);
cipher.apply_keystream(output);
Ok(())
}
fn decrypt(
&self,
input: &[u8],
key: &SkcipherKey,
iv: &SkcipherIv,
output: &mut [u8],
) -> Result<()> {
let mut cipher = Aes128Ctr::new_from_slices(key, iv).unwrap();
output.copy_from_slice(input);
cipher.apply_keystream(output);
Ok(())
}
}

View File

@ -0,0 +1,15 @@
// SPDX-License-Identifier: MPL-2.0
pub(crate) use crate::{
error::{Errno::*, Error},
layers::bio::{BlockId, BLOCK_SIZE},
os::{Arc, Box, String, ToString, Vec, Weak},
return_errno, return_errno_with_msg,
util::{align_down, align_up, Aead as _, RandomInit, Rng as _, Skcipher as _},
};
pub(crate) type Result<T> = core::result::Result<T, Error>;
pub(crate) use core::fmt::{self, Debug};
pub(crate) use log::{debug, error, info, trace, warn};

View File

@ -0,0 +1,143 @@
// SPDX-License-Identifier: MPL-2.0
//! Get and set the current transaction of the current thread.
use core::sync::atomic::Ordering::{Acquire, Release};
use super::{Tx, TxData, TxId, TxProvider, TxStatus};
use crate::{os::CurrentThread, prelude::*};
/// The current transaction on a thread.
#[derive(Clone)]
pub struct CurrentTx<'a> {
provider: &'a TxProvider,
}
// CurrentTx is only useful and valid for the current thread
impl !Send for CurrentTx<'_> {}
impl !Sync for CurrentTx<'_> {}
impl<'a> CurrentTx<'a> {
pub(super) fn new(provider: &'a TxProvider) -> Self {
Self { provider }
}
/// Enter the context of the current TX.
///
/// While within the context of a TX, the implementation side of a TX
/// can get the current TX via `TxProvider::current`.
pub fn context<F, R>(&self, f: F) -> R
where
F: FnOnce() -> R,
{
let tx_table = self.provider.tx_table.lock();
let tid = CurrentThread::id();
if !tx_table.contains_key(&tid) {
panic!("there should be one Tx exited on the current thread");
}
assert!(tx_table.get(&tid).unwrap().status() == TxStatus::Ongoing);
drop(tx_table);
f()
}
/// Commits the current TX.
///
/// If the returned value is `Ok`, then the TX is committed successfully.
/// Otherwise, the TX is aborted.
pub fn commit(&self) -> Result<()> {
let mut tx_table = self.provider.tx_table.lock();
let Some(mut tx) = tx_table.remove(&CurrentThread::id()) else {
panic!("there should be one Tx exited on the current thread");
};
debug_assert!(tx.status() == TxStatus::Ongoing);
let res = self.provider.call_precommit_handlers();
if res.is_ok() {
self.provider.call_commit_handlers();
tx.set_status(TxStatus::Committed);
} else {
self.provider.call_abort_handlers();
tx.set_status(TxStatus::Aborted);
}
res
}
/// Aborts the current TX.
pub fn abort(&self) {
let mut tx_table = self.provider.tx_table.lock();
let Some(mut tx) = tx_table.remove(&CurrentThread::id()) else {
panic!("there should be one Tx exited on the current thread");
};
debug_assert!(tx.status() == TxStatus::Ongoing);
self.provider.call_abort_handlers();
tx.set_status(TxStatus::Aborted);
}
/// The ID of the transaction.
pub fn id(&self) -> TxId {
self.get_current_mut_with(|tx| tx.id())
}
/// Get immutable access to some type of the per-transaction data within a closure.
///
/// # Panics
///
/// The `data_with` method must _not_ be called recursively.
pub fn data_with<T: TxData, F, R>(&self, f: F) -> R
where
F: FnOnce(&T) -> R,
{
self.get_current_mut_with(|tx| {
let data = tx.data::<T>();
f(data)
})
}
/// Get mutable access to some type of the per-transaction data within a closure.
pub fn data_mut_with<T: TxData, F, R>(&mut self, f: F) -> R
where
F: FnOnce(&mut T) -> R,
{
self.get_current_mut_with(|tx| {
let data = tx.data_mut::<T>();
f(data)
})
}
/// Get a _mutable_ reference to the current transaction of the current thread,
/// passing it to a given closure.
///
/// # Panics
///
/// The `get_current_mut_with` method must be called within the closure
/// of `set_and_exec_with`.
///
/// In addition, the `get_current_mut_with` method must _not_ be called
/// recursively.
#[allow(dropping_references)]
fn get_current_mut_with<F, R>(&self, f: F) -> R
where
F: FnOnce(&mut Tx) -> R,
{
let mut tx_table = self.provider.tx_table.lock();
let Some(tx) = tx_table.get_mut(&CurrentThread::id()) else {
panic!("there should be one Tx exited on the current thread");
};
if tx.is_accessing_data.swap(true, Acquire) {
panic!("get_current_mut_with must not be called recursively");
}
let retval: R = f(tx);
// SAFETY. At any given time, at most one mutable reference will be constructed
// between the Acquire-Release section. And it is safe to drop `&mut Tx` after
// `Release`, since drop the reference does nothing to the `Tx` itself.
tx.is_accessing_data.store(false, Release);
retval
}
}

View File

@ -0,0 +1,435 @@
// SPDX-License-Identifier: MPL-2.0
//! Transaction management.
//!
//! Transaction management APIs serve two sides:
//!
//! * The user side of TXs uses `Tx` to use, commit, or abort TXs.
//! * The implementation side of TXs uses `TxProvider` to get notified
//! when TXs are created, committed, or aborted by register callbacks.
mod current;
use core::{
any::{Any, TypeId},
sync::atomic::{AtomicBool, AtomicU64, Ordering},
};
pub use self::current::CurrentTx;
use crate::{
os::{CurrentThread, HashMap, Mutex, RwLock, Tid},
prelude::*,
};
/// A transaction provider.
#[allow(clippy::type_complexity)]
pub struct TxProvider {
id: u64,
initializer_map: RwLock<HashMap<TypeId, Box<dyn Any + Send + Sync>>>,
precommit_handlers: RwLock<Vec<Box<dyn Fn(CurrentTx<'_>) -> Result<()> + Send + Sync>>>,
commit_handlers: RwLock<Vec<Box<dyn Fn(CurrentTx<'_>) + Send + Sync>>>,
abort_handlers: RwLock<Vec<Box<dyn Fn(CurrentTx<'_>) + Send + Sync>>>,
weak_self: Weak<Self>,
tx_table: Mutex<HashMap<Tid, Tx>>,
}
impl TxProvider {
/// Creates a new TX provider.
pub fn new() -> Arc<Self> {
static NEXT_ID: AtomicU64 = AtomicU64::new(0);
Arc::new_cyclic(|weak_self| Self {
id: NEXT_ID.fetch_add(1, Ordering::Release),
initializer_map: RwLock::new(HashMap::new()),
precommit_handlers: RwLock::new(Vec::new()),
commit_handlers: RwLock::new(Vec::new()),
abort_handlers: RwLock::new(Vec::new()),
weak_self: weak_self.clone(),
tx_table: Mutex::new(HashMap::new()),
})
}
/// Creates a new TX that is attached to this TX provider.
pub fn new_tx(&self) -> CurrentTx<'_> {
let mut tx_table = self.tx_table.lock();
let tid = CurrentThread::id();
if tx_table.contains_key(&tid) {
return self.current();
}
let tx = Tx::new(self.weak_self.clone());
let _ = tx_table.insert(tid, tx);
self.current()
}
/// Get the current TX.
///
/// # Panics
///
/// The caller of this method must be within the closure passed to
/// `Tx::context`. Otherwise, the method would panic.
pub fn current(&self) -> CurrentTx<'_> {
CurrentTx::new(self)
}
/// Register a per-TX data initializer.
///
/// The registered initializer function will be called upon the creation of
/// a TX.
pub fn register_data_initializer<T>(&self, f: Box<dyn Fn() -> T + Send + Sync>)
where
T: TxData,
{
let mut initializer_map = self.initializer_map.write();
initializer_map.insert(TypeId::of::<T>(), Box::new(f));
}
fn init_data<T>(&self) -> T
where
T: TxData,
{
let initializer_map = self.initializer_map.read();
let init_fn = initializer_map
.get(&TypeId::of::<T>())
.unwrap()
.downcast_ref::<Box<dyn Fn() -> T>>()
.unwrap();
init_fn()
}
/// Register a callback for the pre-commit stage,
/// which is before the commit stage.
///
/// Committing a TX triggers the pre-commit stage as well as the commit
/// stage of the TX.
/// On the pre-commit stage, the register callbacks will be called.
/// Pre-commit callbacks are allowed to fail (unlike commit callbacks).
/// If any pre-commit callbacks failed, the TX would be aborted and
/// the commit callbacks would not get called.
pub fn register_precommit_handler<F>(&self, f: F)
where
F: Fn(CurrentTx<'_>) -> Result<()> + Send + Sync + 'static,
{
let f = Box::new(f);
let mut precommit_handlers = self.precommit_handlers.write();
precommit_handlers.push(f);
}
fn call_precommit_handlers(&self) -> Result<()> {
let current = self.current();
let precommit_handlers = self.precommit_handlers.read();
for precommit_func in precommit_handlers.iter().rev() {
precommit_func(current.clone())?;
}
Ok(())
}
/// Register a callback for the commit stage,
/// which is after the pre-commit stage.
///
/// Committing a TX triggers first the pre-commit stage of the TX and then
/// the commit stage. The callbacks for the commit stage is not allowed
/// to fail.
pub fn register_commit_handler<F>(&self, f: F)
where
F: Fn(CurrentTx<'_>) + Send + Sync + 'static,
{
let f = Box::new(f);
let mut commit_handlers = self.commit_handlers.write();
commit_handlers.push(f);
}
fn call_commit_handlers(&self) {
let current = self.current();
let commit_handlers = self.commit_handlers.read();
for commit_func in commit_handlers.iter().rev() {
commit_func(current.clone())
}
}
/// Register a callback for the abort stage.
///
/// A TX enters the abort stage when the TX is aborted by the user
/// (via `Tx::abort`) or by a callback in the pre-commit stage.
pub fn register_abort_handler<F>(&self, f: F)
where
F: Fn(CurrentTx<'_>) + Send + Sync + 'static,
{
let f = Box::new(f);
let mut abort_handlers = self.abort_handlers.write();
abort_handlers.push(f);
}
fn call_abort_handlers(&self) {
let current = self.current();
let abort_handlers = self.abort_handlers.read();
for abort_func in abort_handlers.iter().rev() {
abort_func(current.clone())
}
}
}
/// A transaction.
pub struct Tx {
id: TxId,
provider: Weak<TxProvider>,
data_map: HashMap<TypeId, Box<dyn Any + Send + Sync>>,
status: TxStatus,
is_accessing_data: AtomicBool,
}
impl Tx {
fn new(provider: Weak<TxProvider>) -> Self {
static NEXT_ID: AtomicU64 = AtomicU64::new(0);
Self {
id: NEXT_ID.fetch_add(1, Ordering::Release),
provider,
data_map: HashMap::new(),
status: TxStatus::Ongoing,
is_accessing_data: AtomicBool::new(false),
}
}
/// Returns the TX ID.
pub fn id(&self) -> TxId {
self.id
}
/// Returns the status of the TX.
pub fn status(&self) -> TxStatus {
self.status
}
/// Sets the status of the Tx.
pub fn set_status(&mut self, status: TxStatus) {
self.status = status;
}
fn provider(&self) -> Arc<TxProvider> {
self.provider.upgrade().unwrap()
}
fn data<T>(&mut self) -> &T
where
T: TxData,
{
self.data_mut::<T>()
}
fn data_mut<T>(&mut self) -> &mut T
where
T: TxData,
{
let exists = self.data_map.contains_key(&TypeId::of::<T>());
if !exists {
// Slow path, need to initialize the data
let provider = self.provider();
let data: T = provider.init_data::<T>();
self.data_map.insert(TypeId::of::<T>(), Box::new(data));
}
// Fast path
self.data_map
.get_mut(&TypeId::of::<T>())
.unwrap()
.downcast_mut::<T>()
.unwrap()
}
}
impl Drop for Tx {
fn drop(&mut self) {
assert!(
self.status() != TxStatus::Ongoing,
"transactions must be committed or aborted explicitly"
);
}
}
/// The status of a transaction.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TxStatus {
Ongoing,
Committed,
Aborted,
}
/// The ID of a transaction.
pub type TxId = u64;
/// Per-transaction data.
///
/// Using `TxProvider::register_data_initiailzer` to inject per-transaction data
/// and using `CurrentTx::data_with` or `CurrentTx::data_mut_with` to access
/// per-transaction data.
pub trait TxData: Any + Send + Sync {}
#[cfg(test)]
mod tests {
use alloc::collections::BTreeSet;
use super::*;
/// `Db<T>` is a toy implementation of in-memory database for
/// a set of items of type `T`.
///
/// The most interesting feature of `Db<T>` is the support
/// of transactions. All queries and insertions to the database must
/// be performed within transactions. These transactions ensure
/// the atomicity of insertions even in the presence of concurrent execution.
/// If transactions are aborted, their changes won't take effect.
///
/// The main limitation of `Db<T>` is that it only supports
/// querying and inserting items, but not deleting.
/// The lack of support of deletions rules out the possibilities
/// of concurrent transactions conflicting with each other.
pub struct Db<T> {
all_items: Arc<Mutex<BTreeSet<T>>>,
tx_provider: Arc<TxProvider>,
}
struct DbUpdate<T> {
new_items: BTreeSet<T>,
}
impl<T: 'static> TxData for DbUpdate<T> {}
impl<T> Db<T>
where
T: Ord + 'static,
{
/// Creates an empty database.
pub fn new() -> Self {
let new_self = Self {
all_items: Arc::new(Mutex::new(BTreeSet::new())),
tx_provider: TxProvider::new(),
};
new_self
.tx_provider
.register_data_initializer(Box::new(|| DbUpdate {
new_items: BTreeSet::<T>::new(),
}));
new_self.tx_provider.register_commit_handler({
let all_items = new_self.all_items.clone();
move |mut current: CurrentTx<'_>| {
current.data_mut_with(|update: &mut DbUpdate<T>| {
let mut all_items = all_items.lock();
all_items.append(&mut update.new_items);
});
}
});
new_self
}
/// Creates a new DB transaction.
pub fn new_tx(&self) -> CurrentTx<'_> {
self.tx_provider.new_tx()
}
/// Returns whether an item is contained.
///
/// # Transaction
///
/// This method must be called within the context of a transaction.
pub fn contains(&self, item: &T) -> bool {
let is_new_item = {
let current_tx = self.tx_provider.current();
current_tx.data_with(|update: &DbUpdate<T>| update.new_items.contains(item))
};
if is_new_item {
return true;
}
let all_items = self.all_items.lock();
all_items.contains(item)
}
/// Inserts a new item into the DB.
///
/// # Transaction
///
/// This method must be called within the context of a transaction.
pub fn insert(&self, item: T) {
let all_items = self.all_items.lock();
if all_items.contains(&item) {
return;
}
let mut current_tx = self.tx_provider.current();
current_tx.data_mut_with(|update: &mut DbUpdate<_>| {
update.new_items.insert(item);
});
}
/// Collects all items of the DB.
///
/// # Transaction
///
/// This method must be called within the context of a transaction.
pub fn collect(&self) -> Vec<T>
where
T: Copy,
{
let all_items = self.all_items.lock();
let current_tx = self.tx_provider.current();
current_tx.data_with(|update: &DbUpdate<T>| {
all_items.union(&update.new_items).cloned().collect()
})
}
/// Returns the number of items in the DB.
///
/// # Transaction
///
/// This method must be called within the context of a transaction.
pub fn len(&self) -> usize {
let all_items = self.all_items.lock();
let current_tx = self.tx_provider.current();
let new_items_len = current_tx.data_with(|update: &DbUpdate<T>| update.new_items.len());
all_items.len() + new_items_len
}
}
#[test]
fn commit_takes_effect() {
let db: Db<u32> = Db::new();
let items = vec![1, 2, 3];
new_tx_and_insert_items::<u32, alloc::vec::IntoIter<u32>>(&db, items.clone().into_iter())
.commit()
.unwrap();
assert!(collect_items(&db) == items);
}
#[test]
fn abort_has_no_effect() {
let db: Db<u32> = Db::new();
let items = vec![1, 2, 3];
new_tx_and_insert_items::<u32, alloc::vec::IntoIter<u32>>(&db, items.into_iter()).abort();
assert!(collect_items(&db).len() == 0);
}
fn new_tx_and_insert_items<T, I>(db: &Db<T>, new_items: I) -> Tx
where
I: Iterator<Item = T>,
T: Copy + Ord + 'static,
{
let mut tx = db.new_tx();
tx.context(move || {
for new_item in new_items {
db.insert(new_item);
}
});
tx
}
fn collect_items<T>(db: &Db<T>) -> Vec<T>
where
T: Copy + Ord + 'static,
{
let mut tx = db.new_tx();
let items = tx.context(|| db.collect());
tx.commit().unwrap();
items
}
}

View File

@ -0,0 +1,302 @@
// SPDX-License-Identifier: MPL-2.0
use core::ops::Index;
use bittle::{Bits, BitsMut};
use serde::{Deserialize, Serialize};
use crate::prelude::*;
/// A compact array of bits.
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct BitMap {
bits: Vec<u64>,
nbits: usize,
}
impl BitMap {
/// The one bit represents `true`.
const ONE: bool = true;
/// The zero bit represents `false`.
const ZERO: bool = false;
/// Create a new `BitMap` by repeating the `value` for the desired length.
pub fn repeat(value: bool, nbits: usize) -> Self {
let vec_len = nbits.div_ceil(64);
let mut bits = Vec::with_capacity(vec_len);
if value == Self::ONE {
bits.resize(vec_len, !0u64);
} else {
bits.resize(vec_len, 0u64);
}
// Set the unused bits in the last u64 with zero.
if nbits % 64 != 0 {
bits[vec_len - 1]
.iter_ones()
.filter(|index| (*index as usize) >= nbits % 64)
.for_each(|index| bits[vec_len - 1].clear_bit(index));
}
Self { bits, nbits }
}
/// Return the total number of bits.
pub fn len(&self) -> usize {
self.nbits
}
fn check_index(&self, index: usize) {
if index >= self.len() {
panic!(
"bitmap index {} is out of range, total bits {}",
index, self.nbits,
);
}
}
/// Test if the given bit is set.
///
/// Return `true` if the given bit is one bit.
///
/// # Panics
///
/// The `index` must be within the total number of bits. Otherwise, this method panics.
pub fn test_bit(&self, index: usize) -> bool {
self.check_index(index);
self.bits.test_bit(index as _)
}
/// Set the given bit with one bit.
///
/// # Panics
///
/// The `index` must be within the total number of bits. Otherwise, this method panics.
pub fn set_bit(&mut self, index: usize) {
self.check_index(index);
self.bits.set_bit(index as _);
}
/// Clear the given bit with zero bit.
///
/// # Panics
///
/// The `index` must be within the total number of bits. Otherwise, this method panics.
pub fn clear_bit(&mut self, index: usize) {
self.check_index(index);
self.bits.clear_bit(index as _)
}
/// Set the given bit with `value`.
///
/// One bit is set for `true`, and zero bit for `false`.
///
/// # Panics
///
/// The `index` must be within the total number of bits. Otherwise, this method panics.
pub fn set(&mut self, index: usize, value: bool) {
if value == Self::ONE {
self.set_bit(index);
} else {
self.clear_bit(index);
}
}
fn bits_not_in_use(&self) -> usize {
self.bits.len() * 64 - self.nbits
}
/// Get the number of one bits in the bitmap.
pub fn count_ones(&self) -> usize {
self.bits.count_ones() as _
}
/// Get the number of zero bits in the bitmap.
pub fn count_zeros(&self) -> usize {
let total_zeros = self.bits.count_zeros() as usize;
total_zeros - self.bits_not_in_use()
}
/// Find the index of the first one bit, starting from the given index (inclusively).
///
/// Return `None` if no one bit is found.
///
/// # Panics
///
/// The `from` index must be within the total number of bits. Otherwise, this method panics.
pub fn first_one(&self, from: usize) -> Option<usize> {
self.check_index(from);
let first_u64_index = from / 64;
self.bits[first_u64_index..]
.iter_ones()
.map(|index| first_u64_index * 64 + (index as usize))
.find(|&index| index >= from)
}
/// Find `count` indexes of the first one bits, starting from the given index (inclusively).
///
/// Return `None` if fewer than `count` one bits are found.
///
/// # Panics
///
/// The `from + count` index must be within the total number of bits. Otherwise, this method panics.
pub fn first_ones(&self, from: usize, count: usize) -> Option<Vec<usize>> {
self.check_index(from + count - 1);
let first_u64_index = from / 64;
let ones: Vec<_> = self.bits[first_u64_index..]
.iter_ones()
.map(|index| first_u64_index * 64 + (index as usize))
.filter(|&index| index >= from)
.take(count)
.collect();
if ones.len() == count {
Some(ones)
} else {
None
}
}
/// Find the index of the last one bit.
///
/// Return `None` if no one bit is found.
pub fn last_one(&self) -> Option<usize> {
self.bits
.iter_ones()
.rev()
.map(|index| index as usize)
.next()
}
/// Find the index of the first zero bit, starting from the given index (inclusively).
///
/// Return `None` if no zero bit is found.
///
/// # Panics
///
/// The `from` index must be within the total number of bits. Otherwise, this method panics.
pub fn first_zero(&self, from: usize) -> Option<usize> {
self.check_index(from);
let first_u64_index = from / 64;
self.bits[first_u64_index..]
.iter_zeros()
.map(|index| first_u64_index * 64 + (index as usize))
.find(|&index| index >= from && index < self.len())
}
/// Find `count` indexes of the first zero bits, starting from the given index (inclusively).
///
/// Return `None` if fewer than `count` zero bits are found.
///
/// # Panics
///
/// The `from + count` index must be within the total number of bits. Otherwise, this method panics.
pub fn first_zeros(&self, from: usize, count: usize) -> Option<Vec<usize>> {
self.check_index(from + count - 1);
let first_u64_index = from / 64;
let zeros: Vec<_> = self.bits[first_u64_index..]
.iter_zeros()
.map(|index| first_u64_index * 64 + (index as usize))
.filter(|&index| index >= from && index < self.len())
.take(count)
.collect();
if zeros.len() == count {
Some(zeros)
} else {
None
}
}
/// Find the index of the last zero bit.
///
/// Return `None` if no zero bit is found.
pub fn last_zero(&self) -> Option<usize> {
self.bits
.iter_zeros()
.rev()
.skip(self.bits_not_in_use())
.map(|index| index as usize)
.next()
}
}
impl Index<usize> for BitMap {
type Output = bool;
fn index(&self, index: usize) -> &Self::Output {
if self.test_bit(index) {
&BitMap::ONE
} else {
&BitMap::ZERO
}
}
}
#[cfg(test)]
mod tests {
use super::BitMap;
#[test]
fn all_true() {
let bm = BitMap::repeat(true, 100);
assert_eq!(bm.len(), 100);
assert_eq!(bm.count_ones(), 100);
assert_eq!(bm.count_zeros(), 0);
}
#[test]
fn all_false() {
let bm = BitMap::repeat(false, 100);
assert_eq!(bm.len(), 100);
assert_eq!(bm.count_ones(), 0);
assert_eq!(bm.count_zeros(), 100);
}
#[test]
fn bit_ops() {
let mut bm = BitMap::repeat(false, 100);
assert_eq!(bm.count_ones(), 0);
bm.set_bit(32);
assert_eq!(bm.count_ones(), 1);
assert_eq!(bm.test_bit(32), true);
bm.set(64, true);
assert_eq!(bm.count_ones(), 2);
assert_eq!(bm.test_bit(64), true);
bm.clear_bit(32);
assert_eq!(bm.count_ones(), 1);
assert_eq!(bm.test_bit(32), false);
bm.set(64, false);
assert_eq!(bm.count_ones(), 0);
assert_eq!(bm.test_bit(64), false);
}
#[test]
fn find_first_last() {
let mut bm = BitMap::repeat(false, 100);
bm.set_bit(64);
assert_eq!(bm.first_one(0), Some(64));
assert_eq!(bm.first_one(64), Some(64));
assert_eq!(bm.first_one(65), None);
assert_eq!(bm.first_ones(0, 1), Some(vec![64]));
assert_eq!(bm.first_ones(0, 2), None);
assert_eq!(bm.last_one(), Some(64));
let mut bm = BitMap::repeat(true, 100);
bm.clear_bit(64);
assert_eq!(bm.first_zero(0), Some(64));
assert_eq!(bm.first_zero(64), Some(64));
assert_eq!(bm.first_zero(65), None);
assert_eq!(bm.first_zeros(0, 1), Some(vec![64]));
assert_eq!(bm.first_zeros(0, 2), None);
assert_eq!(bm.last_zero(), Some(64));
}
}

View File

@ -0,0 +1,89 @@
// SPDX-License-Identifier: MPL-2.0
use core::ops::Deref;
use crate::prelude::Result;
/// Random initialization for Key, Iv and Mac.
pub trait RandomInit: Default {
fn random() -> Self;
}
/// Authenticated Encryption with Associated Data (AEAD) algorithm.
pub trait Aead {
type Key: Deref<Target = [u8]> + RandomInit;
type Iv: Deref<Target = [u8]> + RandomInit;
type Mac: Deref<Target = [u8]> + RandomInit;
/// Encrypt plaintext referred by `input`, with a secret `Key`,
/// initialization vector `Iv` and additional associated data `aad`.
///
/// If the operation succeed, the ciphertext will be written to `output`
/// and a message authentication code `Mac` will be returned. Or else,
/// return an `Error` on any fault.
fn encrypt(
&self,
input: &[u8],
key: &Self::Key,
iv: &Self::Iv,
aad: &[u8],
output: &mut [u8],
) -> Result<Self::Mac>;
/// Decrypt ciphertext referred by `input`, with a secret `Key` and
/// message authentication code `Mac`, initialization vector `Iv` and
/// additional associated data `aad`.
///
/// If the operation succeed, the plaintext will be written to `output`.
/// Or else, return an `Error` on any fault.
fn decrypt(
&self,
input: &[u8],
key: &Self::Key,
iv: &Self::Iv,
aad: &[u8],
mac: &Self::Mac,
output: &mut [u8],
) -> Result<()>;
}
/// Symmetric key cipher algorithm.
pub trait Skcipher {
type Key: Deref<Target = [u8]> + RandomInit;
type Iv: Deref<Target = [u8]> + RandomInit;
/// Encrypt plaintext referred by `input`, with a secret `Key` and
/// initialization vector `Iv`.
///
/// If the operation succeed, the ciphertext will be written to `output`.
/// Or else, return an `Error` on any fault.
fn encrypt(
&self,
input: &[u8],
key: &Self::Key,
iv: &Self::Iv,
output: &mut [u8],
) -> Result<()>;
/// Decrypt ciphertext referred by `input` with a secret `Key` and
/// initialization vector `Iv`.
///
/// If the operation succeed, the plaintext will be written to `output`.
/// Or else, return an `Error` on any fault.
fn decrypt(
&self,
input: &[u8],
key: &Self::Key,
iv: &Self::Iv,
output: &mut [u8],
) -> Result<()>;
}
/// Random number generator.
pub trait Rng {
/// Create an instance, with `seed` to provide secure entropy.
fn new(seed: &[u8]) -> Self;
/// Fill `dest` with random bytes.
fn fill_bytes(&self, dest: &mut [u8]) -> Result<()>;
}

View File

@ -0,0 +1,105 @@
// SPDX-License-Identifier: MPL-2.0
use core::{
fmt,
ops::{Deref, DerefMut},
sync::atomic::{AtomicBool, Ordering},
};
use crate::prelude::*;
/// An object that may be deleted lazily.
///
/// Lazy-deletion is a technique to postpone the real deletion of an object.
/// This technique allows an object to remain usable even after a decision
/// to delete the object has been made. Of course. After the "real" deletion
/// is carried out, the object will no longer be usable.
///
/// A classic example is file deletion in UNIX file systems.
///
/// ```ignore
/// int fd = open("path/to/my_file", O_RDONLY);
/// unlink("path/to/my_file");
/// // fd is still valid after unlink
/// ```
///
/// `LazyDelete<T>` enables lazy deletion of any object of `T`.
/// Here is a simple example.
///
/// ```
/// use sworndisk_v2::lazy_delete::*;
///
/// let lazy_delete_u32 = LazyDelete::new(123_u32, |obj| {
/// println!("the real deletion happens in this closure");
/// });
///
/// // The object is still usable after it is deleted (lazily)
/// LazyDelete::delete(&lazy_delete_u32);
/// assert!(*lazy_delete_u32 == 123);
///
/// // The deletion operation will be carried out when it is dropped
/// drop(lazy_delete_u32);
/// ```
#[allow(clippy::type_complexity)]
pub struct LazyDelete<T> {
obj: T,
is_deleted: AtomicBool,
delete_fn: Option<Box<dyn FnOnce(&mut T) + Send + Sync>>,
}
impl<T: fmt::Debug> fmt::Debug for LazyDelete<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("LazyDelete")
.field("obj", &self.obj)
.field("is_deleted", &Self::is_deleted(self))
.finish()
}
}
impl<T> LazyDelete<T> {
/// Creates a new instance of `LazyDelete`.
///
/// The `delete_fn` will be called only if this instance of `LazyDelete` is
/// marked deleted by the `delete` method and only when this instance
/// of `LazyDelete` is dropped.
pub fn new<F: FnOnce(&mut T) + Send + Sync + 'static>(obj: T, delete_fn: F) -> Self {
Self {
obj,
is_deleted: AtomicBool::new(false),
delete_fn: Some(Box::new(delete_fn) as _),
}
}
/// Mark this instance deleted.
pub fn delete(this: &Self) {
this.is_deleted.store(true, Ordering::Release);
}
/// Returns whether this instance has been marked deleted.
pub fn is_deleted(this: &Self) -> bool {
this.is_deleted.load(Ordering::Acquire)
}
}
impl<T> Deref for LazyDelete<T> {
type Target = T;
fn deref(&self) -> &T {
&self.obj
}
}
impl<T> DerefMut for LazyDelete<T> {
fn deref_mut(&mut self) -> &mut T {
&mut self.obj
}
}
impl<T> Drop for LazyDelete<T> {
fn drop(&mut self) {
if Self::is_deleted(self) {
let delete_fn = self.delete_fn.take().unwrap();
(delete_fn)(&mut self.obj);
}
}
}

View File

@ -0,0 +1,22 @@
// SPDX-License-Identifier: MPL-2.0
//! Utilities.
mod bitmap;
mod crypto;
mod lazy_delete;
pub use self::{
bitmap::BitMap,
crypto::{Aead, RandomInit, Rng, Skcipher},
lazy_delete::LazyDelete,
};
/// Aligns `x` up to the next multiple of `align`.
pub(crate) const fn align_up(x: usize, align: usize) -> usize {
x.div_ceil(align) * align
}
/// Aligns `x` down to the previous multiple of `align`.
pub(crate) const fn align_down(x: usize, align: usize) -> usize {
(x / align) * align
}