403 lines
14 KiB
Rust
403 lines
14 KiB
Rust
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
//! Block allocation.
|
|
use alloc::vec;
|
|
use core::{
|
|
num::NonZeroUsize,
|
|
sync::atomic::{AtomicBool, AtomicUsize, Ordering},
|
|
};
|
|
|
|
use ostd_pod::{IntoBytes, Pod};
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
use super::mlsdisk::Hba;
|
|
use crate::{
|
|
layers::{
|
|
bio::{BID_SIZE, BlockSet, Buf, BufRef},
|
|
log::{TxLog, TxLogStore},
|
|
},
|
|
os::{BTreeMap, Condvar, CvarMutex, Mutex},
|
|
prelude::*,
|
|
util::BitMap,
|
|
};
|
|
|
|
/// The bucket name of block validity table.
|
|
const BUCKET_BLOCK_VALIDITY_TABLE: &str = "BVT";
|
|
/// The bucket name of block alloc/dealloc log.
|
|
const BUCKET_BLOCK_ALLOC_LOG: &str = "BAL";
|
|
|
|
/// Block validity table. Global allocator for `MlsDisk`,
|
|
/// which manages validities of user data blocks.
|
|
pub(super) struct AllocTable {
|
|
bitmap: Mutex<BitMap>,
|
|
next_avail: AtomicUsize,
|
|
nblocks: NonZeroUsize,
|
|
is_dirty: AtomicBool,
|
|
cvar: Condvar,
|
|
num_free: CvarMutex<usize>,
|
|
}
|
|
|
|
/// Per-TX block allocator in `MlsDisk`, recording validities
|
|
/// of user data blocks within each TX. All metadata will be stored in
|
|
/// `TxLog`s of bucket `BAL` during TX for durability and recovery purpose.
|
|
pub(super) struct BlockAlloc<D> {
|
|
alloc_table: Arc<AllocTable>, // Point to the global allocator
|
|
diff_table: Mutex<BTreeMap<Hba, AllocDiff>>, // Per-TX diffs of block validity
|
|
store: Arc<TxLogStore<D>>, // Store for diff log from L3
|
|
diff_log: Mutex<Option<Arc<TxLog<D>>>>, // Opened diff log (currently not in-use)
|
|
}
|
|
|
|
/// Incremental diff of block validity.
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
|
#[repr(u8)]
|
|
enum AllocDiff {
|
|
Alloc = 3,
|
|
Dealloc = 7,
|
|
Invalid,
|
|
}
|
|
const DIFF_RECORD_SIZE: usize = size_of::<AllocDiff>() + size_of::<Hba>();
|
|
|
|
impl AllocTable {
|
|
/// Create a new `AllocTable` given the total number of blocks.
|
|
pub fn new(nblocks: NonZeroUsize) -> Self {
|
|
Self {
|
|
bitmap: Mutex::new(BitMap::repeat(true, nblocks.get())),
|
|
next_avail: AtomicUsize::new(0),
|
|
nblocks,
|
|
is_dirty: AtomicBool::new(false),
|
|
cvar: Condvar::new(),
|
|
num_free: CvarMutex::new(nblocks.get()),
|
|
}
|
|
}
|
|
|
|
/// Allocate a free slot for a new block, returns `None`
|
|
/// if there are no free slots.
|
|
pub fn alloc(&self) -> Option<Hba> {
|
|
let mut bitmap = self.bitmap.lock();
|
|
let next_avail = self.next_avail.load(Ordering::Acquire);
|
|
|
|
let hba = if let Some(hba) = bitmap.first_one(next_avail) {
|
|
hba
|
|
} else {
|
|
bitmap.first_one(0)?
|
|
};
|
|
bitmap.set(hba, false);
|
|
|
|
self.next_avail.store(hba + 1, Ordering::Release);
|
|
Some(hba as Hba)
|
|
}
|
|
|
|
/// Allocate multiple free slots for a bunch of new blocks, returns `None`
|
|
/// if there are no free slots for all.
|
|
pub fn alloc_batch(&self, count: NonZeroUsize) -> Result<Vec<Hba>> {
|
|
let cnt = count.get();
|
|
let mut num_free = self.num_free.lock().unwrap();
|
|
while *num_free < cnt {
|
|
// TODO: May not be woken, may require manual triggering of a compaction in L4
|
|
num_free = self.cvar.wait(num_free).unwrap();
|
|
}
|
|
debug_assert!(*num_free >= cnt);
|
|
|
|
let hbas = self.do_alloc_batch(count).unwrap();
|
|
debug_assert_eq!(hbas.len(), cnt);
|
|
|
|
*num_free -= cnt;
|
|
let _ = self
|
|
.is_dirty
|
|
.compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed);
|
|
Ok(hbas)
|
|
}
|
|
|
|
fn do_alloc_batch(&self, count: NonZeroUsize) -> Option<Vec<Hba>> {
|
|
let count = count.get();
|
|
debug_assert!(count > 0);
|
|
let mut bitmap = self.bitmap.lock();
|
|
let mut next_avail = self.next_avail.load(Ordering::Acquire);
|
|
|
|
if next_avail + count > self.nblocks.get() {
|
|
next_avail = bitmap.first_one(0)?;
|
|
}
|
|
|
|
let hbas = if let Some(hbas) = bitmap.first_ones(next_avail, count) {
|
|
hbas
|
|
} else {
|
|
next_avail = bitmap.first_one(0)?;
|
|
bitmap.first_ones(next_avail, count)?
|
|
};
|
|
hbas.iter().for_each(|hba| bitmap.set(*hba, false));
|
|
|
|
next_avail = hbas.last().unwrap() + 1;
|
|
self.next_avail.store(next_avail, Ordering::Release);
|
|
Some(hbas)
|
|
}
|
|
|
|
/// Recover the `AllocTable` from the latest `BVT` log and a bunch of `BAL` logs
|
|
/// in the given store.
|
|
pub fn recover<D: BlockSet + 'static>(
|
|
nblocks: NonZeroUsize,
|
|
store: &Arc<TxLogStore<D>>,
|
|
) -> Result<Self> {
|
|
let tx = store.new_tx();
|
|
let res: Result<_> = tx.context(|| {
|
|
// Recover the block validity table from `BVT` log first
|
|
let bvt_log_res = store.open_log_in(BUCKET_BLOCK_VALIDITY_TABLE);
|
|
let mut bitmap = match bvt_log_res {
|
|
Ok(bvt_log) => {
|
|
let mut buf = Buf::alloc(bvt_log.nblocks())?;
|
|
bvt_log.read(0 as BlockId, buf.as_mut())?;
|
|
postcard::from_bytes(buf.as_slice()).map_err(|_| {
|
|
Error::with_msg(InvalidArgs, "deserialize block validity table failed")
|
|
})?
|
|
}
|
|
Err(e) => {
|
|
if e.errno() != NotFound {
|
|
return Err(e);
|
|
}
|
|
BitMap::repeat(true, nblocks.get())
|
|
}
|
|
};
|
|
|
|
// Iterate each `BAL` log and apply each diff, from older to newer
|
|
let bal_log_ids_res = store.list_logs_in(BUCKET_BLOCK_ALLOC_LOG);
|
|
if let Err(e) = &bal_log_ids_res
|
|
&& e.errno() == NotFound
|
|
{
|
|
let next_avail = bitmap.first_one(0).unwrap_or(0);
|
|
let num_free = bitmap.count_ones();
|
|
return Ok(Self {
|
|
bitmap: Mutex::new(bitmap),
|
|
next_avail: AtomicUsize::new(next_avail),
|
|
nblocks,
|
|
is_dirty: AtomicBool::new(false),
|
|
cvar: Condvar::new(),
|
|
num_free: CvarMutex::new(num_free),
|
|
});
|
|
}
|
|
let mut bal_log_ids = bal_log_ids_res?;
|
|
bal_log_ids.sort();
|
|
|
|
for bal_log_id in bal_log_ids {
|
|
let bal_log_res = store.open_log(bal_log_id, false);
|
|
if let Err(e) = &bal_log_res
|
|
&& e.errno() == NotFound
|
|
{
|
|
continue;
|
|
}
|
|
let bal_log = bal_log_res?;
|
|
|
|
let log_nblocks = bal_log.nblocks();
|
|
let mut buf = Buf::alloc(log_nblocks)?;
|
|
bal_log.read(0 as BlockId, buf.as_mut())?;
|
|
let buf_slice = buf.as_slice();
|
|
let mut offset = 0;
|
|
while offset <= log_nblocks * BLOCK_SIZE - DIFF_RECORD_SIZE {
|
|
let diff = AllocDiff::from(buf_slice[offset]);
|
|
offset += 1;
|
|
if diff == AllocDiff::Invalid {
|
|
continue;
|
|
}
|
|
let bid = BlockId::from_bytes(&buf_slice[offset..offset + BID_SIZE]);
|
|
offset += BID_SIZE;
|
|
match diff {
|
|
AllocDiff::Alloc => bitmap.set(bid, false),
|
|
AllocDiff::Dealloc => bitmap.set(bid, true),
|
|
_ => unreachable!(),
|
|
}
|
|
}
|
|
}
|
|
let next_avail = bitmap.first_one(0).unwrap_or(0);
|
|
let num_free = bitmap.count_ones();
|
|
|
|
Ok(Self {
|
|
bitmap: Mutex::new(bitmap),
|
|
next_avail: AtomicUsize::new(next_avail),
|
|
nblocks,
|
|
is_dirty: AtomicBool::new(false),
|
|
cvar: Condvar::new(),
|
|
num_free: CvarMutex::new(num_free),
|
|
})
|
|
});
|
|
let recov_self = res.map_err(|_| {
|
|
tx.abort();
|
|
Error::with_msg(TxAborted, "recover block validity table TX aborted")
|
|
})?;
|
|
tx.commit()?;
|
|
|
|
Ok(recov_self)
|
|
}
|
|
|
|
/// Persist the block validity table to `BVT` log. GC all existed `BAL` logs.
|
|
pub fn do_compaction<D: BlockSet + 'static>(&self, store: &Arc<TxLogStore<D>>) -> Result<()> {
|
|
if !self.is_dirty.load(Ordering::Relaxed) {
|
|
return Ok(());
|
|
}
|
|
|
|
// Serialize the block validity table
|
|
let bitmap = self.bitmap.lock();
|
|
const BITMAP_MAX_SIZE: usize = 1792 * BLOCK_SIZE; // TBD
|
|
let mut ser_buf = vec![0; BITMAP_MAX_SIZE];
|
|
let ser_len = postcard::to_slice::<BitMap>(&bitmap, &mut ser_buf)
|
|
.map_err(|_| Error::with_msg(InvalidArgs, "serialize block validity table failed"))?
|
|
.len();
|
|
ser_buf.resize(align_up(ser_len, BLOCK_SIZE), 0);
|
|
drop(bitmap);
|
|
|
|
// Persist the serialized block validity table to `BVT` log
|
|
// and GC any old `BVT` logs and `BAL` logs
|
|
let tx = store.new_tx();
|
|
let res: Result<_> = tx.context(|| {
|
|
if let Ok(bvt_log_ids) = store.list_logs_in(BUCKET_BLOCK_VALIDITY_TABLE) {
|
|
for bvt_log_id in bvt_log_ids {
|
|
store.delete_log(bvt_log_id)?;
|
|
}
|
|
}
|
|
|
|
let bvt_log = store.create_log(BUCKET_BLOCK_VALIDITY_TABLE)?;
|
|
bvt_log.append(BufRef::try_from(&ser_buf[..]).unwrap())?;
|
|
|
|
if let Ok(bal_log_ids) = store.list_logs_in(BUCKET_BLOCK_ALLOC_LOG) {
|
|
for bal_log_id in bal_log_ids {
|
|
store.delete_log(bal_log_id)?;
|
|
}
|
|
}
|
|
Ok(())
|
|
});
|
|
if res.is_err() {
|
|
tx.abort();
|
|
return_errno_with_msg!(TxAborted, "persist block validity table TX aborted");
|
|
}
|
|
tx.commit()?;
|
|
|
|
self.is_dirty.store(false, Ordering::Relaxed);
|
|
Ok(())
|
|
}
|
|
|
|
/// Mark a specific slot deallocated.
|
|
pub fn set_deallocated(&self, nth: usize) {
|
|
let mut num_free = self.num_free.lock().unwrap();
|
|
self.bitmap.lock().set(nth, true);
|
|
|
|
*num_free += 1;
|
|
const AVG_ALLOC_COUNT: usize = 1024;
|
|
if *num_free >= AVG_ALLOC_COUNT {
|
|
self.cvar.notify_one();
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<D: BlockSet + 'static> BlockAlloc<D> {
|
|
/// Create a new `BlockAlloc` with the given global allocator and store.
|
|
pub fn new(alloc_table: Arc<AllocTable>, store: Arc<TxLogStore<D>>) -> Self {
|
|
Self {
|
|
alloc_table,
|
|
diff_table: Mutex::new(BTreeMap::new()),
|
|
store,
|
|
diff_log: Mutex::new(None),
|
|
}
|
|
}
|
|
|
|
/// Record a diff of `Alloc`.
|
|
pub fn alloc_block(&self, block_id: Hba) -> Result<()> {
|
|
let mut diff_table = self.diff_table.lock();
|
|
let replaced = diff_table.insert(block_id, AllocDiff::Alloc);
|
|
debug_assert!(
|
|
replaced != Some(AllocDiff::Alloc),
|
|
"can't allocate a block twice"
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
/// Record a diff of `Dealloc`.
|
|
pub fn dealloc_block(&self, block_id: Hba) -> Result<()> {
|
|
let mut diff_table = self.diff_table.lock();
|
|
let replaced = diff_table.insert(block_id, AllocDiff::Dealloc);
|
|
debug_assert!(
|
|
replaced != Some(AllocDiff::Dealloc),
|
|
"can't deallocate a block twice"
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
/// Prepare the block validity diff log.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// This method must be called within a TX. Otherwise, this method panics.
|
|
pub fn prepare_diff_log(&self) -> Result<()> {
|
|
// Do nothing for now
|
|
Ok(())
|
|
}
|
|
|
|
/// Persist the metadata in diff table to the block validity diff log.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// This method must be called within a TX. Otherwise, this method panics.
|
|
pub fn update_diff_log(&self) -> Result<()> {
|
|
let diff_table = self.diff_table.lock();
|
|
if diff_table.is_empty() {
|
|
return Ok(());
|
|
}
|
|
|
|
let diff_log = self.store.create_log(BUCKET_BLOCK_ALLOC_LOG)?;
|
|
|
|
const MAX_BUF_SIZE: usize = 1024 * BLOCK_SIZE;
|
|
let mut diff_buf = Vec::with_capacity(MAX_BUF_SIZE);
|
|
for (block_id, block_diff) in diff_table.iter() {
|
|
diff_buf.push(*block_diff as u8);
|
|
diff_buf.extend_from_slice(block_id.as_bytes());
|
|
|
|
if diff_buf.len() + DIFF_RECORD_SIZE > MAX_BUF_SIZE {
|
|
diff_buf.resize(align_up(diff_buf.len(), BLOCK_SIZE), 0);
|
|
diff_log.append(BufRef::try_from(&diff_buf[..]).unwrap())?;
|
|
diff_buf.clear();
|
|
}
|
|
}
|
|
|
|
if diff_buf.is_empty() {
|
|
return Ok(());
|
|
}
|
|
diff_buf.resize(align_up(diff_buf.len(), BLOCK_SIZE), 0);
|
|
diff_log.append(BufRef::try_from(&diff_buf[..]).unwrap())
|
|
}
|
|
|
|
/// Update the metadata in diff table to the in-memory block validity table.
|
|
pub fn update_alloc_table(&self) {
|
|
let diff_table = self.diff_table.lock();
|
|
let alloc_table = &self.alloc_table;
|
|
let mut num_free = alloc_table.num_free.lock().unwrap();
|
|
let mut bitmap = alloc_table.bitmap.lock();
|
|
let mut num_dealloc = 0_usize;
|
|
|
|
for (block_id, block_diff) in diff_table.iter() {
|
|
match block_diff {
|
|
AllocDiff::Alloc => {
|
|
debug_assert!(!bitmap[*block_id]);
|
|
}
|
|
AllocDiff::Dealloc => {
|
|
debug_assert!(!bitmap[*block_id]);
|
|
bitmap.set(*block_id, true);
|
|
num_dealloc += 1;
|
|
}
|
|
AllocDiff::Invalid => unreachable!(),
|
|
};
|
|
}
|
|
|
|
*num_free += num_dealloc;
|
|
const AVG_ALLOC_COUNT: usize = 1024;
|
|
if *num_free >= AVG_ALLOC_COUNT {
|
|
alloc_table.cvar.notify_one();
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<u8> for AllocDiff {
|
|
fn from(value: u8) -> Self {
|
|
match value {
|
|
3 => AllocDiff::Alloc,
|
|
7 => AllocDiff::Dealloc,
|
|
_ => AllocDiff::Invalid,
|
|
}
|
|
}
|
|
}
|