| // Copyright 2024 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| //! A strategy tracks free space and decides where allocations are to be made. |
| //! |
| //! Note that strategy *excludes*: |
| //! |
| //! * get/set byte limits |
| //! * reservation tracking of future allocations. |
| //! * deallocated but not yet usable regions (awaiting flush). |
| //! * Any TRIM-specific logic |
| //! * Volume Deletion |
| //! |
| //! These sorts of higher-level concepts should be implemented in `Allocator`. |
| //! |
| //! Strategies should be concerned only with selecting which free regions of disk to hand out. |
| |
| use { |
| crate::object_store::FxfsError, |
| anyhow::{Context, Error}, |
| std::{ |
| collections::{btree_map, BTreeMap, BTreeSet}, |
| fmt::Debug, |
| ops::Range, |
| }, |
| }; |
| |
| /// Holds the offsets of extents of a given length. |
| #[derive(Debug, Default)] |
| struct SizeBucket { |
| /// True if this bucket has overflowed, indicating that there may be more extents of this |
| /// size on disk if we look again. |
| overflow: bool, |
| offsets: BTreeSet<u64>, |
| } |
| |
| /// Tests (in allocator.rs) may set max extent size to low values for other reasons. |
| /// We want to make sure that we never go below a value that is big enough for superblocks |
| /// so the value we use here is chosen to be larger than the value allocator.rs calculates and |
| /// a convenient power of two. |
| const DEFAULT_MAX_EXTENT_SIZE: u64 = 2 << 20; |
| |
| /// The largest size bucket is a catch-all for extents this size and up so we |
| /// explicitly keep more than we would otherwise. |
| const NUM_MAX_SIZED_EXTENTS_TO_KEEP: u64 = 512; |
| |
| /// Returns the maximum free ranges we keep in RAM for a given range length. |
| /// We use the reciprocal function (2MiB/len) for this as we tend to see a lot more short |
| /// extents. One exception is 'DEFAULT_MAX_EXTENT_SIZE'. This is a catch-all bucket so we |
| /// intentionally bump this to cover the fact that a wide range of allocation sizes may hit this |
| /// bucket. |
| fn max_entries(len: u64) -> usize { |
| if len < DEFAULT_MAX_EXTENT_SIZE { |
| // With a 4kiB block size up to 2MiB (i.e. 512 possible lengths), this works out to be a |
| // total of 6748 entries. |
| (4 << 20) / len as usize |
| } else { |
| NUM_MAX_SIZED_EXTENTS_TO_KEEP as usize |
| } |
| } |
| |
| /// An allocation strategy that returns the smallest extent that is large enough to hold the |
| /// requested allocation, or the next best if no extent is big enough. |
| /// |
| /// This strategy either leads to a perfect fit of a free extent to an allocation or a small |
| /// amount of free space being left over, creating even smaller fragments of free space. |
| /// |
| /// This tendency to create smaller fragments of free space starts to affect file fragmentation |
| /// when filesystem use approaches capacity and there are nothing but small fragments of free |
| /// space remaining. |
| #[derive(Debug, Default)] |
| pub struct BestFit { |
| /// Holds free storage ranges for the filesystem ordered by length, then start. |
| ranges: BTreeMap<u64, SizeBucket>, |
| /// A secondary index used to speed up coalescing of 'free' calls. Keyed by end -> start |
| by_end: BTreeMap<u64, u64>, |
| } |
| |
| impl BestFit { |
| /// Tries to assign a set of contiguous `bytes` and returns the range, removing it |
| /// from the pool of available bytes and returning it. |
| /// |
| /// If insufficient contiguous space is available, the largest available range will |
| /// be returned. If no bytes are available, None will be returned. |
| /// |
| /// There are no special requirements on alignment of `bytes` but the caller is generally |
| /// encouraged to align to device block size. |
| pub fn allocate(&mut self, bytes: u64) -> Result<Range<u64>, FxfsError> { |
| let mut result = self.ranges.range_mut(bytes..).next(); |
| if result.is_none() { |
| // Insufficient space. Return the biggest range we have. |
| result = self.ranges.iter_mut().rev().next(); |
| } |
| if let Some((&size, size_bucket)) = result { |
| if size_bucket.offsets.is_empty() && size_bucket.overflow { |
| return Err(FxfsError::NotFound); |
| } |
| debug_assert!(!size_bucket.offsets.is_empty()); |
| let offset = size_bucket.offsets.pop_first().unwrap(); |
| if size_bucket.offsets.is_empty() && !size_bucket.overflow { |
| self.ranges.remove(&size); |
| } |
| self.by_end.remove(&(offset + size)); |
| if size > bytes { |
| self.free(offset + bytes..offset + size).expect("give extra back"); |
| Ok(offset..offset + bytes) |
| } else { |
| Ok(offset..offset + size) |
| } |
| } else { |
| Err(FxfsError::NoSpace) |
| } |
| } |
| |
| /// This is effectively an optimized path for "remove" followed by "free" on the same range. |
| /// Returns true if this resulted in changes to overall free space. |
| pub fn force_free(&mut self, range: Range<u64>) -> Result<bool, Error> { |
| let mut to_add = Vec::new(); |
| let mut last = range.start; |
| for (&end, &offset) in self.by_end.range(range.start + 1..) { |
| if offset >= range.end { |
| break; |
| } |
| if offset > last { |
| to_add.push(last..offset); |
| } |
| last = end; |
| assert!(end > range.start); |
| } |
| if last < range.end { |
| to_add.push(last..range.end); |
| } |
| Ok(if to_add.is_empty() { |
| false |
| } else { |
| for range in to_add { |
| self.free(range)?; |
| } |
| true |
| }) |
| } |
| |
| /// Removes all free extents in the given range. |
| pub fn remove(&mut self, range: Range<u64>) { |
| let mut to_add = Vec::new(); |
| let mut to_remove = Vec::new(); |
| for (&end, &offset) in self.by_end.range(range.start + 1..) { |
| if offset >= range.end { |
| break; |
| } |
| assert!(end > range.start); |
| to_remove.push(offset..end); |
| if offset < range.start { |
| to_add.push(offset..range.start); |
| } |
| if end > range.end { |
| to_add.push(range.end..end); |
| } |
| } |
| for range in to_remove { |
| self.remove_range(range); |
| } |
| for range in to_add { |
| self.free(range).unwrap(); |
| } |
| } |
| |
| /// Internal helper function. Assumes range exists. |
| fn remove_range(&mut self, range: Range<u64>) { |
| let btree_map::Entry::Occupied(mut size_bucket) = |
| self.ranges.entry(range.end - range.start) |
| else { |
| unreachable!() |
| }; |
| size_bucket.get_mut().offsets.remove(&range.start); |
| if size_bucket.get().offsets.is_empty() && !size_bucket.get().overflow { |
| size_bucket.remove_entry(); |
| } |
| assert_eq!(Some(range.start), self.by_end.remove(&range.end)); |
| } |
| |
| // Overflow markers persist until removed. |
| // Before we refresh data from disk, we need to remove these markers as there is no guarantee |
| // that we will overflow the same bucket every time we refresh. |
| pub fn reset_overflow_markers(&mut self) { |
| let mut to_remove = Vec::new(); |
| for (&size, size_bucket) in self.ranges.iter_mut() { |
| size_bucket.overflow = false; |
| if size_bucket.offsets.is_empty() { |
| to_remove.push(size); |
| } |
| } |
| for size in to_remove { |
| self.ranges.remove(&size); |
| } |
| } |
| |
| /// Internal helper function. Inserts range if room to do so, appends overflow marker if needed. |
| fn insert_range(&mut self, range: Range<u64>) -> Result<(), Error> { |
| let len = range.end - range.start; |
| let entry = self.ranges.entry(len).or_default(); |
| if !entry.offsets.insert(range.start) { |
| // This might happen if there is some high-level logic error that causes a range to be |
| // freed twice. It may indicate a disk corruption or a software bug and shouldn't happen |
| // under normal circumstances. While the low-level data structure held here will not be |
| // compromised by this error, it is probably not safe to continue if a condition such |
| // as this is detected, thus we return "Inconsistent". |
| return Err(FxfsError::Inconsistent).context("Range already in 'ranges'."); |
| } |
| let other = self.by_end.insert(range.end, range.start); |
| if let Some(_other) = other { |
| // self.by_end and self.ranges should always remain consistant. |
| // If this occurs it is almost certainly a software bug and continued use of this |
| // data structure will likely lead to undefined behaviour. |
| return Err(FxfsError::Inconsistent) |
| .context("Range already in 'by_end'. Potential logic bug."); |
| }; |
| if entry.offsets.len() > max_entries(len) { |
| let last = entry.offsets.pop_last().unwrap(); |
| if self.by_end.remove(&(last + len)) != Some(last) { |
| return Err(FxfsError::Inconsistent) |
| .context("Expected range missing or invalid 'by_end'"); |
| }; |
| entry.overflow = true; |
| } |
| Ok(()) |
| } |
| |
| /// Adds an arbitrary range of bytes to the pool of available ranges. |
| /// |
| /// Note that we keep these ranges in a map keyed by their length. To bound the size of this |
| /// map we only track ranges up to N blocks long (up to 2MB). Longer ranges |
| /// are broken up into ranges of this size. |
| pub fn free(&mut self, mut range: Range<u64>) -> Result<(), Error> { |
| // If there is a free range immediately before this one, merge with it. |
| let mut iter = self.by_end.range(range.start..); |
| let mut next_item = iter.next(); |
| if let Some((&end, &start)) = next_item { |
| if end == range.start { |
| self.remove_range(start..end); |
| range.start = start; |
| iter = self.by_end.range(range.start + 1..); |
| next_item = iter.next(); |
| } else if start < range.end { |
| // There exists a range already that overlaps with this. |
| // We have a double free or freeing of overlapping allocations. |
| // While the data-structure here remains valid after this error, the fact that |
| // an overlapping free was attempted indicates that the filesystem is not in a |
| // safe state to continue, thus we return "Inconsistent". |
| return Err(FxfsError::Inconsistent).context("overlapping free"); |
| } |
| } |
| // If there is a free range immediately after this one, merge with it. |
| if let Some((&end, &start)) = next_item { |
| if start == range.end { |
| self.remove_range(start..end); |
| range.end = end; |
| } |
| } |
| |
| // We don't allow ranges longer than maximum extent size, but if we need to split such |
| // a range, we want the smaller fragment to come first (pushing small fragments together at |
| // the start of the device). |
| while (range.end - range.start) >= DEFAULT_MAX_EXTENT_SIZE { |
| self.insert_range(range.end - DEFAULT_MAX_EXTENT_SIZE..range.end) |
| .context("adding max_extent_size fragment")?; |
| range.end -= DEFAULT_MAX_EXTENT_SIZE; |
| } |
| if range.start < range.end { |
| self.insert_range(range).context("adding final range")?; |
| } |
| Ok(()) |
| } |
| } |
| |
| #[cfg(test)] |
| mod test { |
| use super::*; |
| |
| #[test] |
| fn allocate() { |
| let mut bestfit = BestFit::default(); |
| bestfit.free(0..0).unwrap(); // NOOP |
| bestfit.free(0..100).unwrap(); |
| assert_eq!(bestfit.allocate(10), Ok(0..10)); |
| assert_eq!(bestfit.allocate(10), Ok(10..20)); |
| assert_eq!(bestfit.allocate(10), Ok(20..30)); |
| assert_eq!(bestfit.allocate(10), Ok(30..40)); |
| assert_eq!(bestfit.allocate(10), Ok(40..50)); |
| // Make some holes. |
| bestfit.free(30..40).unwrap(); |
| bestfit.free(10..20).unwrap(); |
| // Holes get filled first. |
| assert_eq!(bestfit.allocate(10), Ok(10..20)); |
| assert_eq!(bestfit.allocate(10), Ok(30..40)); |
| assert_eq!(bestfit.allocate(10), Ok(50..60)); |
| // Free a contiguous bunch of allocations at once. |
| bestfit.free(0..50).unwrap(); |
| // Return less than requested. |
| assert_eq!(bestfit.allocate(100), Ok(0..50)); |
| // Return all remaining space. |
| assert_eq!(bestfit.allocate(100), Ok(60..100)); |
| // No space left. Return None. |
| assert_eq!(bestfit.allocate(100), Err(FxfsError::NoSpace)); |
| // Now we have some more back. |
| bestfit.free(50..100).unwrap(); |
| assert_eq!(bestfit.allocate(100), Ok(50..100)); |
| } |
| |
| #[test] |
| fn remove() { |
| let mut bestfit = BestFit::default(); |
| bestfit.free(0..100).unwrap(); |
| bestfit.remove(25..50); |
| assert_eq!(bestfit.allocate(30), Ok(50..80)); |
| assert_eq!(bestfit.allocate(25), Ok(0..25)); |
| |
| // Test removal across disjoint extents. |
| let mut bestfit = BestFit::default(); |
| bestfit.free(0..20).unwrap(); |
| bestfit.free(30..50).unwrap(); |
| bestfit.remove(10..40); |
| assert_eq!(bestfit.allocate(10), Ok(0..10)); |
| assert_eq!(bestfit.allocate(10), Ok(40..50)); |
| assert_eq!(bestfit.allocate(10), Err(FxfsError::NoSpace)); |
| |
| // Test coalescing of adjacent available ranges if the request is large. |
| let mut bestfit = BestFit::default(); |
| let end = DEFAULT_MAX_EXTENT_SIZE * (NUM_MAX_SIZED_EXTENTS_TO_KEEP + 1) + 10000; |
| bestfit.free(0..end).unwrap(); |
| // We slice from the back so we have sizes: 10000,2M,2M,2M,2M,... |
| // Free up most of the 2MB tail entries -- we just needed to trigger an overflow marker. |
| bestfit.remove(DEFAULT_MAX_EXTENT_SIZE * 3 + 10000..end); |
| // Free up the firs two slices worth. (This exercises unaligned removal.) |
| bestfit.remove(0..DEFAULT_MAX_EXTENT_SIZE * 2); |
| // Allocate the last 2MB extent. |
| assert_eq!( |
| bestfit.allocate(DEFAULT_MAX_EXTENT_SIZE), |
| Ok(DEFAULT_MAX_EXTENT_SIZE * 2 + 10000..DEFAULT_MAX_EXTENT_SIZE * 3 + 10000) |
| ); |
| // Allocate all the space. |
| assert_eq!( |
| bestfit.allocate(10000), |
| Ok(DEFAULT_MAX_EXTENT_SIZE * 2..DEFAULT_MAX_EXTENT_SIZE * 2 + 10000) |
| ); |
| // Now there is insufficient space, but we should get the overflow marker. |
| assert_eq!( |
| bestfit.allocate(DEFAULT_MAX_EXTENT_SIZE), |
| Err(FxfsError::NotFound) // Overflow. |
| ); |
| // Reset the overflow marker. We should see NoSpace. |
| bestfit.reset_overflow_markers(); |
| assert_eq!(bestfit.allocate(DEFAULT_MAX_EXTENT_SIZE), Err(FxfsError::NoSpace)); |
| } |
| |
| #[test] |
| fn coalescing_free() { |
| let mut bestfit = BestFit::default(); |
| // Free some bytes at the start and end. |
| bestfit.free(0..10).unwrap(); |
| bestfit.free(20..32).unwrap(); |
| // Now free the space in the middle, which should coalesce with ranges on both sides. |
| bestfit.free(10..20).unwrap(); |
| // Confirm that we can allocate one block of 32 bytes. This will fail if coalescing |
| // didn't occur. |
| bestfit.remove(0..32); |
| assert_eq!(bestfit.allocate(32), Err(FxfsError::NoSpace)); |
| } |
| |
| #[test] |
| fn max_range() { |
| let mut bestfit = BestFit::default(); |
| bestfit.free(10..10 + 10 * DEFAULT_MAX_EXTENT_SIZE).unwrap(); |
| |
| // We can't allocate bigger than DEFAULT_MAX_EXTENT_SIZE. |
| assert_eq!( |
| bestfit.allocate(2 * DEFAULT_MAX_EXTENT_SIZE), |
| Ok(10..10 + DEFAULT_MAX_EXTENT_SIZE) |
| ); |
| |
| // Make sure that coalescing still works properly |
| bestfit.free(10..10 + DEFAULT_MAX_EXTENT_SIZE).unwrap(); |
| assert_eq!(bestfit.allocate(10), Ok(10..20)); |
| assert_eq!(bestfit.allocate(10), Ok(20..30)); |
| assert_eq!( |
| bestfit.allocate(DEFAULT_MAX_EXTENT_SIZE - 20), |
| Ok(30..10 + DEFAULT_MAX_EXTENT_SIZE) |
| ); |
| } |
| |
| #[test] |
| fn fragmenting_free() { |
| // It shouldn't matter how we allocate and free here so long as there are no overlaps. |
| let mut bestfit = BestFit::default(); |
| bestfit.free(0..100).unwrap(); |
| assert_eq!(bestfit.allocate(30), Ok(0..30)); |
| assert!(bestfit.free(20..30).is_ok()); |
| assert_eq!(bestfit.allocate(10), Ok(20..30)); |
| assert!(bestfit.free(0..30).is_ok()); |
| assert!(bestfit.free(0..30).is_err()); |
| |
| // Merge left and right and middle. |
| let mut bestfit = BestFit::default(); |
| bestfit.free(10..20).unwrap(); |
| bestfit.free(30..40).unwrap(); |
| bestfit.free(0..10).unwrap(); // before |
| bestfit.free(40..50).unwrap(); // after |
| bestfit.free(20..30).unwrap(); // middle |
| |
| // Check all combinations of overlaps. |
| let mut bestfit = BestFit::default(); |
| bestfit.free(10..20).unwrap(); |
| assert!(bestfit.free(9..11).is_err()); |
| assert!(bestfit.free(19..21).is_err()); |
| assert!(bestfit.free(11..29).is_err()); |
| assert!(bestfit.free(10..20).is_err()); |
| assert!(bestfit.free(9..10).is_ok()); |
| assert!(bestfit.free(20..21).is_ok()); |
| } |
| |
| #[test] |
| fn force_free() { |
| let mut bestfit = BestFit::default(); |
| bestfit.free(0..100).unwrap(); |
| assert_eq!(bestfit.force_free(0..100).unwrap(), false); |
| assert_eq!(bestfit.force_free(10..100).unwrap(), false); |
| assert_eq!(bestfit.force_free(0..90).unwrap(), false); |
| assert_eq!(bestfit.force_free(10..90).unwrap(), false); |
| |
| let mut bestfit = BestFit::default(); |
| bestfit.free(0..40).unwrap(); |
| bestfit.free(60..100).unwrap(); |
| assert_eq!(bestfit.force_free(10..90).unwrap(), true); |
| assert_eq!(bestfit.allocate(100), Ok(0..100)); |
| |
| let mut bestfit = BestFit::default(); |
| bestfit.free(0..40).unwrap(); |
| bestfit.free(60..100).unwrap(); |
| assert_eq!(bestfit.force_free(10..100).unwrap(), true); |
| assert_eq!(bestfit.allocate(100), Ok(0..100)); |
| |
| let mut bestfit = BestFit::default(); |
| bestfit.free(10..40).unwrap(); |
| bestfit.free(60..100).unwrap(); |
| assert_eq!(bestfit.force_free(0..110).unwrap(), true); |
| assert_eq!(bestfit.allocate(110), Ok(0..110)); |
| |
| let mut bestfit = BestFit::default(); |
| bestfit.free(10..40).unwrap(); |
| bestfit.free(60..100).unwrap(); |
| assert_eq!(bestfit.force_free(10..110).unwrap(), true); |
| assert_eq!(bestfit.allocate(100), Ok(10..110)); |
| |
| let mut bestfit = BestFit::default(); |
| bestfit.free(10..40).unwrap(); |
| bestfit.free(60..100).unwrap(); |
| assert_eq!(bestfit.force_free(0..100).unwrap(), true); |
| assert_eq!(bestfit.allocate(100), Ok(0..100)); |
| |
| let mut bestfit = BestFit::default(); |
| assert_eq!(bestfit.force_free(0..100).unwrap(), true); |
| assert_eq!(bestfit.allocate(100), Ok(0..100)); |
| } |
| } |