blob: 9cf64c430710150dd13a1f1b6942e6db8b2942cb [file] [log] [blame]
// Copyright 2020 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/storage/minfs/vnode_mapper.h"
#include "src/storage/minfs/lazy_buffer.h"
#include "src/storage/minfs/minfs_private.h"
#include "src/storage/minfs/vnode.h"
namespace minfs {
// A Minfs file looks like:
//
// +-----------------------|-------------------------]------------------------+
// | direct blocks | indirect blocks | double indirect blocks |
// +-----------------------|-------------------------|------------------------+
// |<--kMinfsDirect (16)-->^<-------- [1] ---------->^<-------- [2] --------->^
// | | |
// | | |
// kIndirectFileStartBlock | |
// kDoubleIndirectFileStartBlock |
// kMaxBlocks
//
// [1]: kMinfsIndirect (31) * kDirectPerIndirect (2048) = 63488
// [2]: kMinfsDoubleIndirect (1) * kDirectPerIndirect (2048) * kDirectPerIndirect (2048) = 4194304
//
// The pointers to those blocks are arranged like this:
//
// inode.dnum inode.inum inode.dinum
// | | |
// v v v
// Data blocks Indirect blocks Double indirect blocks
// | (a) | (b)
// v v
// Data blocks Indirect blocks
// | (c)
// v
// Data blocks
//
// We support up to three *levels* of indirection.
//
// The pointers that aren't stored in an inode are stored in blocks that we need to allocate, and
// are referred to as indirect blocks. These blocks store 2048 block pointers. These blocks of block
// pointers are cached and stored in a VMO backed buffer, also known as the virtual indirect file,
// and it has the following layout:
//
// +------------------------+----------------------------+--------------------------+
// | indirect-leaf (a) | double-indirect (b) | double-indirect-leaf (c) |
// +------------------------+----------------------------+--------------------------+
// |<-kMinfsIndirect (31)-->|<-kMinfsDoublyIndirect (1)->|<--------- [1] ---------->|
//
// [1]: kMinfsDoublyIndirect (1) * kMinfsDirectPerIndirect (2048)
namespace {
// These constants are the offsets in terms of block pointers to (b) and (c) above, respectively.
constexpr uint64_t kDoubleIndirectViewStart{static_cast<uint64_t>(kMinfsIndirect) *
kMinfsDirectPerIndirect};
constexpr uint64_t kDoubleIndirectLeafViewStart{kDoubleIndirectViewStart +
static_cast<uint64_t>(kMinfsDoublyIndirect) *
kMinfsDirectPerIndirect};
// Scans |array| for a contiguous range of blocks of at most |max_blocks|.
uint64_t Coalesce(const blk_t* array, uint64_t max_blocks) {
uint64_t count = 1;
if (*array == 0) {
// A sparse range.
while (count < max_blocks && array[count] == 0) {
++count;
}
} else {
while (count < max_blocks && array[count] == array[count - 1] + 1) {
++count;
}
}
return count;
}
// Converts blk_t to DeviceBlock. blk_t represents zeroed/sparse/unmapped blocks differently and are
// relative to |dat_block| in the super-block.
DeviceBlock ToDeviceBlock(VnodeMinfs* vnode, blk_t block) {
return block == 0 ? DeviceBlock::Unmapped() : DeviceBlock(block + vnode->Vfs()->Info().dat_block);
}
// Returns a flusher responsible for flushing updated block pointers in the inode.
BaseBufferView::Flusher GetDirectFlusher(VnodeMinfs* vnode, PendingWork* transaction) {
return [vnode, transaction](BaseBufferView* view) {
vnode->InodeSync(transaction, kMxFsSyncDefault);
return zx::ok();
};
}
// Returns a flusher responsible for flushing updated block pointers in indirect blocks;
BaseBufferView::Flusher GetIndirectFlusher(VnodeMinfs* vnode, LazyBuffer* file,
PendingWork* transaction) {
#ifdef __Fuchsia__
return [vnode, file, transaction](BaseBufferView* view) {
VnodeIndirectMapper mapper(vnode);
return file->Flush(
transaction, &mapper, view,
[transaction](ResizeableBufferType* buffer, BlockRange range, DeviceBlock device_block) {
transaction->EnqueueMetadata(
storage::Operation{
.type = storage::OperationType::kWrite,
.vmo_offset = range.Start(),
.dev_offset = device_block.block(),
.length = range.Length(),
},
buffer);
return zx::ok();
});
};
#else
// TODO(https://fxbug.dev/42124749): For the time being, host side code must immediately write data to the
// device, but we should be able to make this code the same as above if storage::BufferedOperation
// changes a bit (so that a pointer to the buffer is kept rather than a pointer to the address in
// memory).
return [vnode, file, transaction](BaseBufferView* view) {
VnodeIndirectMapper mapper(vnode);
auto status = file->Flush(
transaction, &mapper, view,
[vnode](ResizeableBufferType* buffer, BlockRange range, DeviceBlock device_block) {
return EnumerateBlocks(
range, [vnode, buffer, range, device_block](BlockRange r) -> zx::result<uint64_t> {
auto status = vnode->Vfs()->GetMutableBcache()->Writeblk(
static_cast<blk_t>(device_block.block() + (r.Start() - range.Start())),
buffer->Data(r.Start()));
if (status.is_ok()) {
return zx::ok(1);
} else {
return status.take_error();
}
});
});
return status;
};
#endif
}
// -- View Getters --
//
// These functions are helpers that set up BufferView objects for ranges of block pointers.
// The dnum block pointers.
zx::result<BufferView<blk_t>> GetInodeDirectView(PendingWork* transaction, VnodeMinfs* vnode,
BlockPointerRange range) {
ZX_ASSERT(range.End() <= kMinfsDirect);
return zx::ok(BufferView<blk_t>(BufferPtr::FromMemory(&vnode->GetMutableInode()->dnum),
range.Start(), range.End() - range.Start(),
transaction ? GetDirectFlusher(vnode, transaction) : nullptr));
}
// The inum block pointers.
zx::result<BufferView<blk_t>> GetInodeIndirectView(PendingWork* transaction, VnodeMinfs* vnode,
BlockPointerRange range) {
ZX_ASSERT(range.End() <= kMinfsIndirect);
return zx::ok(BufferView<blk_t>(BufferPtr::FromMemory(&vnode->GetMutableInode()->inum),
range.Start(), range.End() - range.Start(),
transaction ? GetDirectFlusher(vnode, transaction) : nullptr));
}
// The dinum block pointers.
zx::result<BufferView<blk_t>> GetInodeDoubleIndirectView(PendingWork* transaction,
VnodeMinfs* vnode,
BlockPointerRange range) {
ZX_ASSERT(range.End() <= kMinfsDoublyIndirect);
return zx::ok(BufferView<blk_t>(BufferPtr::FromMemory(&vnode->GetMutableInode()->dinum),
range.Start(), range.End() - range.Start(),
transaction ? GetDirectFlusher(vnode, transaction) : nullptr));
}
// The pointers in the indirect file. See diagram above to understand how these are laid out.
zx::result<BufferView<blk_t>> GetViewForIndirectFile(PendingWork* transaction, VnodeMinfs* vnode,
BlockPointerRange range) {
zx::result<LazyBuffer*> file = vnode->GetIndirectFile();
if (file.is_error())
return file.take_error();
VnodeIndirectMapper mapper(vnode);
LazyBuffer::Reader reader(vnode->Vfs()->GetMutableBcache(), &mapper, file.value());
return file.value()->GetView<blk_t>(
range.Start(), range.End() - range.Start(), &reader,
transaction ? GetIndirectFlusher(vnode, file.value(), transaction) : nullptr);
}
} // namespace
// -- VnodeIndirectMapper --
zx::result<DeviceBlockRange> VnodeIndirectMapper::Map(BlockRange range) {
return MapForWrite(/*transaction=*/nullptr, range, /*allocated=*/nullptr);
}
zx::result<BufferView<blk_t>> VnodeIndirectMapper::GetView(PendingWork* transaction,
BlockRange range) {
constexpr uint64_t kDoubleIndirectLeafStart{kMinfsIndirect + kMinfsDoublyIndirect};
constexpr uint64_t kMax{kDoubleIndirectLeafStart +
static_cast<uint64_t>(kMinfsDoublyIndirect) * kMinfsDirectPerIndirect};
if (range.Start() < kMinfsIndirect) {
// Region (a) -- Pointers are to be found in the inum field in the inode.
return GetInodeIndirectView(
transaction, &vnode_,
BlockPointerRange(range.Start(),
std::min(range.End(), static_cast<uint64_t>(kMinfsIndirect))));
} else if (range.Start() < kDoubleIndirectLeafStart) {
// Region (b) -- Pointers are to be found in the dinum field in the inode.
return GetInodeDoubleIndirectView(
transaction, &vnode_,
BlockPointerRange(range.Start() - kMinfsIndirect,
std::min(range.End(), kDoubleIndirectLeafStart) - kMinfsIndirect));
} else if (range.Start() < kMax) {
// Region (c) -- Pointers are to be found in region (b) of the virtual indirect file.
return GetViewForIndirectFile(
transaction, &vnode_,
BlockPointerRange(
range.Start() - kDoubleIndirectLeafStart + kDoubleIndirectViewStart,
std::min(range.End(), kMax) - kDoubleIndirectLeafStart + kDoubleIndirectViewStart));
} else {
return zx::error(ZX_ERR_OUT_OF_RANGE);
}
}
zx::result<DeviceBlockRange> VnodeIndirectMapper::MapForWrite(PendingWork* transaction,
BlockRange range, bool* allocated) {
BufferView<blk_t> view;
if (auto view_or = GetView(transaction, range); view_or.is_error()) {
return view_or.take_error();
} else {
view = std::move(view_or).value();
}
// If |allocated| is non-null, allocate an indirect block if necessary.
uint64_t block_count;
if (*view == 0 && allocated) {
block_count = 1;
blk_t new_block = 0;
vnode_.AllocateIndirect(transaction, &new_block);
ZX_ASSERT(new_block != 0);
view.mut_ref() = new_block;
*allocated = true;
auto status = view.Flush();
if (status.is_error())
return status.take_error();
} else {
if (allocated)
*allocated = false;
block_count = Coalesce(view.data(), view.count());
}
return zx::ok(DeviceBlockRange(ToDeviceBlock(&vnode_, *view), block_count));
}
// -- VnodeMapper --
zx::result<std::pair<blk_t, uint64_t>> VnodeMapper::MapToBlk(BlockRange range) {
VnodeIterator iterator;
auto status = iterator.Init(this, nullptr, range.Start());
if (status.is_error())
return status.take_error();
return zx::ok(std::make_pair(iterator.Blk(), iterator.GetContiguousBlockCount(range.Length())));
}
zx::result<DeviceBlockRange> VnodeMapper::Map(BlockRange range) {
zx::result<std::pair<blk_t, uint64_t>> status = MapToBlk(range);
if (status.is_error())
return status.take_error();
return zx::ok(
DeviceBlockRange(ToDeviceBlock(&vnode_, status.value().first), status.value().second));
}
// -- VnodeIterator --
zx::result<> VnodeIterator::Init(VnodeMapper* mapper, PendingWork* transaction,
uint64_t file_block) {
mapper_ = mapper;
transaction_ = transaction;
file_block_ = file_block;
contiguous_block_count_ = 0;
// The file block determines the number of levels of views that we need, and the view-getters
// that we need to use.
if (file_block < VnodeMapper::kIndirectFileStartBlock) {
// We only need the dnum pointers.
level_count_ = 1;
auto status =
InitializeLevel(0, BlockPointerRange(0, kMinfsDirect), file_block, &GetInodeDirectView);
if (status.is_error())
return status;
} else if (file_block < VnodeMapper::kDoubleIndirectFileStartBlock) {
// We need the inum pointers and the blocks they point to.
level_count_ = 2;
uint64_t relative_block = file_block - VnodeMapper::kIndirectFileStartBlock;
auto status = InitializeLevel(1, BlockPointerRange(0, kMinfsIndirect),
relative_block / kMinfsDirectPerIndirect, &GetInodeIndirectView);
if (status.is_error())
return status;
status = InitializeIndirectLevel(0, relative_block);
if (status.is_error())
return status;
} else if (file_block < VnodeMapper::kMaxBlocks) {
// We need the dinum pointers and two more levels.
level_count_ = 3;
uint64_t relative_block = file_block - VnodeMapper::kDoubleIndirectFileStartBlock;
auto status =
InitializeLevel(2, BlockPointerRange(0, kMinfsDoublyIndirect),
relative_block / kMinfsDirectPerIndirect / kMinfsDirectPerIndirect,
&GetInodeDoubleIndirectView);
if (status.is_error())
return status;
status = InitializeIndirectLevel(
1, relative_block / kMinfsDirectPerIndirect + kDoubleIndirectViewStart);
if (status.is_error())
return status;
status = InitializeIndirectLevel(0, relative_block + kDoubleIndirectLeafViewStart);
if (status.is_error())
return status;
} else if (file_block == VnodeMapper::kMaxBlocks) {
// Allow the iterator to point at the end.
level_count_ = 0;
} else {
return zx::error(ZX_ERR_OUT_OF_RANGE);
}
return zx::ok();
}
// Here |range| and |block| are blocks relative to the base of the pointers we are looking at, which
// could be the dnum, inum, dinum pointers or the pointers in the virtual indirect file. |block|
// must be contained within |range|.
zx::result<> VnodeIterator::InitializeLevel(int level, BlockPointerRange range, uint64_t block,
ViewGetter view_getter) {
ZX_ASSERT(block >= range.Start() && block < range.End());
levels_[level].range = range;
levels_[level].count = range.End() - range.Start();
levels_[level].index = block - range.Start();
// If the parent level is sparse, delay getting the view until we need to.
if (level + 1 < level_count_ && levels_[level + 1].blk() == 0) {
levels_[level].view = {};
levels_[level].view_getter = std::move(view_getter);
} else {
zx::result<BufferView<blk_t>> view = view_getter(transaction_, &mapper_->vnode(), range);
if (view.is_error())
return view.take_error();
levels_[level].view = std::move(view).value();
levels_[level].view_getter = nullptr;
}
return zx::ok();
}
// Convenience function for levels that point to the virtual indirect file. Here |relative_block| is
// the pointer offset within the virtual indirect file.
zx::result<> VnodeIterator::InitializeIndirectLevel(int level, uint64_t relative_block) {
uint64_t first_block = fbl::round_down(relative_block, kMinfsDirectPerIndirect);
return InitializeLevel(level,
BlockPointerRange(first_block, first_block + kMinfsDirectPerIndirect),
relative_block, &GetViewForIndirectFile);
}
zx::result<> VnodeIterator::SetBlk(Level* level, blk_t block) {
ZX_ASSERT(level_count_ > 0);
// If this level is sparse, try and get a view for it now.
if (level->IsSparse()) {
if (block == 0)
return zx::ok();
zx::result<BufferView<blk_t>> view =
level->view_getter(transaction_, &mapper_->vnode(), level->range);
if (view.is_error())
return view.take_error();
level->view = std::move(view).value();
}
level->view.mut_ref(level->index) = block;
return zx::ok();
}
uint64_t VnodeIterator::GetContiguousBlockCount(uint64_t max_blocks) const {
if (level_count_ == 0)
return 0;
if (contiguous_block_count_ == 0)
contiguous_block_count_ = ComputeContiguousBlockCount();
return std::min(contiguous_block_count_, max_blocks);
}
uint64_t VnodeIterator::ComputeContiguousBlockCount() const {
// For efficiency reasons, handle sparse ranges differently. This is so we can quickly scan the
// (typically) unallocated/sparse blocks from the end of the file.
if (Blk() == 0) {
// The number of blocks we have found so far.
uint64_t count = 0;
// The number of blocks a block pointer represents at the current level.
uint64_t multiplier = 1;
// The index into the view for the current level.
uint64_t index = levels_[0].index;
int level = 0;
// N.B. When we truncate blocks, we rely on the fact that we only go *up* the tree here, *not*
// down. To further explain, consider the case where the inode points to an indirect block, but
// the indirect block doesn't happen to have any blocks allocated. We could, in theory, coalesce
// those blocks and just say it's all sparse, but if we did that, we wouldn't free the indirect
// block. Instead, we'll coalesce as many blocks as we can at the lowest level, then move up a
// level and coalesce all the blocks at that level, but we'll stop as soon as we find an
// allocated block, even though that indirect block might not point to any allocated blocks.
for (;;) {
const uint64_t left = levels_[level].count - index;
if (left == 0 || levels_[level].IsSparse()) {
count += left * multiplier;
} else if (levels_[level].view[index] == 0) {
uint64_t contiguous = Coalesce(&levels_[level].view[index], left);
count += contiguous * multiplier;
if (contiguous < left)
return count;
} else {
// We've come to a block that isn't sparse.
return count;
}
if (++level >= level_count_)
return count;
multiplier *= kMinfsDirectPerIndirect;
index = levels_[level].index + 1;
}
} else {
return Coalesce(&levels_[0].view[levels_[0].index], levels_[0].remaining());
}
}
zx::result<> VnodeIterator::FlushLevel(int level) {
if (!transaction_)
return zx::ok();
if (level + 1 < level_count_) {
const blk_t parent_block = levels_[level + 1].blk();
// If this block is now empty and we have a parent, deallocate rather than writing block. As an
// optimisation, we quickly check that item current pointed at is zero before doing a full check
// of the whole block.
if (parent_block != 0 && levels_[level].blk() == 0) {
ZX_ASSERT(levels_[level].view.count() == kMinfsDirectPerIndirect);
bool empty = true;
for (unsigned i = 0; i < kMinfsDirectPerIndirect; ++i) {
if (levels_[level].view[i] != 0) {
empty = false;
break;
}
}
if (empty) {
// Delete the block and update the parent.
mapper_->vnode().DeleteBlock(transaction_, 0, parent_block, /*indirect=*/true);
[[maybe_unused]] auto status = SetBlk(&levels_[level + 1], 0);
levels_[level].view.set_dirty(false);
return zx::ok();
}
}
// If there are modifications and the parent doesn't have a block, allocate it now. This isn't
// strictly necessary because VnodeIndirectMapper will allocate if it needs to. However, it will
// immediately flush whereas if we do it here, we can delay the flush as there might be more
// changes to make later.
if (levels_[level].view.dirty() && parent_block == 0) {
blk_t new_block;
mapper_->vnode().AllocateIndirect(transaction_, &new_block);
ZX_ASSERT(new_block != 0);
[[maybe_unused]] auto status = SetBlk(&levels_[level + 1], new_block);
}
}
return levels_[level].view.Flush();
}
zx::result<> VnodeIterator::Flush() {
if (!transaction_)
return zx::ok(); // Iterator is read-only.
for (int level = 0; level < level_count_; ++level) {
auto status = FlushLevel(level);
if (status.is_error())
return status;
}
return zx::ok();
}
zx::result<> VnodeIterator::Advance(const uint64_t advance) {
if (level_count_ == 0) {
return advance == 0 ? zx::ok() : zx::make_result(ZX_ERR_BAD_STATE);
}
// Short circuit for the common case.
if (advance < levels_[0].remaining()) {
levels_[0].index += advance;
file_block_ += advance;
if (advance >= contiguous_block_count_) {
contiguous_block_count_ = 0;
} else {
contiguous_block_count_ -= advance;
}
return zx::ok();
}
// Get a new iterator for the new file block.
VnodeIterator iterator;
auto status = iterator.Init(mapper_, transaction_, file_block_ + advance);
if (status.is_error())
return status;
// Now see which of the old views need flushing.
if (iterator.level_count_ != level_count_) {
// The level count is different so we flush all of the levels.
status = Flush();
if (status.is_error())
return status;
} else {
// If the two iterators have the same view for a level, just move the view. This prevents us
// from over-flushing.
for (int level = 0; level < level_count_; ++level) {
if (levels_[level].range == iterator.levels_[level].range) {
// The ranges are the same and because the level count is the same, we know that the view
// must point to the same thing, so we can just move the views over and defer flushing this
// level.
iterator.levels_[level].view = std::move(levels_[level].view);
} else {
status = FlushLevel(level);
if (status.is_error())
return status;
}
}
}
*this = std::move(iterator);
return zx::ok();
}
} // namespace minfs