blob: 4d90eaf77ea1609b2b1d0ed53a0a91b5c3bab6af [file] [log] [blame]
// Copyright 2016 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <lib/fit/defer.h>
#include <lib/syslog/cpp/macros.h>
#include <lib/zx/result.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <time.h>
#include <unistd.h>
#include <zircon/time.h>
#include <algorithm>
#include <memory>
#include <fbl/algorithm.h>
#include <safemath/checked_math.h>
#include "src/storage/lib/vfs/cpp/vfs_types.h"
#include "zircon/assert.h"
#ifdef __Fuchsia__
#include <zircon/syscalls.h>
#include <utility>
#include <fbl/auto_lock.h>
#endif
#include "src/storage/minfs/file.h"
#include "src/storage/minfs/minfs_private.h"
#include "src/storage/minfs/unowned_vmo_buffer.h"
#include "src/storage/minfs/vnode.h"
namespace minfs {
File::File(Minfs* fs) : VnodeMinfs(fs) {}
#ifdef __Fuchsia__
// AllocateAndCommitData does the following operations:
// - Allocates data blocks,
// - Frees old data blocks (if this were overwritten),
// - Issues data and metadata writes,
// - Updates inode to reflect new size and modification time.
// Writes or fragments of a write may change inode's size, block_count or
// file block table (dnum, inum, dinum).
void File::AllocateAndCommitData(std::unique_ptr<Transaction> transaction) {
// Calculate the maximum number of data blocks we can update within one transaction. This is
// the smallest between half the capacity of the writeback buffer, and the number of direct
// blocks needed to touch the maximum allowed number of indirect blocks.
const uint32_t max_direct_blocks =
kMinfsDirect + (kMinfsDirectPerIndirect * Vfs()->Limits().GetMaximumMetaDataBlocks());
const uint32_t max_writeback_blocks = static_cast<blk_t>(Vfs()->WritebackCapacity() / 2);
const uint32_t max_blocks = std::min(max_direct_blocks, max_writeback_blocks);
fbl::Array<blk_t> allocated_blocks(new blk_t[max_blocks], max_blocks);
// Iterate through all relative block ranges and acquire absolute blocks for each of them.
while (true) {
blk_t expected_blocks = allocation_state_.GetTotalPending();
ZX_ASSERT_MSG(expected_blocks <= max_blocks, "Pending blocks:%u more than max blocks:%u",
expected_blocks, max_blocks);
if (expected_blocks == 0) {
if (GetInode()->size != allocation_state_.GetNodeSize()) {
GetMutableInode()->size = allocation_state_.GetNodeSize();
}
// Since we may have pending reservations from an expected update, reset the allocation
// state. This may happen if the same block range is allocated and de-allocated (e.g.
// written and truncated), before the state is resolved.
blk_t alloc_node_size = allocation_state_.GetNodeSize();
ZX_ASSERT_MSG(alloc_node_size == GetInode()->size,
"Allocation nodesize:%u does not match actual node size:%u", alloc_node_size,
GetInode()->size);
allocation_state_.Reset(allocation_state_.GetNodeSize());
ZX_DEBUG_ASSERT(allocation_state_.IsEmpty());
break;
}
blk_t bno_start, bno_count;
ZX_ASSERT_MSG(allocation_state_.GetNextRange(&bno_start, &bno_count) == ZX_OK,
"Failed to get allocation range.");
ZX_ASSERT_MSG(bno_count <= max_blocks, "Block count:%u more than max block count:%u", bno_count,
max_blocks);
// Since we reserved enough space ahead of time, this should not fail due to lack of space. It
// can fail if there's an I/O error, but if that happens we will potentially be in an
// inconsistent state and fixing that is not worth the effort; crashing will lead to the same
// end result for the user.
ZX_ASSERT_MSG(
BlocksSwap(transaction.get(), bno_start, bno_count, allocated_blocks.data()).is_ok(),
"Failed to reserve blocks.");
// Enqueue each data block one at a time, as they may not be contiguous on disk.
UnownedVmoBuffer buffer(vmo());
for (blk_t i = 0; i < bno_count; i++) {
storage::Operation operation = {
.type = storage::OperationType::kWrite,
.vmo_offset = bno_start + i,
.dev_offset = allocated_blocks[i] + Vfs()->Info().dat_block,
.length = 1,
};
transaction->EnqueueData(operation, &buffer);
}
// Since we are updating the file in "chunks", only update the on-disk inode size
// with the portion we've written so far.
blk_t last_byte = static_cast<blk_t>((bno_start + bno_count) * Vfs()->BlockSize());
ZX_ASSERT_MSG(last_byte <= fbl::round_up(allocation_state_.GetNodeSize(), Vfs()->BlockSize()),
"Offset :%u to be updated is beyond the allowed nodesize :%lu", last_byte,
fbl::round_up(allocation_state_.GetNodeSize(), Vfs()->BlockSize()));
if (last_byte > GetInode()->size && last_byte < allocation_state_.GetNodeSize()) {
// If we have written past the end of the recorded size but have not yet reached the
// allocated size, update the recorded size to the last byte written.
GetMutableInode()->size = last_byte;
} else if (allocation_state_.GetNodeSize() <= last_byte) {
// If we have just written to the allocated inode size, update the recorded size
// accordingly.
GetMutableInode()->size = allocation_state_.GetNodeSize();
}
// In the future we could resolve on a per state (i.e. reservation) basis, but since swaps
// are currently only made within a single thread, for now it is okay to resolve
// everything.
transaction->PinVnode(fbl::RefPtr(this));
}
// At this point there should not be any pending allocations. Following code block
// prints and asserts it.
if (allocation_state_.GetTotalPending() != 0) {
FX_LOGS(ERROR) << "Found modified blocks(" << allocation_state_.GetTotalPending()
<< ") after marking them clean";
for (auto modified_blocks = allocation_state_.cbegin();
modified_blocks != allocation_state_.cend(); ++modified_blocks) {
FX_LOGS(ERROR) << " bitoff:" << modified_blocks->bitoff
<< " bitlen:" << modified_blocks->bitlen;
}
ZX_ASSERT_MSG(allocation_state_.GetTotalPending() == 0, "Pending allocations are non-zero:%u",
allocation_state_.GetTotalPending());
}
InodeSync(transaction.get(), DirtyCacheEnabled() ? kMxFsSyncDefault : kMxFsSyncMtime);
Vfs()->CommitTransaction(std::move(transaction));
}
zx::result<> File::BlocksSwap(Transaction* transaction, blk_t start, blk_t count, blk_t* bnos) {
if (count == 0)
return zx::ok();
VnodeMapper mapper(this);
VnodeIterator iterator;
auto status = iterator.Init(&mapper, transaction, start);
if (status.is_error())
return status.take_error();
while (count > 0) {
const blk_t file_block = static_cast<blk_t>(iterator.file_block());
ZX_DEBUG_ASSERT(allocation_state_.IsPending(file_block));
blk_t old_block = iterator.Blk();
// TODO(https://fxbug.dev/42128794): A value of zero for the block pointer has special meaning:
// the block is sparse or unmapped. We should add something for this magic constant and fix all
// places that currently hard code zero.
if (old_block == 0) {
GetMutableInode()->block_count++;
}
// For copy-on-write, swap the block out if it's a data block.
blk_t new_block = old_block;
Vfs()->BlockSwap(transaction, old_block, &new_block);
status = iterator.SetBlk(new_block);
if (status.is_error())
return status.take_error();
*bnos++ = new_block;
bool cleared = allocation_state_.ClearPending(file_block, old_block != 0);
ZX_DEBUG_ASSERT(cleared);
// We have cleared pending bit for the block. Update the accounting for the dirty block.
Vfs()->InspectTree()->SubtractDirtyBytes(Vfs()->BlockSize());
--count;
status = iterator.Advance();
if (status.is_error())
return status.take_error();
}
return iterator.Flush();
}
#endif
void File::UpdateModificationTime() {
zx_time_t cur_time = GetTimeUTC();
GetMutableInode()->modify_time = cur_time;
}
blk_t File::GetBlockCount() const {
#ifdef __Fuchsia__
return GetInode()->block_count + allocation_state_.GetNewPending();
#else
return GetInode()->block_count;
#endif
}
uint64_t File::GetSize() const {
#ifdef __Fuchsia__
return allocation_state_.GetNodeSize();
#endif
return GetInode()->size;
}
void File::SetSize(uint32_t new_size) {
#ifdef __Fuchsia__
allocation_state_.SetNodeSize(new_size);
#else
GetMutableInode()->size = new_size;
#endif
}
void File::AcquireWritableBlock(Transaction* transaction, blk_t local_bno, blk_t old_bno,
blk_t* out_bno) {
bool using_new_block = (old_bno == 0);
#ifdef __Fuchsia__
allocation_state_.SetPending(local_bno, !using_new_block);
Vfs()->InspectTree()->AddDirtyBytes(Vfs()->BlockSize());
#else
if (using_new_block) {
Vfs()->BlockNew(transaction, out_bno);
GetMutableInode()->block_count++;
} else {
*out_bno = old_bno;
}
#endif
}
void File::DeleteBlock(PendingWork* transaction, blk_t local_bno, blk_t old_bno, bool indirect) {
// If we found a block that was previously allocated, delete it.
if (old_bno != 0) {
transaction->DeallocateBlock(old_bno);
GetMutableInode()->block_count--;
}
#ifdef __Fuchsia__
if (!indirect) {
if (allocation_state_.IsPending(local_bno)) {
Vfs()->InspectTree()->SubtractDirtyBytes(Vfs()->BlockSize());
}
// Remove this block from the pending allocation map in case it's set so we do not
// proceed to allocate a new block.
allocation_state_.ClearPending(local_bno, old_bno != 0);
}
#endif
}
#ifdef __Fuchsia__
void File::IssueWriteback(Transaction* transaction, blk_t vmo_offset, blk_t dev_offset,
blk_t block_count) {
// This is a no-op. The blocks are swapped later.
}
bool File::HasPendingAllocation(blk_t vmo_offset) {
return allocation_state_.IsPending(vmo_offset);
}
void File::CancelPendingWriteback() {
// Drop all pending writes, revert the size of the inode to the "pre-pending-write" size.
allocation_state_.Reset(GetInode()->size);
}
#endif
zx::result<> File::CanUnlink() const { return zx::ok(); }
fuchsia_io::NodeProtocolKinds File::GetProtocols() const {
return fuchsia_io::NodeProtocolKinds::kFile;
}
bool File::ValidateRights(fuchsia_io::Rights rights) const {
// Minfs files can only be opened as readable/writable, not executable.
return !(rights & fuchsia_io::Rights::kExecute);
}
zx_status_t File::Read(void* data, size_t len, size_t off, size_t* out_actual) {
TRACE_DURATION("minfs", "File::Read", "ino", GetIno(), "len", len, "off", off);
FX_LOGS(DEBUG) << "minfs_read() vn=" << this << "(#" << GetIno() << ") len=" << len
<< " off=" << off;
return Vfs()->GetNodeOperations()->read.Track([&] {
Transaction transaction(Vfs());
return ReadInternal(&transaction, data, len, off, out_actual).status_value();
});
}
zx::result<uint32_t> File::GetRequiredBlockCount(size_t offset, size_t length) {
zx::result<blk_t> uncached = zx::error(ZX_ERR_INVALID_ARGS);
uncached = ::minfs::GetRequiredBlockCount(offset, length, Vfs()->BlockSize());
if (!DirtyCacheEnabled()) {
return uncached;
}
if (uncached.is_error()) {
return uncached;
}
return GetRequiredBlockCountForDirtyCache(offset, length, uncached.value());
}
zx::result<> File::CheckAndFlush(bool is_truncate, size_t length, size_t offset) {
auto status = ShouldFlush(is_truncate, length, offset);
if (status.is_error()) {
return status.take_error();
}
if (!status.value()) {
return zx::ok();
}
return FlushCachedWrites();
}
zx::result<std::unique_ptr<Transaction>> File::GetTransaction(uint32_t reserve_blocks) {
std::unique_ptr<Transaction> transaction;
std::unique_ptr<CachedBlockTransaction> cached_transaction;
{
std::lock_guard lock(mutex_);
cached_transaction = std::move(cached_transaction_);
if (!DirtyCacheEnabled()) {
ZX_ASSERT_MSG(cached_transaction == nullptr,
"Cached transaction is enabled on non-dirty cache configuration.");
}
}
if (!DirtyCacheEnabled() || cached_transaction == nullptr) {
auto transaction_or = Vfs()->BeginTransaction(0, reserve_blocks);
if (transaction_or.is_error()) {
return transaction_or.take_error();
}
transaction = std::move(transaction_or.value());
} else {
auto status =
Vfs()->ContinueTransaction(reserve_blocks, std::move(cached_transaction), &transaction);
if (status.is_error()) {
// Failure here means that we ran out of space. Force flush pending writes
// and return the failure.
if (transaction != nullptr) {
[[maybe_unused]] auto error =
FlushTransaction(std::move(transaction), /*force_flush=*/true).status_value();
}
return status.take_error();
}
}
return zx::ok(std::move(transaction));
}
zx_status_t File::Write(const void* data, size_t len, size_t offset, size_t* out_actual) {
TRACE_DURATION("minfs", "File::Write", "ino", GetIno(), "len", len, "off", offset);
FX_LOGS(DEBUG) << "minfs_write() vn=" << this << "(#" << GetIno() << ") len=" << len
<< " off=" << offset;
return Vfs()->GetNodeOperations()->write.Track([&] {
*out_actual = 0;
if (len == 0) {
return ZX_OK;
}
auto new_size_or = safemath::CheckAdd(offset, len);
if (!new_size_or.IsValid() || new_size_or.ValueOrDie() > kMinfsMaxFileSize) {
return ZX_ERR_FILE_BIG;
}
// If this file's pending blocks have crossed a limit or if there are no free blocks in the
// filesystem, try to flush before we proceed.
if (zx::result status = CheckAndFlush(false, len, offset); status.is_error()) {
return status.error_value();
}
// Calculate maximum number of blocks to reserve for this write operation.
zx::result<uint32_t> reserve_blocks_or = GetRequiredBlockCount(offset, len);
if (reserve_blocks_or.is_error()) {
return reserve_blocks_or.error_value();
}
auto transaction_or = GetTransaction(reserve_blocks_or.value());
if (transaction_or.is_error()) {
return transaction_or.error_value();
}
std::unique_ptr<Transaction> transaction = std::move(transaction_or.value());
// We mark block that has writes pending only after we have enough blocks reserved through
// BeginTransaction or through ContinueTransaction.
if (DirtyCacheEnabled()) {
if (auto status = MarkRequiredBlocksPending(offset, len, *transaction); status.is_error()) {
return status.error_value();
}
}
if (auto status = WriteInternal(transaction.get(), static_cast<const uint8_t*>(data), len,
offset, out_actual);
status.is_error()) {
return status.error_value();
}
if (*out_actual == 0) {
return ZX_OK;
}
// If anything was written, enqueue operations allocated within WriteInternal.
UpdateModificationTime();
return FlushTransaction(std::move(transaction)).status_value();
});
}
zx_status_t File::Append(const void* data, size_t len, size_t* out_end, size_t* out_actual) {
return Vfs()->GetNodeOperations()->append.Track([&] {
zx_status_t status = Write(data, len, GetSize(), out_actual);
*out_end = GetSize();
return status;
});
}
zx_status_t File::Truncate(size_t len) {
TRACE_DURATION("minfs", "File::Truncate");
return Vfs()->GetNodeOperations()->truncate.Track([&] {
if (len > kMinfsMaxFileSize) {
return ZX_ERR_INVALID_ARGS;
}
// TODO(unknown): Following can be optimized.
// - do not flush part of the file that will be truncated.
// - conditionally flush unaffected part if necessary.
if (auto status = FlushCachedWrites(); status.is_error()) {
return status.error_value();
}
// Due to file copy-on-write, up to 1 new (data) block may be required.
size_t reserve_blocks = 1;
auto transaction_or = Vfs()->BeginTransaction(0, reserve_blocks);
if (transaction_or.is_error()) {
return transaction_or.error_value();
}
if (auto status = TruncateInternal(transaction_or.value().get(), len); status.is_error()) {
return status.status_value();
}
// Force sync the inode to persistent storage: although our data blocks will be allocated
// later, the act of truncating may have allocated indirect blocks.
//
// Ensure our inode is consistent with that metadata.
UpdateModificationTime();
auto result = FlushTransaction(std::move(transaction_or.value()), true);
ZX_ASSERT_MSG(result.is_ok(), "Failed to force sync inode: %u", result.status_value());
return ZX_OK;
});
}
} // namespace minfs