blob: 7662ea948060c96876513345c363edcd3d4a906b [file] [log] [blame]
// Copyright 2016 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file describes the in-memory structures which construct
// a MinFS filesystem.
#include <inttypes.h>
#include <memory>
#include <utility>
#ifdef __Fuchsia__
#include <fidl/>
#include <lib/fzl/resizeable-vmo-mapper.h>
#include <lib/inspect/cpp/inspect.h>
#include <lib/sync/completion.h>
#include <lib/zx/result.h>
#include <lib/zx/time.h>
#include <lib/zx/vmo.h>
#include <zircon/compiler.h>
#include "src/lib/storage/vfs/cpp/journal/journal.h"
#include "src/lib/storage/vfs/cpp/managed_vfs.h"
#include "src/lib/storage/vfs/cpp/watcher.h"
#include "src/storage/minfs/minfs_inspect_tree.h"
#include <lib/fit/function.h>
#include <fbl/algorithm.h>
#include <fbl/intrusive_hash_table.h>
#include <fbl/intrusive_single_list.h>
#include <fbl/macros.h>
#include "src/lib/storage/vfs/cpp/inspect/node_operations.h"
#include "src/lib/storage/vfs/cpp/journal/inspector_journal.h"
#include "src/lib/storage/vfs/cpp/transaction/transaction_handler.h"
#include "src/lib/storage/vfs/cpp/vfs.h"
#include "src/lib/storage/vfs/cpp/vnode.h"
#include "src/storage/minfs/format.h"
#include "src/storage/minfs/minfs.h"
#include "src/storage/minfs/superblock.h"
#include "src/storage/minfs/transaction_limits.h"
#include "src/storage/minfs/writeback.h"
#ifdef __Fuchsia__
#include "src/storage/minfs/vnode_allocation.h"
#include "src/storage/minfs/allocator/allocator.h"
#include "src/storage/minfs/allocator/inode_manager.h"
#include "src/storage/minfs/bcache.h"
#include "src/storage/minfs/vnode.h"
constexpr uint32_t kExtentCount = 6;
namespace minfs {
#ifdef __Fuchsia__
// How frequently we synchronize the journal. Without this, the journal will only get flushed when
// there is no room for a new transaction, or it is explicitly asked to by some other mechanism.
constexpr zx::duration kJournalBackgroundSyncTime = zx::sec(30);
#endif // __Fuchsia__
// A async_dispatcher_t* is needed for some functions on Fuchsia only. In order to avoid ifdefs on
// every call that is compiled for host Fuchsia and Host, we define this as a nullptr_t type when
// compiling on host where callers should pass null and it's ignored.
// Prefer async_dispatcher_t* for Fuchsia-specific functions since it makes the intent more clear.
#ifdef __Fuchsia__
using FuchsiaDispatcher = async_dispatcher_t*;
using FuchsiaDispatcher = std::nullptr_t;
#endif // __Fuchsia__
// A reference to the vfs is needed for some functions. The specific vfs type is different between
// Fuchsia and Host, so define a PlatformVfs which represents the one for our current platform to
// avoid ifdefs on every call.
// Prefer using the appropriate vfs when the function is only used for one or the other.
#ifdef __Fuchsia__
using PlatformVfs = fs::ManagedVfs;
using PlatformVfs = fs::Vfs;
#endif // __Fuchsia__
// SyncVnode flags
constexpr uint32_t kMxFsSyncDefault = 0; // default: no implicit time update
constexpr uint32_t kMxFsSyncMtime = (1 << 0);
constexpr uint32_t kMxFsSyncCtime = (1 << 1);
constexpr uint32_t kMinfsBlockCacheSize = 64;
// Used by fsck
class VnodeMinfs;
using SyncCallback = fs::Vnode::SyncCallback;
#ifndef __Fuchsia__
// Store start block + length for all extents. These may differ from info block for
// sparse files.
class BlockOffsets {
BlockOffsets(const Bcache& bc, const SuperblockManager& sb);
blk_t IbmStartBlock() const { return ibm_start_block_; }
blk_t IbmBlockCount() const { return ibm_block_count_; }
blk_t AbmStartBlock() const { return abm_start_block_; }
blk_t AbmBlockCount() const { return abm_block_count_; }
blk_t InoStartBlock() const { return ino_start_block_; }
blk_t InoBlockCount() const { return ino_block_count_; }
blk_t IntegrityStartBlock() const { return integrity_start_block_; }
blk_t IntegrityBlockCount() const { return integrity_block_count_; }
blk_t JournalStartBlock() const { return integrity_start_block_ + kBackupSuperblockBlocks; }
blk_t DatStartBlock() const { return dat_start_block_; }
blk_t DatBlockCount() const { return dat_block_count_; }
blk_t ibm_start_block_;
blk_t ibm_block_count_;
blk_t abm_start_block_;
blk_t abm_block_count_;
blk_t ino_start_block_;
blk_t ino_block_count_;
blk_t integrity_start_block_;
blk_t integrity_block_count_;
blk_t dat_start_block_;
blk_t dat_block_count_;
class TransactionalFs {
virtual ~TransactionalFs() = default;
#ifdef __Fuchsia__
virtual fbl::Mutex* GetLock() const = 0;
virtual void EnqueueCallback(SyncCallback callback) = 0;
// Begin a transaction with |reserve_inodes| inodes and |reserve_blocks| blocks reserved.
[[nodiscard]] virtual zx::result<std::unique_ptr<Transaction>> BeginTransaction(
size_t reserve_inodes, size_t reserve_blocks) = 0;
// Enqueues a metadata transaction by persisting its contents to disk.
virtual void CommitTransaction(std::unique_ptr<Transaction> transaction) = 0;
virtual Bcache* GetMutableBcache() = 0;
virtual Allocator& GetBlockAllocator() = 0;
virtual Allocator& GetInodeAllocator() = 0;
// Marked final for cleaning up async thread references in destructor.
class Minfs final : public TransactionalFs {
// Not copyable or movable
Minfs(const Minfs&) = delete;
Minfs& operator=(const Minfs&) = delete;
Minfs(Minfs&&) = delete;
Minfs& operator=(Minfs&&) = delete;
~Minfs() override;
// Destroys a "minfs" object, but take back ownership of the bcache object.
static std::unique_ptr<Bcache> Destroy(std::unique_ptr<Minfs> minfs);
// Terminates Minfs and joins all background threads. This will not necessarily guarantee all
// all data has been flushed. Call Sync first if that matters.
void Terminate();
[[nodiscard]] static zx::result<std::unique_ptr<Minfs>> Create(FuchsiaDispatcher dispatcher,
std::unique_ptr<Bcache> bc,
const MountOptions& options,
PlatformVfs* vfs);
#ifdef __Fuchsia__
// Initializes the Minfs journal and writeback queue and resolves any pending disk state (e.g.,
// resolving unlinked nodes and existing journal entries).
[[nodiscard]] zx::result<> InitializeJournal(fs::JournalSuperblock journal_superblock);
// Initializes the Minfs writeback queue and resolves any pending disk state (e.g., resolving
// unlinked nodes and existing journal entries). Does not enable the journal.
[[nodiscard]] zx_status_t InitializeUnjournalledWriteback();
// instantiate a vnode from an inode
// the inode must exist in the file system
[[nodiscard]] zx::result<fbl::RefPtr<VnodeMinfs>> VnodeGet(ino_t ino);
// instantiate a vnode with a new inode
[[nodiscard]] zx::result<fbl::RefPtr<VnodeMinfs>> VnodeNew(Transaction* transaction,
uint32_t type);
// Insert, lookup, and remove vnode from hash map.
void VnodeInsert(VnodeMinfs* vn) __TA_EXCLUDES(hash_lock_);
fbl::RefPtr<VnodeMinfs> VnodeLookup(uint32_t ino) __TA_EXCLUDES(hash_lock_);
void VnodeRelease(VnodeMinfs* vn) __TA_EXCLUDES(hash_lock_);
// Opens the root inode. This is a special-case of VnodeGet for filesystem bootstrapping.
zx::result<fbl::RefPtr<VnodeMinfs>> OpenRootNode();
// Allocate a new data block.
void BlockNew(PendingWork* transaction, blk_t* out_bno) const;
// Set/Unset the flags.
void UpdateFlags(PendingWork* transaction, uint32_t flags, bool set);
// Mark |in_bno| for de-allocation (if it is > 0), and return a new block |*out_bno|.
// The swap will not be persisted until the transaction is commited.
void BlockSwap(Transaction* transaction, blk_t in_bno, blk_t* out_bno);
// Free ino in inode bitmap, release all blocks held by inode.
[[nodiscard]] zx::result<> InoFree(Transaction* transaction, VnodeMinfs* vn);
// Mark |vn| to be unlinked.
void AddUnlinked(PendingWork* transaction, VnodeMinfs* vn);
// Remove |vn| from the list of unlinked vnodes.
void RemoveUnlinked(PendingWork* transaction, VnodeMinfs* vn);
// Free resources of all vnodes marked unlinked.
[[nodiscard]] zx::result<> PurgeUnlinked();
// Writes back an inode into the inode table on persistent storage.
// Does not modify inode bitmap.
void InodeUpdate(PendingWork* transaction, ino_t ino, const Inode* inode) {
inodes_->Update(transaction, ino, inode);
// Reads an inode from the inode table into memory.
void InodeLoad(ino_t ino, Inode* out) const { inodes_->Load(ino, out); }
void ValidateBno(blk_t bno) const {
ZX_DEBUG_ASSERT(bno != 0);
ZX_DEBUG_ASSERT(bno < Info().block_count);
[[nodiscard]] zx::result<std::unique_ptr<Transaction>> BeginTransaction(
size_t reserve_inodes, size_t reserve_blocks) final;
// Converts a cached transaction into a Transaction. Extends block reservation
// by |reserve_blocks|.
// On failure to reserve blocks, returns error but |out| will have a transaction
// that was converted.
[[nodiscard]] zx::result<> ContinueTransaction(
size_t reserve_blocks, std::unique_ptr<CachedBlockTransaction> cached_transaction,
std::unique_ptr<Transaction>* out);
#ifdef __Fuchsia__
void EnqueueCallback(SyncCallback callback) final;
[[nodiscard]] bool IsJournalErrored();
void EnqueueAllocation(std::unique_ptr<PendingWork> transaction);
// Complete a transaction by enqueueing its WritebackWork to the WritebackQueue.
void CommitTransaction(std::unique_ptr<Transaction> transaction) final;
// Runs fsck at the end of a transaction, just after metadata has been written. Used for testing
// to be sure that all transactions leave the file system in a good state.
void FsckAtEndOfTransaction();
#ifdef __Fuchsia__
// Returns the capacity of the writeback buffer, in blocks.
size_t WritebackCapacity() const {
// Hardcoded to 10 MB; may be replaced by a more device-specific option
// in the future.
return 10 * (1 << 20) / kMinfsBlockSize;
zx::result<fs::FilesystemInfo> GetFilesystemInfo();
// Signals the completion object as soon as the journal has finished synchronizing.
void Sync(SyncCallback closure = {});
// The following methods are used to read one block from the specified extent,
// from relative block |bno|.
// |data| is an out parameter that must be a block in size, provided by the caller
// These functions are single-block and synchronous. On Fuchsia, using the batched read
// functions is preferred.
[[nodiscard]] zx::result<> ReadDat(blk_t bno, void* data);
#ifdef __Fuchsia__
// Return true if the all outstanding block reservations are backed by persistent storage
// (i.e. the volume has already been extended to cover all reservations).
// Requires a Transaction to ensure the transaction lock is held.
[[nodiscard]] bool AllReservationsBacked(const Transaction&) const;
// Record the location, size, and number of all non-free block regions.
fbl::Vector<BlockRegion> GetAllocatedRegions() const;
// Mutable pointer to the MinfsInspectTree this object uses for metrics. Guaranteed non-nullptr.
MinfsInspectTree* InspectTree() { return &inspect_tree_; }
fs_inspect::NodeOperations* GetNodeOperations() { return inspect_tree_.GetNodeOperations(); }
static fs_inspect::NodeOperations* GetNodeOperations() {
static fs_inspect::NodeOperations stub;
return &stub;
// Returns an immutable reference to the superblock.
const Superblock& Info() const { return sb_->Info(); }
uint64_t BlockSize() const {
// Either intentionally or unintentionally, we do not want to change block
// size to anything other than kMinfsBlockSize yet. This is because changing
// block size might lead to format change and also because anything other
// than 8k is not well tested. So assert when we find block size other
// than 8k.
ZX_ASSERT(Info().BlockSize() == kMinfsBlockSize);
return Info().BlockSize();
// Gets an immutable reference to the InodeManager.
const InspectableInodeManager* GetInodeManager() const { return inodes_.get(); }
// Gets an immutable reference to the block_allocator.
const Allocator& GetBlockAllocator() const { return *block_allocator_; }
// Returns number of blocks available.
size_t BlocksAvailable() const { return GetBlockAllocator().GetAvailable(); }
// Returns number of reserved blocks but are yet to be allocated.
// This helps to determine if we should fail incoming writes because we will
// run out of space.
size_t BlocksReserved() const { return GetBlockAllocator().GetReserved(); }
#ifndef __Fuchsia__
// Gets an immutable copy of offsets_.
BlockOffsets GetBlockOffsets() const { return offsets_; }
// Used by the disk inspector.
zx_status_t ReadBlock(blk_t start_block_num, void* out_data) const;
const TransactionLimits& Limits() const { return limits_; }
#ifdef __Fuchsia__
fbl::Mutex* GetLock() const final { return &txn_lock_; }
// Terminates all writeback queues, and flushes pending operations to the underlying device.
// If |!IsReadonly()|, also sets the dirty bit to a "clean" status.
void StopWriteback();
// Issues a sync to the journal's background thread and waits for it to complete.
zx::result<> BlockingJournalSync();
Bcache* GetMutableBcache() final { return bc_.get(); }
// TODO(rvargas): Make private.
std::unique_ptr<Bcache> bc_;
Allocator& GetBlockAllocator() final { return *block_allocator_; }
Allocator& GetInodeAllocator() final { return inodes_->inode_allocator(); }
const MountOptions& mount_options() { return mount_options_; }
PlatformVfs* vfs() { return vfs_; }
using HashTable = fbl::HashTable<ino_t, VnodeMinfs*>;
#ifdef __Fuchsia__
Minfs(async_dispatcher_t* dispatcher, std::unique_ptr<Bcache> bc,
std::unique_ptr<SuperblockManager> sb, std::unique_ptr<Allocator> block_allocator,
std::unique_ptr<InodeManager> inodes, const MountOptions& mount_options,
fs::ManagedVfs* vfs);
Minfs(std::unique_ptr<Bcache> bc, std::unique_ptr<SuperblockManager> sb,
std::unique_ptr<Allocator> block_allocator, std::unique_ptr<InodeManager> inodes,
BlockOffsets offsets, const MountOptions& mount_options, fs::Vfs* vfs);
// Internal version of VnodeLookup which may also return unlinked vnodes.
fbl::RefPtr<VnodeMinfs> VnodeLookupInternal(uint32_t ino) __TA_EXCLUDES(hash_lock_);
// Returns a vector of vnodes having one or more blocks that needs to be
// flushed.
std::vector<fbl::RefPtr<VnodeMinfs>> GetDirtyVnodes();
// Find a free inode, allocate it in the inode bitmap, and write it back to disk
void InoNew(Transaction* transaction, const Inode* inode, ino_t* out_ino);
// Find an unallocated and unreserved block in the block bitmap starting from block |start|
[[nodiscard]] zx_status_t FindBlock(size_t start, size_t* blkno_out);
// Reads blocks from disk. Only to be called during "construction".
static zx::result<std::pair<std::unique_ptr<Allocator>, std::unique_ptr<InodeManager>>>
ReadInitialBlocks(const Superblock& info, Bcache& bc, SuperblockManager& superblock_manager,
const MountOptions& mount_options);
// Updates the clean bit and oldest revision in the super block.
[[nodiscard]] zx::result<> UpdateCleanBitAndOldestRevision(bool is_clean);
#ifndef __Fuchsia__
[[nodiscard]] zx::result<> ReadBlk(blk_t bno, blk_t start, blk_t soft_max, blk_t hard_max,
void* data) const;
// Global information about the filesystem.
// While Allocator is thread-safe, it is recommended that a valid Transaction object be held
// while any metadata fields are modified until the time they are enqueued for writeback. This
// is to avoid modifications from other threads potentially jeopardizing the metadata integrity
// before it is safely persisted to disk.
std::unique_ptr<SuperblockManager> sb_;
std::unique_ptr<Allocator> block_allocator_;
std::unique_ptr<InodeManager> inodes_;
#ifdef __Fuchsia__
mutable fbl::Mutex txn_lock_; // Lock required to start a new Transaction.
mutable fbl::Mutex hash_lock_; // Lock required to access the vnode_hash_.
// Vnodes exist in the hash table as long as one or more reference exists;
// when the Vnode is deleted, it is immediately removed from the map.
HashTable vnode_hash_ __TA_GUARDED(hash_lock_){};
#ifdef __Fuchsia__
std::unique_ptr<fs::Journal> journal_;
// This event's koid is used as a unique identifier for this filesystem instance.
zx::event fs_id_;
// TODO( Stop accessing outside `dispatcher_` thread.
async::TaskClosure journal_sync_task_;
MinfsInspectTree inspect_tree_;
void InitializeInspectTree();
// Store start block + length for all extents. These may differ from info block for
// sparse files.
BlockOffsets offsets_;
TransactionLimits limits_;
MountOptions mount_options_;
#ifdef __Fuchsia__
async_dispatcher_t* dispatcher_;
PlatformVfs* vfs_;
#ifdef __Fuchsia__
// Replay the minfs journal, given the sizes provided within the superblock.
[[nodiscard]] zx::result<fs::JournalSuperblock> ReplayJournal(Bcache* bc, const Superblock& info);
// write the inode data of this vnode to disk (default does not update time values)
void SyncVnode(fbl::RefPtr<VnodeMinfs> vn, uint32_t flags);
void DumpInfo(const Superblock& info);
void DumpInode(const Inode* inode, ino_t ino);
zx_time_t GetTimeUTC();
void InitializeDirectory(void* bdata, ino_t ino_self, ino_t ino_parent);
} // namespace minfs