// Copyright 2016 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// This file describes the in-memory structures which construct
// a MinFS filesystem.

#ifndef ZIRCON_SYSTEM_ULIB_MINFS_MINFS_PRIVATE_H_
#define ZIRCON_SYSTEM_ULIB_MINFS_MINFS_PRIVATE_H_

#include <inttypes.h>

#include <memory>
#include <utility>

#ifdef __Fuchsia__
#include <fuchsia/io/llcpp/fidl.h>
#include <fuchsia/minfs/llcpp/fidl.h>
#include <lib/fzl/resizeable-vmo-mapper.h>
#include <lib/sync/completion.h>
#include <lib/zx/vmo.h>

#include <fs/journal/journal.h>
#include <fs/managed_vfs.h>
#include <fs/remote.h>
#include <fs/watcher.h>
#include <minfs/metrics.h>
#endif

#include <lib/zircon-internal/fnv1hash.h>

#include <fbl/algorithm.h>
#include <fbl/function.h>
#include <fbl/intrusive_hash_table.h>
#include <fbl/intrusive_single_list.h>
#include <fbl/macros.h>
#include <fbl/ref_ptr.h>
#include <fs/inspectable.h>
#include <fs/locking.h>
#include <fs/ticker.h>
#include <fs/trace.h>
#include <fs/transaction/transaction_handler.h>
#include <fs/vfs.h>
#include <fs/vnode.h>
#include <minfs/format.h>
#include <minfs/minfs.h>
#include <minfs/superblock.h>
#include <minfs/transaction_limits.h>
#include <minfs/writeback.h>

#ifdef __Fuchsia__
#include "vnode_allocation.h"
#endif

#include "allocator/allocator.h"
#include "allocator/inode_manager.h"
#include "vnode.h"

constexpr uint32_t kExtentCount = 6;

// A compile-time debug check, which, if enabled, causes
// inline functions to be expanded to error checking code.
// Since this may be expensive, it is typically turned
// off, except for debugging.
// #define MINFS_PARANOID_MODE

namespace minfs {
#ifdef __Fuchsia__
// Validate that |vmo| is large enough to access block |blk|,
// relative to the start of the vmo.
inline void ValidateVmoSize(zx_handle_t vmo, blk_t blk) {
  uint64_t size;
  size_t min = (blk + 1) * kMinfsBlockSize;
  ZX_ASSERT(zx_vmo_get_size(vmo, &size) == ZX_OK);
  ZX_ASSERT_MSG(size >= min, "VMO size %" PRIu64 " too small for access at block %u\n", size, blk);
}

using MountState = llcpp::fuchsia::minfs::MountState;

#endif  // __Fuchsia__

// SyncVnode flags
constexpr uint32_t kMxFsSyncDefault = 0;  // default: no implicit time update
constexpr uint32_t kMxFsSyncMtime = (1 << 0);
constexpr uint32_t kMxFsSyncCtime = (1 << 1);

constexpr uint32_t kMinfsBlockCacheSize = 64;

// Used by fsck
class MinfsChecker;
class VnodeMinfs;

using SyncCallback = fs::Vnode::SyncCallback;

#ifndef __Fuchsia__

// Store start block + length for all extents. These may differ from info block for
// sparse files.
class BlockOffsets {
 public:
  BlockOffsets(const Bcache& bc, const SuperblockManager& sb);

  blk_t IbmStartBlock() const { return ibm_start_block_; }
  blk_t IbmBlockCount() const { return ibm_block_count_; }

  blk_t AbmStartBlock() const { return abm_start_block_; }
  blk_t AbmBlockCount() const { return abm_block_count_; }

  blk_t InoStartBlock() const { return ino_start_block_; }
  blk_t InoBlockCount() const { return ino_block_count_; }

  blk_t IntegrityStartBlock() const { return integrity_start_block_; }
  blk_t IntegrityBlockCount() const { return integrity_block_count_; }

  blk_t JournalStartBlock() const { return integrity_start_block_ + kBackupSuperblockBlocks; }

  blk_t DatStartBlock() const { return dat_start_block_; }
  blk_t DatBlockCount() const { return dat_block_count_; }

 private:
  blk_t ibm_start_block_;
  blk_t ibm_block_count_;

  blk_t abm_start_block_;
  blk_t abm_block_count_;

  blk_t ino_start_block_;
  blk_t ino_block_count_;

  blk_t integrity_start_block_;
  blk_t integrity_block_count_;

  blk_t dat_start_block_;
  blk_t dat_block_count_;
};
#endif

class TransactionalFs {
 public:
  virtual ~TransactionalFs() = default;

#ifdef __Fuchsia__
  virtual fbl::Mutex* GetLock() const = 0;

  virtual void EnqueueCallback(SyncCallback callback) = 0;
#endif

  // Begin a transaction with |reserve_inodes| inodes and |reserve_blocks| blocks reserved.
  virtual zx_status_t BeginTransaction(size_t reserve_inodes, size_t reserve_blocks,
                                       std::unique_ptr<Transaction>* transaction_out) = 0;

  // Enqueues a metadata transaction by persisting its contents to disk.
  virtual void CommitTransaction(std::unique_ptr<Transaction> transaction) = 0;

  virtual Bcache* GetMutableBcache() = 0;

  virtual Allocator& GetBlockAllocator() = 0;
  virtual Allocator& GetInodeAllocator() = 0;
};

class InspectableMinfs : public fs::Inspectable {
 public:
  virtual ~InspectableMinfs() {}

  // Returns an immutable reference to the superblock.
  virtual const Superblock& Info() const = 0;

  // Gets an immutable reference to the InodeManager.
  virtual const InspectableInodeManager* GetInodeManager() const = 0;

  // Gets an immutable reference to the block_allocator.
  virtual const Allocator& GetBlockAllocator() const = 0;

#ifndef __Fuchsia__
  // Gets an immutable copy of offsets_.
  virtual const BlockOffsets GetBlockOffsets() const = 0;
#endif
};

class Minfs :
#ifdef __Fuchsia__
    public fs::ManagedVfs,
#else
    public fs::Vfs,
#endif
    public fbl::RefCounted<Minfs>,
    public TransactionalFs,
    public InspectableMinfs {
 public:
  DISALLOW_COPY_ASSIGN_AND_MOVE(Minfs);

  ~Minfs();

  // Destroys a "minfs" object, but take back ownership of the bcache object.
  static std::unique_ptr<Bcache> Destroy(std::unique_ptr<Minfs> minfs);

  static zx_status_t Create(std::unique_ptr<Bcache> bc, const MountOptions& options,
                            std::unique_ptr<Minfs>* out);

#ifdef __Fuchsia__
  // Initializes the Minfs journal and writeback queue and resolves any pending disk state (e.g.,
  // resolving unlinked nodes and existing journal entries).
  zx_status_t InitializeJournal(fs::JournalSuperblock journal_superblock);

  // Initializes the Minfs writeback queue and resolves any pending disk state (e.g., resolving
  // unlinked nodes and existing journal entries). Does not enable the journal.
  zx_status_t InitializeUnjournalledWriteback();

  // Queries the superblock flags for FVM as well as underlying FVM, if it exists.
  zx_status_t FVMQuery(fuchsia_hardware_block_volume_VolumeInfo* info) const;
#endif

  // instantiate a vnode from an inode
  // the inode must exist in the file system
  zx_status_t VnodeGet(fbl::RefPtr<VnodeMinfs>* out, ino_t ino);

  // instantiate a vnode with a new inode
  zx_status_t VnodeNew(Transaction* transaction, fbl::RefPtr<VnodeMinfs>* out, uint32_t type);

  // Insert, lookup, and remove vnode from hash map.
  void VnodeInsert(VnodeMinfs* vn) FS_TA_EXCLUDES(hash_lock_);
  fbl::RefPtr<VnodeMinfs> VnodeLookup(uint32_t ino) FS_TA_EXCLUDES(hash_lock_);
  void VnodeRelease(VnodeMinfs* vn) FS_TA_EXCLUDES(hash_lock_);

  // Allocate a new data block.
  void BlockNew(PendingWork* transaction, blk_t* out_bno);

  // Set/Unset the flags.
  void UpdateFlags(PendingWork* transaction, uint32_t flags, bool set);

  // Mark |in_bno| for de-allocation (if it is > 0), and return a new block |*out_bno|.
  // The swap will not be persisted until the transaction is commited.
  void BlockSwap(Transaction* transaction, blk_t in_bno, blk_t* out_bno);

  // Free ino in inode bitmap, release all blocks held by inode.
  zx_status_t InoFree(Transaction* transaction, VnodeMinfs* vn);

  // Mark |vn| to be unlinked.
  void AddUnlinked(PendingWork* transaction, VnodeMinfs* vn);

  // Remove |vn| from the list of unlinked vnodes.
  void RemoveUnlinked(PendingWork* transaction, VnodeMinfs* vn);

  // Free resources of all vnodes marked unlinked.
  zx_status_t PurgeUnlinked();

  // Writes back an inode into the inode table on persistent storage.
  // Does not modify inode bitmap.
  void InodeUpdate(PendingWork* transaction, ino_t ino, const Inode* inode) {
    inodes_->Update(transaction, ino, inode);
  }

  // Reads an inode from the inode table into memory.
  void InodeLoad(ino_t ino, Inode* out) const { inodes_->Load(ino, out); }

  void ValidateBno(blk_t bno) const {
    ZX_DEBUG_ASSERT(bno != 0);
    ZX_DEBUG_ASSERT(bno < Info().block_count);
  }

  zx_status_t BeginTransaction(size_t reserve_inodes, size_t reserve_blocks,
                               std::unique_ptr<Transaction>* transaction) final
      __WARN_UNUSED_RESULT;

#ifdef __Fuchsia__
  void EnqueueCallback(SyncCallback callback) final;
#endif

  void EnqueueAllocation(std::unique_ptr<PendingWork> transaction);

  // Complete a transaction by enqueueing its WritebackWork to the WritebackQueue.
  void CommitTransaction(std::unique_ptr<Transaction> transaction) final;

  void MaybeFsckAtEndOfTransaction(zx_status_t status);

#ifdef __Fuchsia__
  // Returns the capacity of the writeback buffer, in blocks.
  size_t WritebackCapacity() const {
    // Hardcoded to 10 MB; may be replaced by a more device-specific option
    // in the future.
    return 10 * (1 << 20) / kMinfsBlockSize;
  }

  void SetUnmountCallback(fbl::Closure closure) { on_unmount_ = std::move(closure); }
  void Shutdown(fs::Vfs::ShutdownCallback cb) final;

  // Returns a unique identifier for this instance.
  uint64_t GetFsId() const { return fs_id_; }

  // Signals the completion object as soon as...
  // (1) A sync probe has entered and exited the writeback queue, and
  // (2) The block cache has sync'd with the underlying block device.
  void Sync(SyncCallback closure);
#endif

  // The following methods are used to read one block from the specified extent,
  // from relative block |bno|.
  // |data| is an out parameter that must be a block in size, provided by the caller
  // These functions are single-block and synchronous. On Fuchsia, using the batched read
  // functions is preferred.
  zx_status_t ReadDat(blk_t bno, void* data);

  void SetMetrics(bool enable) {
#ifdef __Fuchsia__
    metrics_.SetEnable(enable);
#endif
  }
  fs::Ticker StartTicker() {
#ifdef __Fuchsia__
    return fs::Ticker(metrics_.Enabled());
#endif
    return fs::Ticker(true);
  }

  // Update aggregate information about VMO initialization.
  void UpdateInitMetrics(uint32_t dnum_count, uint32_t inum_count, uint32_t dinum_count,
                         uint64_t user_data_size, const fs::Duration& duration);
  // Update aggregate information about looking up vnodes by name.
  void UpdateLookupMetrics(bool success, const fs::Duration& duration);
  // Update aggregate information about looking up vnodes by inode.
  void UpdateOpenMetrics(bool cache_hit, const fs::Duration& duration);
  // Update aggregate information about inode creation.
  void UpdateCreateMetrics(bool success, const fs::Duration& duration);
  // Update aggregate information about reading from Vnodes.
  void UpdateReadMetrics(uint64_t size, const fs::Duration& duration);
  // Update aggregate information about writing to Vnodes.
  void UpdateWriteMetrics(uint64_t size, const fs::Duration& duration);
  // Update aggregate information about truncating Vnodes.
  void UpdateTruncateMetrics(const fs::Duration& duration);
  // Update aggregate information about unlinking Vnodes.
  void UpdateUnlinkMetrics(bool success, const fs::Duration& duration);
  // Update aggregate information about renaming Vnodes.
  void UpdateRenameMetrics(bool success, const fs::Duration& duration);

#ifdef __Fuchsia__
  // Acquire a copy of the collected metrics.
  zx_status_t GetMetrics(::llcpp::fuchsia::minfs::Metrics* out) const {
    if (metrics_.Enabled()) {
      metrics_.CopyToFidl(out);
      return ZX_OK;
    }
    return ZX_ERR_UNAVAILABLE;
  }

  // Record the location, size, and number of all non-free block regions.
  fbl::Vector<BlockRegion> GetAllocatedRegions() const;

  // Returns the current state of mounted filesystem.
  // "state" is intentionally losely defined to allow
  // adding more information in the near future.
  MountState GetMountState() const { return mount_state_; }
#endif

  // InspectableFilesystem interface.
  const Superblock& Info() const final { return sb_->Info(); }

  const InspectableInodeManager* GetInodeManager() const final { return inodes_.get(); }

  const Allocator& GetBlockAllocator() const final { return *block_allocator_; }

#ifndef __Fuchsia__
  const BlockOffsets GetBlockOffsets() const final { return offsets_; }
#endif

  zx_status_t ReadBlock(blk_t start_block_num, void* data) const final;

  const TransactionLimits& Limits() const { return limits_; }

#ifdef __Fuchsia__
  fbl::Mutex* GetLock() const final { return &txn_lock_; }

  // Terminates all writeback queues, and flushes pending operations to the underlying device.
  //
  // If |!IsReadonly()|, also sets the dirty bit to a "clean" status.
  void StopWriteback();
#endif

  Bcache* GetMutableBcache() final { return bc_.get(); }

  // TODO(rvargas): Make private.
  std::unique_ptr<Bcache> bc_;

  Allocator& GetBlockAllocator() final { return *block_allocator_; }
  Allocator& GetInodeAllocator() final { return inodes_->inode_allocator(); }

  const MountOptions& mount_options() { return mount_options_; }

 private:
  using HashTable = fbl::HashTable<ino_t, VnodeMinfs*>;

#ifdef __Fuchsia__
  Minfs(std::unique_ptr<Bcache> bc, std::unique_ptr<SuperblockManager> sb,
        std::unique_ptr<Allocator> block_allocator, std::unique_ptr<InodeManager> inodes,
        uint64_t fs_id, const MountOptions& mount_options);
#else
  Minfs(std::unique_ptr<Bcache> bc, std::unique_ptr<SuperblockManager> sb,
        std::unique_ptr<Allocator> block_allocator, std::unique_ptr<InodeManager> inodes,
        BlockOffsets offsets, const MountOptions& mount_options);
#endif

  // Internal version of VnodeLookup which may also return unlinked vnodes.
  fbl::RefPtr<VnodeMinfs> VnodeLookupInternal(uint32_t ino) FS_TA_EXCLUDES(hash_lock_);

  // Check if filesystem is readonly.
  bool IsReadonly() FS_TA_EXCLUDES(vfs_lock_);

  // Find a free inode, allocate it in the inode bitmap, and write it back to disk
  void InoNew(Transaction* transaction, const Inode* inode, ino_t* out_ino);

  // Find an unallocated and unreserved block in the block bitmap starting from block |start|
  zx_status_t FindBlock(size_t start, size_t* blkno_out);

  // Creates an unique identifier for this instance. This is to be called only during
  // "construction".
  static zx_status_t CreateFsId(uint64_t* out);

  // Reads blocks from disk. Only to be called during "construction".
  static zx_status_t ReadInitialBlocks(const Superblock& info, std::unique_ptr<Bcache> bc,
                                       std::unique_ptr<SuperblockManager> sb,
                                       const MountOptions& mount_options,
                                       std::unique_ptr<Minfs>* out_minfs);

  // Updates the clean bit and oldest revision in the super block.
  zx_status_t UpdateCleanBitAndOldestRevision(bool is_clean);

#ifndef __Fuchsia__
  zx_status_t ReadBlk(blk_t bno, blk_t start, blk_t soft_max, blk_t hard_max, void* data);
#endif

  // Global information about the filesystem.
  // While Allocator is thread-safe, it is recommended that a valid Transaction object be held
  // while any metadata fields are modified until the time they are enqueued for writeback. This
  // is to avoid modifications from other threads potentially jeopardizing the metadata integrity
  // before it is safely persisted to disk.
  std::unique_ptr<SuperblockManager> sb_;
  std::unique_ptr<Allocator> block_allocator_;
  std::unique_ptr<InodeManager> inodes_;

#ifdef __Fuchsia__
  mutable fbl::Mutex txn_lock_;  // Lock required to start a new Transaction.
  fbl::Mutex hash_lock_;         // Lock required to access the vnode_hash_.
#endif
  // Vnodes exist in the hash table as long as one or more reference exists;
  // when the Vnode is deleted, it is immediately removed from the map.
  HashTable vnode_hash_ FS_TA_GUARDED(hash_lock_){};

#ifdef __Fuchsia__
  fbl::Closure on_unmount_{};
  MinfsMetrics metrics_ = {};
  std::unique_ptr<fs::Journal> journal_;
  uint64_t fs_id_ = 0;
  // TODO(fxb/51057): Git rid of MountState.
  MountState mount_state_ = {};
#else
  // Store start block + length for all extents. These may differ from info block for
  // sparse files.
  BlockOffsets offsets_;
#endif

  TransactionLimits limits_;
  MountOptions mount_options_;
};

#ifdef __Fuchsia__
// Create and register a VMO for writes.
zx_status_t CreateAndRegisterVmo(block_client::BlockDevice* device, zx::vmo* out_vmo, size_t blocks,
                                 storage::Vmoid* out_vmoid);

// Writes |bytes| bytes of |data| to disk at block |block_num|. |bytes| must not exceed
// kMinfsBlockSize. If |bytes| < kMinfsBlockSize, kMinfsBlockSize bytes will still be
// written to disk with the remaining |kMinfsBlockSize - bytes| bytes set to 0.
zx_status_t WriteDataToDisk(fs::TransactionHandler* transaction_handler,
                            block_client::BlockDevice* device, void* data, size_t bytes,
                            blk_t block_num);

// Reads |bytes| bytes of |data| from disk at block |block_num|. |bytes| must not exceed
// kMinfsBlockSize.
zx_status_t ReadDataFromDisk(fs::TransactionHandler* transaction_handler,
                             block_client::BlockDevice* device, void* data, size_t bytes,
                             blk_t block_num);
#endif

#ifdef __Fuchsia__
// Replay the minfs journal, given the sizes provided within the superblock.
zx_status_t ReplayJournal(Bcache* bc, const Superblock& info, fs::JournalSuperblock* out);
#endif

// Return the required vmo size (in blocks) to store doubly indirect blocks in vmo_indirect_
// TODO(43519).
constexpr uint32_t GetVmoBlocksForDoublyIndirect() { return kMinfsIndirect + kMinfsDoublyIndirect; }
static_assert(GetVmoBlocksForDoublyIndirect() == kMinfsIndirect + kMinfsDoublyIndirect,
              "Vnode block map changed");

// Return the required vmo size (in bytes) to store doubly indirect blocks in vmo_indirect_
constexpr size_t GetVmoSizeForDoublyIndirect() {
  return GetVmoBlocksForDoublyIndirect() * kMinfsBlockSize;
}
static_assert(GetVmoSizeForDoublyIndirect() ==
                  (kMinfsIndirect + kMinfsDoublyIndirect) * kMinfsBlockSize,
              "Vnode layout changed");

// Return the block offset in vmo_indirect_ of indirect blocks pointed to by the doubly indirect
// block at dindex
constexpr uint32_t GetVmoOffsetForIndirect(uint32_t dibindex) {
  return GetVmoBlocksForDoublyIndirect() + (dibindex * kMinfsDirectPerIndirect);
}

// Return the required vmo size (in bytes) to store indirect blocks pointed to by doubly indirect
// block dibindex
constexpr size_t GetVmoSizeForIndirect(uint32_t dibindex) {
  // See comments for VnodeMinfs::vmo_indirect_.
  size_t size = GetVmoOffsetForIndirect(dibindex + 1);
  return size * kMinfsBlockSize;
}

// Return the block offset of doubly indirect blocks in vmo_indirect_
constexpr uint32_t GetVmoOffsetForDoublyIndirect(uint32_t dibindex) {
  ZX_DEBUG_ASSERT(dibindex < kMinfsDoublyIndirect);
  return kMinfsIndirect + dibindex;
}

// write the inode data of this vnode to disk (default does not update time values)
void SyncVnode(fbl::RefPtr<VnodeMinfs> vn, uint32_t flags);
void DumpInfo(const Superblock* info);
void DumpInode(const Inode* inode, ino_t ino);
zx_time_t GetTimeUTC();
void InitializeDirectory(void* bdata, ino_t ino_self, ino_t ino_parent);

// Given an input bcache, initialize the filesystem and return a reference to the
// root node.
zx_status_t Mount(std::unique_ptr<minfs::Bcache> bc, const MountOptions& options,
                  fbl::RefPtr<VnodeMinfs>* root_out);
}  // namespace minfs

#endif  // ZIRCON_SYSTEM_ULIB_MINFS_MINFS_PRIVATE_H_
