// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef SRC_DEVICES_SYSMEM_DRIVERS_SYSMEM_CONTIGUOUS_POOLED_MEMORY_ALLOCATOR_H_
#define SRC_DEVICES_SYSMEM_DRIVERS_SYSMEM_CONTIGUOUS_POOLED_MEMORY_ALLOCATOR_H_

#include <lib/async/wait.h>
#include <lib/inspect/cpp/inspect.h>
#include <lib/zx/bti.h>
#include <lib/zx/event.h>
#include <zircon/errors.h>
#include <zircon/limits.h>

#include <fbl/algorithm.h>
#include <fbl/vector.h>
#include <region-alloc/region-alloc.h>

#include "allocator.h"
#include "protected_ranges.h"
#include "utils.h"

namespace sysmem_driver {

class ContiguousPooledMemoryAllocator : public MemoryAllocator {
 public:
  ContiguousPooledMemoryAllocator(Owner* parent_device, const char* allocation_name,
                                  inspect::Node* parent_node, fuchsia_sysmem2::Heap heap,
                                  uint64_t size, bool is_always_cpu_accessible,
                                  bool is_ever_cpu_accessible, bool is_ready, bool can_be_torn_down,
                                  async_dispatcher_t* dispatcher);

  ~ContiguousPooledMemoryAllocator();

  // Alignment gets rounded up to system page alignment, so any low number will default to system
  // page alignment.
  zx_status_t Init(uint32_t alignment_log2 = 0);

  // Initializes the guard regions. Must be called after Init. If
  // internal_guard_regions is not set, there will be only guard regions at the
  // begin and end of the buffer.
  void InitGuardRegion(size_t guard_region_size, bool unused_pages_guarded,
                       int64_t unused_guard_pattern_period_bytes,
                       zx::duration unused_page_check_cycle_period, bool internal_guard_regions,
                       bool crash_on_guard_failure, async_dispatcher_t* dispatcher);
  void FillUnusedRangeWithGuard(uint64_t start_offset, uint64_t size);

  // If is_cpu_accessibe_, called after InitGuardRegion() (if any), but during the same dispatcher
  // call-out, before returning to the dispatcher, because is_ready_ is already true.
  //
  // If !is_cpu_accessibe_, called during set_ready().
  void SetupUnusedPages();

  // This uses a physical VMO as the parent VMO.  This is used for VDEC as we learn the physical
  // range of VDEC from the TEE.
  zx_status_t InitPhysical(zx_paddr_t paddr);

  zx_status_t Allocate(uint64_t size, const fuchsia_sysmem2::SingleBufferSettings& settings,
                       std::optional<std::string> name, uint64_t buffer_collection_id,
                       uint32_t buffer_index, zx::vmo* parent_vmo) override;

  void Delete(zx::vmo parent_vmo) override;
  bool is_empty() override {
    // If the contiguous VMO has been marked as secure there's no way to unmark it as secure, so
    // unbinding would never be safe.
    return regions_.empty() && (can_be_torn_down_ || !is_ready_);
  }

  zx_status_t GetPhysicalMemoryInfo(uint64_t* base, uint64_t* size) override {
    *base = phys_start_;
    *size = size_;
    return ZX_OK;
  }

  void set_ready() override;
  bool is_ready() override;

  const zx::vmo& GetPoolVmoForTest() { return contiguous_vmo_; }
  // Gets the offset of a VMO from the beginning of a pool.
  uint64_t GetVmoRegionOffsetForTest(const zx::vmo& vmo);

  uint64_t failed_guard_region_checks() const { return failed_guard_region_checks_; }

  bool is_already_cleared_on_allocate() override;

  // When this is set from unit tests only, we skip any operation that's only allowed on contiguous
  // VMOs, since we don't have a real contiguous VMO, since a fake BTI can't be used to create one.
  // This ends up limiting the fidelity of the unit tests somewhat; in the long run we probably
  // should plumb a real BTI to the unit tests somehow.
  void SetBtiFakeForUnitTests() {
    ZX_ASSERT(!is_ready());
    is_bti_fake_ = true;
  }
  bool is_bti_fake() { return is_bti_fake_; }

  const fuchsia_sysmem2::Heap& heap() { return heap_; }

  // loanable pages / un-used pages
  //
  // We count pages we choose to pattern instead of loan as loanable, despite not actually loaning
  // those pages.  In other words we don't count patterned pages against efficiency.
  double GetLoanableEfficiency();

  // loanable pages / total pages
  //
  // We count pages we choose to pattern instead of loan as loanable, despite not actually loaning
  // those pages.  In other words we don't count patterned pages against the loaned ratio.
  double GetLoanableRatio();

  // loanable bytes
  //
  // We count pages we choose to pattern instead of loan as loanable, despite not actually loaning
  // those pages.  In other words we don't count patterned pages against the loaned ratio.
  uint64_t GetLoanableBytes();

  static constexpr zx::duration kDefaultUnusedPageCheckCyclePeriod = zx::sec(600);

  static constexpr zx::duration kUnusedRecentlyPageCheckPeriod = zx::sec(2);
  static constexpr zx::duration kUnusedRecentlyAgeThreshold = zx::sec(5);

  static constexpr zx::duration kStepTowardOptimalProtectedRangesPeriod = zx::msec(1000);

  // Keep < 1% of pages aside for being unused page guard pattern.  The rest get loaned back to
  // Zircon.
  //
  // This is effectively the default value of contiguous_guard_pages_unused_fraction_denominator.
  static constexpr uint64_t kUnusedGuardPatternPeriodPages = 128;

 private:
  struct RegionData {
    std::string name;
    zx_koid_t koid;
    inspect::Node node;
    inspect::UintProperty size_property;
    inspect::UintProperty koid_property;
    RegionAllocator::Region::UPtr ptr;
  };

  struct DeletedRegion {
    ralloc_region_t region;
    zx::time when_freed;
    std::string name;
  };

  class RangesControl : public protected_ranges::ProtectedRangesControl {
   public:
    RangesControl(ContiguousPooledMemoryAllocator* parent) : parent_(parent) {}

    // protected_ranges::ProtectedRangesControl implementation
    bool IsDynamic() override;
    uint64_t MaxRangeCount() override;
    uint64_t GetRangeGranularity() override;
    bool HasModProtectedRange() override;
    void AddProtectedRange(const protected_ranges::Range& range) override;
    void DelProtectedRange(const protected_ranges::Range& range) override;
    void ModProtectedRange(const protected_ranges::Range& old_range,
                           const protected_ranges::Range& new_range) override;
    void ZeroProtectedSubRange(bool is_covering_range_explicit,
                               const protected_ranges::Range& range) override;
    uint64_t GetBase() override;
    uint64_t GetSize() override;
    bool UseRange(const protected_ranges::Range& range) override;
    void UnUseRange(const protected_ranges::Range& range) override;

   private:
    ContiguousPooledMemoryAllocator* parent_{};
  };

  zx_status_t InitCommon(zx::vmo local_contiguous_vmo);
  void TraceObserverCallback(async_dispatcher_t* dispatcher, async::WaitBase* wait,
                             zx_status_t status, const zx_packet_signal_t* signal);

  void CheckGuardPageCallback(async_dispatcher_t* dispatcher, async::TaskBase* task,
                              zx_status_t status);
  void CheckUnusedPagesCallback(async_dispatcher_t* dispatcher, async::TaskBase* task,
                                zx_status_t status);
  void CheckUnusedRecentlyPagesCallback(async_dispatcher_t* dispatcher, async::TaskBase* task,
                                        zx_status_t status);
  void CheckGuardRegion(const char* region_name, size_t region_size, bool pre,
                        uint64_t start_offset);
  void IncrementGuardRegionFailureInspectData();
  void CheckGuardRegionData(const RegionData& region);
  void CheckExternalGuardRegions();
  void CheckAnyUnusedPages(uint64_t start_offset, uint64_t end_offset);
  void CheckUnusedRange(uint64_t offset, uint64_t size, bool and_also_zero);
  void DumpPoolStats();
  void DumpPoolHighWaterMark();
  void TracePoolSize(bool initial_trace);
  uint64_t CalculateLargeContiguousRegionSize();
  void UpdateLoanableMetrics();

  // This method iterates over all the sub-regions of an unused region.  The sub-regions are regions
  // we need to pattern and keep, loan to zircon, or zero.  Any given page that's unused will always
  // (in any given boot) be pattern, loan, or zero, regardless of the alignment of the unused
  // region.  This way we'll know which pages are supposed to be patterned, loaned, or zeroed
  // despite unused regions getting merged/split.
  //
  // Depending on settings, some sub-region types won't exist, so their corresponding callable won't
  // be called.
  //
  // The pattern_func, loan_func, and zero_func take different actions depending on calling context,
  // but generally each func is supposed to handle the pages that are supposed to be patterned,
  // loaned, or zeroed.  For example, write the pattern or check the pattern, loan the page or
  // un-loan the page, zero the page or nop.
  //
  // If a page is protected it'll be skipped, and not processed by any of the passed-in funcs.
  //
  // All the funcs take const ralloc_region_t&.
  template <typename F1, typename F2, typename F3>
  void ForUnusedGuardPatternRanges(const ralloc_region_t& region, F1 pattern_func, F2 loan_func,
                                   F3 zero_func);

  // This handles the unprotected portions of the region passed to ForUnusedGuardPatternRanges().
  template <typename F1, typename F2, typename F3>
  void ForUnusedGuardPatternRangesInternal(const ralloc_region_t& region, F1 pattern_func,
                                           F2 loan_func, F3 zero_func);

  void StashDeletedRegion(const RegionData& region_data);
  DeletedRegion* FindMostRecentDeletedRegion(uint64_t offset);
  // Log DeletedRegion info and fairly detailed diff info for a range that's detected to differ from
  // the pattern that was previously written.
  //
  // TODO(dustingreen): With some refactoring we could have common code for diff reporting, for all
  // of per-reserved-range guard pages, per-allocation guard pages, and unused page guard pages.
  void ReportPatternCheckFailedRange(const ralloc_region_t& failed_range, const char* which_type);

  void OnRegionUnused(const ralloc_region_t& region);
  zx_status_t CommitRegion(const ralloc_region_t& region);

  void EnsureSteppingTowardOptimalProtectedRanges();
  void StepTowardOptimalProtectedRanges(async_dispatcher_t* dispatcher, async::TaskBase* task,
                                        zx_status_t status);

  protected_ranges::ProtectedRangesCoreControl& protected_ranges_core_control(
      const fuchsia_sysmem2::Heap& heap);

  void DumpRanges() const;

  Owner* const parent_device_{};
  async_dispatcher_t* dispatcher_{};
  const char* const allocation_name_{};
  const fuchsia_sysmem2::Heap heap_{};
  const uint64_t counter_id_{};
  char child_name_[ZX_MAX_NAME_LEN] = {};

  uint64_t guard_region_size_ = 0;
  // Holds the default data to be placed into the guard region.
  std::vector<uint8_t> guard_region_data_;
  // Holds a copy of the guard region data that's compared with the real value.
  std::vector<uint8_t> guard_region_copy_;

  bool crash_on_guard_failure_ = false;
  // Internal guard regions are around every allocation, and not just the beginning and end of the
  // contiguous VMO.
  bool has_internal_guard_regions_ = false;

  zx::vmo contiguous_vmo_;
  zx::pmt pool_pmt_;
  RegionAllocator region_allocator_;
  uint64_t allocated_bytes_ = 0;

  // We run protected_ranges_ in the same [0, size_) space as region_allocator_, and convert to
  // physical ranges in protected_ranges_control_ (adding phys_start_).
  std::optional<protected_ranges::ProtectedRanges> protected_ranges_;
  // When allocating/deallocating a buffer, we immediately make the necessary/possible changes via
  // protection_ranges_ to make that buffer space usable/best-effort-reclaimable, but to really
  // optimize the protection ranges we need to spread out the changes in time to avoid churning all
  // the loaned pages at once.  This timer does that.
  async::TaskMethod<ContiguousPooledMemoryAllocator,
                    &ContiguousPooledMemoryAllocator::StepTowardOptimalProtectedRanges>
      step_toward_optimal_protected_ranges_{this};
  // We effectively reset the timer any time there's new allocate/deallocate activity, since that
  // activity is also churn in some sense, so we avoid compounding that churn with optimizing
  // steps until more time has passed, even if the timer had previously been set to go off soon.
  zx::time step_toward_optimal_protected_ranges_min_time_ = zx::time::infinite_past();
  // The bottom edge of protected_ranges_ uses protected_ranges_control_ to effect actual changes.
  // This delegates to ContiguousPooledMemoryAllocator or Device (and then SecureMem) to do the
  // changes.
  std::optional<RangesControl> protected_ranges_control_;

  // From parent_vmo handle to std::unique_ptr<>
  std::map<zx_handle_t, RegionData> regions_;
  zx_paddr_t phys_start_{};
  uint64_t size_{};
  // True if the CPU can always touch these pages.  False if these pages are under a HW protected
  // range at least sometimes.
  bool is_always_cpu_accessible_{};
  // True if the CPU can sometimes touch these pages.  False if these pages are under a HW protected
  // range 100% of the time).
  bool is_ever_cpu_accessible_{};
  // True if the VMO is a normal contiguous VMO.  False if the VMO is a physical VMO, which doesn't
  // support decommit (and we don't need it to, since a physical VMO is only use when
  // !is_ever_cpu_accessible_).
  bool can_decommit_{};
  bool is_ready_{};
  // True if the allocator can be deleted after it's marked ready.
  bool can_be_torn_down_{};
  bool is_setup_unused_pages_called_{};

  uint64_t failed_guard_region_checks_{};

  uint64_t high_water_mark_used_size_{};
  uint64_t max_free_size_at_high_water_mark_{};

  inspect::Node node_;
  inspect::ValueList properties_;
  inspect::UintProperty size_property_;
  inspect::UintProperty high_water_mark_property_;
  inspect::UintProperty used_size_property_;
  inspect::UintProperty allocations_failed_property_;
  inspect::UintProperty last_allocation_failed_timestamp_ns_property_;
  inspect::UintProperty commits_failed_property_;
  inspect::UintProperty last_commit_failed_timestamp_ns_property_;
  // Keeps track of how many allocations would have succeeded but failed due to fragmentation.
  inspect::UintProperty allocations_failed_fragmentation_property_;
  // This is the size of a the largest free contiguous region when high_water_mark_property_ was
  // last modified. It can be used to determine how much space was wasted due to fragmentation.
  inspect::UintProperty max_free_at_high_water_property_;
  // size - high_water_mark. This is used for cobalt reporting.
  inspect::UintProperty free_at_high_water_mark_property_;
  inspect::BoolProperty is_ready_property_;
  inspect::UintProperty failed_guard_region_checks_property_;
  inspect::UintProperty last_failed_guard_region_check_timestamp_ns_property_;
  // This tracks the sum of the size of the 10 largest free regions.
  inspect::UintProperty large_contiguous_region_sum_property_;

  // CMM / PCMM properties regarding loaning of pages to Zircon.
  //
  // The minimum efficiency since this class was created.
  double min_efficiency_ = 1.0;
  inspect::DoubleProperty loanable_efficiency_property_;
  inspect::DoubleProperty loanable_ratio_property_;
  inspect::UintProperty loanable_bytes_property_;
  inspect::UintProperty loanable_mebibytes_property_;

  zx::event trace_observer_event_;
  async::WaitMethod<ContiguousPooledMemoryAllocator,
                    &ContiguousPooledMemoryAllocator::TraceObserverCallback>
      wait_{this};

  async::TaskMethod<ContiguousPooledMemoryAllocator,
                    &ContiguousPooledMemoryAllocator::CheckGuardPageCallback>
      guard_checker_{this};

  // Split up the unused page check into relatively small pieces to avoid spiking the CPU or
  // causing latency spikes for normal sysmem requests.
  static constexpr uint32_t kUnusedCheckPartialCount = 64;
  // We do this one page at a time to hopefully stay within L1 on all devices, since in the allocate
  // path we're checking this amount of buffer space with memcmp(), then also zeroing the same space
  // with memset().  If we did so in chunks larger than L1, we'd be spilling cache lines to L2
  // or RAM during memcmp(), then pulling them back in during memset().  Cache sizes and tiers can
  // vary of course.  This also determines the granularity at which we report pattern mismatch
  // failures, so 1 page is best here for that also.
  const uint64_t unused_guard_data_size_ = zx_system_get_page_size();
  bool unused_pages_guarded_ = false;
  zx::duration unused_page_check_cycle_period_ = kDefaultUnusedPageCheckCyclePeriod;
  uint64_t unused_check_phase_ = 0;
  async::TaskMethod<ContiguousPooledMemoryAllocator,
                    &ContiguousPooledMemoryAllocator::CheckUnusedPagesCallback>
      unused_checker_{this};
  async::TaskMethod<ContiguousPooledMemoryAllocator,
                    &ContiguousPooledMemoryAllocator::CheckUnusedRecentlyPagesCallback>
      unused_recently_checker_{this};
  SysmemMetrics& metrics_;

  // Regardless of is_ever_cpu_accessible_, we create a mapping of the whole vmo.  When
  // is_always_cpu_accessible_ we can use the mapping to zero new buffers.  When
  // is_ever_cpu_accessible_ we can use the mapping to write and check patterns in unused pages.
  uint8_t* mapping_ = nullptr;

  // While we'll typically pattern only 1 page per pattern period and adjust the pattern period to
  // get the % we want, being able to vary this might potentially help catch a suspected problem
  // faster; in any case it's simple enough to allow this to be adjusted.
  static constexpr uint64_t kUnusedToPatternPages = 1;
  uint64_t unused_guard_pattern_period_bytes_ =
      kUnusedGuardPatternPeriodPages * zx_system_get_page_size();
  const uint64_t unused_to_pattern_bytes_ = kUnusedToPatternPages * zx_system_get_page_size();

  bool is_bti_fake_ = false;

  // We cap the number of DeletedRegion we're willing to track; otherwise the overhead could get a
  // bit excessive in pathological cases if we were to allow tracking a DeletedRegion per page for
  // example.  This is optimized for update, not (at all) for lookup, since we only do lookups if
  // a page just failed a pattern check, which should never happen.  If it does happen, we want to
  // know the paddr_t range and name of the most-recently-deleted region, and possibly the 2nd most
  // recently deleted region also, if it comes to that.
  static constexpr int32_t kNumDeletedRegions = 512;
  int32_t deleted_regions_count_ = 0;
  int32_t deleted_regions_next_ = 0;
  // Only allocate if we'll be checking unused pages.
  std::vector<DeletedRegion> deleted_regions_;

  // This is Zircon's zero page mapped a few times, read-only.
  uint64_t zero_page_vmo_size_ = fbl::round_up(64ull * 1024, zx_system_get_page_size());
  zx::vmo zero_page_vmo_;
  uint8_t* zero_page_vmo_base_ = nullptr;

  protected_ranges::ProtectedRangesCoreControl* protected_ranges_core_control_ = nullptr;
};

}  // namespace sysmem_driver

#endif  // SRC_DEVICES_SYSMEM_DRIVERS_SYSMEM_CONTIGUOUS_POOLED_MEMORY_ALLOCATOR_H_
