zircon/kernel/vm/physical_page_provider.cc - fuchsia - Git at Google

 // Copyright 2021 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #include "include/vm/physical_page_provider.h"

 #include <lib/counters.h>
 #include <lib/dump/depth_printer.h>
 #include <lib/fit/result.h>
 #include <trace.h>

 #include <kernel/range_check.h>
 #include <object/thread_dispatcher.h>

 #define LOCAL_TRACE 0

 KCOUNTER(physical_reclaim_total_requests, "physical.reclaim.total_requests")
 KCOUNTER(physical_reclaim_succeeded_requests, "physical.reclaim.succeeded_requests")
 KCOUNTER(physical_reclaim_failed_requests, "physical.reclaim.failed_requests")

 PhysicalPageProvider::PhysicalPageProvider(uint64_t size) : size_(size) { LTRACEF("\n"); }

 PhysicalPageProvider::~PhysicalPageProvider() {
   LTRACEF("%p\n", this);
   // Possible we were destructed without being initialized and in error paths we can destruct
   // without detached_ or closed_ becoming true, so cannot check for those.
   if (phys_base_ == kInvalidPhysBase) {
     return;
   }

   // To return our pages first retrieve any pages that are loaned, before returning everything to
   // the PMM. This is inefficient as instead of retrieving the loaned page only to put it back in
   // the PMM we could unloan the page 'in place' where it is, however contiguous VMOs that are
   // loaning pages are not expected to be destructed often (or at all) and so this is not presently
   // a needed optimization.
   UnloanRange(0, size_, &free_list_);
   ASSERT(list_length(&free_list_) == size_ / PAGE_SIZE);
   Pmm::Node().FreeList(&free_list_);
 }

 PageSourceProperties PhysicalPageProvider::properties() const {
   return PageSourceProperties{
       .is_user_pager = false,
       .is_preserving_page_content = false,
       .is_providing_specific_physical_pages = true,
       .supports_request_type = {true, false, false},
   };
 }

 void PhysicalPageProvider::Init(VmCowPages* cow_pages, PageSource* page_source, paddr_t phys_base) {
   DEBUG_ASSERT(cow_pages);
   DEBUG_ASSERT(!IS_PAGE_ROUNDED(kInvalidPhysBase));
   DEBUG_ASSERT(IS_PAGE_ROUNDED(phys_base));
   DEBUG_ASSERT(!cow_pages_);
   DEBUG_ASSERT(phys_base_ == kInvalidPhysBase);
   Guard<Mutex> guard{&mtx_};
   cow_pages_ = cow_pages;
   page_source_ = page_source;
   phys_base_ = phys_base;
 }

 // Called under lock of contiguous VMO that needs the pages.  The request is later processed at the
 // start of WaitOnEvent.
 void PhysicalPageProvider::SendAsyncRequest(PageRequest* request) {
   DEBUG_ASSERT(phys_base_ != kInvalidPhysBase);
   DEBUG_ASSERT(GetRequestType(request) == page_request_type::READ);
   Guard<Mutex> guard{&mtx_};
   ASSERT(!closed_);

   // PhysicalPageProvider always operates async (similar to PagerProxy), because we'd like to (in
   // typical non-overlapping commit/decommit usage) have one batch that covers the entire commit,
   // regardless of the fact that some of the pages may already be free and therefore could be
   // immediately obtained.  Quite often at least one page will be presently owned by a different
   // VMO, so we may as well always do one big async batch that deals with all the presently
   // non-FREE_LOANED pages.
   //
   // At this point the page may be FREE_LOANED, or in use by a different VMO.
   //
   // Allocation of a new page to a VMO has an interval during which the page is not free, but also
   // isn't state == OBJECT yet.  During processing we rely on that interval occurring only under the
   // other VMO's lock, but we can't acquire the other VMO's lock here since we're already currently
   // holding the underlying owning contiguous VMO's lock.
   QueueRequestLocked(request);
 }

 void PhysicalPageProvider::QueueRequestLocked(PageRequest* request) {
   DEBUG_ASSERT(phys_base_ != kInvalidPhysBase);
   DEBUG_ASSERT(GetRequestType(request) == page_request_type::READ);
   ASSERT(!closed_);
   pending_requests_.push_back(request);
 }

 void PhysicalPageProvider::ClearAsyncRequest(PageRequest* request) {
   DEBUG_ASSERT(phys_base_ != kInvalidPhysBase);
   DEBUG_ASSERT(GetRequestType(request) == page_request_type::READ);
   Guard<Mutex> guard{&mtx_};
   ASSERT(!closed_);

   if (fbl::InContainer<PageProviderTag>(*request)) {
     pending_requests_.erase(*request);
   }

   // No need to chase down any currently-processing request here, since before processing a request,
   // we stash the values of all fields we need from the PageRequest under the lock.  So any
   // currently-processing request is independent from the PageRequest that started it.
 }

 void PhysicalPageProvider::SwapAsyncRequest(PageRequest* old, PageRequest* new_req) {
   DEBUG_ASSERT(phys_base_ != kInvalidPhysBase);
   DEBUG_ASSERT(GetRequestType(old) == page_request_type::READ);
   DEBUG_ASSERT(GetRequestType(new_req) == page_request_type::READ);
   Guard<Mutex> guard{&mtx_};
   ASSERT(!closed_);

   if (fbl::InContainer<PageProviderTag>(*old)) {
     pending_requests_.insert(*old, new_req);
     pending_requests_.erase(*old);
   }
 }

 void PhysicalPageProvider::FreePages(list_node* pages) {
   {
     // Check if we are detached, and if so put the pages straight in the free_list_ instead of
     // loaning them out, since we will be being closed soon and will only have to go and retrieve
     // them.
     Guard<Mutex> guard{&mtx_};
     if (DetachedLocked()) {
       if (list_is_empty(&free_list_)) {
         list_move(pages, &free_list_);
       } else {
         list_splice_after(pages, list_peek_tail(&free_list_));
       }
       return;
     }
   }
   // This should always be called in a way that is serialized with other operations on our
   // cow_pages_, otherwise there is a race where an operation on the cow_pages_ could observe an
   // absence of pages, but then be unable to retrieve them because it is not synchronized with this
   // FreePages call. Both parties using the paged_vmo_lock serves to create a synchronization that
   // avoids this scenario.
   ASSERT(page_source_);
   AssertHeld(*page_source_->paged_vmo_lock());
   // This marks the pages loaned, and makes them FREE_LOANED for potential use by other clients that
   // are ok with getting loaned pages when allocating. Must hold the loaned_state_lock_ as we are
   // manipulating the loaned state of pages that could get inspected by UnloanRange due to
   // interactions with cancelled page requests.
   Guard<Mutex> guard{&loaned_state_lock_};
   Pmm::Node().BeginLoan(pages);
 }

 bool PhysicalPageProvider::DebugIsPageOk(vm_page_t* page, uint64_t offset) {
   Guard<Mutex> guard{&mtx_};
   DEBUG_ASSERT((cow_pages_ != nullptr) == (phys_base_ != kInvalidPhysBase));
   // Assume pages added before we know the cow_pages_ or phys_base_ are ok.
   if (!cow_pages_) {
     return true;
   }
   return (page->paddr() - phys_base_) == offset;
 }

 void PhysicalPageProvider::OnDetach() {
   // Its possible for destruction to happen prior to initialization completing, and this can lead
   // to OnDetach being called from what would be the cow_pages_, but prior to |Init| being called
   // and the cow_pages_ pointer being setup. In this case we can safely ignore locking requirements
   // and just set detached.
   Guard<Mutex> guard{&mtx_};
   ASSERT(!closed_);
   if (phys_base_ == kInvalidPhysBase) {
     ASSERT(!cow_pages_);
     // As we cannot assert the cow_pages_ lock, due to it being a nullptr, we must temporarily
     // disable analysis.
     [&]() TA_NO_THREAD_SAFETY_ANALYSIS { detached_ = true; }();
   } else {
     // The current synchronization strategy relies on OnDetach being called with the VMO lock being
     // held. This allows us to assume that the detached_ flag only transitions under the VmCowPages
     // lock.
     AssertHeld(cow_pages_->lock_ref());
     detached_ = true;
   }
 }

 void PhysicalPageProvider::OnClose() {
   Guard<Mutex> guard{&mtx_};
   ASSERT(!closed_);
   closed_ = true;
 }

 bool PhysicalPageProvider::DequeueRequest(uint64_t* request_offset, uint64_t* request_length) {
   Guard<Mutex> guard{&mtx_};
   // closed_ can be true here, but if closed_ is true, then pending_requests_ is also empty, so
   // we won't process any more requests once closed_ is true. However, there is also no point in
   // processing requests if we have detached, as these requests will be cancelled anyhow.
   DEBUG_ASSERT(!closed_ || pending_requests_.is_empty());
   if (pending_requests_.is_empty() || DetachedLocked()) {
     // Done with all requests (or remaining requests cancelled).
     return false;
   }
   PageRequest* request = pending_requests_.pop_front();
   DEBUG_ASSERT(request);
   DEBUG_ASSERT(GetRequestType(request) == page_request_type::READ);
   *request_offset = GetRequestOffset(request);
   *request_length = GetRequestLen(request);
   DEBUG_ASSERT(InRange(*request_offset, *request_length, size_));
   return true;
 }

 void PhysicalPageProvider::UnloanRange(uint64_t range_offset, uint64_t length, list_node_t* pages) {
   Guard<Mutex> guard{&loaned_state_lock_};
   // Evict needed physical pages from other VMOs, so that needed physical pages become free.  This
   // is iterating over the destination offset in cow_pages_.  The needed pages can be scattered
   // around in various VMOs and offsets of those VMOs and, by the time we get to looking at them,
   // could even already be returned to the PMM and free.
   uint64_t range_end = range_offset + length;
   for (uint64_t offset = range_offset; offset < range_end; offset += PAGE_SIZE) {
     vm_page_t* page = paddr_to_vm_page(phys_base_ + offset);
     DEBUG_ASSERT(page);
     // Page should never have entered the regular FREE state as it should either be loaned out in a
     // VMO, in the FREE_LOANED state, or owned by us in our VmCowPages.
     DEBUG_ASSERT(!page->is_free());
     // If the page is not currently loaned out, then skip, our work here is done.
     if (!page->is_loaned()) {
       continue;
     }
     // Cancel the loan allowing us to track the page down.
     Pmm::Node().CancelLoan(page);

     // Cancelling the loan took the loaned pages lock and so just prior to that completing we knew
     // that every page was either:
     //  1. Found in the LOANED_FREE state, and is therefore still in that state.
     //  2. Completely installed in a VMO with a valid backlink.
     if (!page->is_free_loaned()) {
       // Between cancelling the loan and now, the page could be in the progress of migrating back
       // to the PMM. If we just perform GetCowForLoanedPage then we could observe a scenario where
       // the page is still in the OBJECT state, but has its backlink cleared. To avoid this we
       // perform the lookup under the loaned pages lock, ensuring we either see the page while it
       // is still in the VMO, with a valid backlink, or after it has fully migrated back to the
       // PMM.
       ktl::optional<PageQueues::VmoBacklink> maybe_vmo_backlink;
       Pmm::Node().WithLoanedPage(page, [&maybe_vmo_backlink](vm_page_t* page) {
         maybe_vmo_backlink = pmm_page_queues()->GetCowForLoanedPage(page);
       });
       // Either we got a backlink, or the page is already back in the PMM.
       DEBUG_ASSERT(maybe_vmo_backlink || page->is_free_loaned());
       if (maybe_vmo_backlink) {
         // As we will be calling back into the VMO we want to drop the loaned state lock to both
         // avoid lock ordering issues, and to not excessively hold the lock. During this section we
         // are not modifying or inspecting the loaned state so the lock is not needed. The cow page
         // we call may inspect the loaned state, but it does so iff it knows it is the owner and so
         // it may safely do so.
         guard.CallUnlocked([&] {
           auto& vmo_backlink = maybe_vmo_backlink.value();
           DEBUG_ASSERT(vmo_backlink.cow);
           auto& cow_container = vmo_backlink.cow;

           DEBUG_ASSERT(!page->object.always_need);
           bool needs_evict = true;

           // Check if we should attempt to replace the page to avoid eviction.
           if (PhysicalPageBorrowingConfig::Get().is_replace_on_unloan_enabled()) {
             __UNINITIALIZED AnonymousPageRequest page_request;
             zx_status_t replace_result = cow_container->ReplacePage(page, vmo_backlink.offset,
                                                                     false, nullptr, &page_request);
             // If replacement failed for any reason, fall back to eviction. If replacement succeeded
             // then the page got directly returned to the pmm.
             needs_evict = replace_result != ZX_OK;
             if (replace_result == ZX_ERR_SHOULD_WAIT) {
               page_request.Cancel();
             }
             // If replacement succeeded, i.e. we are not going to fall back to eviction, then the
             // page should be back in the PMM.
             DEBUG_ASSERT(needs_evict || page->is_free_loaned());
           }
           if (needs_evict) {
             [[maybe_unused]] VmCowReclaimResult reclaimed = cow_container->ReclaimPageForEviction(
                 page, vmo_backlink.offset, VmCowPages::EvictionAction::Require);
             // Either we succeeded eviction, or another thread raced and did it first. If another
             // thread did it first then it would have done so under the VMO lock, which we have
             // since acquired, and so we know the page is either on the way (in a
             // FreeLoanedPagesHolder) or in the PMM. We can ensure the page is fully migrated to the
             // PMM by waiting for any holding to be concluded.
             DEBUG_ASSERT(reclaimed.is_error() || page->is_free_loaned());
             if (!page->is_free_loaned()) {
               Pmm::Node().WithLoanedPage(page, [](vm_page_t* page) {});
               DEBUG_ASSERT(page->is_free_loaned());
             }
           }
         });
       }
       // For all the scenarios, no backlink, successful replacement or eviction attempts, the page
       // must have ended up in the PMM.
       ASSERT(page->is_free_loaned());
     }
     // Now that the page is definitely in the FREE_LOANED state, gain ownership from the PMM.
     Pmm::Node().EndLoan(page);
     DEBUG_ASSERT(page->state() == vm_page_state::ALLOC);
     list_add_tail(pages, &page->queue_node);
   }  // for pages of request
 }

 // TODO(https://fxbug.dev/42084841): Reason about the use of |suspendable|, ignored for now.
 zx_status_t PhysicalPageProvider::WaitOnEvent(Event* event,
                                               bool suspendable) TA_NO_THREAD_SAFETY_ANALYSIS {
   // Before processing any events we synchronize with OnDetach and retrieve a RefPtr to the
   // cow_pages_. This ensures we can safely dereference the cow_pages_ later on, knowing we are not
   // racing with its destructor.
   fbl::RefPtr<VmCowPages> cow_ref;
   {
     Guard<Mutex> guard{&mtx_};
     if (DetachedLocked()) {
       return ZX_OK;
     }
     cow_ref = fbl::MakeRefPtrUpgradeFromRaw(cow_pages_, mtx_);
     // As we hold the lock, and had not yet detached, the cow_pages_ must not have run
     // OnDeadTransition, and so it must still be possible to get a ref to it.
     ASSERT(cow_ref);
   }
   // When WaitOnEvent is called, we know that the event being waited on is associated with a request
   // that's already been queued, so we can use this thread to process _all_ the queued requests
   // first, and then wait on the event which then won't have any reason to block this thread, since
   // every page of every request that existed on entry to this method has been succeeded or failed
   // by the time we wait on the passed-in event.
   uint64_t request_offset;
   uint64_t request_length;
   while (DequeueRequest(&request_offset, &request_length)) {
     DEBUG_ASSERT(request_offset + request_length > request_offset);

     // These are ordered by cow_pages_ offsets (destination offsets), but may have gaps due to not
     // all the pages being loaned.
     list_node unloaned_pages;
     list_initialize(&unloaned_pages);
     UnloanRange(request_offset, request_length, &unloaned_pages);

     list_node contiguous_pages = LIST_INITIAL_VALUE(contiguous_pages);
     // Process all the loaned pages by finding any contiguous runs and processing those as a batch.
     // There is no correctness reason to process them in batches, but it is more efficient for the
     // cow_pages_ to receive a run where possible instead of repeatedly being given single pages.
     while (!list_is_empty(&unloaned_pages)) {
       vm_page_t* page = list_peek_head_type(&unloaned_pages, vm_page_t, queue_node);
       if (list_is_empty(&contiguous_pages) ||
           list_peek_tail_type(&contiguous_pages, vm_page_t, queue_node)->paddr() + PAGE_SIZE ==
               page->paddr()) {
         list_delete(&page->queue_node);
         list_add_tail(&contiguous_pages, &page->queue_node);
         // Generally want to keep trying to find more contiguous pages, unless there are no more
         // pages.
         if (!list_is_empty(&unloaned_pages)) {
           continue;
         }
       }

       // An interfering decommit can occur after we've moved these pages into VmCowPages, but not
       // yet moved the entire commit request into VmCowPages.  If not all pages end up present in
       // cow_pages_ on return to the user from the present commit, due to concurrent decommit,
       // that's just normal commit semantics.
       //
       // Supply the pages we got to cow_pages_.  Also tell it what range to claim is supplied now
       // for convenience.
       //
       // If there's an interfering decommit, then that decommit can only interfere after we've added
       // the pages to VmCowPages, so isn't an immediate concern here.
       //
       // We want to use VmCowPages::SupplyPages() to avoid a proliferation of VmCowPages code that
       // calls OnPagesSupplied() / OnPagesFailed(), so to call SupplyPages() we need a
       // VmPageSpliceList.  We put all the pages in the "head" portion of the VmPageSpliceList since
       // there are no VmPageListNode(s) involved in this path.  We also zero the pages here, since
       // SupplyPages() doesn't do that.
       //
       // We can zero the pages before we supply them, which avoids holding the VmCowPages::lock_
       // while zeroing, and also allows us to flush the zeroes to RAM here just in case any client
       // is (incorrectly) assuming that non-pinned pages necessarily remain cache clean once they
       // are cache clean.
       uint64_t supply_length = 0;
       list_for_every_entry (&contiguous_pages, page, vm_page, queue_node) {
         void* ptr = paddr_to_physmap(page->paddr());
         DEBUG_ASSERT(ptr);
         arch_zero_page(ptr);
         supply_length += PAGE_SIZE;
         arch_clean_invalidate_cache_range(reinterpret_cast<vaddr_t>(ptr), PAGE_SIZE);
       }
       uint64_t supply_offset =
           list_peek_head_type(&contiguous_pages, vm_page_t, queue_node)->paddr() - phys_base_;

       VmPageSpliceList splice_list;

       // Any pages that do not get supplied for any reason, due to failures from supply or because
       // we got detached, should be re-loaned.
       auto loan_pages = fit::defer([&] {
         while (!splice_list.IsProcessed()) {
           VmPageOrMarker page_or_marker = splice_list.Pop();
           if (page_or_marker.IsPage()) {
             vm_page_t* p = page_or_marker.ReleasePage();
             DEBUG_ASSERT(!list_in_list(&p->queue_node));
             list_add_tail(&contiguous_pages, &p->queue_node);
           }
         }
         if (!list_is_empty(&contiguous_pages)) {
           Guard<Mutex> guard{&loaned_state_lock_};
           Pmm::Node().BeginLoan(&contiguous_pages);
         }
       });

       zx_status_t status =
           VmPageSpliceList::CreateFromPageList(supply_length, &contiguous_pages, &splice_list);
       if (status != ZX_OK) {
         // Only possible error is out of memory.
         ASSERT(status == ZX_ERR_NO_MEMORY);
         DEBUG_ASSERT(PageSource::IsValidInternalFailureCode(status));
         page_source_->OnPagesFailed(supply_offset, supply_length, status);
         // Do not attempt to then supply the pages, move to the next range.
         continue;
       }
       DEBUG_ASSERT(list_is_empty(&contiguous_pages));
       uint64_t supplied_len = 0;

       // First take the VMO lock before taking our lock to ensure lock ordering is correct. As we
       // hold a RefPtr we know that even if racing with OnClose this is a valid object.
       VmCowPages::DeferredOps deferred(cow_pages_);
       Guard<CriticalMutex> cow_lock{cow_pages_->lock()};
       bool detached;
       // Now take our lock and check to see if we have been detached.
       {
         Guard<Mutex> guard{&mtx_};
         detached = detached_;
       }
       // We can use the cached value of detached_ since OnDetach only runs with the VMO lock held,
       // which we presently hold. This means that if detached is false we can safely call VMO
       // methods knowing that we are not racing with any detach or close attempts.
       if (!detached) {
         // The splice_list being inserted has only true vm_page_t in it, and so SupplyPages will
         // never need to allocate or otherwise perform a partial success that would generate a page
         // request.
         zx_status_t supply_result = cow_pages_->SupplyPagesLocked(
             VmCowRange(supply_offset, supply_length), &splice_list,
             SupplyOptions::PhysicalPageProvider, &supplied_len, deferred, nullptr);
         ASSERT(supplied_len == supply_length || supply_result != ZX_OK);
         if (supply_result != ZX_OK) {
           // Supply can only fail due to being out of memory as we currently hold the lock and know
           // that it cannot be racing with a detach for close.
           DEBUG_ASSERT(supply_result == ZX_ERR_NO_MEMORY);
           DEBUG_ASSERT(PageSource::IsValidInternalFailureCode(supply_result));
           page_source_->OnPagesFailed(supply_offset, supply_length, supply_result);
         }
       }
     }
     DEBUG_ASSERT(list_is_empty(&contiguous_pages) && list_is_empty(&unloaned_pages));
   }  // while have requests to process

   kcounter_add(physical_reclaim_total_requests, 1);
   // The event now should be in one of three states:
   //  1. We processed the related request above in the loop, and the event got signaled as a
   //     consequence of the pages we supplied.
   //  2. A different thread dequeued the request and either processed it, or is still processing it.
   //  3. The request is in the process of being cancelled and the underlying request packet got
   //     dequeued, and the event has or will be signaled.
   // In all cases it is a *kernel* thread under our control that should signal the thread, and so
   // although we may need to wait, only a kernel bug should cause this to block indefinitely.
   // To attempt to detect such bugs we wait with a generous timeout before making some noise.
   constexpr zx_duration_t kReportWaitTime = ZX_SEC(60);
   zx_status_t wait_result = ZX_OK;
   uint32_t waited = 0;
   while ((wait_result = event->Wait(Deadline::after_mono(kReportWaitTime))) == ZX_ERR_TIMED_OUT) {
     waited++;
     printf("WARNING: PhysicalPageProvider has waited %" PRIi64 " seconds on event.\n",
            (kReportWaitTime * waited) / ZX_SEC(1));
   }
   if (wait_result == ZX_OK) {
     kcounter_add(physical_reclaim_succeeded_requests, 1);
   } else {
     kcounter_add(physical_reclaim_failed_requests, 1);
   }
   return wait_result;
 }

 void PhysicalPageProvider::Dump(uint depth, uint32_t max_items) {
   Guard<Mutex> guard{&mtx_};
   dump::DepthPrinter printer(depth);
   printer.Emit("physical_page_provider %p cow_pages_ %p phys_base_ 0x%" PRIx64 " closed %d", this,
                cow_pages_, phys_base_, closed_);
   printer.BeginList(max_items);
   for (auto& req : pending_requests_) {
     DEBUG_ASSERT(GetRequestType(&req) == page_request_type::READ);
     printer.Emit("  pending req [0x%lx, 0x%lx)", GetRequestOffset(&req), GetRequestLen(&req));
   }
   printer.EndList();
 }