[msd-intel-gen] Retry on release when there are active mappings

Needed for upcoming Mesa/Anvil 20.0.

Multiply:msd_intel_gen_nonhardware_tests

Bug:13257

Change-Id: Ib2aaa7cf70bb47126f384facf87a516e5175613e
Reviewed-on: https://fuchsia-review.googlesource.com/c/fuchsia/+/402026
Reviewed-by: John Bauman <jbauman@google.com>
Testability-Review: John Bauman <jbauman@google.com>
Commit-Queue: Craig Stout <cstout@google.com>
diff --git a/src/graphics/drivers/msd-intel-gen/src/msd_intel_connection.cc b/src/graphics/drivers/msd-intel-gen/src/msd_intel_connection.cc
index ec57ede..d369a17 100644
--- a/src/graphics/drivers/msd-intel-gen/src/msd_intel_connection.cc
+++ b/src/graphics/drivers/msd-intel-gen/src/msd_intel_connection.cc
@@ -101,15 +101,48 @@
       ->ReleaseBuffer(MsdIntelAbiBuffer::cast(buffer)->ptr()->platform_buffer());
 }
 
-void MsdIntelConnection::ReleaseBuffer(magma::PlatformBuffer* buffer) {
+void MsdIntelConnection::ReleaseBuffer(magma::PlatformBuffer* buffer,
+                                       std::function<void(uint32_t delay_ms)> sleep_callback) {
   std::vector<std::shared_ptr<GpuMapping>> mappings;
   per_process_gtt()->ReleaseBuffer(buffer, &mappings);
 
   DLOG("ReleaseBuffer %lu\n", buffer->id());
 
-  bool killed = false;
   for (const auto& mapping : mappings) {
     uint32_t use_count = mapping.use_count();
+
+    if (use_count > 1) {
+      // It's an error to release a buffer while it has inflight mappings, as that can fault the
+      // GPU.  However Mesa/Anvil no longer exactly tracks the user buffers that are associated
+      // with each command buffer, instead it snapshots all user buffers currently allocated on
+      // the device, which can include buffers from other threads.
+      // We observe this happening in at least one multithreaded CTS case.
+      // Intel says their DRM system driver will stall to handle the unlikely case when it happens,
+      // so we do the same here.
+      DLOG("ReleaseBuffer %lu mapping has use count %u", mapping->BufferId(), use_count);
+
+      if (!sent_context_killed()) {
+        constexpr uint32_t kRetries = 10;
+        constexpr uint32_t kRetrySleepMs = 100;
+
+        for (uint32_t retry = 0; retry < kRetries; retry++) {
+          sleep_callback(kRetrySleepMs);
+
+          use_count = mapping.use_count();
+          if (use_count == 1)
+            break;
+        }
+
+        if (use_count > 1) {
+          MAGMA_LOG(
+              WARNING,
+              "ReleaseBuffer %lu mapping has use count %u after stall, sending context killed",
+              mapping->BufferId(), use_count);
+          SendContextKilled();
+        }
+      }
+    }
+
     if (use_count == 1) {
       // Bus mappings are held in the connection and passed through the command stream to
       // ensure the memory isn't released until the tlbs are invalidated, which happens
@@ -119,14 +152,6 @@
       for (uint32_t i = 0; i < bus_mappings.size(); i++) {
         mappings_to_release_.emplace_back(std::move(bus_mappings[i]));
       }
-    } else {
-      // It's an error to release a buffer while it has inflight mappings, as that
-      // can fault the gpu.
-      DLOG("buffer %lu mapping use_count %d", mapping->BufferId(), use_count);
-      if (!killed) {
-        SendContextKilled();
-        killed = true;
-      }
     }
   }
 }
diff --git a/src/graphics/drivers/msd-intel-gen/src/msd_intel_connection.h b/src/graphics/drivers/msd-intel-gen/src/msd_intel_connection.h
index 98810dd..deb9c89 100644
--- a/src/graphics/drivers/msd-intel-gen/src/msd_intel_connection.h
+++ b/src/graphics/drivers/msd-intel-gen/src/msd_intel_connection.h
@@ -50,14 +50,22 @@
     notifications_.SendBufferIds(buffer_ids);
   }
 
-  void SendContextKilled() { notifications_.SendContextKilled(); }
+  void SendContextKilled() {
+    notifications_.SendContextKilled();
+    sent_context_killed_ = true;
+  }
 
   // Maps |page_count| pages of the given |buffer| at |page_offset| to |gpu_addr| into the
   // GPU address space belonging to this connection.
   magma::Status MapBufferGpu(std::shared_ptr<MsdIntelBuffer> buffer, uint64_t gpu_addr,
                              uint64_t page_offset, uint64_t page_count);
 
-  void ReleaseBuffer(magma::PlatformBuffer* buffer);
+  void ReleaseBuffer(magma::PlatformBuffer* buffer) {
+    auto callback = [](uint32_t delay_ms) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(delay_ms));
+    };
+    ReleaseBuffer(buffer, callback);
+  }
 
   // Submit pending release mappings on the given context
   bool SubmitPendingReleaseMappings(std::shared_ptr<MsdIntelContext> context);
@@ -71,12 +79,18 @@
     return mappings_to_release_;
   }
 
+  bool sent_context_killed() { return sent_context_killed_; }
+
+  void ReleaseBuffer(magma::PlatformBuffer* buffer,
+                     std::function<void(uint32_t delay_ms)> sleep_callback);
+
   static const uint32_t kMagic = 0x636f6e6e;  // "conn" (Connection)
 
   Owner* owner_;
   std::shared_ptr<PerProcessGtt> ppgtt_;
   msd_client_id_t client_id_;
   std::vector<std::unique_ptr<magma::PlatformBusMapper::BusMapping>> mappings_to_release_;
+  bool sent_context_killed_ = false;
 
   class Notifications {
    public:
diff --git a/src/graphics/drivers/msd-intel-gen/src/msd_intel_context.cc b/src/graphics/drivers/msd-intel-gen/src/msd_intel_context.cc
index 1642a0d..f4d930314 100644
--- a/src/graphics/drivers/msd-intel-gen/src/msd_intel_context.cc
+++ b/src/graphics/drivers/msd-intel-gen/src/msd_intel_context.cc
@@ -110,6 +110,11 @@
   }
 
   semaphore_port_.reset();
+
+  // Clear pending command buffers so buffer release doesn't see stuck mappings
+  while (pending_command_buffer_queue_.size()) {
+    pending_command_buffer_queue_.pop();
+  }
 }
 
 magma::Status ClientContext::SubmitCommandBuffer(std::unique_ptr<CommandBuffer> command_buffer) {
diff --git a/src/graphics/drivers/msd-intel-gen/tests/unit_tests/test_connection.cc b/src/graphics/drivers/msd-intel-gen/tests/unit_tests/test_connection.cc
index fbd3e70..2685662 100644
--- a/src/graphics/drivers/msd-intel-gen/tests/unit_tests/test_connection.cc
+++ b/src/graphics/drivers/msd-intel-gen/tests/unit_tests/test_connection.cc
@@ -94,14 +94,49 @@
 
     std::shared_ptr<MsdIntelBuffer> buffer = MsdIntelBuffer::Create(PAGE_SIZE, "test");
     std::shared_ptr<GpuMapping> mapping;
+
+    constexpr uint64_t kGpuAddr = 0x10000;
+    EXPECT_TRUE(AddressSpace::MapBufferGpu(connection->per_process_gtt(), buffer, kGpuAddr, 0, 1,
+                                           &mapping));
+    ASSERT_TRUE(mapping);
+    EXPECT_TRUE(connection->per_process_gtt()->AddMapping(mapping));
+
+    // Release the mapping during the retry.
+    auto sleep_callback = [&](uint32_t ms) { mapping.reset(); };
+
+    connection->ReleaseBuffer(buffer->platform_buffer(), sleep_callback);
+
+    EXPECT_EQ(0u, callback_count_);
+    EXPECT_FALSE(connection->sent_context_killed());
+
+    const std::vector<std::unique_ptr<magma::PlatformBusMapper::BusMapping>>& mappings =
+        connection->mappings_to_release();
+    EXPECT_EQ(1u, mappings.size());
+  }
+
+  void ReleaseBufferWhileMappedContextKilled() {
+    auto connection = MsdIntelConnection::Create(this, 0);
+    ASSERT_TRUE(connection);
+
+    connection->SetNotificationCallback(KillCallbackStatic, this);
+
+    std::shared_ptr<MsdIntelBuffer> buffer = MsdIntelBuffer::Create(PAGE_SIZE, "test");
+    std::shared_ptr<GpuMapping> mapping;
     EXPECT_TRUE(
         AddressSpace::MapBufferGpu(connection->per_process_gtt(), buffer, 0x10000, 0, 1, &mapping));
     ASSERT_TRUE(mapping);
     EXPECT_TRUE(connection->per_process_gtt()->AddMapping(mapping));
 
-    // Release the buffer while holding the mapping triggers the killed callback
-    connection->ReleaseBuffer(buffer->platform_buffer());
+    // Release the buffer while holding the mapping retries for a while then
+    // triggers the killed callback.
+    uint32_t sleep_callback_count = 0;
+    auto sleep_callback = [&](uint32_t ms) { sleep_callback_count += 1; };
+
+    connection->ReleaseBuffer(buffer->platform_buffer(), sleep_callback);
+
+    EXPECT_GT(sleep_callback_count, 0u);
     EXPECT_EQ(1u, callback_count_);
+    EXPECT_TRUE(connection->sent_context_killed());
 
     const std::vector<std::unique_ptr<magma::PlatformBusMapper::BusMapping>>& mappings =
         connection->mappings_to_release();
@@ -144,6 +179,10 @@
 
 TEST_F(TestMsdIntelConnection, ReleaseBufferWhileMapped) { ReleaseBufferWhileMapped(); }
 
+TEST_F(TestMsdIntelConnection, ReleaseBufferWhileMappedContextKilled) {
+  ReleaseBufferWhileMappedContextKilled();
+}
+
 TEST_F(TestMsdIntelConnection, ReuseGpuAddrWithoutRelease) { ReuseGpuAddrWithoutRelease(); }
 
 TEST_F(TestMsdIntelConnection, InheritanceCheck) {