[msd-vsl-gc] Modify ringbuffer helper functions.

Adds ringbuffer ReserveContiguous helper.

We will want to write blocks of instructions contiguously,
such as EVENT WAIT LINK, otherwise we would have to add additional
LINK commands in between to wrap back to the start of the ringbuffer.

Also changes Overwrite32 to take an offset rather than a relative
negative offset.

Next CL will use these.

BUG=43043

Change-Id: Ifbdba89fdf47e53642e40071b80488fc3fc3898e
diff --git a/garnet/drivers/gpu/msd-vsl-gc/src/msd_vsl_device.cc b/garnet/drivers/gpu/msd-vsl-gc/src/msd_vsl_device.cc
index 1f56709..f236824 100644
--- a/garnet/drivers/gpu/msd-vsl-gc/src/msd_vsl_device.cc
+++ b/garnet/drivers/gpu/msd-vsl-gc/src/msd_vsl_device.cc
@@ -285,9 +285,8 @@
     return true;
   }
   // Overwrite the last WAIT with an END.
-  bool res =
-      ringbuffer_->Overwrite32(kWaitLinkDwords /* dwords_before_tail */, MiEnd::kCommandType);
-  if (!res) {
+  uint32_t prev_wait_link = ringbuffer_->SubtractOffset(kWaitLinkDwords * sizeof(uint32_t));
+  if (!ringbuffer_->Overwrite32(prev_wait_link, MiEnd::kCommandType)) {
     return DRETF(false, "Failed to overwrite WAIT in ringbuffer");
   }
   return true;
@@ -439,21 +438,18 @@
   return true;
 }
 
-bool MsdVslDevice::LinkRingbuffer(uint32_t num_new_rb_instructions, uint32_t gpu_addr,
+bool MsdVslDevice::LinkRingbuffer(uint32_t wait_link_offset, uint32_t gpu_addr,
                                   uint32_t dest_prefetch) {
-  // Replace the penultimate WAIT (before the newly added one) with a LINK to the command buffer.
-  // We need to calculate the offset from the current tail, skipping past the new commands
-  // we wrote into the ringbuffer and also the WAIT-LINK that we are modifying.
-  uint32_t prev_wait_offset_dwords =
-     (num_new_rb_instructions * kInstructionDwords) + kWaitLinkDwords;
-  DASSERT(prev_wait_offset_dwords > 0);
+  DASSERT(ringbuffer_->IsOffsetPopulated(wait_link_offset));
+  // We can assume the instruction was written as 8 contiguous bytes.
+  DASSERT(ringbuffer_->IsOffsetPopulated(wait_link_offset + sizeof(uint32_t)));
 
-  // prev_wait_offset_dwords is pointing to the beginning of the WAIT instruction.
+  // Replace the penultimate WAIT (before the newly added one) with a LINK to the command buffer.
   // We will first modify the second dword which specifies the address,
   // as the hardware may be executing at the address of the current WAIT.
-  ringbuffer_->Overwrite32(prev_wait_offset_dwords - 1 /* dwords_before_tail */, gpu_addr);
+  ringbuffer_->Overwrite32(wait_link_offset + sizeof(uint32_t), gpu_addr);
   magma::barriers::Barrier();
-  ringbuffer_->Overwrite32(prev_wait_offset_dwords, MiLink::kCommandType | dest_prefetch);
+  ringbuffer_->Overwrite32(wait_link_offset, MiLink::kCommandType | dest_prefetch);
   magma::barriers::Barrier();
   return true;
 }
@@ -520,6 +516,7 @@
 
   // Number of new commands to be added to the ringbuffer - EVENT WAIT LINK.
   const uint16_t kRbPrefetch = 3;
+  uint32_t prev_wait_link = ringbuffer_->SubtractOffset(kWaitLinkDwords * sizeof(uint32_t));
 
   if (buf) {
     // Write a LINK at the end of the command buffer that links back to the ringbuffer.
@@ -553,7 +550,7 @@
 
   DLOG("Submitting buffer at gpu addr 0x%x", gpu_addr);
 
-  if (!LinkRingbuffer(kRbPrefetch, gpu_addr, *prefetch_out)) {
+  if (!LinkRingbuffer(prev_wait_link, gpu_addr, *prefetch_out)) {
     return DRETF(false, "Failed to link ringbuffer");
   }
   return true;
diff --git a/garnet/drivers/gpu/msd-vsl-gc/src/msd_vsl_device.h b/garnet/drivers/gpu/msd-vsl-gc/src/msd_vsl_device.h
index 2a752c2..ccf748c 100644
--- a/garnet/drivers/gpu/msd-vsl-gc/src/msd_vsl_device.h
+++ b/garnet/drivers/gpu/msd-vsl-gc/src/msd_vsl_device.h
@@ -77,10 +77,9 @@
   // Adds a WAIT-LINK to the end of the ringbuffer.
   bool AddRingbufferWaitLink();
   // Modifies the last WAIT in the ringbuffer to link to |gpu_addr|.
-  // |num_new_rb_instructions| is the number of ringbuffer instructions that have been written
-  // since the last WAIT.
+  // |wait_link_offset| is the offset into the ringbuffer of the WAIT-LINK to replace.
   // |dest_prefetch| is the prefetch of the buffer we are linking to.
-  bool LinkRingbuffer(uint32_t num_new_rb_instructions, uint32_t gpu_addr, uint32_t dest_prefetch);
+  bool LinkRingbuffer(uint32_t wait_link_offset, uint32_t gpu_addr, uint32_t dest_prefetch);
 
   // Writes a LINK command at the end of the given buffer.
   bool WriteLinkCommand(magma::PlatformBuffer* buf, uint32_t length,
diff --git a/garnet/drivers/gpu/msd-vsl-gc/src/ringbuffer.cc b/garnet/drivers/gpu/msd-vsl-gc/src/ringbuffer.cc
index 1b4e502..21a0592 100644
--- a/garnet/drivers/gpu/msd-vsl-gc/src/ringbuffer.cc
+++ b/garnet/drivers/gpu/msd-vsl-gc/src/ringbuffer.cc
@@ -4,24 +4,20 @@
 
 #include "ringbuffer.h"
 
-bool Ringbuffer::Overwrite32(uint32_t dwords_before_tail, uint32_t value) {
-  // The tail points past the last element in the ringbuffer, so 0 is an invalid offset.
-  if (dwords_before_tail == 0) {
-    return DRETF(false, "Cannot overwrite at zero offset from tail");
+bool Ringbuffer::IsOffsetPopulated(uint32_t offset) {
+  if (offset >= size()) {
+    return false;
   }
-  uint32_t offset_bytes = dwords_before_tail * sizeof(uint32_t);
-  uint32_t rb_bytes_stored = (tail() >= head())
-      ? tail() - head()
-      : size() - head() + tail();
+  return (head() <= tail()) ?
+    ((offset >= head()) && (offset < tail())) :
+    ((offset >= head()) || (offset < tail()));
+}
 
-  if (rb_bytes_stored < offset_bytes) {
-    return DRETF(false, "Invalid offset from tail 0x%x bytes, cur ringbuffer size 0x%x",
-                 offset_bytes, rb_bytes_stored);
+bool Ringbuffer::Overwrite32(uint32_t offset, uint32_t value) {
+  if (!IsOffsetPopulated(offset)) {
+    return DRETF(false, "Invalid rb offset %u, head %u tail %u", offset, head(), tail());
   }
-  uint32_t write_offset = SubtractOffset(offset_bytes);
-  DASSERT(write_offset < size());
-
-  vaddr()[write_offset >> 2] = value;
+  vaddr()[offset >> 2] = value;
   return true;
 }
 
@@ -30,3 +26,21 @@
     ? tail() - offset_bytes
     : size() - offset_bytes + tail();
 }
+
+bool Ringbuffer::ReserveContiguous(uint32_t reserve_bytes) {
+  if (!HasSpace(reserve_bytes)) {
+    return DRETF(false, "Ringbuffer does not have space for %u bytes", reserve_bytes);
+  }
+  // If there are not at least |reserve_bytes| number of contiguous bytes,
+  // we will need to advance the tail to the start of the ringbuffer.
+  uint32_t bytes_until_end = size() - tail();
+  if (bytes_until_end < reserve_bytes) {
+    if (!HasSpace(reserve_bytes + bytes_until_end)) {
+      return DRETF(false, "Ringbuffer does not have contiguous space for %u bytes",
+                   reserve_bytes);
+    }
+    update_tail(0);
+    DASSERT(tail() != head());
+  }
+  return true;
+}
diff --git a/garnet/drivers/gpu/msd-vsl-gc/src/ringbuffer.h b/garnet/drivers/gpu/msd-vsl-gc/src/ringbuffer.h
index 07d1181..458ed73 100644
--- a/garnet/drivers/gpu/msd-vsl-gc/src/ringbuffer.h
+++ b/garnet/drivers/gpu/msd-vsl-gc/src/ringbuffer.h
@@ -14,15 +14,28 @@
   Ringbuffer(std::unique_ptr<MsdVslBuffer>&& buffer, uint32_t start_offset)
       : magma::Ringbuffer<GpuMapping>(std::move(buffer), start_offset) {}
 
-  // Replaces the value stored in the ringbuffer at offset |dwords_before_tail| with |value|.
-  // Returns false if |dwords_before_tail| is zero, or does not point to a currently stored
+  // Returns whether |offset| points to a currently stored value in the ringbuffer.
+  bool IsOffsetPopulated(uint32_t offset);
+
+  // Replaces the value stored in the ringbuffer at |offset| with |value|.
+  // Returns false if |offset| does not point to a currently stored
   // value in the ringbuffer.
-  bool Overwrite32(uint32_t dwords_before_tail, uint32_t value);
+  bool Overwrite32(uint32_t offset, uint32_t value);
 
   // Returns the position corresponding to negative |offset| from the current tail.
   uint32_t SubtractOffset(uint32_t offset);
 
+  // Advances the ringbuffer tail so that the next write(s) totalling |want_bytes| will be
+  // contiguous.
+  // Returns whether the requested number of contiguous bytes were available,
+  // and any required ringbuffer tail adjustment was made.
+  // If false, the caller should wait for an existing event to be removed
+  // from the ringbuffer before trying again.
+  bool ReserveContiguous(uint32_t want_bytes);
+
   friend class RingbufferTest;
+  friend class RingbufferTest_OffsetPopulatedHeadBeforeTail_Test;
+  friend class RingbufferTest_OffsetPopulatedTailBeforeHead_Test;
 };
 
 #endif  // RINGBUFFER_H
diff --git a/garnet/drivers/gpu/msd-vsl-gc/tests/unit_tests/test_events.cc b/garnet/drivers/gpu/msd-vsl-gc/tests/unit_tests/test_events.cc
index 7e8839b..c394984 100644
--- a/garnet/drivers/gpu/msd-vsl-gc/tests/unit_tests/test_events.cc
+++ b/garnet/drivers/gpu/msd-vsl-gc/tests/unit_tests/test_events.cc
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
+#include "garnet/drivers/gpu/msd-vsl-gc/src/instructions.h"
 #include "garnet/drivers/gpu/msd-vsl-gc/src/msd_vsl_device.h"
 #include "gtest/gtest.h"
 #include "helper/platform_device_helper.h"
@@ -95,6 +96,7 @@
   }
 
   for (unsigned int i = 0; i < 2; i++) {
+    uint32_t prev_wait_link = ringbuffer->SubtractOffset(kWaitLinkDwords * sizeof(uint32_t));
     // We will link to the end of the ringbuffer, where we are adding new events.
     uint32_t rb_link_addr = rb_gpu_addr + ringbuffer->tail();
 
@@ -110,12 +112,12 @@
 
     // Link the ringbuffer to the newly written events.
     uint32_t num_new_rb_instructions = MsdVslDevice::kNumEvents + 2;  // Add 2 for WAIT-LINK.
-    device_->LinkRingbuffer(num_new_rb_instructions, rb_link_addr,
+    device_->LinkRingbuffer(prev_wait_link, rb_link_addr,
                             num_new_rb_instructions /* prefetch */);
 
     constexpr uint64_t kTimeoutMs = 5000;
     for (unsigned int j = 0; j < MsdVslDevice::kNumEvents; j++) {
-      EXPECT_EQ(MAGMA_STATUS_OK, semaphores[j]->Wait(kTimeoutMs).get());
+      ASSERT_EQ(MAGMA_STATUS_OK, semaphores[j]->Wait(kTimeoutMs).get());
     }
   }
 
@@ -139,7 +141,7 @@
                                              event_id, semaphore->Clone(), &prefetch_out));
 
     constexpr uint64_t kTimeoutMs = 1000;
-    EXPECT_EQ(MAGMA_STATUS_OK, semaphore->Wait(kTimeoutMs).get());
+    ASSERT_EQ(MAGMA_STATUS_OK, semaphore->Wait(kTimeoutMs).get());
 
     ASSERT_TRUE(device_->FreeInterruptEvent(event_id));
   }
diff --git a/garnet/drivers/gpu/msd-vsl-gc/tests/unit_tests/test_ringbuffer.cc b/garnet/drivers/gpu/msd-vsl-gc/tests/unit_tests/test_ringbuffer.cc
index 8794a2e..54d631f 100644
--- a/garnet/drivers/gpu/msd-vsl-gc/tests/unit_tests/test_ringbuffer.cc
+++ b/garnet/drivers/gpu/msd-vsl-gc/tests/unit_tests/test_ringbuffer.cc
@@ -36,11 +36,59 @@
   EXPECT_TRUE(ringbuffer->Map(address_space));
 }
 
-TEST_F(RingbufferTest, Overwrite32) {
+TEST_F(RingbufferTest, OffsetPopulatedEmpty) {
+  const uint32_t kRingbufferSize = 4096;
+  const uint32_t kStartOffset = 0;
+
+  auto ringbuffer = std::make_unique<Ringbuffer>(
+      MsdVslBuffer::Create(kRingbufferSize, "ringbuffer"), kStartOffset);
+  ASSERT_NE(ringbuffer, nullptr);
+
+  EXPECT_FALSE(ringbuffer->IsOffsetPopulated(0));
+  EXPECT_FALSE(ringbuffer->IsOffsetPopulated(4096));
+}
+
+TEST_F(RingbufferTest, OffsetPopulatedHeadBeforeTail) {
+  const uint32_t kRingbufferSize = 4096;
+  const uint32_t kStartOffset = 40;
+
+  auto ringbuffer = std::make_unique<Ringbuffer>(
+      MsdVslBuffer::Create(kRingbufferSize, "ringbuffer"), kStartOffset);
+  ASSERT_NE(ringbuffer, nullptr);
+
+  ringbuffer->update_tail(100);
+
+  EXPECT_TRUE(ringbuffer->IsOffsetPopulated(40));
+  EXPECT_TRUE(ringbuffer->IsOffsetPopulated(60));
+  EXPECT_TRUE(ringbuffer->IsOffsetPopulated(96));
+
+  EXPECT_FALSE(ringbuffer->IsOffsetPopulated(100));
+}
+
+TEST_F(RingbufferTest, OffsetPopulatedTailBeforeHead) {
+  const uint32_t kRingbufferSize = 4096;
+  const uint32_t kStartOffset = 4000;
+
+  auto ringbuffer = std::make_unique<Ringbuffer>(
+      MsdVslBuffer::Create(kRingbufferSize, "ringbuffer"), kStartOffset);
+  ASSERT_NE(ringbuffer, nullptr);
+
+  ringbuffer->update_tail(100);
+
+  EXPECT_TRUE(ringbuffer->IsOffsetPopulated(4000));
+  EXPECT_TRUE(ringbuffer->IsOffsetPopulated(4092));
+
+  EXPECT_FALSE(ringbuffer->IsOffsetPopulated(4096));
+
+  EXPECT_TRUE(ringbuffer->IsOffsetPopulated(0));
+  EXPECT_TRUE(ringbuffer->IsOffsetPopulated(96));
+
+  EXPECT_FALSE(ringbuffer->IsOffsetPopulated(100));
+}
+
+TEST_F(RingbufferTest, ReserveContiguous) {
   const uint32_t kRingbufferSize = magma::page_size();
-  // Start near the end of the ringbuffer so we can test wrapping around.
-  const uint32_t kStartOffset = magma::page_size() - (3 * sizeof(uint32_t));
-  const uint32_t kStartIndex = kStartOffset / sizeof(uint32_t);
+  const uint32_t kStartOffset = 0;
 
   auto ringbuffer = std::make_unique<Ringbuffer>(
       MsdVslBuffer::Create(kRingbufferSize, "ringbuffer"), kStartOffset);
@@ -49,52 +97,52 @@
   MockAddressSpaceOwner owner;
   std::shared_ptr<AddressSpace> address_space = AddressSpace::Create(&owner);
   ASSERT_NE(nullptr, address_space);
-
   EXPECT_TRUE(ringbuffer->Map(address_space));
 
-  // Should not be able to overwrite anything if ringbuffer is empty.
-  EXPECT_FALSE(ringbuffer->Overwrite32(0 /* dwords_before_tail */, 0));
-  EXPECT_FALSE(ringbuffer->Overwrite32(1 /* dwords_before_tail */, 0));
+  // Cannot request the same number of bytes as the ringbuffer size,
+  // as the ringbuffer holds 4 bytes less.
+  EXPECT_FALSE(ringbuffer->ReserveContiguous(kRingbufferSize));
+  // Request all the space available.
+  EXPECT_TRUE(
+      ringbuffer->ReserveContiguous(kRingbufferSize - sizeof(uint32_t) /* reserve_bytes */));
+  EXPECT_EQ(ringbuffer->tail(), 0u);  // Tail should stay the same until we write something.
 
-  // Write a few values to the ringbuffer but don't wrap around.
-  uint32_t num_values = 2;
-  for (unsigned int i = 0; i < num_values; i++) {
+  // Partially fill the ringbuffer, leaving |available_bytes| free.
+  const uint32_t available_bytes = 5 * sizeof(uint32_t);
+  const uint32_t bytes_written = kRingbufferSize - available_bytes - sizeof(uint32_t);
+  for (unsigned int i = 0; i < bytes_written / sizeof(uint32_t); i++) {
     ringbuffer->Write32(0xFFFFFFFF /* value */);
   }
-  // Overwrite the values we just wrote with the expected ringbuffer offset.
-  EXPECT_TRUE(ringbuffer->Overwrite32(1 /* dwords_before_tail */, kStartIndex + 1 /* value */));
-  EXPECT_TRUE(ringbuffer->Overwrite32(2 /* dwords_before_tail */, kStartIndex));
-  // Only wrote 2 values, cannot overwrite at index 3.
-  EXPECT_FALSE(ringbuffer->Overwrite32(3 /* dwords_before_tail */, 0));
+  EXPECT_EQ(ringbuffer->tail(), bytes_written);
 
-  // Fill the rest of the ringbuffer. The ringbuffer holds 1 less than the ringbuffer size.
-  uint32_t size_dwords = kRingbufferSize / sizeof(uint32_t);
-  num_values = size_dwords - num_values - 1;
-  for (unsigned int i = 0; i < num_values; i++) {
-    ringbuffer->Write32(0xFFFFFFFF /* value */);
-  }
-  EXPECT_EQ(ringbuffer->tail(), kStartOffset - sizeof(uint32_t));
+  // Ringbuffer state (# = occupied, x = unusable)
+  //
+  // Contents:  | ####################################### |               |x|
+  // Offset:    HEAD (0)                                  TAIL (4072)       END
 
-  // Replace the values we just wrote.
-  // The first value we wrote is at the last physical index of the ringbuffer.
-  EXPECT_TRUE(ringbuffer->Overwrite32(num_values /* dwords_before_tail */,
-                                      kStartIndex + 2 /* value */));
-  // Start overwriting values starting from the tail.
-  for (unsigned int i = 1; i < num_values; i++) {
-    uint32_t expected_index = kStartIndex - 1 - i;
-    EXPECT_TRUE(ringbuffer->Overwrite32(i /* dwords_before_tail */, expected_index /* value */));
-  }
+  // Request slightly more space than is available.
+  EXPECT_FALSE(ringbuffer->ReserveContiguous(available_bytes + sizeof(uint32_t)));
+  // Request all the space available.
+  EXPECT_TRUE(ringbuffer->ReserveContiguous(available_bytes));
+  EXPECT_EQ(ringbuffer->tail(), bytes_written);
 
-  // Verify all the values in the ringbuffer have been correctly replaced.
-  uint32_t* addr = vaddr(ringbuffer.get());
-  ASSERT_NE(addr, nullptr);
+  // Free up some space in the ringbuffer.
+  const uint32_t head_offset = 40;
+  ringbuffer->update_head(head_offset);
 
-  for (unsigned int i = 0; i < size_dwords; i++) {
-    // The index before the start index won't be written, as the ringbuffer can only store
-    // 1 less than the ringbuffer size.
-    uint32_t next_index = (i + 1) % size_dwords;
-    if (next_index != kStartIndex) {
-      EXPECT_EQ(addr[i], i);
-    }
-  }
+  // Ringbuffer state
+  //
+  // Contents:  |           |x| ######################### |               |
+  // Offset:    START         HEAD (40)                   TAIL (4072)     END
+
+  // As the head is no longer at 0, we can write an additional 4 bytes contiguously.
+  EXPECT_TRUE(ringbuffer->ReserveContiguous(available_bytes + sizeof(uint32_t)));
+  EXPECT_EQ(ringbuffer->tail(), bytes_written);
+
+  // There are enough bytes, but not contiguously.
+  EXPECT_FALSE(ringbuffer->ReserveContiguous(head_offset));
+
+  // This will reset the tail to get enough contiguous bytes.
+  EXPECT_TRUE(ringbuffer->ReserveContiguous(head_offset - sizeof(uint32_t)));
+  EXPECT_EQ(ringbuffer->tail(), 0u);
 }
diff --git a/garnet/lib/magma/src/magma_util/ringbuffer.h b/garnet/lib/magma/src/magma_util/ringbuffer.h
index f72f51c..5d633d0 100644
--- a/garnet/lib/magma/src/magma_util/ringbuffer.h
+++ b/garnet/lib/magma/src/magma_util/ringbuffer.h
@@ -54,6 +54,13 @@
  protected:
   uint32_t* vaddr() { return vaddr_; }
 
+  void update_tail(uint32_t tail) {
+    DASSERT((tail & (sizeof(*vaddr_) - 1)) == 0);
+    DASSERT(tail < size_);
+    DLOG("updating tail 0x%x", tail);
+    tail_ = tail;
+  }
+
  private:
   std::shared_ptr<typename GpuMapping::BufferType> buffer_;
   std::unique_ptr<GpuMapping> gpu_mapping_;