src/graphics/lib/compute/spinel/platforms/vk/raster_builder_impl.c - fuchsia - Git at Google

 // Copyright 2019 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 //
 //
 //

 #include "raster_builder_impl.h"

 #include <memory.h>
 #include <stdlib.h>

 #include "block_pool.h"
 #include "common/macros.h"
 #include "common/vk/assert.h"
 #include "common/vk/barrier.h"
 #include "flush.h"
 #include "handle_pool.h"
 #include "path_builder.h"
 #include "path_builder_impl.h"
 #include "queue_pool.h"
 #include "ring.h"
 #include "shaders/push.h"
 #include "spinel/spinel_assert.h"
 #include "weakref.h"

 //
 // The raster builder prepares fill commands, transforms and clips for the
 // rasterization sub-pipeline.
 //
 // A simplifying assumption is that the maximum length of a single raster can't
 // be larger than what fits in the raster builder ring.
 //
 // This would be a very long raster and is a legitimate size limitation.
 //
 // If a raster is exceeds this limit then the raster builder instance is lost.
 //
 // Note that this restriction can be removed with added complexity to the
 // builder and shaders.
 //
 // The general strategy that this particular Vulkan implementation uses is to
 // allocate a large "HOST_COHERENT" buffer for the ring.
 //
 // Note that the maximum number of "in-flight" rasterization sub-pipelines is
 // conveniently determined by the size of the fence pool.
 //
 // The size of ring buffer is driven by the desired size limit of a single
 // raster.
 //
 // The worst-case total storage per fill() invocation is:
 //
 //   coherent
 //     - fills      : 4 dwords
 //     - transforms : 8 dwords
 //     - clips      : 4 dwords
 //   host
 //     - paths      : 1 dword
 //     - rasters    : 1 dword +
 //                   ----------
 //                   18 dwords
 //
 // There are a maximum of (SPN_RASTER_COHORT_METAS_SIZE-1) rasters in a single
 // cohort.
 //
 // A single raster will necessarily have a maximum number of
 // paths/transforms/clips.
 //
 // Exceeding this limit terminates the raster builder.
 //
 // Note that the fills/paths count will always be 1:1 and potentially greater
 // than the varying transforms/clips/rasters counts.
 //
 // Worst case is that the fills/transforms/clips/paths/rasters counts are all
 // equal.
 //
 // Note that fill commands, transforms and clips may be read more than once by
 // the rasterization sub-pipeline.
 //
 // Depending on the device architecture, it may be beneficial to copy the
 // working region of the coherent buffer to a device-local buffer.
 //
 // If the Vulkan device is integrated or supports mapped write-through (AMD)
 // then we don't need to copy.  If the device is discrete and doesn't support
 // write-through (NVIDIA) then we do.
 //
 // Note that the fill command can reduce its transform and clip fields to 13-16
 // bits and fit into 3 dwords but it's easier to use a uint4 with GPUs.
 //
 // A non-affine transformation elevates a Bezier to a rational.  For this
 // reason, we indicate with a bit flag if the transform matrix has non-zero
 // {w0,w1} elements.
 //

 //
 //
 // clang-format off
 #define SPN_RASTER_BUILDER_RAST_TYPE_EXPAND()                                     \
   SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(proj_line,  SPN_RAST_TYPE_PROJ_LINE,  4)  \
   SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(proj_quad,  SPN_RAST_TYPE_PROJ_QUAD,  6)  \
   SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(proj_cubic, SPN_RAST_TYPE_PROJ_CUBIC, 8)  \
   SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(line,       SPN_RAST_TYPE_LINE,       4)  \
   SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(quad,       SPN_RAST_TYPE_QUAD,       6)  \
   SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(cubic,      SPN_RAST_TYPE_CUBIC,      8)  \
   SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(rat_quad,   SPN_RAST_TYPE_RAT_QUAD,   7)  \
   SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(rat_cubic,  SPN_RAST_TYPE_RAT_CUBIC, 10)

 //
 // TODO(allanmac): Unify the .cf and .tc rings since they're both quads.
 //

 //
 // The fill command layout is the same on both the host and device.
 //
 struct spinel_cmd_fill
 {
   uint32_t path_h;              // host id
   uint32_t na0            : 16; // unused
   uint32_t cohort         : 15; // cohort is 8-11 bits
   uint32_t transform_type : 1;  // transform type: 0=affine,1=projective
   uint32_t transform;           // index of first quad of transform
   uint32_t clip;                // index of clip quad
 };
 // clang-format on

 STATIC_ASSERT_MACRO_1(sizeof(struct spinel_cmd_fill) == sizeof(uint32_t[4]));

 //
 // Ring work span
 //
 struct spinel_rbi_head_span
 {
   uint32_t head;
   uint32_t span;
 };

 //
 // Dispatch states
 //
 typedef enum spinel_rbi_dispatch_state_e
 {
   SPN_RBI_DISPATCH_STATE_INVALID,
   SPN_RBI_DISPATCH_STATE_RECORDING,
   SPN_RBI_DISPATCH_STATE_PENDING,
   SPN_RBI_DISPATCH_STATE_COMPLETE,

 } spinel_rbi_dispatch_state_e;

 //
 // There are always as many dispatch records as there are fences in
 // the fence pool.  This simplifies reasoning about concurrency.
 //
 // FIXME(allanmac): We don't have to track tc/rc once submitted
 // FIXME(allanmac): We probably can drop most of the dbi structs
 //
 struct spinel_rbi_dispatch
 {
   struct
   {
     struct
     {
       struct spinel_dbi_devaddr internal;
       struct spinel_dbi_devaddr indirect;
     } rs;

     struct spinel_dbi_devaddr ttrks;             // ttrks + ttrks_keyvals_even
     struct spinel_dbi_devaddr fill_scan;         // used before sorting
     struct spinel_dbi_devaddr rast_cmds;         // used before sorting
     struct spinel_dbi_devaddr ttrk_keyvals_odd;  // used by radix and post-sort
   } vk;

   struct spinel_rbi_head_span cf;  // fills and paths are 1:1
   struct spinel_rbi_head_span tc;  // transform quads and clips
   struct spinel_rbi_head_span rc;  // rasters in cohort

   spinel_deps_delayed_semaphore_t delayed;

   spinel_rbi_dispatch_state_e state;
 };

 //
 // The host-side rings share a single host-coherent buffer:
 //
 //   |<--cmds(uvec4)-->|<--transform.lo/hi & clip(vec4)-->|<--raster_h(uint)-->|
 //
 // Each ring has a different access pattern:
 //
 //      ring    | reads
 //   -----------+-------
 //   cmd_fills  |   2
 //   transforms |   1+
 //   clips      |   1+
 //   rasters    |   1
 //
 // For this reason, some Vulkan devices may benefit from copying the
 // ring spans from the host-coherent buffer to a device-local buffer.
 //
 struct spinel_rbi_vk
 {
   struct
   {
     struct
     {
       struct spinel_dbi_dm_devaddr h;
       struct spinel_dbi_dm_devaddr d;
     } cf;

     struct
     {
       struct spinel_dbi_dm_devaddr h;
       struct spinel_dbi_dm_devaddr d;
     } tc;

     struct
     {
       struct spinel_dbi_dm_devaddr h;
       struct spinel_dbi_dm_devaddr d;
     } rc;
   } rings;

   struct
   {
     struct spinel_dbi_dm ttrks;
     struct spinel_dbi_dm rfs_rrc_tko;

     struct
     {
       struct spinel_dbi_dm internal;
       struct spinel_dbi_dm indirect;
     } rs;
   } dispatch;
 };

 //
 //
 //
 struct spinel_raster_builder_impl
 {
   struct spinel_raster_builder * raster_builder;
   struct spinel_device *         device;
   struct spinel_rbi_vk           vk;

   //
   // As noted above, the remaining slots in the fills ring is always
   // greater-than-or-equal to the remaining slots in the tcs ring so
   // we use simpler accounting for tcs and rc.
   //
   struct
   {
     struct
     {
       struct spinel_cmd_fill * extent;
       struct spinel_ring       ring;
     } cf;  // fill commands

     struct
     {
       SPN_TYPE_F32VEC4 * extent;
       struct spinel_next next;
     } tc;  // transforms & clips

     struct
     {
       spinel_handle_t *  extent;
       struct spinel_next next;
     } rc;  // rasters in cohort
   } mapped;

   //
   // work in progress raster
   //
   struct
   {
     struct spinel_rbi_head_span cf;  // fill commands
     struct spinel_rbi_head_span tc;  // transforms and clips
   } wip;

   //
   // Resources released upon an grid completion:
   //
   //   - Path handles can be released after rasterization stage.
   //
   //   - Raster handles can be released after the entire rasterization
   //     sub-pipeline completes.
   //
   //   - Dispatch records and associated mapped spans released in
   //     ring order.
   //
   spinel_weakref_epoch_t epoch;

   struct
   {
     spinel_handle_t * extent;
   } paths;

   struct
   {
     spinel_handle_t * extent;
   } rasters;

   struct
   {
     struct spinel_rbi_dispatch * extent;
     struct spinel_ring           ring;
   } dispatches;
 };

 //
 //
 //
 static bool
 spinel_rbi_is_staged(struct spinel_device const * device)
 {
   return !spinel_allocator_is_device_local(&device->allocator.device.perm.hw_dr);
 }

 //
 // These pfns are installed when the Spinel context is lost
 //
 static spinel_result_t
 spinel_rbi_lost_begin(struct spinel_raster_builder_impl * impl)
 {
   return SPN_ERROR_RASTER_BUILDER_LOST;
 }

 static spinel_result_t
 spinel_rbi_lost_end(struct spinel_raster_builder_impl * impl, spinel_raster_t * raster)
 {
   *raster = SPN_RASTER_INVALID;  // FIXME -- SPN_TYPED_HANDLE_INVALID

   return SPN_ERROR_RASTER_BUILDER_LOST;
 }

 static spinel_result_t
 spinel_rbi_release(struct spinel_raster_builder_impl * impl);

 static spinel_result_t
 spinel_rbi_lost_release(struct spinel_raster_builder_impl * impl)
 {
   //
   // FIXME -- releasing a lost path builder might eventually require a
   // specialized function.  For now, just call the default release.
   //
   return spinel_rbi_release(impl);
 }

 static spinel_result_t
 spinel_rbi_lost_flush(struct spinel_raster_builder_impl * impl)
 {
   return SPN_ERROR_RASTER_BUILDER_LOST;
 }

 static spinel_result_t
 spinel_rbi_lost_add(struct spinel_raster_builder_impl * impl,
                     spinel_path_t const *               paths,
                     spinel_transform_weakref_t *        transform_weakrefs,
                     spinel_transform_t const *          transforms,
                     spinel_clip_weakref_t *             clip_weakrefs,
                     spinel_clip_t const *               clips,
                     uint32_t                            count)
 {
   return SPN_ERROR_RASTER_BUILDER_LOST;
 }

 //
 // If (wip.span == mapped.ring.size) then the raster is too long and
 // the raster builder is terminally "lost".  The raster builder should
 // be released and a new one created.
 //
 static void
 spinel_rbi_lost(struct spinel_raster_builder_impl * impl)
 {
   struct spinel_raster_builder * const rb = impl->raster_builder;

   rb->begin   = spinel_rbi_lost_begin;
   rb->end     = spinel_rbi_lost_end;
   rb->release = spinel_rbi_lost_release;
   rb->flush   = spinel_rbi_lost_flush;
   rb->add     = spinel_rbi_lost_add;
 }

 static void
 spinel_rbi_raster_append(struct spinel_raster_builder_impl * impl, spinel_handle_t handle)
 {
   uint32_t const idx = spinel_next_acquire_1(&impl->mapped.rc.next);

   impl->mapped.rc.extent[idx] = handle;  // device
   impl->rasters.extent[idx]   = handle;  // host
 }

 //
 // A dispatch captures how many paths and blocks are in a dispatched or
 // the work-in-progress compute grid.
 //
 static struct spinel_rbi_dispatch *
 spinel_rbi_dispatch_head(struct spinel_raster_builder_impl * impl)
 {
   assert(!spinel_ring_is_empty(&impl->dispatches.ring));

   return impl->dispatches.extent + impl->dispatches.ring.head;
 }

 static struct spinel_rbi_dispatch *
 spinel_rbi_dispatch_tail(struct spinel_raster_builder_impl * impl)
 {
   assert(!spinel_ring_is_full(&impl->dispatches.ring));

   return impl->dispatches.extent + impl->dispatches.ring.tail;
 }

 static void
 spinel_rbi_dispatch_drop(struct spinel_raster_builder_impl * impl)
 {
   struct spinel_ring * const ring = &impl->dispatches.ring;

   spinel_ring_drop_1(ring);
 }

 static void
 spinel_rbi_dispatch_head_init(struct spinel_raster_builder_impl * impl)
 {
   //
   // Don't initialize this with a spinel_rbi_dispatch struct and designated
   // initializers because each dispatch structure has precalculated .dbi
   // buffers.
   //
   // Per-member initializers are fine.
   //
   struct spinel_rbi_dispatch * const dispatch = spinel_rbi_dispatch_head(impl);

   assert(dispatch->state == SPN_RBI_DISPATCH_STATE_INVALID);

   dispatch->cf = impl->wip.cf;
   dispatch->tc = impl->wip.tc;
   dispatch->rc = (struct spinel_rbi_head_span){ .head = impl->mapped.rc.next.head, .span = 0 };

   dispatch->delayed = SPN_DEPS_DELAYED_SEMAPHORE_INVALID;
   dispatch->state   = SPN_RBI_DISPATCH_STATE_RECORDING;
 }

 static void
 spinel_rbi_dispatch_acquire(struct spinel_raster_builder_impl * impl)
 {
   struct spinel_ring * const   ring   = &impl->dispatches.ring;
   struct spinel_device * const device = impl->device;

   while (spinel_ring_is_empty(ring))
     {
       spinel_deps_drain_1(device->deps, &device->vk);
     }

   spinel_rbi_dispatch_head_init(impl);
 }

 //
 //
 //
 static void
 spinel_rbi_dispatch_append_wip(struct spinel_raster_builder_impl * impl,
                                struct spinel_rbi_dispatch *        dispatch)
 {
   dispatch->cf.span += impl->wip.cf.span;
   dispatch->tc.span += impl->wip.tc.span;
   dispatch->rc.span += 1;
 }

 //
 // We record where the *next* work-in-progress raster will start in the ring
 // along with its rolling counter.
 //
 static void
 spinel_rbi_wip_reset(struct spinel_raster_builder_impl * impl)
 {
   impl->wip.cf = (struct spinel_rbi_head_span){ .head = impl->mapped.cf.ring.head, .span = 0 };
   impl->wip.tc = (struct spinel_rbi_head_span){ .head = impl->mapped.tc.next.head, .span = 0 };
 }

 //
 //
 //
 static void
 spinel_rbi_copy_ring(VkCommandBuffer                     cb,
                      VkDescriptorBufferInfo const *      h,
                      VkDescriptorBufferInfo const *      d,
                      VkDeviceSize                        elem_size,
                      uint32_t                            ring_size,
                      struct spinel_rbi_head_span const * head_span)
 {
   VkBufferCopy bcs[2];
   uint32_t     bc_count;

   bool const     is_wrap   = (head_span->span + head_span->head) > ring_size;
   uint32_t const span_hi   = is_wrap ? (ring_size - head_span->head) : head_span->span;
   VkDeviceSize   offset_hi = elem_size * head_span->head;

   bcs[0].srcOffset = h->offset + offset_hi;
   bcs[0].dstOffset = d->offset + offset_hi;
   bcs[0].size      = elem_size * span_hi;

   if (is_wrap)
     {
       uint32_t const span_lo = head_span->span - span_hi;

       bcs[1].srcOffset = h->offset;
       bcs[1].dstOffset = d->offset;
       bcs[1].size      = elem_size * span_lo;

       bc_count = 2;
     }
   else
     {
       bc_count = 1;
     }

   vkCmdCopyBuffer(cb, h->buffer, d->buffer, bc_count, bcs);
 }

 //
 //
 //
 static void
 spinel_rbi_flush_complete(void * data0, void * data1)
 {
   struct spinel_raster_builder_impl * const impl     = data0;
   struct spinel_rbi_dispatch * const        dispatch = data1;
   struct spinel_device * const              device   = impl->device;

   //
   // These raster handles are now materialized so invalidate their dependencies.
   //
   spinel_deps_delayed_detach_ring(device->deps,
                                   impl->rasters.extent,
                                   impl->mapped.rc.next.size,
                                   dispatch->rc.head,
                                   dispatch->rc.span);

   //
   // Release paths -- may invoke wait()
   //
   // FIXME(allanmac): Paths could be released much earlier if we're willing to
   // complicate the submission and launch an additional command buffer.
   //
   spinel_device_release_d_paths_ring(device,
                                      impl->paths.extent,
                                      impl->mapped.cf.ring.size,
                                      dispatch->cf.head,
                                      dispatch->cf.span);
   //
   // Release the rasters -- may invoke wait()
   //
   spinel_device_release_d_rasters_ring(device,
                                        impl->rasters.extent,
                                        impl->mapped.rc.next.size,
                                        dispatch->rc.head,
                                        dispatch->rc.span);
   //
   // If the dispatch is the tail of the ring then try to release as many
   // dispatch records as possible...
   //
   // Note that dispatches can complete in any order but the ring releases need
   // to occur in order.
   //
   assert(dispatch->state != SPN_RBI_DISPATCH_STATE_COMPLETE);

   dispatch->state = SPN_RBI_DISPATCH_STATE_COMPLETE;

   struct spinel_rbi_dispatch * tail = spinel_rbi_dispatch_tail(impl);

   while (tail->state == SPN_RBI_DISPATCH_STATE_COMPLETE)
     {
       // will always be true
       assert(impl->mapped.cf.ring.tail == tail->cf.head);

       // release the blocks and cmds
       spinel_ring_release_n(&impl->mapped.cf.ring, tail->cf.span);

       // release the dispatch
       spinel_ring_release_n(&impl->dispatches.ring, 1);

       // mark as invalid
       tail->state = SPN_RBI_DISPATCH_STATE_INVALID;

       // any dispatches still in flight?
       if (spinel_ring_is_full(&impl->dispatches.ring))
         {
           break;
         }

       // get new tail
       tail = spinel_rbi_dispatch_tail(impl);
     }
 }

 //
 //
 //
 static VkPipelineStageFlags
 spinel_rbi_flush_record(VkCommandBuffer cb, void * data0, void * data1)
 {
   //
   // 0. ZEROES & COPY
   //
   //    Prepares device-side data structures.
   //
   // 1. FILL_SCAN
   //
   //    Compute the prefix sum of each path type in the fill's path.
   //
   // 2. FILL_DISPATCH
   //
   //    Take the atomically updated count of rasterization commands and
   //    initialize a workgroup triple for vkCmdDispatchIndirect().
   //
   // 3. FILL_EXPAND
   //
   //    Expand the fill command into rasterization commands and store them to
   //    a temporary buffer:
   //
   //      |<lines><quads><cubics><rat_quads><rat_cubics>|
   //
   // 4. RASTERIZE_LINES/QUADS/CUBICS/RAT_QUADS/RAT_CUBICS
   //
   //    For each path type, indirectly dispatch a rasterizer.
   //
   // 5. INDIRECT RADIX SORT TTRK KEYS
   //
   // 6. SEGMENT_TTRK_DISPATCH
   //
   // 7. SEGMENT_TTRK
   //
   // 8. RASTERS_ALLOC
   //
   // 9. RASTERS_PREFIX
   //
   struct spinel_raster_builder_impl * const impl     = data0;
   struct spinel_rbi_dispatch * const        dispatch = data1;
   struct spinel_device * const              device   = impl->device;

   ////////////////////////////////////////////////////////////////
   //
   // FILL: ZERO RASTER COHORT META TABLE
   // FILL: ZERO RASTERIZE.FILL_SCAN_COUNTS
   //
   ////////////////////////////////////////////////////////////////

   {
     //
     // * Zero ttrks.meta SoA arrays *after* .alloc[]
     // * Zero ttrks.count_dispatch
     //
     // NOTE(allanmac): The ttrks.meta SoA fills have no dependencies until step
     // (7) so there may be an opportunity to delay this.  But the
     // ttrks.count_dispatch needs to be zeroed earlier.  Since they're adjacent
     // we go ahead and zero them together.
     //
     // clang-format off
     VkDeviceSize const meta_offset           = SPN_BUFFER_OFFSETOF(ttrks,    meta.rk_off);
     VkDeviceSize const count_dispatch_offset = SPN_BUFFER_OFFSETOF(ttrks,    count_dispatch);
     VkDeviceSize const count_dispatch_size   = SPN_BUFFER_MEMBER_SIZE(ttrks, count_dispatch);
     VkDeviceSize const span                  = count_dispatch_offset + count_dispatch_size - meta_offset;
     // clang-format on

     vkCmdFillBuffer(cb,
                     dispatch->vk.ttrks.dbi.buffer,
                     dispatch->vk.ttrks.dbi.offset + meta_offset,
                     span,
                     0);
   }

   ////////////////////////////////////////////////////////////////
   //
   // FILL: ZERO RASTERIZE.FILL_SCAN_COUNTS
   //
   ////////////////////////////////////////////////////////////////

   {
     VkDeviceSize const offset = SPN_BUFFER_OFFSETOF(rasterize_fill_scan, counts);
     VkDeviceSize const size   = SPN_BUFFER_MEMBER_SIZE(rasterize_fill_scan, counts);

     vkCmdFillBuffer(cb,
                     dispatch->vk.fill_scan.dbi.buffer,
                     dispatch->vk.fill_scan.dbi.offset + offset,
                     size,
                     0);
   }

   ////////////////////////////////////////////////////////////////
   //
   // COPY COMMAND RINGS
   //
   // On a discrete GPU, 1-2 regions of 3 rings are copied from H>D
   //
   // FIXME(allanmac): Only the .cf ring is used by fill_scan so the .tc and .rc
   // copies could be delayed.
   //
   ////////////////////////////////////////////////////////////////

   struct spinel_target_config const * const config = &device->ti.config;

   if (spinel_rbi_is_staged(device))
     {
       // CF
       spinel_rbi_copy_ring(cb,
                            &impl->vk.rings.cf.h.dbi_dm.dbi,
                            &impl->vk.rings.cf.d.dbi_dm.dbi,
                            sizeof(*impl->mapped.cf.extent),
                            impl->mapped.cf.ring.size,
                            &dispatch->cf);

       // TC
       spinel_rbi_copy_ring(cb,
                            &impl->vk.rings.tc.h.dbi_dm.dbi,
                            &impl->vk.rings.tc.d.dbi_dm.dbi,
                            sizeof(*impl->mapped.tc.extent),
                            impl->mapped.tc.next.size,
                            &dispatch->tc);

       // RC
       spinel_rbi_copy_ring(cb,
                            &impl->vk.rings.rc.h.dbi_dm.dbi,
                            &impl->vk.rings.rc.d.dbi_dm.dbi,
                            sizeof(*impl->mapped.rc.extent),
                            impl->mapped.rc.next.size,
                            &dispatch->rc);
     }

   ////////////////////////////////////////////////////////////////
   //
   // BARRIER: FILLS & COPIES
   //
   ////////////////////////////////////////////////////////////////

   vk_barrier_transfer_w_to_compute_r(cb);

   ////////////////////////////////////////////////////////////////
   //
   // PIPELINE: FILL_SCAN
   //
   ////////////////////////////////////////////////////////////////

   struct spinel_push_fill_scan const push_fill_scan = {

     .devaddr_rasterize_fill_scan = dispatch->vk.fill_scan.devaddr,
     .devaddr_rasterize_fill_cmds = impl->vk.rings.cf.d.devaddr,
     .devaddr_block_pool_blocks   = device->block_pool.vk.dbi_devaddr.blocks.devaddr,
     .devaddr_block_pool_host_map = device->block_pool.vk.dbi_devaddr.host_map.devaddr,
     .cmd_head                    = dispatch->cf.head,
     .cmd_size                    = impl->mapped.cf.ring.size,
     .cmd_span                    = dispatch->cf.span
   };

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.fill_scan,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_fill_scan),
                      &push_fill_scan);

   vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.fill_scan);

   {
     //
     // Each invocation processes multiple commands.
     //
     uint32_t const cmds_per_wg = config->raster_builder.fill_scan.rows *  //
                                  config->group_sizes.named.fill_scan.workgroup;

     uint32_t const wg_count = (dispatch->cf.span + cmds_per_wg - 1) / cmds_per_wg;

     vkCmdDispatch(cb, wg_count, 1, 1);
   }

   ////////////////////////////////////////////////////////////////
   //
   // BARRIER: COMPUTE>COMPUTE
   //
   ////////////////////////////////////////////////////////////////

   vk_barrier_compute_w_to_compute_r(cb);

   ////////////////////////////////////////////////////////////////
   //
   // PIPELINE: FILL_DISPATCH
   //
   // NOTE: PUSH CONSTANTS ARE COMPATIBLE WITH FILL_SCAN
   //
   // A single workgroup initializes the indirect dispatches
   //
   // Either 4 or 8 invocations are required (SPN_RAST_TYPE_COUNT == 8)
   //
   ////////////////////////////////////////////////////////////////

   vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.fill_dispatch);

   vkCmdDispatch(cb, 1, 1, 1);

   ////////////////////////////////////////////////////////////////
   //
   // BARRIER: COMPUTE>COMPUTE
   //
   // Note that FILL_EXPAND reads the u32vec4 initialized by FILL_DISPATCH but
   // only RASTERIZE_XXX indirectly dispatches off of the u32vec4.
   //
   ////////////////////////////////////////////////////////////////

   vk_barrier_compute_w_to_compute_r(cb);

   ////////////////////////////////////////////////////////////////
   //
   // PIPELINE: FILL_EXPAND
   //
   // NOTE: PUSH CONSTANTS ARE MOSTLY COMPATIBLE WITH FILL_SCAN
   //
   ////////////////////////////////////////////////////////////////

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.fill_expand,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      SPN_PUSH_OFFSETOF(fill_expand, devaddr_rasterize_rast_cmds),
                      SPN_PUSH_MEMBER_SIZE(fill_expand, devaddr_rasterize_rast_cmds),
                      &dispatch->vk.rast_cmds.devaddr);

   vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.fill_expand);

   {
     //
     // Dispatch one subgroup per command
     //
     uint32_t const sgs_per_wg = config->group_sizes.named.fill_expand.workgroup >>  //
                                 config->group_sizes.named.fill_expand.subgroup_log2;

     uint32_t const wg_count = (dispatch->cf.span + sgs_per_wg - 1) / sgs_per_wg;

     vkCmdDispatch(cb, wg_count, 1, 1);
   }

   ////////////////////////////////////////////////////////////////
   //
   // BARRIER: COMPUTE>INDIRECT|COMPUTE
   //
   // Note that the indirect dispatch was initialized by `fill_dispatch`.
   //
   ////////////////////////////////////////////////////////////////

   vk_barrier_compute_w_to_indirect_compute_r(cb);

   ////////////////////////////////////////////////////////////////
   //
   // TODO(allanmac): PIPELINE RASTERIZE_DISPATCH
   //
   // The indirect dispatch of the rasterization *may* need to support workgroups
   // larger than one subgroup if the device architecture doesn't achieve max
   // residency when dispatching single subgroup workgroups.  This would require
   // another compute shader to adjust the dispatch counts.  For now, just assume
   // (workgroup_size == subgroup_size).
   //
   ////////////////////////////////////////////////////////////////

   ////////////////////////////////////////////////////////////////
   //
   // PIPELINES: RASTERIZE_PROJ_LINE
   //            RASTERIZE_PROJ_QUAD
   //            RASTERIZE_PROJ_CUBIC
   //            RASTERIZE_LINE
   //            RASTERIZE_QUAD
   //            RASTERIZE_CUBIC
   //            RASTERIZE_RAT_QUAD
   //            RASTERIZE_RAT_CUBIC
   //
   ////////////////////////////////////////////////////////////////

   assert(config->group_sizes.named.rasterize_line.workgroup ==
          (1u << config->group_sizes.named.rasterize_line.subgroup_log2));

   struct spinel_push_rasterize const push_rasterize = {

     .devaddr_block_pool_ids       = device->block_pool.vk.dbi_devaddr.ids.devaddr,
     .devaddr_block_pool_blocks    = device->block_pool.vk.dbi_devaddr.blocks.devaddr,
     .devaddr_rasterize_fill_quads = impl->vk.rings.tc.d.devaddr,
     .devaddr_rasterize_fill_scan  = dispatch->vk.fill_scan.devaddr,
     .devaddr_rasterize_rast_cmds  = dispatch->vk.rast_cmds.devaddr,
     .devaddr_ttrks                = dispatch->vk.ttrks.devaddr,
     .bp_mask                      = device->block_pool.bp_mask,
   };

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.rasterize_line,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_rasterize),
                      &push_rasterize);

   VkBuffer const     rasterize_buffer = dispatch->vk.fill_scan.dbi.buffer;
   VkDeviceSize const rasterize_offset = dispatch->vk.fill_scan.dbi.offset +  //
                                         SPN_BUFFER_OFFSETOF(rasterize_fill_scan, dispatch);

 #undef SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X
 #define SPN_RASTER_BUILDER_RAST_TYPE_EXPAND_X(_p, _i, _n)                                          \
   {                                                                                                \
     vkCmdBindPipeline(cb,                                                                          \
                       VK_PIPELINE_BIND_POINT_COMPUTE,                                              \
                       device->ti.pipelines.named.rasterize_##_p);                                  \
                                                                                                    \
     vkCmdDispatchIndirect(cb, rasterize_buffer, rasterize_offset + _i * sizeof(SPN_TYPE_U32VEC4)); \
   }

   SPN_RASTER_BUILDER_RAST_TYPE_EXPAND()

   ////////////////////////////////////////////////////////////////
   //
   // BARRIER: COMPUTE>INDIRECT|COMPUTE
   //
   ////////////////////////////////////////////////////////////////

   vk_barrier_compute_w_to_indirect_compute_r(cb);

   ////////////////////////////////////////////////////////////////
   //
   // RADIX SORT INDIRECT
   //
   // The "rasterize_fill_scan" "rasterize_rast_cmds" extents are no longer used
   // at this point.
   //
   ////////////////////////////////////////////////////////////////

   VkDescriptorBufferInfo const dbi_ttrks_count = {
     .buffer = dispatch->vk.ttrks.dbi.buffer,
     .offset = dispatch->vk.ttrks.dbi.offset + SPN_BUFFER_OFFSETOF(ttrks, count_dispatch.w),
     .range  = sizeof(uint32_t),
   };

   VkDescriptorBufferInfo const dbi_ttrk_keyvals_even = {
     .buffer = dispatch->vk.ttrks.dbi.buffer,
     .offset = dispatch->vk.ttrks.dbi.offset + SPN_BUFFER_OFFSETOF(ttrks, keyvals),
     .range  = dispatch->vk.ttrks.dbi.range - SPN_BUFFER_OFFSETOF(ttrks, keyvals),
   };

   struct radix_sort_vk_sort_indirect_info const info = {
     .ext          = NULL,
     .key_bits     = SPN_TTRK_BITS_XY_COHORT,
     .count        = dbi_ttrks_count,
     .keyvals_even = dbi_ttrk_keyvals_even,
     .keyvals_odd  = dispatch->vk.ttrk_keyvals_odd.dbi,
     .internal     = dispatch->vk.rs.internal.dbi,
     .indirect     = dispatch->vk.rs.indirect.dbi,
   };

   VkDescriptorBufferInfo dbi_ttrk_keyvals_out;

   radix_sort_vk_sort_indirect(device->ti.rs, &info, device->vk.d, cb, &dbi_ttrk_keyvals_out);

   // Device address of extent output by radix sort
   VkDeviceAddress const devaddr_ttrk_keyvals_out = spinel_dbi_to_devaddr(device->vk.d,  //
                                                                          &dbi_ttrk_keyvals_out);

   ////////////////////////////////////////////////////////////////
   //
   // BARRIER: COMPUTE>COMPUTE
   //
   ////////////////////////////////////////////////////////////////

   vk_barrier_compute_w_to_compute_r(cb);

   ////////////////////////////////////////////////////////////////
   //
   // PIPELINE: TTRKS_SEGMENT_DISPATCH
   //
   // FIXME(allanmac): push_ttrks_segment_dispatch is "push compatible" with
   // push_ttrks_segment
   //
   ////////////////////////////////////////////////////////////////

   struct spinel_push_ttrks_segment_dispatch const push_ttrks_segment_dispatch = {
     .devaddr_ttrks_header = dispatch->vk.ttrks.devaddr,
   };

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.ttrks_segment_dispatch,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_ttrks_segment_dispatch),
                      &push_ttrks_segment_dispatch);

   vkCmdBindPipeline(cb,
                     VK_PIPELINE_BIND_POINT_COMPUTE,
                     device->ti.pipelines.named.ttrks_segment_dispatch);

   vkCmdDispatch(cb, 1, 1, 1);  // A single invocation initializes the indirect dispatches

   ////////////////////////////////////////////////////////////////
   //
   // BARRIER: COMPUTE>INDIRECT|COMPUTE
   //
   ////////////////////////////////////////////////////////////////

   vk_barrier_compute_w_to_indirect_compute_r(cb);

   ////////////////////////////////////////////////////////////////
   //
   // PIPELINE: TTRKS_SEGMENT
   //
   // FIXME(allanmac): push_ttrks_segment_dispatch is "push compatible" with
   // push_ttrks_segment
   //
   ////////////////////////////////////////////////////////////////

   struct spinel_push_ttrks_segment const push_ttrks_segment = {
     .devaddr_ttrks_header = dispatch->vk.ttrks.devaddr,
     .devaddr_ttrk_keyvals = devaddr_ttrk_keyvals_out,
   };

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.ttrks_segment,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_ttrks_segment),
                      &push_ttrks_segment);

   vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.ttrks_segment);

   VkDeviceSize const ttrks_segment_count_dispatch_offset =
     dispatch->vk.ttrks.dbi.offset + SPN_BUFFER_OFFSETOF(ttrks, count_dispatch);

   vkCmdDispatchIndirect(cb, dispatch->vk.ttrks.dbi.buffer, ttrks_segment_count_dispatch_offset);

   ////////////////////////////////////////////////////////////////
   //
   // BARRIER: COMPUTE>COMPUTE
   //
   ////////////////////////////////////////////////////////////////

   vk_barrier_compute_w_to_compute_r(cb);

   ////////////////////////////////////////////////////////////////
   //
   // PIPELINE: RASTERS_ALLOC
   //
   ////////////////////////////////////////////////////////////////

   struct spinel_push_rasters_alloc const push_rasters_alloc = {
     .devaddr_raster_ids          = impl->vk.rings.rc.d.devaddr,
     .devaddr_ttrks_header        = dispatch->vk.ttrks.devaddr,
     .devaddr_block_pool_ids      = device->block_pool.vk.dbi_devaddr.ids.devaddr,
     .devaddr_block_pool_blocks   = device->block_pool.vk.dbi_devaddr.blocks.devaddr,
     .devaddr_block_pool_host_map = device->block_pool.vk.dbi_devaddr.host_map.devaddr,
     .ids_size                    = impl->mapped.rc.next.size,
     .ids_head                    = dispatch->rc.head,
     .ids_span                    = dispatch->rc.span,
     .bp_mask                     = device->block_pool.bp_mask,
   };

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.rasters_alloc,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_rasters_alloc),
                      &push_rasters_alloc);

   vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.rasters_alloc);

   {
     //
     // Dispatch one thread per raster rounded up to a workgroup
     //
     uint32_t const wg_size  = config->group_sizes.named.rasters_alloc.workgroup;
     uint32_t const wg_count = (dispatch->rc.span + wg_size - 1) / wg_size;

     vkCmdDispatch(cb, wg_count, 1, 1);
   }

   ////////////////////////////////////////////////////////////////
   //
   // BARRIER: COMPUTE>COMPUTE
   //
   ////////////////////////////////////////////////////////////////

   vk_barrier_compute_w_to_compute_r(cb);

   ////////////////////////////////////////////////////////////////
   //
   // PIPELINE: RASTERS_PREFIX
   //
   ////////////////////////////////////////////////////////////////

   struct spinel_push_rasters_prefix const push_rasters_prefix = {
     .devaddr_block_pool_ids    = device->block_pool.vk.dbi_devaddr.ids.devaddr,
     .devaddr_block_pool_blocks = device->block_pool.vk.dbi_devaddr.blocks.devaddr,
     .devaddr_ttrks_header      = dispatch->vk.ttrks.devaddr,
     .devaddr_ttrk_keyvals      = devaddr_ttrk_keyvals_out,
     .ids_size                  = impl->mapped.rc.next.size,
     .ids_head                  = dispatch->rc.head,
     .ids_span                  = dispatch->rc.span,
     .bp_mask                   = device->block_pool.bp_mask,
   };

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.rasters_prefix,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_rasters_prefix),
                      &push_rasters_prefix);

   vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.rasters_prefix);

   {
     //
     // Dispatch one subgroup per raster
     //
     uint32_t const sgs_per_wg = config->group_sizes.named.rasters_prefix.workgroup >>
                                 config->group_sizes.named.rasters_prefix.subgroup_log2;

     uint32_t const wg_count = (dispatch->rc.span + sgs_per_wg - 1) / sgs_per_wg;

     vkCmdDispatch(cb, wg_count, 1, 1);
   }

   //
   // NOTE(allanmac):
   //
   // The `deps` scheduler assumes that the command buffers associated with
   // delayed semaphores always end with a with a compute shader
   // (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT).
   //
   // Only the path builder and raster builder acquire delayed semaphores.
   //
   return VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
 }

 //
 //
 //
 static void
 spinel_rbi_flush_submit(void * data0, void * data1)
 {
   struct spinel_raster_builder_impl * const impl     = data0;
   struct spinel_rbi_dispatch * const        dispatch = data1;
   struct spinel_device * const              device   = impl->device;

   assert(dispatch->cf.span > 0);
   assert(dispatch == spinel_rbi_dispatch_head(impl));
   assert(dispatch->state == SPN_RBI_DISPATCH_STATE_RECORDING);

   //
   // Flush if the ring is non-coherent
   //
   if (!spinel_allocator_is_coherent(&device->allocator.device.perm.hw_dr))
     {
       // Flush CF
       spinel_ring_flush(&device->vk,
                         impl->vk.rings.cf.h.dbi_dm.dm,
                         0UL,
                         impl->mapped.cf.ring.size,
                         dispatch->cf.head,
                         dispatch->cf.span,
                         sizeof(*impl->mapped.cf.extent));

       // Flush TC
       spinel_ring_flush(&device->vk,
                         impl->vk.rings.tc.h.dbi_dm.dm,
                         0UL,
                         impl->mapped.tc.next.size,
                         dispatch->tc.head,
                         dispatch->tc.span,
                         sizeof(*impl->mapped.tc.extent));

       // Flush RC
       spinel_ring_flush(&device->vk,
                         impl->vk.rings.rc.h.dbi_dm.dm,
                         0UL,
                         impl->mapped.rc.next.size,
                         dispatch->rc.head,
                         dispatch->rc.span,
                         sizeof(*impl->mapped.rc.extent));
     }

   //
   // Acquire an immediate semaphore
   //
   struct spinel_deps_immediate_submit_info const disi = {
     .record = {
       .pfn   = spinel_rbi_flush_record,
       .data0 = impl,
       .data1 = dispatch,
     },
     .wait = {
       .delayed = {
         .handles = {
           .extent = impl->paths.extent,
           .size   = impl->mapped.cf.ring.size,
           .head   = dispatch->cf.head,
           .span   = dispatch->cf.span,
         },
       },
     },
     .completion = {
       .pfn   = spinel_rbi_flush_complete,
       .data0 = impl,
       .data1 = dispatch,
     },
     .signal = {
       .delayed = {
         .count      = 1,
         .semaphores = {
           dispatch->delayed,
         },
       },
     },
   };

   //
   // Invalidate all outstanding transform and clip weakrefs
   //
   spinel_weakref_epoch_increment(&impl->epoch);

   //
   // The current dispatch is in flight so drop it
   //
   spinel_rbi_dispatch_drop(impl);

   //
   // Move to pending state
   //
   dispatch->state = SPN_RBI_DISPATCH_STATE_PENDING;

   //
   // We don't need to save the immediate semaphore
   //
   spinel_deps_immediate_submit(device->deps, &device->vk, &disi, NULL);

   //
   // Acquire and initialize the next dispatch
   //
   spinel_rbi_dispatch_acquire(impl);
 }

 //
 //
 //
 static spinel_result_t
 spinel_rbi_flush(struct spinel_raster_builder_impl * impl)
 {
   //
   // Anything to launch?
   //
   struct spinel_rbi_dispatch const * const dispatch = spinel_rbi_dispatch_head(impl);

   assert(dispatch->state == SPN_RBI_DISPATCH_STATE_RECORDING);

   if (dispatch->rc.span == 0)
     {
       assert(dispatch->cf.span == 0);

       return SPN_SUCCESS;
     }

   //
   // Invoke the delayed submission action
   //
   spinel_deps_delayed_flush(impl->device->deps, dispatch->delayed);

   return SPN_SUCCESS;
 }

 //
 //
 //
 static spinel_result_t
 spinel_rbi_begin(struct spinel_raster_builder_impl * impl)
 {
   return SPN_SUCCESS;
 }

 //
 //
 //
 static spinel_result_t
 spinel_rbi_end(struct spinel_raster_builder_impl * impl, spinel_raster_t * raster)
 {
   //
   // acquire raster host id
   //
   struct spinel_device * const device = impl->device;

   raster->handle = spinel_device_handle_acquire(device);

   //
   // get the head dispatch
   //
   struct spinel_rbi_dispatch * const dispatch = spinel_rbi_dispatch_head(impl);

   assert(dispatch->state == SPN_RBI_DISPATCH_STATE_RECORDING);

   //
   // do we need to acquire a delayed semaphore for the dispatch?
   //
   if (dispatch->rc.span == 0)
     {
       struct spinel_deps_acquire_delayed_info const dadi = {
         .submission = { .pfn   = spinel_rbi_flush_submit,  //
                         .data0 = impl,
                         .data1 = dispatch },
       };

       dispatch->delayed = spinel_deps_delayed_acquire(device->deps, &device->vk, &dadi);
     }

   // associate delayed semaphore with handle
   spinel_deps_delayed_attach(device->deps, raster->handle, dispatch->delayed);

   // save raster handle to ring
   spinel_rbi_raster_append(impl, raster->handle);

   // update head dispatch's span
   spinel_rbi_dispatch_append_wip(impl, dispatch);

   // reset wip
   spinel_rbi_wip_reset(impl);

   //
   // * flush if the raster cohort size limit has been reached
   // * flush if the fill command "eager" limit has been reached
   //
   struct spinel_target_config const * const config = &device->ti.config;

   if ((dispatch->rc.span >= config->raster_builder.size.cohort) ||  // "is_rc_full"
       (dispatch->cf.span >= config->raster_builder.size.eager))     // "is_cf_eager"
     {
       return spinel_rbi_flush(impl);
     }

   return SPN_SUCCESS;
 }

 //
 // If the raster builder is directly exposed as a public API then
 // validate the transform and clip weakref indices.
 //
 // If a fuzzer alters the weakref epoch then the weakref is invalid --
 // we don't need to check for this case since that's the purpose of
 // the weakref.
 //
 // If a fuzzer alters the weakref's index but its epoch still matches
 // the the current epoch then we simply need to validate that its
 // index is *potentially* valid -- the weakref might still be
 // invalidated by about-to-happen spinel_rbi_flush().
 //
 // clang-format off
 static spinel_result_t
 spinel_rbi_validate_transform_weakref_indices(struct spinel_ring const *         cf_ring,
                                               struct spinel_rbi_dispatch const * dispatch,
                                               spinel_transform_weakref_t const * transform_weakrefs,
                                               uint32_t const                           count)
 // clang-format on
 {
   //
   // FIXME(allanmac)
   //
   // For non-null weakrefs, check to see index is within WIP span
   //
   return SPN_SUCCESS;
 }

 static spinel_result_t
 spinel_rbi_validate_clip_weakref_indices(struct spinel_ring const *         cf_ring,
                                          struct spinel_rbi_dispatch const * dispatch,
                                          spinel_clip_weakref_t const *      clip_weakrefs,
                                          uint32_t const                     count)
 {
   //
   // FIXME(allanmac)
   //
   // For non-null weakrefs, check to see index is within WIP span
   //
   return SPN_SUCCESS;
 }

 //
 // Permute lo and hi transform
 //
 // src: { sx shx tx  shy sy ty w0 w1 } // Row-ordered matrix
 // dst: { sx shx shy sy  tx ty w0 w1 } // GPU-friendly ordering
 //
 static void
 spinel_rbi_transform_copy_lo(SPN_TYPE_F32VEC4 * dst, spinel_transform_t const * src)
 {
   dst->x = src->sx;
   dst->y = src->shx;
   dst->z = src->shy;
   dst->w = src->sy;
 }

 static void
 spinel_rbi_transform_copy_hi(SPN_TYPE_F32VEC4 * dst, spinel_transform_t const * src)
 {
   dst->x = src->tx;
   dst->y = src->ty;
   dst->z = src->w0;
   dst->w = src->w1;
 }

 //
 //
 //
 static spinel_result_t
 spinel_rbi_add(struct spinel_raster_builder_impl * impl,
                spinel_path_t const *               paths,
                spinel_transform_weakref_t *        transform_weakrefs,
                spinel_transform_t const *          transforms,
                spinel_clip_weakref_t *             clip_weakrefs,
                spinel_clip_t const *               clips,
                uint32_t                            count)
 {
   // Anything to do?
   if (count == 0)
     {
       return SPN_SUCCESS;
     }

   //
   // If the number of work-in-progress paths is larger than the ring then fail
   // hard and lose the raster builder.
   //
   struct spinel_ring * const cf_ring = &impl->mapped.cf.ring;

   if (impl->wip.cf.span + count > cf_ring->size)
     {
       spinel_rbi_lost(impl);

       return SPN_ERROR_RASTER_BUILDER_LOST;
     }

   //
   // If not enough entries are left in the command ring then flush now and wait
   // for cf slots to be made available.
   //
   struct spinel_device * const device = impl->device;

   if (count > cf_ring->rem)
     {
       //
       // Launch whatever is in the ring and then wait for cf slots...
       //
       spinel_rbi_flush(impl);

       // clang-format off
       do {

         spinel_deps_drain_1(device->deps, &device->vk);

       } while (count > cf_ring->rem);
       // clang-format on
     }

   //
   // Validate the paths before we proceed
   //
   spinel_result_t result;

   result = spinel_device_validate_d_paths(device, paths, count);

   if (result != SPN_SUCCESS)
     {
       return result;
     }

   //
   // Validate the transform and clip weakref indices -- this is cheap!
   //
   struct spinel_rbi_dispatch * const dispatch = spinel_rbi_dispatch_head(impl);

   assert(dispatch->state == SPN_RBI_DISPATCH_STATE_RECORDING);

   result = spinel_rbi_validate_transform_weakref_indices(cf_ring,  //
                                                          dispatch,
                                                          transform_weakrefs,
                                                          count);
   if (result != SPN_SUCCESS)
     {
       return result;
     }

   result = spinel_rbi_validate_clip_weakref_indices(cf_ring,  //
                                                     dispatch,
                                                     clip_weakrefs,
                                                     count);

   if (result != SPN_SUCCESS)
     {
       return result;
     }

   //
   // Everything validates... retain the paths on the device
   //
   spinel_device_retain_d_paths(device, paths, count);

   //
   // Increment the cf span
   //
   impl->wip.cf.span += count;

   //
   // There will always be enough room in the TC ring so only its head
   // needs to be tracked.
   //
   struct spinel_next * const tc_next = &impl->mapped.tc.next;

   //
   // The command's cohort id is the same for all commands
   //
   struct spinel_cmd_fill cf = { 0 };

   cf.cohort = dispatch->rc.span & 0x7fff;

   //
   // Append commands to the cf ring and dependent quads to the tc ring
   //
   while (true)
     {
       uint32_t const cf_idx = spinel_ring_acquire_1(cf_ring);

       //
       // Get the path
       //
       uint32_t const handle = paths->handle;

       impl->paths.extent[cf_idx] = handle;
       cf.path_h                  = handle;

       //
       // Classify the transform
       //
       // if (w0==w1==0) then it's an affine matrix
       //
       cf.transform_type = ((transforms->w0 == 0.0f) && (transforms->w1 == 0.0f))
                             ? SPN_CMD_FILL_TRANSFORM_TYPE_AFFINE
                             : SPN_CMD_FILL_TRANSFORM_TYPE_PROJECTIVE;

       //
       // If the weakref exists then reuse existing transform index
       //
       if (!spinel_transform_weakrefs_get_index(transform_weakrefs, 0, &impl->epoch, &cf.transform))
         {
           uint32_t       span;
           uint32_t const t_idx = spinel_next_acquire_2(tc_next, &span);

           impl->wip.tc.span += span;

           spinel_transform_weakrefs_init(transform_weakrefs, 0, &impl->epoch, t_idx);

           spinel_rbi_transform_copy_lo(impl->mapped.tc.extent + t_idx + 0, transforms);
           spinel_rbi_transform_copy_hi(impl->mapped.tc.extent + t_idx + 1, transforms);

           cf.transform = t_idx;
         }

       //
       // If the weakref exists then reuse existing clip index
       //
       if (!spinel_clip_weakrefs_get_index(clip_weakrefs, 0, &impl->epoch, &cf.clip))
         {
           uint32_t const c_idx = spinel_next_acquire_1(tc_next);

           impl->wip.tc.span += 1;

           spinel_clip_weakrefs_init(clip_weakrefs, 0, &impl->epoch, c_idx);

           memcpy(impl->mapped.tc.extent + c_idx, clips, sizeof(*impl->mapped.tc.extent));

           cf.clip = c_idx;
         }

       //
       // Store the command to the ring
       //
       impl->mapped.cf.extent[cf_idx] = cf;

       //
       // No more paths?
       //
       if (--count == 0)
         {
           break;
         }

       //
       // Otherwise, increment pointers
       //
       // FIXME(allanmac): This will be updated with an argument "template"
       // struct.
       //
       paths += 1;

       if (transform_weakrefs != NULL)
         {
           transform_weakrefs += 1;
         }

       transforms += 1;

       if (clip_weakrefs != NULL)
         {
           clip_weakrefs += 1;
         }

       clips += 1;
     }

   return SPN_SUCCESS;
 }

 //
 //
 //
 static spinel_result_t
 spinel_rbi_release(struct spinel_raster_builder_impl * impl)
 {
   //
   // launch any undispatched rasters
   //
   spinel_rbi_flush(impl);

   //
   // wait for all in-flight dispatches to complete
   //
   struct spinel_ring * const   ring   = &impl->dispatches.ring;
   struct spinel_device * const device = impl->device;

   while (!spinel_ring_is_full(ring))
     {
       spinel_deps_drain_1(device->deps, &device->vk);
     }

   //
   // Dispatch extents
   //
   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.dispatch.rfs_rrc_tko);

   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.dispatch.ttrks);

   //
   // Radix Sort extents
   //
   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.dispatch.rs.indirect);

   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.dispatch.rs.internal);

   //
   // Ring extents
   //
   vkUnmapMemory(device->vk.d, impl->vk.rings.rc.h.dbi_dm.dm);  // not necessary
   vkUnmapMemory(device->vk.d, impl->vk.rings.tc.h.dbi_dm.dm);  // not necessary
   vkUnmapMemory(device->vk.d, impl->vk.rings.cf.h.dbi_dm.dm);  // not necessary

   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.hw_dr,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.rings.rc.h.dbi_dm);

   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.hw_dr,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.rings.tc.h.dbi_dm);

   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.hw_dr,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.rings.cf.h.dbi_dm);
   //
   // Ring staging extents
   //
   if (spinel_rbi_is_staged(device))
     {
       spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
                                    device->vk.d,
                                    device->vk.ac,
                                    &impl->vk.rings.rc.d.dbi_dm);

       spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
                                    device->vk.d,
                                    device->vk.ac,
                                    &impl->vk.rings.tc.d.dbi_dm);

       spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
                                    device->vk.d,
                                    device->vk.ac,
                                    &impl->vk.rings.cf.d.dbi_dm);
     }

   //
   // Free host allocations
   //
   free(impl->rasters.extent);
   free(impl->paths.extent);
   free(impl->dispatches.extent);
   free(impl->raster_builder);
   free(impl);

   spinel_context_release(device->context);

   return SPN_SUCCESS;
 }

 //
 //
 //
 spinel_result_t
 spinel_raster_builder_impl_create(struct spinel_device *    device,
                                   spinel_raster_builder_t * raster_builder)
 {
   spinel_context_retain(device->context);

   //
   // allocate impl
   //
   struct spinel_raster_builder_impl * const impl = MALLOC_MACRO(sizeof(*impl));

   //
   // allocate raster builder
   //
   struct spinel_raster_builder * const rb = MALLOC_MACRO(sizeof(*rb));

   // init impl and rb back-pointers
   *raster_builder      = rb;
   impl->raster_builder = rb;
   rb->impl             = impl;

   // save device
   impl->device = device;

   //
   // init raster builder pfns
   //
   rb->begin   = spinel_rbi_begin;
   rb->end     = spinel_rbi_end;
   rb->release = spinel_rbi_release;
   rb->flush   = spinel_rbi_flush;
   rb->add     = spinel_rbi_add;

   //
   // init refcount & state
   //
   rb->ref_count = 1;

   SPN_ASSERT_STATE_INIT(SPN_RASTER_BUILDER_STATE_READY, rb);

   //
   // Allocate rings
   //
   struct spinel_target_config const * const config = &device->ti.config;

   assert(config->raster_builder.size.eager <= config->raster_builder.size.ring);

   //
   // CF: 1 ring entry per command
   //
   uint32_t const cf_ring_size = config->raster_builder.size.ring;

   spinel_ring_init(&impl->mapped.cf.ring, cf_ring_size);

   //
   // TC: 1 transform + 1 clip = 3 quads
   //
   // Worst case is 3 quads per command.
   //
   // NOTE(allanmac): One additional quad is required because transforms require
   // 2 consecutive quads and the worst case would be a full ring of commands
   // each with a transform and clip.
   //
   uint32_t const tc_ring_size = cf_ring_size * 3 + 1;

   spinel_next_init(&impl->mapped.tc.next, tc_ring_size);

   //
   // How many dispatches?
   //
   uint32_t const max_in_flight = config->raster_builder.size.dispatches;

   //
   // RC: Worst case is (cohort size * dispatches) rasters.
   //
   assert(config->raster_builder.size.cohort <= SPN_RASTER_COHORT_MAX_SIZE);

   uint32_t const rc_ring_size = config->raster_builder.size.cohort * max_in_flight;

   spinel_next_init(&impl->mapped.rc.next, rc_ring_size);

   //
   // FIXME(allanmac): Allocate one buffer for all rings and one buffer for all
   // staging buffers.
   //

   //
   // Allocate and map CF
   //
   {
     VkDeviceSize const cf_size    = sizeof(*impl->mapped.cf.extent) * cf_ring_size;
     VkDeviceSize const cf_size_ru = ROUND_UP_POW2_MACRO(cf_size,  //
                                                         device->vk.limits.noncoherent_atom_size);

     spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.hw_dr,
                                           device->vk.pd,
                                           device->vk.d,
                                           device->vk.ac,
                                           cf_size_ru,
                                           NULL,
                                           &impl->vk.rings.cf.h);

     vk(MapMemory(device->vk.d,
                  impl->vk.rings.cf.h.dbi_dm.dm,
                  0,
                  VK_WHOLE_SIZE,
                  0,
                  (void **)&impl->mapped.cf.extent));

     //
     // Allocate and map TC
     //
     VkDeviceSize const tc_size    = sizeof(*impl->mapped.tc.extent) * tc_ring_size;
     VkDeviceSize const tc_size_ru = ROUND_UP_POW2_MACRO(tc_size,  //
                                                         device->vk.limits.noncoherent_atom_size);

     spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.hw_dr,
                                           device->vk.pd,
                                           device->vk.d,
                                           device->vk.ac,
                                           tc_size_ru,
                                           NULL,
                                           &impl->vk.rings.tc.h);

     vk(MapMemory(device->vk.d,
                  impl->vk.rings.tc.h.dbi_dm.dm,
                  0,
                  VK_WHOLE_SIZE,
                  0,
                  (void **)&impl->mapped.tc.extent));

     //
     // Allocate and map RC
     //
     VkDeviceSize const rc_size    = sizeof(*impl->mapped.rc.extent) * rc_ring_size;
     VkDeviceSize const rc_size_ru = ROUND_UP_POW2_MACRO(rc_size,  //
                                                         device->vk.limits.noncoherent_atom_size);

     spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.hw_dr,
                                           device->vk.pd,
                                           device->vk.d,
                                           device->vk.ac,
                                           rc_size_ru,
                                           NULL,
                                           &impl->vk.rings.rc.h);

     vk(MapMemory(device->vk.d,
                  impl->vk.rings.rc.h.dbi_dm.dm,
                  0,
                  VK_WHOLE_SIZE,
                  0,
                  (void **)&impl->mapped.rc.extent));

     //
     // Allocate rasters
     //
     impl->rasters.extent = MALLOC_MACRO(rc_size_ru);  // same size as mapped size

     //
     // Are host-writable rings used as staging buffers?
     //
     if (spinel_rbi_is_staged(device))
       {
         spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.drw,
                                               device->vk.pd,
                                               device->vk.d,
                                               device->vk.ac,
                                               cf_size_ru,
                                               NULL,
                                               &impl->vk.rings.cf.d);

         spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.drw,
                                               device->vk.pd,
                                               device->vk.d,
                                               device->vk.ac,
                                               tc_size_ru,
                                               NULL,
                                               &impl->vk.rings.tc.d);

         spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.drw,
                                               device->vk.pd,
                                               device->vk.d,
                                               device->vk.ac,
                                               rc_size_ru,
                                               NULL,
                                               &impl->vk.rings.rc.d);
       }
     else
       {
         impl->vk.rings.cf.d = impl->vk.rings.cf.h;
         impl->vk.rings.tc.d = impl->vk.rings.tc.h;
         impl->vk.rings.rc.d = impl->vk.rings.rc.h;
       }
   }

   //
   // Allocate dispatches and path/raster release extents
   //
   // Implicitly sets state to SPN_RBI_DISPATCH_STATE_INVALID.
   //
   impl->dispatches.extent = CALLOC_MACRO(max_in_flight, sizeof(*impl->dispatches.extent));

   //
   // Allocate paths
   //
   size_t const paths_size = sizeof(*impl->paths.extent) * config->raster_builder.size.ring;

   impl->paths.extent = MALLOC_MACRO(paths_size);

   //
   // Get radix sort memory requirements
   //
   struct radix_sort_vk_memory_requirements rs_mr;

   radix_sort_vk_get_memory_requirements(device->ti.rs, config->raster_builder.size.ttrks, &rs_mr);

   assert(SPN_MEMBER_ALIGN_LIMIT >= rs_mr.keyvals_alignment);
   assert(SPN_MEMBER_ALIGN_LIMIT >= rs_mr.internal_alignment);
   assert(SPN_MEMBER_ALIGN_LIMIT >= rs_mr.indirect_alignment);

   //
   // What is rounded-up size of ttrks buffer?
   //
   VkDeviceSize const ttrks_size    = SPN_BUFFER_OFFSETOF(ttrks, keyvals) + rs_mr.keyvals_size;
   VkDeviceSize const ttrks_size_ru = ROUND_UP_POW2_MACRO(ttrks_size, SPN_MEMBER_ALIGN_LIMIT);

   //
   // Allocate memory shared across dispatches:
   //
   //   ttrks = max_in_flight * sizeof(ttrks)
   //
   spinel_allocator_alloc_dbi_dm(&device->allocator.device.perm.drw,
                                 device->vk.pd,
                                 device->vk.d,
                                 device->vk.ac,
                                 max_in_flight * ttrks_size_ru,
                                 NULL,
                                 &impl->vk.dispatch.ttrks);
   //
   // Assign bufrefs to dispatches
   //
   // FIXME(allanmac): Do all VK objects need both their .dbis and .devaddrs
   // initialized?
   //
   for (uint32_t ii = 0; ii < max_in_flight; ii++)
     {
       struct spinel_rbi_dispatch * const dispatch = impl->dispatches.extent + ii;

       // vk.ttrks
       spinel_dbi_devaddr_from_dbi(device->vk.d,
                                   &dispatch->vk.ttrks,
                                   &impl->vk.dispatch.ttrks.dbi,
                                   ii * ttrks_size_ru,
                                   ttrks_size_ru);
     }

   //
   // Allocate per-dispatch radix sort internal and indirect buffers
   //
   //   internal = max_in_flight * rs_mr.internal_size
   //   indirect = max_in_flight * rs_mr.indirect_size
   //
   // clang-format off
   VkDeviceSize const rs_internal_size_ru = ROUND_UP_POW2_MACRO(rs_mr.internal_size, SPN_MEMBER_ALIGN_LIMIT);
   VkDeviceSize const rs_indirect_size_ru = ROUND_UP_POW2_MACRO(rs_mr.indirect_size, SPN_MEMBER_ALIGN_LIMIT);
   // clang-format on

   spinel_allocator_alloc_dbi_dm(&device->allocator.device.perm.drw,
                                 device->vk.pd,
                                 device->vk.d,
                                 device->vk.ac,
                                 max_in_flight * rs_internal_size_ru,
                                 NULL,
                                 &impl->vk.dispatch.rs.internal);

   spinel_allocator_alloc_dbi_dm(&device->allocator.device.perm.drw,
                                 device->vk.pd,
                                 device->vk.d,
                                 device->vk.ac,
                                 max_in_flight * rs_indirect_size_ru,
                                 NULL,
                                 &impl->vk.dispatch.rs.indirect);

   //
   // Assign bufrefs to dispatches
   //
   // FIXME(allanmac): Do all VK objects need both their .dbis and .devaddrs
   // initialized?
   //
   for (uint32_t ii = 0; ii < max_in_flight; ii++)
     {
       struct spinel_rbi_dispatch * const dispatch = impl->dispatches.extent + ii;

       // vk.rs.internal
       spinel_dbi_devaddr_from_dbi(device->vk.d,
                                   &dispatch->vk.rs.internal,
                                   &impl->vk.dispatch.rs.internal.dbi,
                                   ii * rs_internal_size_ru,
                                   rs_internal_size_ru);

       // vk.rs.indirect
       spinel_dbi_devaddr_from_dbi(device->vk.d,
                                   &dispatch->vk.rs.indirect,
                                   &impl->vk.dispatch.rs.indirect.dbi,
                                   ii * rs_indirect_size_ru,
                                   rs_indirect_size_ru);
     }

   //
   // More per-dispatch buffers:
   //
   //   rfs: rasterize fill scan
   //   rrc: rasterize rast cmds
   //   tko: ttrk keys odd
   //
   // We conservatively round them up to ensure the buffers are properly aligned
   // on the device.
   //
   // The rfs/rrc extents can be safely aliased by the later-used tko extent.
   //
   // TODO(allanmac): The rounding and aliasing can be refined.
   //
   // clang-format off
   uint32_t     const rfs_sg_size         = 1u << config->group_sizes.named.fill_scan.subgroup_log2;
   VkDeviceSize const rfs_block_size_pow2 = sizeof(SPN_TYPE_U32VEC4) * 2 * rfs_sg_size;
   VkDeviceSize const rfs_prefix_size     = config->raster_builder.size.ring * sizeof(SPN_TYPE_U32VEC4) * 2;
   VkDeviceSize const rfs_prefix_size_ru  = ROUND_UP_POW2_MACRO(rfs_prefix_size, rfs_block_size_pow2);
   VkDeviceSize const rfs_size            = SPN_BUFFER_OFFSETOF(rasterize_fill_scan, prefix) + rfs_prefix_size_ru;
   VkDeviceSize const rfs_size_ru         = ROUND_UP_POW2_MACRO(rfs_size, SPN_MEMBER_ALIGN_LIMIT);

   VkDeviceSize const rrc_size            = config->raster_builder.size.cmds * sizeof(SPN_TYPE_U32VEC4);
   VkDeviceSize const rrc_size_ru         = ROUND_UP_POW2_MACRO(rrc_size, SPN_MEMBER_ALIGN_LIMIT);

   VkDeviceSize const rfs_rrc_size_ru     = rfs_size_ru + rrc_size_ru;

   VkDeviceSize const tko_size_ru         = ROUND_UP_POW2_MACRO(rs_mr.keyvals_size, SPN_MEMBER_ALIGN_LIMIT);

   VkDeviceSize const rfs_rrc_tko_size_ru = MAX_MACRO(VkDeviceSize, rfs_rrc_size_ru, tko_size_ru);
   // clang-format on

   spinel_allocator_alloc_dbi_dm(&device->allocator.device.perm.drw,
                                 device->vk.pd,
                                 device->vk.d,
                                 device->vk.ac,
                                 max_in_flight * rfs_rrc_tko_size_ru,
                                 NULL,
                                 &impl->vk.dispatch.rfs_rrc_tko);

   //
   // Assign bufrefs to dispatches
   //
   // FIXME(allanmac): Do all VK objects need both their .dbis and .devaddrs
   // initialized?
   //
   for (uint32_t ii = 0; ii < max_in_flight; ii++)
     {
       struct spinel_rbi_dispatch * const dispatch = impl->dispatches.extent + ii;

       // vk.fill_scan
       spinel_dbi_devaddr_from_dbi(device->vk.d,
                                   &dispatch->vk.fill_scan,
                                   &impl->vk.dispatch.rfs_rrc_tko.dbi,
                                   ii * rfs_rrc_tko_size_ru,
                                   rfs_size_ru);

       // vk.rast_cmds
       spinel_dbi_devaddr_from_dbi(device->vk.d,
                                   &dispatch->vk.rast_cmds,
                                   &impl->vk.dispatch.rfs_rrc_tko.dbi,
                                   ii * rfs_rrc_tko_size_ru + rfs_size_ru,
                                   rrc_size_ru);

       // vk.ttrk_keyvals_odd
       spinel_dbi_devaddr_from_dbi(device->vk.d,
                                   &dispatch->vk.ttrk_keyvals_odd,
                                   &impl->vk.dispatch.rfs_rrc_tko.dbi,
                                   ii * rfs_rrc_tko_size_ru,
                                   tko_size_ru);
     }

   //
   // Initialize rings and first dispatch
   //
   spinel_ring_init(&impl->dispatches.ring, max_in_flight);

   spinel_rbi_wip_reset(impl);

   spinel_rbi_dispatch_head_init(impl);

   spinel_weakref_epoch_init(&impl->epoch);

   return SPN_SUCCESS;
 }

 //
 //
 //