src/graphics/lib/compute/spinel/platforms/vk/path_builder_impl.c - fuchsia - Git at Google

 // Copyright 2019 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 //
 //
 //

 #include "path_builder_impl.h"

 #include <float.h>
 #include <memory.h>
 #include <stdlib.h>

 #include "block_pool.h"
 #include "common/macros.h"
 #include "common/vk/assert.h"
 #include "common/vk/barrier.h"
 #include "core_c.h"
 #include "device.h"
 #include "flush.h"
 #include "handle_pool.h"
 #include "queue_pool.h"
 #include "ring.h"
 #include "shaders/push.h"
 #include "spinel/spinel_assert.h"

 //
 // Verify the path header size matches core.h
 //
 STATIC_ASSERT_MACRO_1(sizeof(union spinel_path_header) == SPN_PATH_HEAD_DWORDS * sizeof(uint32_t));

 //
 // The path builder moves bulk path data, nodes and a single header from the
 // host into the device-managed "block" memory pool.  The data is arranged into
 // a SIMT/SIMD-friendly data structure that can be efficiently read by the
 // rasterizer.
 //
 // A simplifying assumption is that the maximum length of a single path can't be
 // larger than what fits in the path builder ring.
 //
 // If a path is too long then the path builder instance is lost.
 //
 // Note that this restriction can be removed with added complexity to the
 // builder and shader.
 //
 // Also note that for some systems, it may be appropriate to never pull path
 // data into the device-managed block pool and instead present the path data to
 // the device in a temporarily available allocated memory "zone" of paths that
 // can be discarded all at once.
 //
 // For other systems, it may be appropriate to simply copy the path data from
 // host to device.
 //
 // The general strategy that this particular Vulkan implementation uses is to
 // allocate a large "HOST_COHERENT" bulk-data path buffer and an auxiliary
 // mappable command buffer.
 //
 // The work-in-progress path's header and latest node are updated locally until
 // full and then stored because the mapped HOST_COHERENT memory is likely
 // uncached and read-modify-writes will be expensive.
 //
 // A line/quad/cubic/rat_quad/rat_cubic acquires 4/6/8/7/10 segments which may
 // be spread across one or more contiguous blocks.
 //
 // If a flush() occurs, then the remaining columns of multi-segment paths are
 // initialized with zero-length path primitives.
 //
 // Every block's command word has a type and a count acquired from a rolling
 // counter.
 //
 // Note that the maximum number of "in-flight" path copy grids is conveniently
 // determined by the size of the fence pool.
 //

 //
 // A dispatch record represents a contiguous region of the ring that can be
 // copied to or read from the device.
 //
 // There should be enough dispatch records available so that if they're all in
 // flight then either a PCIe or memory bandwidth "roofline" limit is reached.
 //
 // The expectation is that the path builder will *not* be CPU bound.
 //
 // The number of dispatch records is defined in the target's config data
 // structure.
 //
 struct spinel_pbi_head_span
 {
   uint32_t head;
   uint32_t span;
 };

 struct spinel_pbi_dispatch
 {
   struct spinel_pbi_head_span blocks;
   struct spinel_pbi_head_span paths;

   uint32_t rolling;  // FIXME(allanmac): move to wip

   struct
   {
     spinel_deps_delayed_semaphore_t delayed;
   } signal;
 };

 //
 //
 //

 struct spinel_pbi_vk
 {
   struct spinel_dbi_dm_devaddr alloc;
   struct spinel_dbi_dm_devaddr ring;
 };

 //
 //
 //

 struct spinel_path_builder_impl
 {
   struct spinel_path_builder * path_builder;
   struct spinel_device *       device;
   struct spinel_pbi_vk         vk;

   struct
   {
     uint32_t block_dwords;
     uint32_t block_subgroups;
     uint32_t subgroup_dwords;
     uint32_t subgroup_subblocks;
     uint32_t rolling_one;
     uint32_t eager_size;
   } config;

   //
   // block and cmd rings share a buffer
   //
   // [<--- blocks --->|<--- cmds --->]
   //
   struct
   {
     struct spinel_ring ring;

     uint32_t rolling;

     struct
     {
       uint32_t rem;
       float *  f32;
     } subgroups;

     union
     {
       uint32_t * u32;
       float *    f32;
       // add head and node structures
     } blocks;

     uint32_t * cmds;
   } mapped;

   //
   // work in progress header
   //
   struct
   {
     union spinel_path_header header;

     uint32_t * node;

     struct
     {
       uint32_t idx;
       uint32_t rolling;
     } head;

     struct
     {
       uint32_t rolling;
     } segs;

     uint32_t rem;
   } wip;

   //
   // Resources released upon an grid completion:
   //
   //   - Path handles are released immediately.
   //
   //   - Dispatch records and associated mapped spans are released in
   //     ring order.
   //
   // Note that there can only be as many paths as there are blocks
   // (empty paths have a header block) so this resource is implicitly
   // managed by the mapped.ring and release.dispatch.ring.
   //
   struct
   {
     spinel_handle_t *  extent;
     struct spinel_next next;
   } paths;

   struct
   {
     struct spinel_pbi_dispatch * extent;
     struct spinel_ring           ring;
   } dispatches;
 };

 //
 //
 //
 static spinel_result_t
 spinel_pbi_lost_begin(struct spinel_path_builder_impl * impl)
 {
   return SPN_ERROR_PATH_BUILDER_LOST;
 }

 static spinel_result_t
 spinel_pbi_lost_end(struct spinel_path_builder_impl * impl, spinel_path_t * path)
 {
   *path = SPN_PATH_INVALID;

   return SPN_ERROR_PATH_BUILDER_LOST;
 }

 static spinel_result_t
 spinel_pbi_release(struct spinel_path_builder_impl * impl);

 static spinel_result_t
 spinel_pbi_lost_release(struct spinel_path_builder_impl * impl)
 {
   //
   // FIXME -- releasing a lost path builder might eventually require a
   // specialized function.  For now, just call the default release.
   //
   return spinel_pbi_release(impl);
 }

 static spinel_result_t
 spinel_pbi_lost_flush(struct spinel_path_builder_impl * impl)
 {
   return SPN_ERROR_PATH_BUILDER_LOST;
 }

 //
 // Define primitive geometry "lost" pfns
 //
 #define SPN_PBI_PFN_LOST_NAME(_p) spinel_pbi_lost_##_p

 #undef SPN_PATH_BUILDER_PRIM_TYPE_EXPAND_X
 #define SPN_PATH_BUILDER_PRIM_TYPE_EXPAND_X(_p, _i, _n)                                            \
   static spinel_result_t SPN_PBI_PFN_LOST_NAME(_p)(struct spinel_path_builder_impl * impl)         \
   {                                                                                                \
     return SPN_ERROR_PATH_BUILDER_LOST;                                                            \
   }

 SPN_PATH_BUILDER_PRIM_TYPE_EXPAND()

 //
 // If (wip.span == mapped.ring.size) then the path is too long and the
 // path builder is terminally "lost".  The path builder should be
 // released and a new one created.
 //
 static void
 spinel_pbi_lost(struct spinel_path_builder_impl * impl)
 {
   struct spinel_path_builder * pb = impl->path_builder;

   pb->begin   = spinel_pbi_lost_begin;
   pb->end     = spinel_pbi_lost_end;
   pb->release = spinel_pbi_lost_release;
   pb->flush   = spinel_pbi_lost_flush;

 #undef SPN_PATH_BUILDER_PRIM_TYPE_EXPAND_X
 #define SPN_PATH_BUILDER_PRIM_TYPE_EXPAND_X(_p, _i, _n) pb->_p = SPN_PBI_PFN_LOST_NAME(_p);

   SPN_PATH_BUILDER_PRIM_TYPE_EXPAND()
 }

 //
 // Append path to path release extent -- note that this resource is
 // implicitly "clocked" by the mapped.ring.
 //
 static void
 spinel_pbi_path_append(struct spinel_path_builder_impl * impl, spinel_path_t const * path)
 {
   uint32_t const idx = spinel_next_acquire_1(&impl->paths.next);

   impl->paths.extent[idx] = path->handle;
 }

 //
 // A dispatch captures how many paths and blocks are in a dispatched or
 // the work-in-progress compute grid.
 //
 static struct spinel_pbi_dispatch *
 spinel_pbi_dispatch_head(struct spinel_path_builder_impl * impl)
 {
   assert(!spinel_ring_is_empty(&impl->dispatches.ring));

   return impl->dispatches.extent + impl->dispatches.ring.head;
 }

 static struct spinel_pbi_dispatch *
 spinel_pbi_dispatch_tail(struct spinel_path_builder_impl * impl)
 {
   assert(!spinel_ring_is_full(&impl->dispatches.ring));

   return impl->dispatches.extent + impl->dispatches.ring.tail;
 }

 static void
 spinel_pbi_dispatch_head_init(struct spinel_path_builder_impl * impl)
 {
   *spinel_pbi_dispatch_head(impl) = (struct spinel_pbi_dispatch){
     .blocks  = { .head = impl->wip.head.idx,     //
                  .span = 0 },                    //
     .paths   = { .head = impl->paths.next.head,  //
                  .span = 0 },                    //
     .rolling = impl->wip.head.rolling,           //
     .signal  = { .delayed = SPN_DEPS_DELAYED_SEMAPHORE_INVALID },
   };
 }

 static void
 spinel_pbi_dispatch_drop(struct spinel_path_builder_impl * impl)
 {
   struct spinel_ring * const ring = &impl->dispatches.ring;

   spinel_ring_drop_1(ring);
 }

 static void
 spinel_pbi_dispatch_acquire(struct spinel_path_builder_impl * impl)
 {
   struct spinel_ring * const   ring   = &impl->dispatches.ring;
   struct spinel_device * const device = impl->device;

   while (spinel_ring_is_empty(ring))
     {
       spinel_deps_drain_1(device->deps, &device->vk);
     }

   spinel_pbi_dispatch_head_init(impl);
 }

 static void
 spinel_pbi_dispatch_append(struct spinel_path_builder_impl * impl,
                            struct spinel_pbi_dispatch *      dispatch,
                            spinel_path_t const *             path)
 {
   spinel_pbi_path_append(impl, path);

   // clang-format off
   dispatch->blocks.span += impl->wip.header.named.blocks;
   dispatch->paths.span  += 1;
   // clang-format on
 }

 //
 //
 //
 static void
 spinel_pbi_flush_complete(void * data0, void * data1)
 {
   struct spinel_path_builder_impl * const impl     = data0;
   struct spinel_pbi_dispatch * const      dispatch = data1;
   struct spinel_device * const            device   = impl->device;

   //
   // These path handles are now materialized
   //
   spinel_deps_delayed_detach_ring(device->deps,
                                   impl->paths.extent,
                                   impl->paths.next.size,
                                   dispatch->paths.head,
                                   dispatch->paths.span);

   //
   // Release the paths -- may invoke wait()
   //
   spinel_device_release_d_paths_ring(device,
                                      impl->paths.extent,
                                      impl->paths.next.size,
                                      dispatch->paths.head,
                                      dispatch->paths.span);
   //
   // If the dispatch is the tail of the ring then try to release as
   // many dispatch records as possible...
   //
   // Note that kernels can complete in any order so the release
   // records need to add to the mapped.ring.tail in order.
   //
   dispatch->signal.delayed = SPN_DEPS_DELAYED_SEMAPHORE_INVALID;

   struct spinel_pbi_dispatch * tail = spinel_pbi_dispatch_tail(impl);

   while (tail->signal.delayed == SPN_DEPS_DELAYED_SEMAPHORE_INVALID)
     {
       // release the blocks and cmds
       spinel_ring_release_n(&impl->mapped.ring, tail->blocks.span);

       // release the dispatch
       spinel_ring_release_n(&impl->dispatches.ring, 1);

       // any dispatches in flight?
       if (spinel_ring_is_full(&impl->dispatches.ring))
         {
           break;
         }

       // get new tail
       tail = spinel_pbi_dispatch_tail(impl);
     }
 }

 //
 //
 //
 static VkPipelineStageFlags
 spinel_pbi_flush_record(VkCommandBuffer cb, void * data0, void * data1)
 {
   struct spinel_path_builder_impl * const impl     = data0;
   struct spinel_pbi_dispatch * const      dispatch = data1;
   struct spinel_device * const            device   = impl->device;

   ////////////////////////////////////////////////////////////////
   //
   // PATHS ALLOC
   //
   ////////////////////////////////////////////////////////////////

   struct spinel_push_paths_alloc const push_paths_alloc = {

     .devaddr_block_pool_ids   = device->block_pool.vk.dbi_devaddr.ids.devaddr,
     .devaddr_paths_copy_alloc = impl->vk.alloc.devaddr,
     .pc_alloc_idx             = impl->dispatches.ring.head,
     .pc_span                  = dispatch->blocks.span
   };

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.paths_alloc,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_paths_alloc),
                      &push_paths_alloc);

   vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.paths_alloc);

   vkCmdDispatch(cb, 1, 1, 1);

   ////////////////////////////////////////////////////////////////
   //
   // BARRIER: COMPUTE>COMPUTE
   //
   ////////////////////////////////////////////////////////////////

   vk_barrier_compute_w_to_compute_r(cb);

   ////////////////////////////////////////////////////////////////
   //
   // PATHS COPY
   //
   ////////////////////////////////////////////////////////////////

   struct spinel_push_paths_copy const push_paths_copy = {

     .devaddr_block_pool_ids      = device->block_pool.vk.dbi_devaddr.ids.devaddr,
     .devaddr_block_pool_blocks   = device->block_pool.vk.dbi_devaddr.blocks.devaddr,
     .devaddr_block_pool_host_map = device->block_pool.vk.dbi_devaddr.host_map.devaddr,
     .devaddr_paths_copy_alloc    = impl->vk.alloc.devaddr,
     .devaddr_paths_copy_ring     = impl->vk.ring.devaddr,
     .bp_mask                     = device->block_pool.bp_mask,
     .pc_alloc_idx                = impl->dispatches.ring.head,
     .pc_span                     = dispatch->blocks.span,
     .pc_head                     = dispatch->blocks.head,
     .pc_rolling                  = dispatch->rolling,
     .pc_size                     = impl->mapped.ring.size,
   };

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.paths_copy,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_paths_copy),
                      &push_paths_copy);

   vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.paths_copy);

   //
   // Dispatch one subgroup per block
   //
   struct spinel_target_config const * const config = &device->ti.config;

   uint32_t const sgs_per_wg = config->group_sizes.named.paths_copy.workgroup >>
                               config->group_sizes.named.paths_copy.subgroup_log2;

   uint32_t const wg_count = (dispatch->blocks.span + sgs_per_wg - 1) / sgs_per_wg;

   vkCmdDispatch(cb, wg_count, 1, 1);

   //
   // NOTE(allanmac):
   //
   // The `deps` scheduler assumes that the command buffers associated with
   // delayed semaphores always end with a with a compute shader
   // (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT).
   //
   // Only the path builder and raster builder acquire delayes semaphores.
   //
   return VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
 }

 //
 //
 //
 static void
 spinel_pbi_flush_submit(void * data0, void * data1)
 {
   struct spinel_path_builder_impl * const impl     = data0;
   struct spinel_pbi_dispatch * const      dispatch = data1;
   struct spinel_device * const            device   = impl->device;

   assert(dispatch->paths.span > 0);

   //
   // Flush if the ring is non-coherent
   //
   if (!spinel_allocator_is_coherent(&device->allocator.device.perm.hw_dr))
     {
       VkDeviceSize const block_size       = sizeof(uint32_t) * impl->config.block_dwords;
       VkDeviceSize const blocks_ring_size = block_size * impl->mapped.ring.size;

       // Flush blocks
       spinel_ring_flush(&device->vk,
                         impl->vk.ring.dbi_dm.dm,
                         0,
                         impl->mapped.ring.size,
                         dispatch->blocks.head,
                         dispatch->blocks.span,
                         block_size);

       // Flush commands
       spinel_ring_flush(&device->vk,
                         impl->vk.ring.dbi_dm.dm,
                         blocks_ring_size,
                         impl->mapped.ring.size,
                         dispatch->blocks.head,
                         dispatch->blocks.span,
                         sizeof(uint32_t));
     }

   //
   // Acquire an immediate semaphore
   //
   // Doesn't wait on any handles.
   //
   struct spinel_deps_immediate_submit_info const disi = {
     .record = {
       .pfn   = spinel_pbi_flush_record,
       .data0 = impl,
       .data1 = dispatch,
     },
     //
     // Path builder has no delayed handle dependency
     //
     .completion = {
       .pfn   = spinel_pbi_flush_complete,
       .data0 = impl,
       .data1 = dispatch,
     },
     .signal = {
       .delayed = {
         .count      = 1,
         .semaphores = {
           dispatch->signal.delayed,
         },
       },
     },
   };

   //
   // The current dispatch is now sealed so drop it
   //
   spinel_pbi_dispatch_drop(impl);

   //
   // We don't need to save the returned immediate semaphore.
   //
   spinel_deps_immediate_submit(device->deps, &device->vk, &disi, NULL);

   //
   // Acquire and initialize the next dispatch
   //
   spinel_pbi_dispatch_acquire(impl);
 }

 //
 //
 //
 static spinel_result_t
 spinel_pbi_flush(struct spinel_path_builder_impl * impl)
 {
   //
   // Anything to launch?
   //
   struct spinel_pbi_dispatch * const dispatch = spinel_pbi_dispatch_head(impl);

   if (dispatch->paths.span == 0)
     {
       return SPN_SUCCESS;
     }

   //
   // Invoke the delayed submission action
   //
   spinel_deps_delayed_flush(impl->device->deps, dispatch->signal.delayed);

   return SPN_SUCCESS;
 }

 //
 // Before returning a path handle, any remaining coordinates in the
 // subgroups(s) are finalized with zero-length primitives.
 //
 static void
 spinel_pb_cn_coords_zero(float * coords, uint32_t rem)
 {
   do
     {
       *coords++ = 0.0f;
   } while (--rem > 0);
 }

 static void
 spinel_pb_cn_coords_finalize(float * coords[], uint32_t coords_len, uint32_t rem)
 {
   do
     {
       spinel_pb_cn_coords_zero(*coords++, rem);
   } while (--coords_len > 0);
 }

 static void
 spinel_pb_finalize_subgroups(struct spinel_path_builder_impl * impl)
 {
   struct spinel_path_builder * const pb = impl->path_builder;

   //
   // Note that this zeroes a cacheline / subblock at a time
   //
 #undef SPN_PATH_BUILDER_PRIM_TYPE_EXPAND_X
 #define SPN_PATH_BUILDER_PRIM_TYPE_EXPAND_X(_p, _i, _n)                                            \
   {                                                                                                \
     uint32_t rem = pb->cn.rem._p;                                                                  \
                                                                                                    \
     if (rem > 0)                                                                                   \
       {                                                                                            \
         pb->cn.rem._p = 0;                                                                         \
                                                                                                    \
         spinel_pb_cn_coords_finalize(pb->cn.coords._p, _n, rem);                                   \
       }                                                                                            \
   }

   SPN_PATH_BUILDER_PRIM_TYPE_EXPAND()
 }

 //
 //
 //
 static void
 spinel_pbi_cmd_append(struct spinel_path_builder_impl * impl,
                       uint32_t const                    idx,
                       uint32_t const                    type)
 {
   uint32_t const rolling = impl->mapped.rolling;
   uint32_t const cmd     = rolling | type;

   impl->mapped.cmds[idx] = cmd;
   impl->mapped.rolling   = rolling + impl->config.rolling_one;

   impl->wip.header.named.blocks += 1;
 }

 //
 //
 //
 static void
 spinel_pbi_node_append_next(struct spinel_path_builder_impl * impl)
 {
   // no need to increment the node pointer
   *impl->wip.node = impl->mapped.rolling | SPN_BLOCK_ID_TAG_PATH_NEXT;
 }

 //
 //
 //
 static uint32_t
 spinel_pbi_acquire_head_block(struct spinel_path_builder_impl * impl)
 {
   struct spinel_ring * const ring = &impl->mapped.ring;

   // is ring full?
   if (spinel_ring_is_empty(ring))
     {
       // launch any unlaunched dispatch
       spinel_pbi_flush(impl);

       struct spinel_device * const device = impl->device;

       do
         {
           // wait for at least one dispatch to complete
           spinel_deps_drain_1(device->deps, &device->vk);

       } while (spinel_ring_is_empty(ring));
     }

   return spinel_ring_acquire_1(&impl->mapped.ring);
 }

 static spinel_result_t
 spinel_pbi_acquire_node_segs_block(struct spinel_path_builder_impl * impl, uint32_t * idx)
 {
   struct spinel_ring * const ring = &impl->mapped.ring;

   if (spinel_ring_is_empty(ring))
     {
       //
       // If the work in progress is going to exceed the size of the ring
       // then this is a fatal error. At this point, we can kill the path
       // builder instead of the device.
       //
       if (impl->wip.header.named.blocks >= impl->mapped.ring.size)
         {
           spinel_pbi_lost(impl);

           return SPN_ERROR_PATH_BUILDER_LOST;  // FIXME(allanmac): return a "TOO_LONG" error?
         }

       //
       // Otherwise, launch whatever is in the ring...
       //
       spinel_pbi_flush(impl);

       //
       // ... and wait for blocks to appear in the ring!
       //
       struct spinel_device * const device = impl->device;

       do
         {
           // wait for at least one dispatch to complete
           spinel_deps_drain_1(device->deps, &device->vk);

       } while (spinel_ring_is_empty(ring));
     }

   *idx = spinel_ring_acquire_1(&impl->mapped.ring);

   return SPN_SUCCESS;
 }

 //
 //
 //
 static void
 spinel_pbi_acquire_head(struct spinel_path_builder_impl * impl)
 {
   uint32_t const idx = spinel_pbi_acquire_head_block(impl);

   spinel_pbi_cmd_append(impl, idx, SPN_PATHS_COPY_CMD_TYPE_HEAD);

   uint32_t const   offset = idx * impl->config.block_dwords;
   uint32_t * const head   = impl->mapped.blocks.u32 + offset;

   impl->wip.node = head + SPN_PATH_HEAD_DWORDS;
   impl->wip.rem  = impl->config.block_dwords - SPN_PATH_HEAD_DWORDS;
 }

 static spinel_result_t
 spinel_pbi_acquire_node(struct spinel_path_builder_impl * impl)
 {
   spinel_pbi_node_append_next(impl);

   uint32_t              idx;
   spinel_result_t const err = spinel_pbi_acquire_node_segs_block(impl, &idx);

   if (err != SPN_SUCCESS)
     {
       return err;
     }

   spinel_pbi_cmd_append(impl, idx, SPN_PATHS_COPY_CMD_TYPE_NODE);

   impl->wip.header.named.nodes += 1;

   uint32_t const offset = idx * impl->config.block_dwords;

   impl->wip.node = impl->mapped.blocks.u32 + offset;
   impl->wip.rem  = impl->config.block_dwords;

   return SPN_SUCCESS;
 }

 static spinel_result_t
 spinel_pbi_acquire_segs(struct spinel_path_builder_impl * impl)
 {
   uint32_t idx;

   spinel_result_t const err = spinel_pbi_acquire_node_segs_block(impl, &idx);

   if (err != SPN_SUCCESS)
     {
       return err;
     }

   impl->wip.segs.rolling = impl->mapped.rolling;

   spinel_pbi_cmd_append(impl, idx, SPN_PATHS_COPY_CMD_TYPE_SEGS);

   uint32_t const offset = idx * impl->config.block_dwords;

   impl->mapped.subgroups.f32 = impl->mapped.blocks.f32 + offset;
   impl->mapped.subgroups.rem = impl->config.block_subgroups;

   return SPN_SUCCESS;
 }

 //
 //
 //
 static void
 spinel_pbi_node_append_segs(struct spinel_path_builder_impl * impl, uint32_t const tag)
 {
   uint32_t const subgroup_idx = impl->config.block_subgroups - impl->mapped.subgroups.rem;
   uint32_t const subblock_idx = subgroup_idx * impl->config.subgroup_subblocks;
   uint32_t const subblock_shl = subblock_idx << SPN_TAGGED_BLOCK_ID_BITS_TAG;
   uint32_t const tbid         = (impl->wip.segs.rolling | subblock_shl | tag);

   *impl->wip.node++ = tbid;

   impl->wip.rem -= 1;
 }

 //
 //
 //
 static spinel_result_t
 spinel_pbi_prim_acquire_subgroups(struct spinel_path_builder_impl * impl,
                                   uint32_t const                    tag,
                                   float **                          coords,
                                   uint32_t                          coords_len)
 {
   //
   // Write a tagged block id to the node that records:
   //
   //   { block id, subblock idx, prim tag }
   //
   // If the path primitive spans more than one block then there will
   // be a TAG_PATH_NEXT pointing to the next block.
   //
   // Note that a subgroup may be 1, 2 or a higher power of two
   // subblocks.
   //
   uint32_t curr_tag = tag;

   do
     {
       // is there only one tagged block id left in the node?
       if (impl->wip.rem == 1)
         {
           spinel_result_t const err = spinel_pbi_acquire_node(impl);

           if (err != SPN_SUCCESS)
             return err;
         }

       // are there no subgroups left?
       if (impl->mapped.subgroups.rem == 0)
         {
           spinel_result_t const err = spinel_pbi_acquire_segs(impl);

           if (err != SPN_SUCCESS)
             return err;
         }

       // record the tagged block id
       spinel_pbi_node_append_segs(impl, curr_tag);

       // any tag after this is a caboose
       curr_tag = SPN_BLOCK_ID_TAG_PATH_NEXT;

       // initialize path builder's pointers
       uint32_t count = MIN_MACRO(uint32_t, coords_len, impl->mapped.subgroups.rem);

       impl->mapped.subgroups.rem -= count;

       coords_len -= count;

       do
         {
           *coords++ = impl->mapped.subgroups.f32;

           impl->mapped.subgroups.f32 += impl->config.subgroup_dwords;
       } while (--count > 0);
   } while (coords_len > 0);

   // update path builder rem count
   impl->path_builder->cn.rem.aN[tag] = impl->config.subgroup_dwords;

   // the prims count tracks the number of tagged block ids
   impl->wip.header.named.prims.array[tag] += 1;

   return SPN_SUCCESS;
 }

 //
 // Define primitive geometry pfns
 //

 #define SPN_PBI_PFN_NAME(_p) spinel_pbi_##_p

 #undef SPN_PATH_BUILDER_PRIM_TYPE_EXPAND_X
 #define SPN_PATH_BUILDER_PRIM_TYPE_EXPAND_X(_p, _i, _n)                                            \
   static spinel_result_t SPN_PBI_PFN_NAME(_p)(struct spinel_path_builder_impl * impl)              \
   {                                                                                                \
     return spinel_pbi_prim_acquire_subgroups(impl, _i, impl->path_builder->cn.coords._p, _n);      \
   }

 SPN_PATH_BUILDER_PRIM_TYPE_EXPAND()

 //
 //
 //
 STATIC_ASSERT_MACRO_1(sizeof(union spinel_path_header) ==
                       MEMBER_SIZE_MACRO(union spinel_path_header, array));

 static void
 spinel_pbi_wip_reset(struct spinel_path_builder_impl * impl)
 {
   struct spinel_path_builder * const pb = impl->path_builder;

   // init path builder counters
 #undef SPN_PATH_BUILDER_PRIM_TYPE_EXPAND_X
 #define SPN_PATH_BUILDER_PRIM_TYPE_EXPAND_X(_p, _i, _n) pb->cn.rem._p = 0;

   SPN_PATH_BUILDER_PRIM_TYPE_EXPAND();

   // save mapped head to wip
   impl->wip.head.idx     = impl->mapped.ring.head;
   impl->wip.head.rolling = impl->mapped.rolling;

   // there are no subblocks available
   impl->mapped.subgroups.rem = 0;

   // update header -- don't bother initializing .handle and .na
   impl->wip.header.named.blocks = 0;
   impl->wip.header.named.nodes  = 0;

   // reset prim counters
   memset(impl->wip.header.named.prims.array, 0, sizeof(impl->wip.header.named.prims.array));

   // reset bounds
   impl->wip.header.named.bounds[0] = FLT_MAX;
   impl->wip.header.named.bounds[1] = FLT_MAX;
   impl->wip.header.named.bounds[2] = FLT_MIN;
   impl->wip.header.named.bounds[3] = FLT_MIN;
 }

 //
 //
 //
 static spinel_result_t
 spinel_pbi_begin(struct spinel_path_builder_impl * impl)
 {
   // acquire head block
   spinel_pbi_acquire_head(impl);

   return SPN_SUCCESS;
 }

 //
 //
 //
 STATIC_ASSERT_MACRO_1(SPN_TAGGED_BLOCK_ID_INVALID == UINT32_MAX);

 static spinel_result_t
 spinel_pbi_end(struct spinel_path_builder_impl * impl, spinel_path_t * path)
 {
   // finalize all incomplete active subgroups -- note that we don't
   // care about unused remaining subblocks in a block
   spinel_pb_finalize_subgroups(impl);

   // mark remaining ids in the head or node as invalid
   memset(impl->wip.node, 0xFF, sizeof(*impl->wip.node) * impl->wip.rem);

   // device
   struct spinel_device * const device = impl->device;

   // get the head dispatch
   struct spinel_pbi_dispatch * const dispatch = spinel_pbi_dispatch_head(impl);

   // do we need to acquire a delayed semaphore?
   if (dispatch->signal.delayed == SPN_DEPS_DELAYED_SEMAPHORE_INVALID)
     {
       struct spinel_deps_acquire_delayed_info const dadi = {

         .submission = { .pfn   = spinel_pbi_flush_submit,  //
                         .data0 = impl,
                         .data1 = dispatch }
       };

       dispatch->signal.delayed = spinel_deps_delayed_acquire(device->deps, &device->vk, &dadi);
     }

   // acquire path host id
   path->handle = spinel_device_handle_acquire(device);

   // update device-side path header with host-side path handle
   impl->wip.header.named.handle = path->handle;

   // associate delayed semaphore with handle
   spinel_deps_delayed_attach(device->deps, path->handle, dispatch->signal.delayed);

   // append path to dispatch
   spinel_pbi_dispatch_append(impl, dispatch, path);

   uint32_t const   offset = impl->wip.head.idx * impl->config.block_dwords;
   uint32_t * const head   = impl->mapped.blocks.u32 + offset;

   // copy wip header to mapped coherent head block
   memcpy(head, impl->wip.header.array, sizeof(impl->wip.header));

   // reset wip header
   spinel_pbi_wip_reset(impl);

   // eagerly flush?
   if (dispatch->blocks.span >= impl->config.eager_size)
     {
       spinel_deps_delayed_flush(device->deps, dispatch->signal.delayed);
     }

   return SPN_SUCCESS;
 }

 //
 //
 //
 static spinel_result_t
 spinel_pbi_release(struct spinel_path_builder_impl * impl)
 {
   //
   // Launch any wip dispatch
   //
   spinel_pbi_flush(impl);

   //
   // Wait for all in-flight dispatches to complete
   //
   struct spinel_ring * const   ring   = &impl->dispatches.ring;
   struct spinel_device * const device = impl->device;

   while (!spinel_ring_is_full(ring))
     {
       spinel_deps_drain_1(device->deps, &device->vk);
     }

   //
   // Free device allocations.
   //
   vkUnmapMemory(device->vk.d, impl->vk.ring.dbi_dm.dm);  // not necessary

   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.hw_dr,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.ring.dbi_dm);

   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.alloc.dbi_dm);

   //
   // Free host allocations
   //
   free(impl->dispatches.extent);
   free(impl->paths.extent);
   free(impl->path_builder);
   free(impl);

   spinel_context_release(device->context);

   return SPN_SUCCESS;
 }

 //
 //
 //
 spinel_result_t
 spinel_path_builder_impl_create(struct spinel_device *        device,
                                 struct spinel_path_builder ** path_builder)
 {
   spinel_context_retain(device->context);

   //
   // allocate impl
   //
   struct spinel_path_builder_impl * const impl = MALLOC_MACRO(sizeof(*impl));

   //
   // allocate path builder
   //
   struct spinel_path_builder * const pb = MALLOC_MACRO(sizeof(*pb));

   // init impl and pb back-pointers
   *path_builder      = pb;
   impl->path_builder = pb;
   pb->impl           = impl;

   // save device
   impl->device = device;

   //
   // init path builder pfns and rem count
   //
   pb->begin   = spinel_pbi_begin;
   pb->end     = spinel_pbi_end;
   pb->release = spinel_pbi_release;
   pb->flush   = spinel_pbi_flush;

 #undef SPN_PATH_BUILDER_PRIM_TYPE_EXPAND_X
 #define SPN_PATH_BUILDER_PRIM_TYPE_EXPAND_X(_p, _i, _n) pb->_p = SPN_PBI_PFN_NAME(_p);

   SPN_PATH_BUILDER_PRIM_TYPE_EXPAND()

   //
   // init refcount & state
   //
   pb->ref_count = 1;

   SPN_ASSERT_STATE_INIT(SPN_PATH_BUILDER_STATE_READY, pb);

   //
   // get target config
   //
   struct spinel_target_config const * const config = &device->ti.config;

   //
   // FIXME(allanmac): Stop replicating these constants in the impl?
   //
   // stash device-specific params
   uint32_t const block_dwords    = 1u << config->block_pool.block_dwords_log2;
   uint32_t const subblock_dwords = 1u << config->block_pool.subblock_dwords_log2;
   uint32_t const subgroup_dwords = 1u << config->group_sizes.named.paths_copy.subgroup_log2;

   impl->config.block_dwords       = block_dwords;
   impl->config.block_subgroups    = block_dwords / subgroup_dwords;
   impl->config.subgroup_dwords    = subgroup_dwords;
   impl->config.subgroup_subblocks = subgroup_dwords / subblock_dwords;

   impl->config.rolling_one = (block_dwords / subblock_dwords) << SPN_TAGGED_BLOCK_ID_BITS_TAG;
   impl->config.eager_size  = config->path_builder.size.eager;

   uint32_t const max_in_flight = config->path_builder.size.dispatches;

   spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.drw,
                                         device->vk.pd,
                                         device->vk.d,
                                         device->vk.ac,
                                         sizeof(uint32_t) * max_in_flight,
                                         NULL,
                                         &impl->vk.alloc);

   uint32_t const ring_size = config->path_builder.size.ring;

   //
   // initialize mapped counters
   //
   spinel_ring_init(&impl->mapped.ring, ring_size);

   impl->mapped.rolling = 0;

   //
   // each ring entry is a block of dwords and a one dword cmd
   //
   // round up to coherent atom whether allocator is coherent or not
   //
   uint32_t const     extent_dwords  = ring_size * (block_dwords + 1);
   size_t const       extent_size    = extent_dwords * sizeof(uint32_t);
   VkDeviceSize const extent_size_ru = ROUND_UP_POW2_MACRO(extent_size,  //
                                                           device->vk.limits.noncoherent_atom_size);

   spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.hw_dr,
                                         device->vk.pd,
                                         device->vk.d,
                                         device->vk.ac,
                                         extent_size_ru,
                                         NULL,
                                         &impl->vk.ring);

   //
   // map and initialize blocks and cmds
   //
   vk(MapMemory(device->vk.d,
                impl->vk.ring.dbi_dm.dm,
                0,
                VK_WHOLE_SIZE,
                0,
                (void **)&impl->mapped.blocks.u32));

   //
   // cmds are offset from blocks
   //
   uint32_t const cmds_offset = ring_size * block_dwords;

   impl->mapped.cmds = impl->mapped.blocks.u32 + cmds_offset;

   //
   // allocate path release extent
   //
   size_t const paths_size = sizeof(*impl->paths.extent) * ring_size;

   impl->paths.extent = MALLOC_MACRO(paths_size);

   spinel_next_init(&impl->paths.next, ring_size);

   //
   // reset wip after mapped counters and path release extent
   //
   spinel_pbi_wip_reset(impl);

   //
   // allocate dispatches ring
   //
   size_t const dispatches_size = sizeof(*impl->dispatches.extent) * max_in_flight;

   impl->dispatches.extent = MALLOC_MACRO(dispatches_size);

   spinel_ring_init(&impl->dispatches.ring, max_in_flight);

   spinel_pbi_dispatch_head_init(impl);

   return SPN_SUCCESS;
 }

 //
 //
 //