src/graphics/lib/compute/spinel/platforms/vk/composition_impl.c - fuchsia - Git at Google

 // Copyright 2019 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 //
 //
 //

 #include "composition_impl.h"

 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>

 #include "block_pool.h"
 #include "common/vk/assert.h"
 #include "common/vk/barrier.h"
 #include "device.h"
 #include "flush.h"
 #include "handle_pool.h"
 #include "queue_pool.h"
 #include "raster_builder_impl.h"
 #include "ring.h"
 #include "shaders/push.h"
 #include "spinel/spinel_assert.h"
 #include "state_assert.h"

 //
 // The composition launches a number of dependent command buffers:
 //
 //   1. RESET TTCK atomic count
 //   2. PLACE shaders
 //   3. SORT INDIRECT keyvals
 //   4. SEGMENT keyvals
 //

 //
 // Per-dispatch state
 //
 struct spinel_ci_dispatch
 {
   struct
   {
     uint32_t head;
     uint32_t span;
   } cp;  // place commands

   struct
   {
     uint32_t head;
   } rd;  // raster handles are 1:1 with place commands

   struct
   {
     spinel_deps_immediate_semaphore_t immediate;  // "invalid" once drained
   } signal;
 };

 //
 // Vulkan objects
 //
 struct spinel_ci_vk
 {
   struct
   {
     struct spinel_dbi_dm_devaddr h;
     struct spinel_dbi_dm_devaddr d;
   } rings;

   struct spinel_dbi_dm_devaddr ttcks;
   struct spinel_dbi_dm         ttck_keyvals_odd;
   struct spinel_dbi_devaddr    ttck_keyvals_out;

   struct
   {
     struct spinel_dbi_dm internal;
     struct spinel_dbi_dm indirect;
   } rs;
 };

 //
 // Valid states
 //
 typedef enum spinel_ci_state_e
 {
   SPN_CI_STATE_RESETTING,  // unsealed and resetting
   SPN_CI_STATE_UNSEALED,   // unsealed and ready to place rasters
   SPN_CI_STATE_SEALING,    // waiting for PLACE and TTCK_SORT
   SPN_CI_STATE_SEALED      // sort & segment complete

 } spinel_ci_state_e;

 //
 //
 //
 struct spinel_composition_impl
 {
   struct spinel_composition * composition;
   struct spinel_device *      device;

   //
   // Vulkan resources
   //
   struct spinel_ci_vk vk;

   //
   // composition clip
   //
   SPN_TYPE_I32VEC4 clip;

   //
   // host mapped command ring and copyback counts
   //
   struct
   {
     struct
     {
       struct spinel_cmd_place * extent;
       struct spinel_ring        ring;
     } cp;  // place commands
   } mapped;

   //
   // records of work-in-progress and work-in-flight
   //
   struct
   {
     struct spinel_ci_dispatch * extent;
     struct spinel_ring          ring;
   } dispatches;

   //
   // all rasters are retained until reset or release
   //
   struct
   {
     spinel_handle_t * extent;
     uint32_t          size;
     uint32_t          count;
   } rasters;

   uint32_t          lock_count;  // # of wip renders
   spinel_ci_state_e state;       // state of composition

   //
   // signalling timelines
   //
   struct
   {
     struct
     {
       spinel_deps_immediate_semaphore_t immediate;
     } resetting;
     struct
     {
       spinel_deps_immediate_semaphore_t immediate;
     } sealing;
   } signal;
 };

 //
 //
 //
 static bool
 spinel_ci_is_staged(struct spinel_device const * device)
 {
   return !spinel_allocator_is_device_local(&device->allocator.device.perm.hw_dr);
 }

 //
 // A dispatch captures how many paths and blocks are in a dispatched or
 // the work-in-progress compute grid.
 //
 static struct spinel_ci_dispatch *
 spinel_ci_dispatch_head(struct spinel_composition_impl * impl)
 {
   assert(!spinel_ring_is_empty(&impl->dispatches.ring));

   return impl->dispatches.extent + impl->dispatches.ring.head;
 }

 static struct spinel_ci_dispatch *
 spinel_ci_dispatch_tail(struct spinel_composition_impl * impl)
 {
   assert(!spinel_ring_is_full(&impl->dispatches.ring));

   return impl->dispatches.extent + impl->dispatches.ring.tail;
 }

 static bool
 spinel_ci_dispatch_is_empty(struct spinel_ci_dispatch const * dispatch)
 {
   return (dispatch->cp.span == 0);
 }

 static void
 spinel_ci_dispatch_init(struct spinel_composition_impl * impl, struct spinel_ci_dispatch * dispatch)
 {
   // .signal doesn't need initialization
   *dispatch = (struct spinel_ci_dispatch){
     .cp = {
       .head = impl->mapped.cp.ring.head,
       .span = 0,
     },
     .rd = {
       .head = impl->rasters.count,
     },
   };
 }

 static void
 spinel_ci_dispatch_drop(struct spinel_composition_impl * impl)
 {
   struct spinel_ring * const ring = &impl->dispatches.ring;

   spinel_ring_drop_1(ring);
 }

 static void
 spinel_ci_dispatch_acquire(struct spinel_composition_impl * impl)
 {
   struct spinel_ring * const   ring   = &impl->dispatches.ring;
   struct spinel_device * const device = impl->device;

   while (spinel_ring_is_empty(ring))
     {
       spinel_deps_drain_1(device->deps, &device->vk);
     }

   struct spinel_ci_dispatch * const dispatch = spinel_ci_dispatch_head(impl);

   spinel_ci_dispatch_init(impl, dispatch);
 }

 //
 //
 //
 static void
 spinel_ci_place_flush_complete(void * data0, void * data1)
 {
   struct spinel_composition_impl * impl     = data0;
   struct spinel_ci_dispatch *      dispatch = data1;

   //
   // If the dispatch is the tail of the ring then try to release as
   // many dispatch records as possible...
   //
   // Note that kernels can complete in any order so the release
   // records need to add to the mapped.ring.tail in order.
   //
   dispatch->signal.immediate = SPN_DEPS_IMMEDIATE_SEMAPHORE_INVALID;

   dispatch = spinel_ci_dispatch_tail(impl);

   while (dispatch->signal.immediate == SPN_DEPS_IMMEDIATE_SEMAPHORE_INVALID)
     {
       // release ring span
       spinel_ring_release_n(&impl->mapped.cp.ring, dispatch->cp.span);

       // release the dispatch
       spinel_ring_release_n(&impl->dispatches.ring, 1);

       // any dispatches in flight?
       if (spinel_ring_is_full(&impl->dispatches.ring))
         {
           break;
         }

       // get new tail
       dispatch = spinel_ci_dispatch_tail(impl);
     }
 }

 //
 //
 //
 static VkPipelineStageFlags
 spinel_ci_place_flush_record(VkCommandBuffer cb, void * data0, void * data1)
 {
   struct spinel_composition_impl * const    impl     = data0;
   struct spinel_device * const              device   = impl->device;
   struct spinel_target_config const * const config   = &device->ti.config;
   struct spinel_ci_dispatch * const         dispatch = data1;

   if (spinel_ci_is_staged(device))
     {
       VkDeviceSize const head_offset = dispatch->cp.head * sizeof(struct spinel_cmd_place);

       if (dispatch->cp.head + dispatch->cp.span <= impl->mapped.cp.ring.size)
         {
           VkBufferCopy bcs[1];

           bcs[0].srcOffset = impl->vk.rings.h.dbi_dm.dbi.offset + head_offset;
           bcs[0].dstOffset = impl->vk.rings.d.dbi_dm.dbi.offset + head_offset;
           bcs[0].size      = dispatch->cp.span * sizeof(struct spinel_cmd_place);

           vkCmdCopyBuffer(cb,
                           impl->vk.rings.h.dbi_dm.dbi.buffer,
                           impl->vk.rings.d.dbi_dm.dbi.buffer,
                           1,
                           bcs);
         }
       else  // wraps around ring
         {
           VkBufferCopy bcs[2];

           uint32_t const hi = impl->mapped.cp.ring.size - dispatch->cp.head;
           bcs[0].srcOffset  = impl->vk.rings.h.dbi_dm.dbi.offset + head_offset;
           bcs[0].dstOffset  = impl->vk.rings.d.dbi_dm.dbi.offset + head_offset;
           bcs[0].size       = hi * sizeof(struct spinel_cmd_place);

           uint32_t const lo = dispatch->cp.head + dispatch->cp.span - impl->mapped.cp.ring.size;
           bcs[1].srcOffset  = impl->vk.rings.h.dbi_dm.dbi.offset;
           bcs[1].dstOffset  = impl->vk.rings.d.dbi_dm.dbi.offset;
           bcs[1].size       = lo * sizeof(struct spinel_cmd_place);

           vkCmdCopyBuffer(cb,
                           impl->vk.rings.h.dbi_dm.dbi.buffer,
                           impl->vk.rings.d.dbi_dm.dbi.buffer,
                           2,
                           bcs);
         }

       vk_barrier_transfer_w_to_compute_r(cb);
     }

   //
   // PLACE
   //
   // NOTE(allanmac): PLACE_TTPK and PLACE_TTSK have compatible push constants.
   //
   struct spinel_push_place const push_place = {

     .place_clip                  = impl->clip,
     .devaddr_block_pool_blocks   = device->block_pool.vk.dbi_devaddr.blocks.devaddr,
     .devaddr_block_pool_host_map = device->block_pool.vk.dbi_devaddr.host_map.devaddr,
     .devaddr_ttcks               = impl->vk.ttcks.devaddr,
     .devaddr_place               = impl->vk.rings.d.devaddr,
     .place_head                  = dispatch->cp.head,
     .place_span                  = dispatch->cp.span,
     .place_size                  = impl->mapped.cp.ring.size
   };

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.place_ttpk,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_place),
                      &push_place);

   //
   // dispatch one subgroup per command -- place_ttpk and place_ttsk are same
   //
   // clang-format off
   uint32_t const place_wg_size      = config->group_sizes.named.place_ttpk.workgroup;
   uint32_t const place_sg_size_log2 = config->group_sizes.named.place_ttpk.subgroup_log2;
   uint32_t const place_cmds_per_wg  = place_wg_size >> place_sg_size_log2;
   uint32_t const place_wgs          = (dispatch->cp.span + place_cmds_per_wg - 1) / place_cmds_per_wg;
   // clang-format on

   // bind PLACE_TTPK
   vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.place_ttpk);

   // dispatch PLACE_TTPK
   vkCmdDispatch(cb, place_wgs, 1, 1);

   // bind PLACE_TTSK
   vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.place_ttsk);

   // dispatch PLACE_TTSK
   vkCmdDispatch(cb, place_wgs, 1, 1);

   //
   // This command buffer ends with a compute shader
   //
   return VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
 }

 //
 //
 //
 static void
 spinel_ci_place_flush(struct spinel_composition_impl * impl)
 {
   struct spinel_ci_dispatch * const dispatch = spinel_ci_dispatch_head(impl);
   struct spinel_device * const      device   = impl->device;

   // Is this a dispatch with no commands?
   if (spinel_ci_dispatch_is_empty(dispatch))
     {
       return;
     }

   //
   // Flush if the ring is non-coherent
   //
   if (!spinel_allocator_is_coherent(&device->allocator.device.perm.hw_dr))
     {
       spinel_ring_flush(&device->vk,
                         impl->vk.rings.h.dbi_dm.dm,
                         0UL,
                         impl->mapped.cp.ring.size,
                         dispatch->cp.head,
                         dispatch->cp.span,
                         sizeof(*impl->mapped.cp.extent));
     }

   //
   // Submit dispatch
   //
   // Waits upon:
   //
   //  * Composition reset
   //  * Materialization of raster handles
   //
   bool const is_resetting = (impl->signal.resetting.immediate !=  //
                              SPN_DEPS_IMMEDIATE_SEMAPHORE_INVALID);

   struct spinel_deps_immediate_submit_info const disi = {
     .record = {
       .pfn   = spinel_ci_place_flush_record,
       .data0 = impl,
       .data1 = dispatch,
     },
     .wait = {
       .immediate = {
         .count      = (is_resetting ? 1 : 0),
         .semaphores = { impl->signal.resetting.immediate },
       },
       .delayed = {
         .handles = {
           .extent = impl->rasters.extent,
           .size   = impl->rasters.size,
           .head   = dispatch->rd.head,
           .span   = dispatch->cp.span,
         },
       },
     },
     .completion = {
       .pfn   = spinel_ci_place_flush_complete,
       .data0 = impl,
       .data1 = dispatch,
     },
   };

   //
   // The current dispatch is now sealed so drop it
   //
   spinel_ci_dispatch_drop(impl);

   //
   // Submit!
   //
   spinel_deps_immediate_submit(device->deps, &device->vk, &disi, &dispatch->signal.immediate);

   //
   // Acquire and initialize the next dispatch
   //
   spinel_ci_dispatch_acquire(impl);
 }

 //
 // COMPLETION: SEALING
 //
 //   PHASE 1: COPYBACK5
 //   PHASE 2: SORT & SEGMENT
 //
 // The same payload is used for both phases
 //
 static void
 spinel_ci_unsealed_to_sealed_complete(void * data0, void * data1)
 {
   struct spinel_composition_impl * const impl = data0;

   impl->state = SPN_CI_STATE_SEALED;

   impl->signal.sealing.immediate = SPN_DEPS_IMMEDIATE_SEMAPHORE_INVALID;
 }

 //
 //
 //
 static VkPipelineStageFlags
 spinel_ci_unsealed_to_sealed_record(VkCommandBuffer cb, void * data0, void * data1)
 {
   struct spinel_composition_impl * impl   = data0;
   struct spinel_device * const     device = impl->device;

   //
   // Sort the TTCK keyvals
   //
   VkDescriptorBufferInfo const ttck_count_dbi = {
     .buffer = impl->vk.ttcks.dbi_dm.dbi.buffer,
     .offset = impl->vk.ttcks.dbi_dm.dbi.offset + SPN_BUFFER_OFFSETOF(ttcks, segment_dispatch.w),
     .range  = sizeof(uint32_t),
   };

   VkDescriptorBufferInfo const ttck_keyvals_even_dbi = {
     .buffer = impl->vk.ttcks.dbi_dm.dbi.buffer,
     .offset = impl->vk.ttcks.dbi_dm.dbi.offset + SPN_BUFFER_OFFSETOF(ttcks, ttck_keyvals),
     .range  = impl->vk.ttcks.dbi_dm.dbi.range - SPN_BUFFER_OFFSETOF(ttcks, ttck_keyvals),
   };

   struct radix_sort_vk_sort_indirect_info const info = {
     .ext          = NULL,
     .key_bits     = SPN_TTCK_HI_BITS_LXY,
     .count        = ttck_count_dbi,
     .keyvals_even = ttck_keyvals_even_dbi,
     .keyvals_odd  = impl->vk.ttck_keyvals_odd.dbi,
     .internal     = impl->vk.rs.internal.dbi,
     .indirect     = impl->vk.rs.indirect.dbi,
   };

   radix_sort_vk_sort_indirect(device->ti.rs,
                               &info,
                               device->vk.d,
                               cb,
                               &impl->vk.ttck_keyvals_out.dbi);

   //
   // Init ttck_keyvals_out.devaddr
   //
   spinel_dbi_devaddr_init_devaddr(impl->device->vk.d, &impl->vk.ttck_keyvals_out);

   //
   // COMPUTE>COMPUTE
   //
   vk_barrier_compute_w_to_compute_r(cb);

   //
   // Dispatch TTCKS_SEGMENT_DISPATCH
   //
   struct spinel_push_ttcks_segment_dispatch const push_ttcks_segment_dispatch = {

     .devaddr_ttcks_header = impl->vk.ttcks.devaddr,
   };

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.ttcks_segment_dispatch,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_ttcks_segment_dispatch),
                      &push_ttcks_segment_dispatch);

   vkCmdBindPipeline(cb,
                     VK_PIPELINE_BIND_POINT_COMPUTE,
                     device->ti.pipelines.named.ttcks_segment_dispatch);

   vkCmdDispatch(cb, 1, 1, 1);

   //
   // COMPUTE>INDIRECT|COMPUTE
   //
   vk_barrier_compute_w_to_indirect_compute_r(cb);

   //
   // Dispatch TTCKS_SEGMENT
   //
   struct spinel_push_ttcks_segment const push_ttcks_segment = {

     .devaddr_ttcks_header = impl->vk.ttcks.devaddr,
     .devaddr_ttck_keyvals = impl->vk.ttck_keyvals_out.devaddr
   };

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.ttcks_segment,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_ttcks_segment),
                      &push_ttcks_segment);

   vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.ttcks_segment);

   //
   // Dispatch segmentation pipeline indirectly
   //
   VkDeviceSize const ttcks_segment_dispatch_offset = SPN_BUFFER_OFFSETOF(ttcks, segment_dispatch);

   vkCmdDispatchIndirect(cb, impl->vk.ttcks.dbi_dm.dbi.buffer, ttcks_segment_dispatch_offset);

   //
   // This command buffer ends with a compute shader
   //
   return VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
 }

 //
 // Accumulate wait.immediate semaphores:
 //
 //   1. Resetting signalling timeline
 //   2. All in-flight "PLACE" signalling timelines
 //
 static void
 spinel_ci_unsealed_to_sealed_accumulate_waits(struct spinel_composition_impl *           impl,
                                               struct spinel_deps_immediate_submit_info * disi)
 {
   //
   // We also wait on resetting here in case there were zero dispatches
   //
   bool const is_resetting = (impl->signal.resetting.immediate !=  //
                              SPN_DEPS_IMMEDIATE_SEMAPHORE_INVALID);

   if (is_resetting)
     {
       disi->wait.immediate.semaphores[disi->wait.immediate.count++] =
         impl->signal.resetting.immediate;
     }

   //
   // In-flight dispatches
   //
   struct spinel_ring const * const        ring       = &impl->dispatches.ring;
   uint32_t const                          in_flight  = spinel_ring_dropped(ring);
   uint32_t                                tail       = ring->tail;
   struct spinel_ci_dispatch const * const dispatches = impl->dispatches.extent;

   for (uint32_t ii = 0; ii < in_flight; ii++)
     {
       struct spinel_ci_dispatch const * const dispatch = dispatches + tail++;

       if (dispatch->signal.immediate != SPN_DEPS_IMMEDIATE_SEMAPHORE_INVALID)
         {
           disi->wait.immediate.semaphores[disi->wait.immediate.count++] =
             dispatch->signal.immediate;
         }

       if (tail == ring->size)
         {
           tail = 0;
         }
     }
 }

 //
 // 1. Flush work-in-progress place dispatch
 // 2. Indirect radix sort
 // 3. Segment ttck keys
 //
 static void
 spinel_ci_unsealed_to_sealed(struct spinel_composition_impl * impl)
 {
   //
   // Move to SEALING state...
   //
   impl->state = SPN_CI_STATE_SEALING;

   //
   // Flush any work-in-progress place
   //
   spinel_ci_place_flush(impl);

   //
   // Prepare the deps submit info
   //
   struct spinel_device * const device = impl->device;

   struct spinel_deps_immediate_submit_info disi = {
     .record = {
       .pfn   = spinel_ci_unsealed_to_sealed_record,
       .data0 = impl,
     },
     .completion = {
       .pfn   = spinel_ci_unsealed_to_sealed_complete,  //
       .data0 = impl,
     },
   };

   //
   // Gather place dependencies
   //
   spinel_ci_unsealed_to_sealed_accumulate_waits(impl, &disi);

   //
   // Acquire an immediate semaphore
   //
   spinel_deps_immediate_submit(device->deps, &device->vk, &disi, &impl->signal.sealing.immediate);
 }

 //
 //
 //
 static void
 spinel_ci_unsealed_reset_complete(void * data0, void * data1)
 {
   struct spinel_composition_impl * const impl = data0;

   //
   // move to UNSEALED state and invalidate timeline
   //
   impl->state                      = SPN_CI_STATE_UNSEALED;
   impl->signal.resetting.immediate = SPN_DEPS_IMMEDIATE_SEMAPHORE_INVALID;

   //
   // are there retained rasters?
   //
   if (impl->rasters.count > 0)
     {
       //
       // release retained rasters
       //
       spinel_device_release_d_rasters(impl->device, impl->rasters.extent, impl->rasters.count);

       //
       // zero the count
       //
       impl->rasters.count = 0;

       //
       // reset the WIP dispatch
       //
       struct spinel_ci_dispatch * const dispatch = spinel_ci_dispatch_head(impl);

       spinel_ci_dispatch_init(impl, dispatch);
     }
 }

 //
 // Zero `.ttcks_count` and `.offset_count`
 //
 static VkPipelineStageFlags
 spinel_ci_unsealed_reset_record(VkCommandBuffer cb, void * data0, void * data1)
 {
   struct spinel_composition_impl * const impl = data0;

   vkCmdFillBuffer(cb,
                   impl->vk.ttcks.dbi_dm.dbi.buffer,
                   impl->vk.ttcks.dbi_dm.dbi.offset,
                   sizeof(SPN_TYPE_U32VEC4) * 2,
                   0);

   //
   // This command buffer ends with a transfer
   //
   return VK_PIPELINE_STAGE_TRANSFER_BIT;
 }

 //
 //
 //
 static void
 spinel_ci_unsealed_reset(struct spinel_composition_impl * impl)
 {
   //
   // otherwise... kick off a zeroing fill
   //
   impl->state = SPN_CI_STATE_RESETTING;

   //
   // acquire a signalling timeline
   //
   struct spinel_device * const device = impl->device;

   struct spinel_deps_immediate_submit_info disi = {
     .record = {
       .pfn   = spinel_ci_unsealed_reset_record,
       .data0 = impl,
     },
     .completion = {
       .pfn   = spinel_ci_unsealed_reset_complete,
       .data0 = impl,
     },
   };

   spinel_deps_immediate_submit(device->deps, &device->vk, &disi, &impl->signal.resetting.immediate);
 }

 //
 //
 //
 static void
 spinel_ci_block_until_sealed(struct spinel_composition_impl * impl)
 {
   struct spinel_device * const device = impl->device;

   while (impl->state != SPN_CI_STATE_SEALED)
     {
       spinel_deps_drain_1(device->deps, &device->vk);
     }
 }

 //
 //
 //
 static void
 spinel_ci_block_while_resetting(struct spinel_composition_impl * impl)
 {
   struct spinel_device * const device = impl->device;

   while (impl->state == SPN_CI_STATE_RESETTING)
     {
       spinel_deps_drain_1(device->deps, &device->vk);
     }
 }

 //
 // wait for any in-flight renders to complete
 //
 static void
 spinel_ci_sealed_unseal(struct spinel_composition_impl * impl)
 {
   struct spinel_device * const device = impl->device;

   while (impl->lock_count > 0)
     {
       spinel_deps_drain_1(device->deps, &device->vk);
     }

   impl->state = SPN_CI_STATE_UNSEALED;
 }

 //
 // FIXME(allanmac): add UNSEALING state
 //
 static spinel_result_t
 spinel_ci_seal(struct spinel_composition_impl * impl)
 {
   switch (impl->state)
     {
       case SPN_CI_STATE_RESETTING:
       case SPN_CI_STATE_UNSEALED:
         spinel_ci_unsealed_to_sealed(impl);
         return SPN_SUCCESS;

       case SPN_CI_STATE_SEALING:
         return SPN_SUCCESS;

       case SPN_CI_STATE_SEALED:
         // default:
         return SPN_SUCCESS;
     }
 }

 static spinel_result_t
 spinel_ci_unseal(struct spinel_composition_impl * impl)
 {
   switch (impl->state)
     {
       case SPN_CI_STATE_RESETTING:
       case SPN_CI_STATE_UNSEALED:
         return SPN_SUCCESS;

       case SPN_CI_STATE_SEALING:
         spinel_ci_block_until_sealed(impl);
         __attribute__((fallthrough));

       case SPN_CI_STATE_SEALED:
         // default:
         spinel_ci_sealed_unseal(impl);
         return SPN_SUCCESS;
     }
 }

 static spinel_result_t
 spinel_ci_reset(struct spinel_composition_impl * impl)
 {
   switch (impl->state)
     {
       case SPN_CI_STATE_RESETTING:
         return SPN_SUCCESS;

       case SPN_CI_STATE_UNSEALED:
         spinel_ci_unsealed_reset(impl);
         return SPN_SUCCESS;

       case SPN_CI_STATE_SEALING:
         return SPN_ERROR_COMPOSITION_SEALED;

       case SPN_CI_STATE_SEALED:
         // default:
         return SPN_ERROR_COMPOSITION_SEALED;
     }
 }

 //
 //
 //
 static spinel_result_t
 spinel_ci_set_clip(struct spinel_composition_impl * impl, spinel_pixel_clip_t const * clip)
 {
   switch (impl->state)
     {
       case SPN_CI_STATE_RESETTING:
       case SPN_CI_STATE_UNSEALED:
         break;

       case SPN_CI_STATE_SEALING:
       case SPN_CI_STATE_SEALED:
       default:
         return SPN_ERROR_COMPOSITION_SEALED;
     }

   //
   // Set up the place clip
   //
   struct spinel_target_config const * const config = &impl->device->ti.config;

   uint32_t const tile_w = 1 << config->tile.width_log2;
   uint32_t const tile_h = 1 << config->tile.height_log2;

   uint32_t const surf_w = tile_w << SPN_TTCK_HI_BITS_X;
   uint32_t const surf_h = tile_h << SPN_TTCK_HI_BITS_Y;

   uint32_t const clip_x0 = MIN_MACRO(uint32_t, clip->x0, surf_w);
   uint32_t const clip_y0 = MIN_MACRO(uint32_t, clip->y0, surf_h);

   uint32_t const tile_w_mask = tile_w - 1;
   uint32_t const tile_h_mask = tile_h - 1;

   uint32_t const clip_x1 = MIN_MACRO(uint32_t, clip->x1, surf_w) + tile_w_mask;
   uint32_t const clip_y1 = MIN_MACRO(uint32_t, clip->y1, surf_h) + tile_h_mask;

   //
   // Note that impl->clip is an i32vec4
   //
   impl->clip.x = clip_x0 >> config->tile.width_log2;
   impl->clip.y = clip_y0 >> config->tile.height_log2;
   impl->clip.z = clip_x1 >> config->tile.width_log2;
   impl->clip.w = clip_y1 >> config->tile.height_log2;

   return SPN_SUCCESS;
 }

 //
 //
 //
 static spinel_result_t
 spinel_ci_place(struct spinel_composition_impl * impl,
                 spinel_raster_t const *          rasters,
                 spinel_layer_id const *          layer_ids,
                 spinel_txty_t const *            txtys,
                 uint32_t                         count)
 {
   struct spinel_device * const device = impl->device;

   switch (impl->state)
     {
       case SPN_CI_STATE_RESETTING:
         spinel_ci_block_while_resetting(impl);
         break;

       case SPN_CI_STATE_UNSEALED:
         break;

       case SPN_CI_STATE_SEALING:
       case SPN_CI_STATE_SEALED:
       default:
         return SPN_ERROR_COMPOSITION_SEALED;
     }

   //
   // Nothing to do?
   //
   if (count == 0)
     {
       return SPN_SUCCESS;
     }

   //
   // Validate there is enough room for retained rasters
   //
   // Note that this is why we have to block if RESETTING.
   //
   if (impl->rasters.count + count > impl->rasters.size)
     {
       return SPN_ERROR_COMPOSITION_TOO_MANY_RASTERS;
     }

 #ifndef NDEBUG
   //
   // NOTE(allanmac): No, we should never need to perform this test. The layer
   // invoking Spinel should ensure that layer ids remain below LAYER_MAX.
   //
   // Furthermore, the styling layer range is almost always far smaller than the
   // LAYER_MAX.
   //
   // Validate range of layer ids
   //
   for (uint32_t ii = 0; ii < count; ii++)
     {
       if (layer_ids[ii] > SPN_TTCK_LAYER_MAX)
         {
           return SPN_ERROR_LAYER_ID_INVALID;
         }
     }
 #endif

   //
   // Validate first and then retain the rasters before we proceed
   //
   spinel_result_t result = spinel_device_validate_d_rasters(device, rasters, count);

   if (result != SPN_SUCCESS)
     {
       return result;
     }

   //
   // No survivable errors from here onward... any failure beyond here will be
   // fatal to the context!
   //
   spinel_device_retain_d_rasters(device, rasters, count);

   //
   // Save the rasters but update the dispatch head incrementally
   //
   spinel_handle_t * const rasters_base = impl->rasters.extent + impl->rasters.count;

   for (uint32_t ii = 0; ii < count; ii++)
     {
       rasters_base[ii] = rasters[ii].handle;
     }

   //
   // copy place commands into the ring
   //
   struct spinel_ring * const ring = &impl->mapped.cp.ring;

   while (true)
     {
       //
       // how many slots left in ring?
       //
       uint32_t const head_nowrap = spinel_ring_head_nowrap(ring);
       uint32_t       avail       = MIN_MACRO(uint32_t, count, head_nowrap);

       //
       // if ring is full then this implies we're already waiting on
       // dispatches because an eager launch would've occurred
       //
       if (avail == 0)
         {
           spinel_deps_drain_1(device->deps, &device->vk);
           continue;
         }

       //
       // update rasters count incrementally
       //
       impl->rasters.count += avail;

       //
       // update dispatch
       //
       struct spinel_ci_dispatch * const dispatch = spinel_ci_dispatch_head(impl);

       dispatch->cp.span += avail;

       count -= avail;

       //
       // append commands to ring
       //
       struct spinel_cmd_place * cmds = impl->mapped.cp.extent + ring->head;

       spinel_ring_drop_n(ring, avail);

       if (txtys == NULL)
         {
           while (avail-- > 0)
             {
               cmds->raster_h = rasters->handle;
               cmds->layer_id = *layer_ids;
               cmds->txty[0]  = 0;
               cmds->txty[1]  = 0;

               ++rasters;
               ++layer_ids;
               ++cmds;
             }
         }
       else
         {
           while (avail-- > 0)
             {
               cmds->raster_h = rasters->handle;
               cmds->layer_id = *layer_ids;
               cmds->txty[0]  = txtys->tx;
               cmds->txty[1]  = txtys->ty;

               ++rasters;
               ++layer_ids;
               ++txtys;
               ++cmds;
             }
         }

       //
       // launch place kernel?
       //
       struct spinel_target_config const * const config = &device->ti.config;

       if (dispatch->cp.span >= config->composition.size.eager)
         {
           spinel_ci_place_flush(impl);
         }

       //
       // anything left?
       //
       if (count == 0)
         {
           return SPN_SUCCESS;
         }
     }
 }

 //
 //
 //

 static spinel_result_t
 spinel_ci_release(struct spinel_composition_impl * impl)
 {
   //
   // wait for resetting to complete
   //
   struct spinel_device * const device = impl->device;

   spinel_ci_block_while_resetting(impl);

   //
   // wait for any in-flight PLACE dispatches to complete
   //
   while (!spinel_ring_is_full(&impl->dispatches.ring))
     {
       spinel_deps_drain_1(device->deps, &device->vk);
     }

   //
   // wait for any in-flight renders to complete
   //
   while (impl->lock_count > 0)
     {
       spinel_deps_drain_1(device->deps, &device->vk);
     }

   //
   // release any retained rasters
   //
   if (impl->rasters.count > 0)
     {
       spinel_device_release_d_rasters(impl->device, impl->rasters.extent, impl->rasters.count);
     }

   //
   // free Radix Sort extents
   //
   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.rs.indirect);

   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.rs.internal);

   //
   // free ttck_keyvals
   //
   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.ttck_keyvals_odd);

   //
   // free ttcks
   //
   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.ttcks.dbi_dm);
   //
   // free rings
   //
   vkUnmapMemory(device->vk.d, impl->vk.rings.h.dbi_dm.dm);  // not necessary

   spinel_allocator_free_dbi_dm(&device->allocator.device.perm.hw_dr,
                                device->vk.d,
                                device->vk.ac,
                                &impl->vk.rings.h.dbi_dm);

   if (spinel_ci_is_staged(device))
     {
       spinel_allocator_free_dbi_dm(&device->allocator.device.perm.drw,
                                    device->vk.d,
                                    device->vk.ac,
                                    &impl->vk.rings.d.dbi_dm);
     }

   //
   // free host allocations
   //
   free(impl->rasters.extent);
   free(impl->dispatches.extent);
   free(impl->composition);
   free(impl);

   spinel_context_release(device->context);

   return SPN_SUCCESS;
 }

 //
 //
 //

 spinel_result_t
 spinel_composition_impl_create(struct spinel_device *       device,
                                struct spinel_composition ** composition)
 {
   spinel_context_retain(device->context);

   //
   // Allocate impl
   //
   struct spinel_composition_impl * const impl = MALLOC_MACRO(sizeof(*impl));

   //
   // Allocate composition
   //
   struct spinel_composition * const c = *composition = MALLOC_MACRO(sizeof(*c));

   //
   // Init back-pointers
   //
   impl->composition = c;
   c->impl           = impl;

   // Save device
   impl->device = device;

   // No locks
   impl->lock_count = 0;

   // Start in an unsealed state
   impl->state = SPN_CI_STATE_UNSEALED;

   //
   // initialize composition
   //
   c->release   = spinel_ci_release;
   c->place     = spinel_ci_place;
   c->seal      = spinel_ci_seal;
   c->unseal    = spinel_ci_unseal;
   c->reset     = spinel_ci_reset;
   c->set_clip  = spinel_ci_set_clip;
   c->ref_count = 1;

   //
   // Default to max clip
   //
   impl->clip = (SPN_TYPE_I32VEC4){ .x = 0,
                                    .y = 0,
                                    .z = 1 << SPN_TTCK_HI_BITS_X,
                                    .w = 1 << SPN_TTCK_HI_BITS_Y };

   //
   // Get config
   //
   struct spinel_target_config const * const config = &device->ti.config;

   //
   // Init ring
   //
   spinel_ring_init(&impl->mapped.cp.ring, config->composition.size.ring);

   //
   // Allocate and map ring
   //
   VkDeviceSize const ring_size    = config->composition.size.ring * sizeof(*impl->mapped.cp.extent);
   VkDeviceSize const ring_size_ru = ROUND_UP_POW2_MACRO(ring_size,  //
                                                         device->vk.limits.noncoherent_atom_size);

   spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.hw_dr,
                                         device->vk.pd,
                                         device->vk.d,
                                         device->vk.ac,
                                         ring_size_ru,
                                         NULL,
                                         &impl->vk.rings.h);

   vk(MapMemory(device->vk.d,
                impl->vk.rings.h.dbi_dm.dm,
                0,
                VK_WHOLE_SIZE,
                0,
                (void **)&impl->mapped.cp.extent));

   if (spinel_ci_is_staged(device))
     {
       spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.drw,
                                             device->vk.pd,
                                             device->vk.d,
                                             device->vk.ac,
                                             ring_size_ru,
                                             NULL,
                                             &impl->vk.rings.d);
     }
   else
     {
       impl->vk.rings.d = impl->vk.rings.h;
     }

   //
   // Get radix sort memory requirements
   //
   struct radix_sort_vk_memory_requirements rs_mr;

   radix_sort_vk_get_memory_requirements(device->ti.rs, config->composition.size.ttcks, &rs_mr);

   assert(SPN_MEMBER_ALIGN_LIMIT >= rs_mr.keyvals_alignment);

   //
   // Allocate ttcks extent
   //
   VkDeviceSize const ttcks_size = sizeof(SPN_BUFFER_TYPE(ttcks)) + rs_mr.keyvals_size;

   spinel_allocator_alloc_dbi_dm_devaddr(&device->allocator.device.perm.drw,
                                         device->vk.pd,
                                         device->vk.d,
                                         device->vk.ac,
                                         ttcks_size,
                                         NULL,
                                         &impl->vk.ttcks);

   //
   // Allocate ttck_keyvals_odd extent
   //
   spinel_allocator_alloc_dbi_dm(&device->allocator.device.perm.drw,
                                 device->vk.pd,
                                 device->vk.d,
                                 device->vk.ac,
                                 rs_mr.keyvals_size,
                                 NULL,
                                 &impl->vk.ttck_keyvals_odd);

   //
   // Allocate radix sort internal and indirect buffers
   //
   spinel_allocator_alloc_dbi_dm(&device->allocator.device.perm.drw,
                                 device->vk.pd,
                                 device->vk.d,
                                 device->vk.ac,
                                 rs_mr.internal_size,
                                 NULL,
                                 &impl->vk.rs.internal);

   spinel_allocator_alloc_dbi_dm(&device->allocator.device.perm.drw,
                                 device->vk.pd,
                                 device->vk.d,
                                 device->vk.ac,
                                 rs_mr.indirect_size,
                                 NULL,
                                 &impl->vk.rs.indirect);

   //
   // How many dispatches?
   //
   uint32_t const max_in_flight = config->composition.size.dispatches;

   //
   // Check worst case number of immediates is supported:
   //
   //  max_in_flight + resetting
   //
   assert(max_in_flight + 1 <= SPN_DEPS_IMMEDIATE_SUBMIT_SIZE_WAIT_IMMEDIATE);

   //
   // Allocate handle retention extent
   //
   size_t const d_size = sizeof(*impl->dispatches.extent) * max_in_flight;
   size_t const r_size = sizeof(*impl->rasters.extent) * config->composition.size.rasters;

   impl->dispatches.extent = MALLOC_MACRO(d_size);
   impl->rasters.extent    = MALLOC_MACRO(r_size);
   impl->rasters.size      = config->composition.size.rasters;
   impl->rasters.count     = 0;

   spinel_ring_init(&impl->dispatches.ring, max_in_flight);

   //
   // Initialize the first dispatch
   //
   spinel_ci_dispatch_init(impl, impl->dispatches.extent);

   //
   // Kick off resetting...
   //
   spinel_ci_unsealed_reset(impl);

   return SPN_SUCCESS;
 }

 //
 //
 //
 spinel_deps_immediate_semaphore_t
 spinel_composition_retain_and_lock(struct spinel_composition * composition)
 {
   struct spinel_composition_impl * const impl = composition->impl;

   assert(impl->state >= SPN_CI_STATE_SEALING);

   spinel_composition_retain(composition);

   composition->impl->lock_count += 1;

   return impl->signal.sealing.immediate;
 }

 //
 //
 //
 void
 spinel_composition_unlock_and_release(struct spinel_composition * composition)
 {
   composition->impl->lock_count -= 1;

   spinel_composition_release(composition);
 }

 //
 //
 //
 void
 spinel_composition_push_render_dispatch_record(struct spinel_composition * composition,
                                                VkCommandBuffer             cb)
 {
   struct spinel_composition_impl * const impl   = composition->impl;
   struct spinel_device * const           device = impl->device;

   struct spinel_push_render_dispatch push_render_dispatch = {

     .devaddr_ttcks_header = impl->vk.ttcks.devaddr
   };

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.render_dispatch,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(push_render_dispatch),
                      &push_render_dispatch);

   vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.render_dispatch);

   vkCmdDispatch(cb, 1, 1, 1);
 }

 //
 // 1. Initialize RENDER push constants with composition bufrefs
 // 2. Record composition-driven indirect dispatch command
 //
 void
 spinel_composition_push_render_init_record(struct spinel_composition * composition,
                                            struct spinel_push_render * push_render,
                                            VkCommandBuffer             cb)
 {
   struct spinel_composition_impl * const impl   = composition->impl;
   struct spinel_device * const           device = impl->device;

   push_render->devaddr_ttcks_header = impl->vk.ttcks.devaddr;
   push_render->devaddr_ttck_keyvals = impl->vk.ttck_keyvals_out.devaddr;

   vkCmdPushConstants(cb,
                      device->ti.pipeline_layouts.named.render,
                      VK_SHADER_STAGE_COMPUTE_BIT,
                      0,
                      sizeof(*push_render),
                      push_render);

   vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, device->ti.pipelines.named.render);

   VkDeviceSize const ttcks_offset_render_dispatch = SPN_BUFFER_OFFSETOF(ttcks, render_dispatch);

   vkCmdDispatchIndirect(cb, impl->vk.ttcks.dbi_dm.dbi.buffer, ttcks_offset_render_dispatch);
 }

 //
 //
 //