src/graphics/lib/compute/spinel/platforms/vk/composition_impl.c - fuchsia - Git at Google

 // Copyright 2019 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 //
 //
 //

 #include "composition_impl.h"

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #include "block_pool.h"
 #include "common/vk/assert.h"
 #include "common/vk/barrier.h"
 #include "core_vk.h"
 #include "device.h"
 #include "dispatch.h"
 #include "handle_pool.h"
 #include "hotsort/platforms/vk/hotsort_vk.h"
 #include "queue_pool.h"
 #include "raster_builder_impl.h"
 #include "ring.h"
 #include "spinel_assert.h"
 #include "state_assert.h"
 #include "status.h"
 #include "vk_target.h"

 //
 // composition states
 //

 typedef enum spn_ci_state_e
 {
   SPN_CI_STATE_RESET,      // unsealed and was reset
   SPN_CI_STATE_RESETTING,  // unsealed and resetting
   SPN_CI_STATE_UNSEALED,   // ready to place rasters
   SPN_CI_STATE_SEALING,    // waiting for PLACE and TTCK_SORT
   SPN_CI_STATE_SEALED      // sort & segment complete

 } spn_ci_state_e;

 //
 // The composition launches a number of dependent command buffers:
 //
 //   1. reset TTCK atomic count
 //   2. PLACE shaders -- happens-after (1)
 //   3. COPYBACK -- happens-after (2)
 //   4. SORT -- happens-after (3)
 //

 //
 // FIXME(allanmac): The scheduling logic has changed.
 //
 // There are always as many dispatch records as there are fences in
 // the fence pool.  This simplifies reasoning about concurrency.
 //
 // The dispatch record in the composition tracks resources associated
 // with wip and in-flight PLACE submissions.
 //

 typedef enum spn_ci_dispatch_state_e
 {
   SPN_CI_DISPATCH_STATE_PLACING,
   SPN_CI_DISPATCH_STATE_PLACED

 } spn_ci_dispatch_state_e;

 //
 //
 //

 struct spn_ci_dispatch
 {
   struct
   {
     uint32_t head;
     uint32_t span;
   } cp;  // place commands

   struct
   {
     uint32_t head;
   } rd;  // raster handles are 1:1 with place commands

   spn_ci_dispatch_state_e state;

   bool complete;

   spn_dispatch_id_t id;
 };

 //
 //
 //

 struct spn_ci_vk
 {
   struct
   {
     struct
     {
       VkDescriptorBufferInfo dbi;
       VkDeviceMemory         dm;
     } h;

     struct
     {
       VkDescriptorBufferInfo dbi;
       VkDeviceMemory         dm;
     } d;
   } rings;

   struct
   {
     VkDescriptorBufferInfo dbi;
     VkDeviceMemory         dm;
   } ttcks;

   struct
   {
     VkDescriptorBufferInfo dbi;
     VkDeviceMemory         dm;
   } copyback;
 };

 //
 // This structure *partially* matches `struct spn_vk_buf_ttcks_ttcks`
 //
 // FIXME(allanmac): hoist this so that we always have a compatible C and
 // GLSL structure instead of partially redefining it here.
 //

 struct spn_ci_copyback
 {
   uint32_t ttcks_count[4];  // only first dword is used

 #ifndef NDEBUG
   uint32_t offsets_count[4];  // first 3 dwords are used
 #endif
 };

 //
 //
 //

 struct spn_composition_impl
 {
   struct spn_composition *            composition;
   struct spn_device *                 device;
   struct spn_vk_target_config const * config;  // FIXME(allanmac): we don't need to duplicate this
   struct spn_ci_vk                    vk;

   //
   // composition clip
   //
   struct spn_ivec4 clip;

   //
   // mapped command ring and copyback counts
   //
   struct
   {
     struct
     {
       struct spn_cmd_place * extent;
       struct spn_ring        ring;
     } cp;  // place commands

     struct
     {
       struct spn_ci_copyback * extent;
     } cb;
   } mapped;

   //
   // records of work-in-progress and work-in-flight
   //
   struct
   {
     struct spn_ci_dispatch * extent;
     struct spn_ring          ring;
   } dispatches;

   //
   // all rasters are retained until reset or release
   //
   struct
   {
     spn_handle_t * extent;
     uint32_t       size;
     uint32_t       count;
   } rasters;

   uint32_t lock_count;  // # of wip renders

   SPN_ASSERT_STATE_DECLARE(spn_ci_state_e);

   //
   // dispatch ids
   //
   spn_dispatch_id_t id_sealing;
   spn_dispatch_id_t id_resetting;
 };

 //
 //
 //

 static bool
 spn_ci_is_staged(struct spn_vk_target_config const * const config)
 {
   return ((config->allocator.device.hw_dr.properties & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) == 0) &&
          (config->composition.no_staging == 0);
 }

 //
 // A dispatch captures how many paths and blocks are in a dispatched or
 // the work-in-progress compute grid.
 //

 static struct spn_ci_dispatch *
 spn_ci_dispatch_idx(struct spn_composition_impl * const impl, uint32_t const idx)
 {
   return impl->dispatches.extent + idx;
 }

 static struct spn_ci_dispatch *
 spn_ci_dispatch_head(struct spn_composition_impl * const impl)
 {
   return spn_ci_dispatch_idx(impl, impl->dispatches.ring.head);
 }

 static struct spn_ci_dispatch *
 spn_ci_dispatch_tail(struct spn_composition_impl * const impl)
 {
   return spn_ci_dispatch_idx(impl, impl->dispatches.ring.tail);
 }

 static bool
 spn_ci_dispatch_is_empty(struct spn_ci_dispatch const * const dispatch)
 {
   return dispatch->cp.span == 0;
 }

 static void
 spn_ci_dispatch_init(struct spn_composition_impl * const impl,
                      struct spn_ci_dispatch * const      dispatch)
 {
   dispatch->cp.head  = impl->mapped.cp.ring.head;
   dispatch->cp.span  = 0;
   dispatch->rd.head  = impl->rasters.count;
   dispatch->state    = SPN_CI_DISPATCH_STATE_PLACING;
   dispatch->complete = false;

   spn(device_dispatch_acquire(impl->device, SPN_DISPATCH_STAGE_COMPOSITION_PLACE, &dispatch->id));
 }

 static void
 spn_ci_dispatch_drop(struct spn_composition_impl * const impl)
 {
   struct spn_ring * const ring = &impl->dispatches.ring;

   spn_ring_drop_1(ring);
 }

 static void
 spn_ci_dispatch_acquire(struct spn_composition_impl * const impl)
 {
   struct spn_ring * const ring = &impl->dispatches.ring;

   while (spn_ring_is_empty(ring))
     {
       spn(device_wait(impl->device, __func__));
     }

   struct spn_ci_dispatch * const dispatch = spn_ci_dispatch_idx(impl, ring->head);

   spn_ci_dispatch_init(impl, dispatch);
 }

 //
 // Wait on all in-flight PLACE before SEALING_1
 //

 static void
 spn_ci_dispatch_wait(struct spn_composition_impl * const impl, spn_dispatch_id_t const id)
 {
   struct spn_ring * const ring      = &impl->dispatches.ring;
   uint32_t                in_flight = spn_ring_dropped(ring);

   // anything to do?
   if (in_flight == 0)
     return;

   uint32_t                             tail       = ring->tail;
   uint32_t const                       size       = ring->size;
   struct spn_ci_dispatch const * const dispatches = impl->dispatches.extent;
   struct spn_device * const            device     = impl->device;

   for (uint32_t ii = 0; ii < in_flight; ii++)
     {
       struct spn_ci_dispatch const * const dispatch = dispatches + tail++;

       if (dispatch->state == SPN_CI_DISPATCH_STATE_PLACING)
         {
           spn_device_dispatch_happens_after(device, id, dispatch->id);
         }

       if (tail == size)
         {
           tail = 0;
         }
     }
 }

 //
 // COMPLETION: PLACE
 //

 struct spn_ci_complete_payload_place
 {
   struct spn_composition_impl * impl;

   struct
   {
     struct spn_vk_ds_ttcks_t ttcks;
     struct spn_vk_ds_place_t place;
   } ds;

   uint32_t dispatch_idx;  // dispatch idx
 };

 //
 //
 //

 static void
 spn_ci_complete_place(void * pfn_payload)
 {
   struct spn_ci_complete_payload_place const * const payload  = pfn_payload;
   struct spn_composition_impl * const                impl     = payload->impl;
   struct spn_device * const                          device   = impl->device;
   struct spn_vk * const                              instance = device->instance;

   // release descriptor sets
   spn_vk_ds_release_ttcks(instance, payload->ds.ttcks);
   spn_vk_ds_release_place(instance, payload->ds.place);

   //
   // If the dispatch is the tail of the ring then try to release as
   // many dispatch records as possible...
   //
   // Note that kernels can complete in any order so the release
   // records need to add to the mapped.ring.tail in order.
   //
   uint32_t const           dispatch_idx = payload->dispatch_idx;
   struct spn_ci_dispatch * dispatch     = spn_ci_dispatch_idx(impl, dispatch_idx);

   dispatch->state = SPN_CI_DISPATCH_STATE_PLACED;

   if (spn_ring_is_tail(&impl->dispatches.ring, dispatch_idx))
     {
       while (true)
         {
           spn_ring_release_n(&impl->mapped.cp.ring, dispatch->cp.span);
           spn_ring_release_n(&impl->dispatches.ring, 1);

           // any dispatches in flight?
           if (spn_ring_is_full(&impl->dispatches.ring))
             break;

           dispatch = spn_ci_dispatch_tail(impl);

           if (!dispatch->complete)
             break;
         }
     }
   else
     {
       dispatch->complete = true;
     }
 }

 //
 //
 //

 static void
 spn_ci_flush(struct spn_composition_impl * const impl)
 {
   struct spn_ci_dispatch * const dispatch = spn_ci_dispatch_head(impl);

   // is this a dispatch with no commands?
   if (spn_ci_dispatch_is_empty(dispatch))
     return;

   //
   // We're go for launch...
   //
   struct spn_device * const device = impl->device;

   //
   // get the cb associated with the wip dispatch
   //
   VkCommandBuffer cb = spn_device_dispatch_get_cb(device, dispatch->id);

   //
   // COPY COMMANDS
   //
   // If this is a discrete GPU, copy the place command ring.
   //
   if (spn_ci_is_staged(impl->config))
     {
       VkDeviceSize const head_offset = dispatch->cp.head * sizeof(struct spn_cmd_place);

       if (dispatch->cp.head + dispatch->cp.span <= impl->mapped.cp.ring.size)
         {
           VkBufferCopy bcs[1];

           bcs[0].srcOffset = impl->vk.rings.h.dbi.offset + head_offset;
           bcs[0].dstOffset = impl->vk.rings.d.dbi.offset + head_offset;
           bcs[0].size      = dispatch->cp.span * sizeof(struct spn_cmd_place);

           vkCmdCopyBuffer(cb, impl->vk.rings.h.dbi.buffer, impl->vk.rings.d.dbi.buffer, 1, bcs);
         }
       else  // wraps around ring
         {
           VkBufferCopy bcs[2];

           uint32_t const hi = impl->mapped.cp.ring.size - dispatch->cp.head;
           bcs[0].srcOffset  = impl->vk.rings.h.dbi.offset + head_offset;
           bcs[0].dstOffset  = impl->vk.rings.d.dbi.offset + head_offset;
           bcs[0].size       = hi * sizeof(struct spn_cmd_place);

           uint32_t const lo = dispatch->cp.head + dispatch->cp.span - impl->mapped.cp.ring.size;
           bcs[1].srcOffset  = impl->vk.rings.h.dbi.offset;
           bcs[1].dstOffset  = impl->vk.rings.d.dbi.offset;
           bcs[1].size       = lo * sizeof(struct spn_cmd_place);

           vkCmdCopyBuffer(cb, impl->vk.rings.h.dbi.buffer, impl->vk.rings.d.dbi.buffer, 2, bcs);
         }

       vk_barrier_transfer_w_to_compute_r(cb);
     }

   //
   // DS: BLOCK POOL
   //
   struct spn_vk * const instance = device->instance;

   spn_vk_ds_bind_place_ttpk_block_pool(instance, cb, spn_device_block_pool_get_ds(device));

   //
   // DS: TTCKS
   //
   // acquire TTCKS descriptor set
   struct spn_vk_ds_ttcks_t ds_ttcks;

   spn_vk_ds_acquire_ttcks(instance, device, &ds_ttcks);

   // copy the dbi structs
   *spn_vk_ds_get_ttcks_ttcks(instance, ds_ttcks) = impl->vk.ttcks.dbi;

   // update TTCKS descriptor set
   spn_vk_ds_update_ttcks(instance, &device->environment, ds_ttcks);

   // bind the TTCKS descriptor set
   spn_vk_ds_bind_place_ttpk_ttcks(instance, cb, ds_ttcks);

   //
   // DS: PLACE
   //
   // acquire PLACE descriptor set
   struct spn_vk_ds_place_t ds_place;

   spn_vk_ds_acquire_place(instance, device, &ds_place);

   // copy the dbi struct
   *spn_vk_ds_get_place_place(instance, ds_place) = impl->vk.rings.d.dbi;

   // update PLACE descriptor set
   spn_vk_ds_update_place(instance, &device->environment, ds_place);

   // bind PLACE descriptor set
   spn_vk_ds_bind_place_ttpk_place(instance, cb, ds_place);

   //
   // set a completion payload
   //
   struct spn_ci_complete_payload_place * const payload =
     spn_device_dispatch_set_completion(device,
                                        dispatch->id,
                                        spn_ci_complete_place,
                                        sizeof(*payload));

   payload->impl         = impl;
   payload->ds.ttcks     = ds_ttcks;
   payload->ds.place     = ds_place;
   payload->dispatch_idx = impl->dispatches.ring.head;

   //
   // PIPELINE: PLACE
   //
   // Set up push constants -- note that for now the paths_copy push
   // constants are an extension of the paths_alloc constants.
   //
   // This means we can push the constants once.
   //
   struct spn_vk_push_place_ttpk const push = {

     .place_clip = impl->clip,
     .place_head = dispatch->cp.head,
     .place_span = dispatch->cp.span,
     .place_size = impl->mapped.cp.ring.size
   };

   spn_vk_p_push_place_ttpk(instance, cb, &push);

   // dispatch one subgroup per command -- place_ttpk and place_ttsk are same
   uint32_t const place_wg_size      = impl->config->p.group_sizes.named.place_ttpk.workgroup;
   uint32_t const place_sg_size_log2 = impl->config->p.group_sizes.named.place_ttpk.subgroup_log2;
   uint32_t const place_cmds_per_wg  = place_wg_size >> place_sg_size_log2;

   uint32_t const place_wgs = (dispatch->cp.span + place_cmds_per_wg - 1) / place_cmds_per_wg;

   // bind PLACE_TTPK
   spn_vk_p_bind_place_ttpk(instance, cb);

   // dispatch PLACE_TTPK
   vkCmdDispatch(cb, place_wgs, 1, 1);

   // bind PLACE_TTSK
   spn_vk_p_bind_place_ttsk(instance, cb);

   // dispatch PLACE_TTSK
   vkCmdDispatch(cb, place_wgs, 1, 1);

   //
   // Wait for reset
   //
   if (impl->state == SPN_CI_STATE_RESETTING)
     {
       spn_device_dispatch_happens_after(device, dispatch->id, impl->id_resetting);
     }

   //
   // the current dispatch is now sealed so drop it
   //
   spn_ci_dispatch_drop(impl);

   //
   // wait for rasters associated with this dispatch to materialize
   //
   spn_device_dispatch_happens_after_handles_and_submit(device,
                                                        (spn_dispatch_flush_pfn_t)spn_rbi_flush,
                                                        dispatch->id,
                                                        impl->rasters.extent + dispatch->rd.head,
                                                        UINT32_MAX,
                                                        0,
                                                        dispatch->cp.span);

   //
   // acquire and initialize the next dispatch
   //
   spn_ci_dispatch_acquire(impl);
 }

 //
 // COMPLETION: SEALING
 //
 //   PHASE 1: COPYBACK
 //   PHASE 2: SORT & SEGMENT
 //
 // The same payload is used for both phases
 //

 struct spn_ci_complete_payload_sealing
 {
   struct spn_composition_impl * impl;

   struct
   {
     struct spn_vk_ds_ttcks_t ttcks;
   } ds;
 };

 //
 //
 //

 static void
 spn_ci_complete_sealing_2(void * pfn_payload)
 {
   struct spn_ci_complete_payload_sealing const * const payload  = pfn_payload;
   struct spn_composition_impl * const                  impl     = payload->impl;
   struct spn_device * const                            device   = impl->device;
   struct spn_vk * const                                instance = device->instance;

   // release the ttcks ds -- will never wait()
   spn_vk_ds_release_ttcks(instance, payload->ds.ttcks);

   // move to sealed state
   impl->state = SPN_CI_STATE_SEALED;
 }

 //
 //
 //

 static void
 spn_ci_complete_sealing_1(void * pfn_payload)
 {
   struct spn_ci_complete_payload_sealing const * const payload = pfn_payload;

   struct spn_composition_impl * const impl     = payload->impl;
   struct spn_device * const           device   = impl->device;
   struct spn_vk * const               instance = device->instance;

   //
   // duplicate the completion payload
   //
   struct spn_ci_complete_payload_sealing * const payload_copy =
     spn_device_dispatch_set_completion(device,
                                        impl->id_sealing,
                                        spn_ci_complete_sealing_2,
                                        sizeof(*payload_copy));
   *payload_copy = *payload;

   //
   // invalidate if necessary
   //
   if ((impl->config->allocator.device.hr_dw.properties & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) == 0)
     {
       VkMappedMemoryRange const mmr = { .sType  = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
                                         .pNext  = NULL,
                                         .memory = impl->vk.copyback.dm,
                                         .offset = 0,
                                         .size   = VK_WHOLE_SIZE };

       vk(InvalidateMappedMemoryRanges(device->environment.d, 1, &mmr));
     }

   //
   // get a cb
   //
   VkCommandBuffer cb = spn_device_dispatch_get_cb(device, impl->id_sealing);

   //
   // DEBUG ONLY
   //
   // This DS only needs to be bound if we're debugging
   //
 #ifdef SPN_BP_DEBUG
   //
   // BLOCK POOL
   //
   // bind global BLOCK_POOL descriptor set
   spn_vk_ds_bind_segment_ttck_block_pool(instance, cb, spn_device_block_pool_get_ds(device));
 #endif

   //
   // DS: TTCKS
   //
   spn_vk_ds_bind_segment_ttck_ttcks(instance, cb, payload->ds.ttcks);

   ////////////////////////////////////////////////////////////////
   //
   // HOTSORT
   //
   ////////////////////////////////////////////////////////////////

   uint32_t const keys_count = impl->mapped.cb.extent->ttcks_count[0];
   uint32_t       slabs_in;
   uint32_t       padded_in;
   uint32_t       padded_out;

   hotsort_vk_pad(device->hs, keys_count, &slabs_in, &padded_in, &padded_out);

   struct hotsort_vk_ds_offsets const keys_offsets = {
     .in  = SPN_VK_BUFFER_OFFSETOF(ttcks, ttcks, ttcks_keys),
     .out = SPN_VK_BUFFER_OFFSETOF(ttcks, ttcks, ttcks_keys)
   };

 #if !defined(NDEBUG) && 0
   fprintf(stderr,
           "Composition:\n"
           "  keys_count       = %u\n"
           "  slabs_in         = %u\n"
           "  padded_in        = %u\n"
           "  padded_out       = %u\n"
           "  keys_offsets.in  = %zu\n"
           "  keys_offsets.out = %zu\n",
           keys_count,
           slabs_in,
           padded_in,
           padded_out,
           keys_offsets.in,
           keys_offsets.out);
 #endif

   hotsort_vk_sort(cb, device->hs, &keys_offsets, keys_count, padded_in, padded_out, false);

   ////////////////////////////////////////////////////////////////
   //
   // PIPELINE: SEGMENT_TTCK
   //
   ////////////////////////////////////////////////////////////////

   //
   // TODO(allanmac): evaluate whether or not to remove this conditional
   // once fxb:50840 is resolved.
   //
   if (slabs_in > 0)
     {
       //
       vk_barrier_compute_w_to_compute_r(cb);

       // bind the pipeline
       spn_vk_p_bind_segment_ttck(instance, cb);

       // dispatch one workgroup per slab
       vkCmdDispatch(cb, slabs_in, 1, 1);
     }

   //
   // Note that we dispatch *indirectly* off of the output SEGMENT_TTCK
   //

   //
   // submit the dispatch
   //
   spn_device_dispatch_submit(device, impl->id_sealing);
 }

 //
 //
 //

 static void
 spn_ci_unsealed_to_sealing(struct spn_composition_impl * const impl)
 {
   //
   // update the state
   //
   impl->state = SPN_CI_STATE_SEALING;

   //
   // acquire the sealing dispatch id ahead of time
   //
   struct spn_device * const device = impl->device;

   spn(device_dispatch_acquire(device, SPN_DISPATCH_STAGE_COMPOSITION_SEAL_2, &impl->id_sealing));

   //
   // flush the current dispatch
   //
   spn_ci_flush(impl);

   struct spn_vk * const instance = device->instance;

   //
   // acquire a dispatch to kick off phase 1 of sealing
   //
   spn_dispatch_id_t id;

   spn(device_dispatch_acquire(device, SPN_DISPATCH_STAGE_COMPOSITION_SEAL_1, &id));

   //
   // wait on any in-flight PLACE dispatches
   //
   spn_ci_dispatch_wait(impl, id);

   // get a cb
   VkCommandBuffer cb = spn_device_dispatch_get_cb(device, id);

   //
   // set a completion payload
   //
   struct spn_ci_complete_payload_sealing * const payload_sealing =
     spn_device_dispatch_set_completion(device,
                                        id,
                                        spn_ci_complete_sealing_1,
                                        sizeof(*payload_sealing));

   payload_sealing->impl = impl;

   //
   // DS: TTCKS
   //
   // FIXME(allanmac): do we need to acquire this DS here and so early?
   //

   // acquire TTCKS descriptor set
   spn_vk_ds_acquire_ttcks(instance, device, &payload_sealing->ds.ttcks);

   // copy the dbi structs
   *spn_vk_ds_get_ttcks_ttcks(instance, payload_sealing->ds.ttcks) = impl->vk.ttcks.dbi;

   // update TTCKS descriptor set
   spn_vk_ds_update_ttcks(instance, &device->environment, payload_sealing->ds.ttcks);

   //
   // INITIALIZE DISPATCH INDIRECT BUFFER
   //
   // FIXME(allanmac): This could be done much earlier but it probably doesn't
   // matter.  Evaluate once we can measure and visualize queue submissions.
   //
   VkDeviceSize const dbi_ttcks_offset = impl->vk.ttcks.dbi.offset;

   uint32_t const dispatch_indirect[] = { 0, 1, 1, 0 };

   vkCmdUpdateBuffer(cb,
                     impl->vk.ttcks.dbi.buffer,
                     dbi_ttcks_offset + SPN_VK_BUFFER_OFFSETOF(ttcks, ttcks, offsets_count),
                     sizeof(dispatch_indirect),
                     dispatch_indirect);

   //
   // COPYBACK TTCKS_COUNT
   //
   VkDeviceSize const dbi_copyback_offset = impl->vk.copyback.dbi.offset;

   VkBufferCopy const bc = {

     .srcOffset = dbi_ttcks_offset + SPN_VK_BUFFER_OFFSETOF(ttcks, ttcks, ttcks_count),
     .dstOffset = dbi_copyback_offset + OFFSETOF_MACRO(struct spn_ci_copyback, ttcks_count),
     .size      = sizeof(impl->mapped.cb.extent->ttcks_count)
   };

   // copyback the key count
   vkCmdCopyBuffer(cb, impl->vk.ttcks.dbi.buffer, impl->vk.copyback.dbi.buffer, 1, &bc);

   //
   // make the copyback visible to the host
   //
   vk_barrier_transfer_w_to_host_r(cb);

   //
   // submit the dispatch
   //
   spn_device_dispatch_submit(device, id);
 }

 //
 //
 //

 struct spn_ci_complete_reset_payload
 {
   struct spn_composition_impl * impl;
 };

 static void
 spn_ci_complete_reset(void * pfn_payload)
 {
   struct spn_ci_complete_reset_payload const * const payload = pfn_payload;
   struct spn_composition_impl * const                impl    = payload->impl;

   if (impl->rasters.count > 0)
     {
       //
       // release any retained rasters
       //
       spn_device_handle_pool_release_d_rasters(impl->device,
                                                impl->rasters.extent,
                                                impl->rasters.count);

       //
       // zero the count
       //
       impl->rasters.count = 0;

       //
       // reset the WIP dispatch
       //
       struct spn_ci_dispatch * const dispatch = spn_ci_dispatch_head(impl);

       dispatch->cp.span = 0;
       dispatch->rd.head = 0;
     }

   //
   // move to RESET state
   //
   payload->impl->state = SPN_CI_STATE_RESET;
 }

 //
 //
 //

 static void
 spn_ci_unsealed_reset(struct spn_composition_impl * const impl)
 {
   //
   // otherwise... kick off a zeroing fill
   //
   impl->state = SPN_CI_STATE_RESETTING;

   //
   // acquire a dispatch
   //
   struct spn_device * const device = impl->device;

   spn(device_dispatch_acquire(device, SPN_DISPATCH_STAGE_COMPOSITION_RESET, &impl->id_resetting));

   // get a cb
   VkCommandBuffer cb = spn_device_dispatch_get_cb(device, impl->id_resetting);

   //
   // zero ttcks_count
   //
   VkDeviceSize const dbi_src_offset = impl->vk.ttcks.dbi.offset;

   vkCmdFillBuffer(cb,
                   impl->vk.ttcks.dbi.buffer,
                   dbi_src_offset + SPN_VK_BUFFER_OFFSETOF(ttcks, ttcks, ttcks_count),
                   SPN_VK_BUFFER_MEMBER_SIZE(ttcks, ttcks, ttcks_count),
                   0);

   //
   // set a completion payload
   //
   struct spn_ci_complete_reset_payload * const payload =
     spn_device_dispatch_set_completion(device,
                                        impl->id_resetting,
                                        spn_ci_complete_reset,
                                        sizeof(*payload));
   payload->impl = impl;

   //
   // submit the dispatch
   //
   spn_device_dispatch_submit(device, impl->id_resetting);
 }

 //
 //
 //

 static void
 spn_ci_block_until_sealed(struct spn_composition_impl * const impl)
 {
   struct spn_device * const device = impl->device;

   while (impl->state != SPN_CI_STATE_SEALED)
     {
       spn(device_wait(device, __func__));
     }
 }

 static void
 spn_ci_sealed_unseal(struct spn_composition_impl * const impl)
 {
   //
   // wait for any in-flight renders to complete
   //
   struct spn_device * const device = impl->device;

   while (impl->lock_count > 0)
     {
       spn(device_wait(device, __func__));
     }

   impl->state = SPN_CI_STATE_UNSEALED;
 }

 //
 // FIXME(allanmac): add UNSEALING state
 //

 static spn_result_t
 spn_ci_seal(struct spn_composition_impl * const impl)
 {
   switch (impl->state)
     {
       case SPN_CI_STATE_RESET:
       case SPN_CI_STATE_RESETTING:
       case SPN_CI_STATE_UNSEALED:
         spn_ci_unsealed_to_sealing(impl);
         return SPN_SUCCESS;

       case SPN_CI_STATE_SEALING:
         return SPN_SUCCESS;

       case SPN_CI_STATE_SEALED:
         // default:
         return SPN_SUCCESS;
     }
 }

 static spn_result_t
 spn_ci_unseal(struct spn_composition_impl * const impl)
 {
   switch (impl->state)
     {
       case SPN_CI_STATE_RESET:
       case SPN_CI_STATE_RESETTING:
       case SPN_CI_STATE_UNSEALED:
         return SPN_SUCCESS;

       case SPN_CI_STATE_SEALING:
         spn_ci_block_until_sealed(impl);
         // [[fallthrough]];

       case SPN_CI_STATE_SEALED:
         // default:
         spn_ci_sealed_unseal(impl);
         return SPN_SUCCESS;
     }
 }

 static spn_result_t
 spn_ci_reset(struct spn_composition_impl * const impl)
 {
   switch (impl->state)
     {
       case SPN_CI_STATE_RESET:
       case SPN_CI_STATE_RESETTING:
         return SPN_SUCCESS;

       case SPN_CI_STATE_UNSEALED:
         spn_ci_unsealed_reset(impl);
         return SPN_SUCCESS;

       case SPN_CI_STATE_SEALING:
         return SPN_ERROR_COMPOSITION_SEALED;

       case SPN_CI_STATE_SEALED:
         // default:
         return SPN_ERROR_COMPOSITION_SEALED;
     }
 }

 //
 //
 //

 static spn_result_t
 spn_ci_clone(struct spn_composition_impl * const impl, struct spn_composition ** const clone)
 {
   return SPN_ERROR_NOT_IMPLEMENTED;
 }

 //
 //
 //

 static spn_result_t
 spn_ci_get_bounds(struct spn_composition_impl * const impl, uint32_t bounds[4])
 {
   return SPN_ERROR_NOT_IMPLEMENTED;
 }

 //
 // Initialize clip to max tile clip for the target
 //

 static void
 spn_ci_get_max_clip(struct spn_composition_impl * const impl, struct spn_ivec4 * const clip)
 {
   *clip = (struct spn_ivec4){ .x = 0,
                               .y = 0,
                               .z = 1 << SPN_TTCK_HI_BITS_X,
                               .w = 1 << SPN_TTCK_HI_BITS_Y };
 }

 //
 //
 //

 static spn_result_t
 spn_ci_set_clip(struct spn_composition_impl * const impl, uint32_t const clip[4])
 {
   switch (impl->state)
     {
       case SPN_CI_STATE_RESET:
         break;

       case SPN_CI_STATE_RESETTING:
         do
           {
             spn(device_wait(impl->device, __func__));
           }
         while (impl->state == SPN_CI_STATE_RESETTING);
         break;

       case SPN_CI_STATE_UNSEALED:
         break;

       case SPN_CI_STATE_SEALING:
       case SPN_CI_STATE_SEALED:
       default:
         return SPN_ERROR_COMPOSITION_SEALED;
     }

   //
   // convert pixel clip coords to tile coords
   //
   // FIXME(allanmac): use the signed SIMD4 trick
   //
   uint32_t const tile_w = 1 << impl->config->tile.width_log2;
   uint32_t const tile_h = 1 << impl->config->tile.height_log2;

   uint32_t const surf_w_max = tile_w << SPN_TTCK_HI_BITS_X;
   uint32_t const surf_h_max = tile_h << SPN_TTCK_HI_BITS_Y;

   struct spn_uvec4 const tile_clip = {

     clip[0] >> impl->config->tile.width_log2,
     clip[1] >> impl->config->tile.height_log2,

     (MIN_MACRO(uint32_t, clip[2], surf_w_max) + tile_w - 1) >> impl->config->tile.width_log2,
     (MIN_MACRO(uint32_t, clip[3], surf_h_max) + tile_h - 1) >> impl->config->tile.height_log2
   };

   // clip to max
   impl->clip.x = MIN_MACRO(uint32_t, tile_clip.x, 1 << SPN_TTCK_HI_BITS_X);
   impl->clip.y = MIN_MACRO(uint32_t, tile_clip.y, 1 << SPN_TTCK_HI_BITS_Y);
   impl->clip.z = MIN_MACRO(uint32_t, tile_clip.z, 1 << SPN_TTCK_HI_BITS_X);
   impl->clip.w = MIN_MACRO(uint32_t, tile_clip.w, 1 << SPN_TTCK_HI_BITS_Y);

   return SPN_SUCCESS;
 }

 //
 //
 //

 static spn_result_t
 spn_ci_place(struct spn_composition_impl * const impl,
              spn_raster_t const *                rasters,
              spn_layer_id const *                layer_ids,
              spn_txty_t const *                  txtys,
              uint32_t                            count)
 {
   switch (impl->state)
     {
       case SPN_CI_STATE_RESET:
         break;

       case SPN_CI_STATE_RESETTING:
         do
           {
             spn(device_wait(impl->device, __func__));
           }
         while (impl->state == SPN_CI_STATE_RESETTING);
         break;

       case SPN_CI_STATE_UNSEALED:
         break;

       case SPN_CI_STATE_SEALING:
       case SPN_CI_STATE_SEALED:
       default:
         return SPN_ERROR_COMPOSITION_SEALED;
     }

   // nothing to do?
   if (count == 0)
     {
       return SPN_SUCCESS;
     }

   // validate there is enough room for retained rasters
   if (impl->rasters.count + count > impl->rasters.size)
     {
       return SPN_ERROR_COMPOSITION_TOO_MANY_RASTERS;
     }

 #if 0
   //
   // FIXME -- No, we should NEVER need to do this.  The layer invoking
   // Spinel should ensure that layer ids remain in range.  Do not
   // enable this.
   //
   // validate layer ids
   //
   for (uint32_t ii=0; ii<count; ii++) {
     if (layer_ids[ii] > SPN_TTCK_LAYER_MAX) {
       return SPN_ERROR_LAYER_ID_INVALID;
     }
   }
 #endif

   //
   // validate first and then retain the rasters before we proceed
   //
   struct spn_device * const device = impl->device;

   spn_result_t result;

   result = spn_device_handle_pool_validate_d_rasters(device, rasters, count);

   if (result != SPN_SUCCESS)
     return result;

   spn_device_handle_pool_retain_d_rasters(device, rasters, count);

   //
   // No survivable errors from here onward... any failure beyond here
   // will be fatal to the context -- most likely too many ttcks.
   //

   //
   // block if resetting...
   //
   while (impl->state == SPN_CI_STATE_RESETTING)
     {
       // FIXME(allanmac): wait on resetting id
       spn(device_wait(device, "spn_ci_place: (impl->state == SPN_CI_STATE_RESETTING)"));
     }

   //
   // save the untyped raster handles
   //
   spn_handle_t * const saved = impl->rasters.extent + impl->rasters.count;

   impl->rasters.count += count;

   for (uint32_t ii = 0; ii < count; ii++)
     {
       saved[ii] = rasters[ii].handle;
     }

   //
   // copy place commands into the ring
   //
   struct spn_ring * const ring = &impl->mapped.cp.ring;

   while (true)
     {
       //
       // how many slots left in ring?
       //
       uint32_t const head_nowrap = spn_ring_head_nowrap(ring);
       uint32_t       avail       = MIN_MACRO(uint32_t, count, head_nowrap);

       //
       // if ring is full then this implies we're already waiting on
       // dispatches because an eager launch would've occurred
       //
       if (avail == 0)
         {
           spn(device_wait(device, "spn_ci_place: (avail == 0)"));
           continue;
         }

       //
       // increment dispatch span
       //
       struct spn_ci_dispatch * const dispatch = spn_ci_dispatch_head(impl);

       dispatch->cp.span += avail;

       //
       // append commands
       //
       struct spn_cmd_place * cmds = impl->mapped.cp.extent + ring->head;

       spn_ring_drop_n(ring, avail);

       count -= avail;

       if (txtys == NULL)
         {
           while (avail-- > 0)
             {
               cmds->raster_h = rasters->handle;
               cmds->layer_id = *layer_ids;
               cmds->txty[0]  = 0;
               cmds->txty[1]  = 0;

               ++rasters;
               ++layer_ids;
               ++cmds;
             }
         }
       else
         {
           while (avail-- > 0)
             {
               cmds->raster_h = rasters->handle;
               cmds->layer_id = *layer_ids;
               cmds->txty[0]  = txtys->tx;
               cmds->txty[1]  = txtys->ty;

               ++rasters;
               ++layer_ids;
               ++txtys;
               ++cmds;
             }
         }

       //
       // launch place kernel?
       //
       if (dispatch->cp.span >= impl->config->composition.size.eager)
         {
           spn_ci_flush(impl);
         }

       //
       // anything left?
       //
       if (count == 0)
         {
           return SPN_SUCCESS;
         }
     }
 }

 //
 //
 //

 static spn_result_t
 spn_ci_release(struct spn_composition_impl * const impl)
 {
   //
   // was this the last reference?
   //
   if (--impl->composition->ref_count != 0)
     {
       return SPN_SUCCESS;
     }

   struct spn_device * const device = impl->device;

   //
   // wait for any in-flight PLACE dispatches to complete
   //
   while (!spn_ring_is_full(&impl->dispatches.ring))
     {
       spn(device_wait(device, "spn_ci_release: (!spn_ring_is_full(&impl->dispatches.ring)"));
     }

   //
   // wait for any in-flight renders to complete
   //
   while (impl->lock_count > 0)
     {
       spn(device_wait(device, "spn_ci_release: (impl->lock_count > 0)"));
     }

   //
   // release any retained rasters
   //
   if (impl->rasters.count > 0)
     {
       spn_device_handle_pool_release_d_rasters(impl->device,
                                                impl->rasters.extent,
                                                impl->rasters.count);
     }

   //
   // note that we don't have to unmap before freeing
   //

   //
   // free copyback
   //
   spn_allocator_device_perm_free(&device->allocator.device.perm.hr_dw,
                                  &device->environment,
                                  &impl->vk.copyback.dbi,
                                  impl->vk.copyback.dm);
   //
   // free ttcks
   //
   spn_allocator_device_perm_free(&device->allocator.device.perm.drw,
                                  &device->environment,
                                  &impl->vk.ttcks.dbi,
                                  impl->vk.ttcks.dm);
   //
   // free ring
   //
   if (spn_ci_is_staged(impl->config))
     {
       spn_allocator_device_perm_free(&device->allocator.device.perm.drw,
                                      &device->environment,
                                      &impl->vk.rings.d.dbi,
                                      impl->vk.rings.d.dm);
     }

   spn_allocator_device_perm_free(&device->allocator.device.perm.hw_dr,
                                  &device->environment,
                                  &impl->vk.rings.h.dbi,
                                  impl->vk.rings.h.dm);
   //
   // free host allocations
   //
   struct spn_allocator_host_perm * const perm = &impl->device->allocator.host.perm;

   spn_allocator_host_perm_free(perm, impl->rasters.extent);
   spn_allocator_host_perm_free(perm, impl->dispatches.extent);
   spn_allocator_host_perm_free(perm, impl->composition);
   spn_allocator_host_perm_free(perm, impl);

   return SPN_SUCCESS;
 }

 //
 //
 //

 spn_result_t
 spn_composition_impl_create(struct spn_device * const       device,
                             struct spn_composition ** const composition)
 {
   //
   // FIXME(allanmac): retain the context
   //
   // spn_context_retain(context);
   //
   struct spn_allocator_host_perm * const perm = &device->allocator.host.perm;

   //
   // allocate impl
   //
   struct spn_composition_impl * const impl =
     spn_allocator_host_perm_alloc(perm, SPN_MEM_FLAGS_READ_WRITE, sizeof(*impl));
   //
   // allocate composition
   //
   struct spn_composition * const c =
     spn_allocator_host_perm_alloc(perm, SPN_MEM_FLAGS_READ_WRITE, sizeof(*c));

   // init impl and pb back-pointers
   *composition      = c;
   impl->composition = c;
   c->impl           = impl;

   // save device
   impl->device = device;

   struct spn_vk_target_config const * const config = spn_vk_get_config(device->instance);

   impl->config = config;

   impl->lock_count = 0;

   // the composition impl starts out unsealed
   SPN_ASSERT_STATE_INIT(SPN_CI_STATE_UNSEALED, impl);

   //
   // initialize composition
   //
   c->release    = spn_ci_release;
   c->place      = spn_ci_place;
   c->seal       = spn_ci_seal;
   c->unseal     = spn_ci_unseal;
   c->reset      = spn_ci_reset;
   c->clone      = spn_ci_clone;
   c->get_bounds = spn_ci_get_bounds;
   c->set_clip   = spn_ci_set_clip;
   c->ref_count  = 1;

   // set max clip
   spn_ci_get_max_clip(impl, &impl->clip);

   //
   // allocate and map ring
   //
   size_t const ring_size = config->composition.size.ring * sizeof(*impl->mapped.cp.extent);

   spn_ring_init(&impl->mapped.cp.ring, config->composition.size.ring);

   spn_allocator_device_perm_alloc(&device->allocator.device.perm.hw_dr,
                                   &device->environment,
                                   ring_size,
                                   NULL,
                                   &impl->vk.rings.h.dbi,
                                   &impl->vk.rings.h.dm);

   vk(MapMemory(device->environment.d,
                impl->vk.rings.h.dm,
                0,
                VK_WHOLE_SIZE,
                0,
                (void **)&impl->mapped.cp.extent));

   if (spn_ci_is_staged(impl->config))
     {
       spn_allocator_device_perm_alloc(&device->allocator.device.perm.drw,
                                       &device->environment,
                                       ring_size,
                                       NULL,
                                       &impl->vk.rings.d.dbi,
                                       &impl->vk.rings.d.dm);
     }
   else
     {
       impl->vk.rings.d.dbi = impl->vk.rings.h.dbi;
       impl->vk.rings.d.dm  = impl->vk.rings.h.dm;
     }

   //
   // allocate ttck descriptor set
   //
   size_t const ttcks_size = SPN_VK_BUFFER_OFFSETOF(ttcks, ttcks, ttcks_keys) +
                             config->composition.size.ttcks * sizeof(SPN_TYPE_UVEC2);

 #if 0
   fprintf(stderr, "ttcks_size = %zu\n", ttcks_size);
 #endif

   spn_allocator_device_perm_alloc(&device->allocator.device.perm.drw,
                                   &device->environment,
                                   ttcks_size,
                                   NULL,
                                   &impl->vk.ttcks.dbi,
                                   &impl->vk.ttcks.dm);

   //
   // allocate and map tiny copyback buffer
   //
   size_t const copyback_size = sizeof(*impl->mapped.cb.extent);

   spn_allocator_device_perm_alloc(&device->allocator.device.perm.hr_dw,
                                   &device->environment,
                                   copyback_size,
                                   NULL,
                                   &impl->vk.copyback.dbi,
                                   &impl->vk.copyback.dm);

   vk(MapMemory(device->environment.d,
                impl->vk.copyback.dm,
                0,
                VK_WHOLE_SIZE,
                0,
                (void **)&impl->mapped.cb.extent));

   //
   // allocate release resources
   //
   uint32_t const max_in_flight = config->composition.size.dispatches;
   size_t const   d_size        = sizeof(*impl->dispatches.extent) * max_in_flight;
   size_t const   r_size        = sizeof(*impl->rasters.extent) * config->composition.size.rasters;

   impl->dispatches.extent = spn_allocator_host_perm_alloc(perm, SPN_MEM_FLAGS_READ_WRITE, d_size);

   impl->rasters.extent = spn_allocator_host_perm_alloc(perm, SPN_MEM_FLAGS_READ_WRITE, r_size);
   impl->rasters.size   = config->composition.size.rasters;
   impl->rasters.count  = 0;

   spn_ring_init(&impl->dispatches.ring, max_in_flight);

   // initialize the first dispatch
   spn_ci_dispatch_init(impl, impl->dispatches.extent);

   // start in the resetting state
   spn_ci_unsealed_reset(impl);

   return SPN_SUCCESS;
 }

 //
 //
 //

 static void
 spn_ci_retain_and_lock(struct spn_composition_impl * const impl)
 {
   impl->composition->ref_count += 1;

   impl->lock_count += 1;
 }

 static void
 spn_composition_unlock_and_release(struct spn_composition_impl * const impl)
 {
   impl->lock_count -= 1;

   spn_ci_release(impl);
 }

 //
 //
 //

 void
 spn_composition_happens_before(struct spn_composition * const composition,
                                spn_dispatch_id_t const        id)
 {
   struct spn_composition_impl * const impl = composition->impl;

   assert(impl->state >= SPN_CI_STATE_SEALING);

   //
   // retain the composition
   //
   spn_ci_retain_and_lock(impl);

   //
   // already sealed?
   //
   if (impl->state == SPN_CI_STATE_SEALED)
     return;

   //
   // otherwise... composition happens before render
   //
   spn_device_dispatch_happens_after(impl->device,
                                     id,                 // after
                                     impl->id_sealing);  // before
 }

 //
 //
 //

 void
 spn_composition_pre_render_bind_ds(struct spn_composition * const   composition,
                                    struct spn_vk_ds_ttcks_t * const ds,
                                    VkCommandBuffer                  cb)
 {
   struct spn_composition_impl * const impl     = composition->impl;
   struct spn_device * const           device   = impl->device;
   struct spn_vk * const               instance = device->instance;

   assert(impl->state >= SPN_CI_STATE_SEALING);

   //
   // acquire TTCKS descriptor set
   //
   spn_vk_ds_acquire_ttcks(instance, device, ds);

   // copy the dbi structs
   *spn_vk_ds_get_ttcks_ttcks(instance, *ds) = impl->vk.ttcks.dbi;

   // update ds
   spn_vk_ds_update_ttcks(instance, &device->environment, *ds);

   // bind
   spn_vk_ds_bind_render_ttcks(instance, cb, *ds);
 }

 //
 //
 //

 void
 spn_composition_pre_render_dispatch_indirect(struct spn_composition * const composition,
                                              VkCommandBuffer                cb)
 {
   struct spn_composition_impl * const impl = composition->impl;

   VkDeviceSize const dbi_offset = impl->vk.ttcks.dbi.offset +  //
                                   SPN_VK_BUFFER_OFFSETOF(ttcks, ttcks, offsets_count);

   vkCmdDispatchIndirect(cb, impl->vk.ttcks.dbi.buffer, dbi_offset);
 }

 //
 //
 //

 void
 spn_composition_post_render(struct spn_composition * const composition)
 {
   spn_composition_unlock_and_release(composition->impl);
 }

 //
 //
 //