src/graphics/lib/compute/spinel/platforms/vk/deps.c - fuchsia - Git at Google

 // Copyright 2021 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 //
 //
 //

 #include "deps.h"

 #include <assert.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>

 #include "common/vk/assert.h"
 #include "device.h"
 #include "queue_pool.h"
 #include "ring.h"

 //
 // Utility struct and functions for accumulating a wait set from a bag of
 // delayed semaphore indices.
 //
 #define SPN_DEPS_WAITSET_DELAYED_BITMAP_DWORDS ((SPN_DEPS_DELAYED_SEMAPHORE_MAX + 31) / 32)

 struct spinel_deps_waitset_gather
 {
   struct
   {
     uint32_t bitmap[SPN_DEPS_WAITSET_DELAYED_BITMAP_DWORDS];
   } delayed;
 };

 //
 // clang-format off
 //

 //
 // Stack allocated store of waiting semaphores totals:
 //
 //  * Every in-flight delayed semaphore    |  128
 //  * In-flight immediate semaphores       |   33
 //  * Internal transfer waiting timelines  |    1
 //  * Imported waiting timelines           |    1
 //
 #define SPN_DEPS_WAITSET_SIZE (SPN_DEPS_DELAYED_SEMAPHORE_MAX                + \
                                SPN_DEPS_IMMEDIATE_SUBMIT_SIZE_WAIT_IMMEDIATE + \
                                SPN_DEPS_TRANSFER_WAIT_SIZE                   + \
                                SPN_VK_SEMAPHORE_IMPORT_WAIT_SIZE)

 struct spinel_deps_waitset
 {
   uint32_t             count;
   VkPipelineStageFlags stages    [SPN_DEPS_WAITSET_SIZE];
   VkSemaphore          semaphores[SPN_DEPS_WAITSET_SIZE];
   uint64_t             values    [SPN_DEPS_WAITSET_SIZE];
 };

 //
 // Stack allocated store of signalling semaphores totals:
 //
 //  * Delayed signal semaphores
 //  * One just-acquired immediate semaphore
 //  * Internal transfer signalling timelines
 //  * Imported signalling timelines
 //
 #define SPN_DEPS_SIGNALSET_SIZE (SPN_DEPS_IMMEDIATE_SUBMIT_SIZE_SIGNAL_DELAYED + \
                                  1                                             + \
                                  SPN_DEPS_TRANSFER_SIGNAL_SIZE                 + \
                                  SPN_VK_SEMAPHORE_IMPORT_SIGNAL_SIZE)

 struct spinel_deps_signalset
 {
   uint32_t    count;
   VkSemaphore semaphores[SPN_DEPS_SIGNALSET_SIZE];
   uint64_t    values    [SPN_DEPS_SIGNALSET_SIZE];
 };

 //
 // clang-format on
 //

 //
 // Deps instance
 //
 struct spinel_deps
 {
   //
   // A new path or raster builder dispatch immediately acquires a "delayed"
   // timeline.
   //
   struct
   {
     struct spinel_next                next;
     VkSemaphore *                     semaphores;
     uint64_t *                        values;
     struct spinel_deps_action *       submissions;
     spinel_deps_delayed_semaphore_t * handle_map;
   } delayed;

   //
   // Immediately acquire a timeline and command buffer and submit to the
   // VkDevice.
   //
   struct
   {
     struct
     {
       uint32_t        size;    // pool.size          - number of cbs per pool
       uint32_t        count;   // pool.count         - number of pools
       VkCommandPool * extent;  // extent[pool.count] - extent of pools
     } pool;

     struct spinel_ring          ring;
     VkPipelineStageFlags *      stages;
     VkSemaphore *               semaphores;
     uint64_t *                  values;
     VkCommandBuffer *           cbs;
     struct spinel_deps_action * completions;
   } immediate;

   //
   // Completed submission actions are only executed:
   //
   //   * After an immediate timeline has been acquired.
   //   * Or when waiting for submitted dispatches to complete.
   //
   struct
   {
     struct spinel_ring          ring;
     struct spinel_deps_action * extent;
   } completion;
 };

 //
 //
 //
 struct spinel_deps *
 spinel_deps_create(struct spinel_deps_create_info const * info, struct spinel_device_vk const * vk)
 {
   assert(info->semaphores.delayed.size <= SPN_DEPS_DELAYED_SEMAPHORE_MAX);

   struct spinel_deps * deps = MALLOC_MACRO(sizeof(*deps));

   //////////////////////////////////////////////////////////////////////////////
   //
   // Delayed timelines and submission actions.
   //
   size_t const handle_map_size = info->handle_count * sizeof(*deps->delayed.handle_map);

   spinel_next_init(&deps->delayed.next, info->semaphores.delayed.size);

   // clang-format off
   deps->delayed.semaphores  = MALLOC_MACRO(info->semaphores.delayed.size * sizeof(*deps->delayed.semaphores));
   deps->delayed.values      = CALLOC_MACRO(info->semaphores.delayed.size, sizeof(*deps->delayed.values));
   deps->delayed.submissions = CALLOC_MACRO(info->semaphores.delayed.size, sizeof(*deps->delayed.submissions));
   deps->delayed.handle_map  = MALLOC_MACRO(handle_map_size);
   // clang-format on

   // invalidate handle map
   memset(deps->delayed.handle_map, 0xFF, handle_map_size);

   //////////////////////////////////////////////////////////////////////////////
   //
   // Immediate command pools, command buffers, timelines and completion actions.
   //
   uint32_t const immediate_size = info->semaphores.immediate.pool.size *  //
                                   info->semaphores.immediate.pool.count;

   assert(immediate_size <= SPN_DEPS_IMMEDIATE_SEMAPHORE_MAX);

   // clang-format off
   deps->immediate.pool.size   = info->semaphores.immediate.pool.size;
   deps->immediate.pool.count  = info->semaphores.immediate.pool.count;
   deps->immediate.pool.extent = MALLOC_MACRO(info->semaphores.immediate.pool.count * sizeof(*deps->immediate.pool.extent));

   spinel_ring_init(&deps->immediate.ring, immediate_size);

   deps->immediate.stages      = MALLOC_MACRO(immediate_size * sizeof(*deps->immediate.stages));
   deps->immediate.semaphores  = MALLOC_MACRO(immediate_size * sizeof(*deps->immediate.semaphores));
   deps->immediate.values      = CALLOC_MACRO(immediate_size, sizeof(*deps->immediate.values));      // zeroed
   deps->immediate.cbs         = MALLOC_MACRO(immediate_size * sizeof(*deps->immediate.cbs));
   deps->immediate.completions = MALLOC_MACRO(immediate_size * sizeof(*deps->immediate.completions));
   // clang-format on

   //////////////////////////////////////////////////////////////////////////////
   //
   // Completion ring
   //
   spinel_ring_init(&deps->completion.ring, immediate_size);

   deps->completion.extent = MALLOC_MACRO(immediate_size * sizeof(*deps->completion.extent));

   //////////////////////////////////////////////////////////////////////////////
   //
   // Create Vulkan objects: command pools, command buffers, timelines.
   //
   VkCommandPoolCreateInfo const cpci = {

     .sType            = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
     .pNext            = NULL,
     .flags            = 0,
     .queueFamilyIndex = vk->q.compute.create_info.family_index
   };

   VkCommandBufferAllocateInfo cbai = {

     .sType              = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
     .pNext              = NULL,
     .level              = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
     .commandBufferCount = info->semaphores.immediate.pool.size,
     .commandPool        = VK_NULL_HANDLE  // updated on each iteration
   };

   for (uint32_t ii = 0; ii < deps->immediate.pool.count; ii++)
     {
       vk(CreateCommandPool(vk->d, &cpci, vk->ac, deps->immediate.pool.extent + ii));

       //
       // Allocate command buffers
       //
       uint32_t const cmds_base = info->semaphores.immediate.pool.size * ii;

       cbai.commandPool = deps->immediate.pool.extent[ii];

       vk(AllocateCommandBuffers(vk->d, &cbai, deps->immediate.cbs + cmds_base));
     }

   //
   // Create timeline semaphores
   //
   VkSemaphoreTypeCreateInfo const stci = {

     .sType         = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR,
     .pNext         = NULL,
     .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE,
     .initialValue  = 0UL
   };

   VkSemaphoreCreateInfo const sci = {

     .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
     .pNext = &stci,
     .flags = 0
   };

   //
   // Create immediate timeline semaphores initialized to 0
   //
   for (uint32_t ii = 0; ii < immediate_size; ii++)
     {
       vk(CreateSemaphore(vk->d, &sci, vk->ac, deps->immediate.semaphores + ii));
     }

   //
   // Create delayed timeline semaphores initialized to 0
   //
   for (uint32_t ii = 0; ii < info->semaphores.delayed.size; ii++)
     {
       vk(CreateSemaphore(vk->d, &sci, vk->ac, deps->delayed.semaphores + ii));
     }

   return deps;
 }

 //
 //
 //
 void
 spinel_deps_dispose(struct spinel_deps * deps, struct spinel_device_vk const * vk)
 {
   //
   // Destroy semaphores
   //
   for (uint32_t ii = 0; ii < deps->immediate.ring.size; ii++)
     {
       vkDestroySemaphore(vk->d, deps->immediate.semaphores[ii], vk->ac);
     }

   for (uint32_t ii = 0; ii < deps->delayed.next.size; ii++)
     {
       vkDestroySemaphore(vk->d, deps->delayed.semaphores[ii], vk->ac);
     }

   //
   // Free command buffers
   //
   for (uint32_t ii = 0; ii < deps->immediate.pool.count; ii++)
     {
       uint32_t const pool_base = ii * deps->immediate.pool.size;

       vkFreeCommandBuffers(vk->d,
                            deps->immediate.pool.extent[ii],
                            deps->immediate.pool.size,
                            deps->immediate.cbs + pool_base);
     }

   //
   // Destroy command pools
   //
   for (uint32_t ii = 0; ii < deps->immediate.pool.count; ii++)
     {
       vkDestroyCommandPool(vk->d, deps->immediate.pool.extent[ii], vk->ac);
     }

   //
   // Arrays
   //
   free(deps->completion.extent);

   free(deps->immediate.completions);
   free(deps->immediate.cbs);
   free(deps->immediate.values);
   free(deps->immediate.semaphores);
   free(deps->immediate.stages);
   free(deps->immediate.pool.extent);

   free(deps->delayed.handle_map);
   free(deps->delayed.submissions);
   free(deps->delayed.values);
   free(deps->delayed.semaphores);

   free(deps);
 }

 //
 // Attach a semaphore to a handle
 //
 void
 spinel_deps_delayed_attach(struct spinel_deps *            deps,
                            spinel_handle_t                 handle,
                            spinel_deps_delayed_semaphore_t semaphore)
 {
   deps->delayed.handle_map[handle] = semaphore;
 }

 //
 // Detach a semaphore from an extent of handles
 //
 void
 spinel_deps_delayed_detach(struct spinel_deps *    deps,
                            spinel_handle_t const * handles,
                            uint32_t                count)
 {
   for (uint32_t ii = 0; ii < count; ii++)
     {
       spinel_handle_t const handle = handles[ii];

       deps->delayed.handle_map[handle] = SPN_DEPS_DELAYED_SEMAPHORE_INVALID;
     }
 }

 //
 // Detach a semaphore from a ring of handles
 //
 void
 spinel_deps_delayed_detach_ring(struct spinel_deps *    deps,
                                 spinel_handle_t const * handles,
                                 uint32_t                size,
                                 uint32_t                head,
                                 uint32_t                span)
 {
   uint32_t const head_max   = head + span;
   uint32_t const head_clamp = MIN_MACRO(uint32_t, head_max, size);
   uint32_t const count_lo   = head_clamp - head;

   spinel_deps_delayed_detach(deps, handles + head, count_lo);

   if (span > count_lo)
     {
       uint32_t const count_hi = span - count_lo;

       spinel_deps_delayed_detach(deps, handles, count_hi);
     }
 }

 //
 // Actions only need two args.
 //
 // Note that we clear the action to keep delayed semaphore actions from being
 // reexecuted.
 //
 static void
 spinel_deps_action_invoke(struct spinel_deps_action * action)
 {
   if (action->pfn != NULL)
     {
       spinel_deps_pfn_t pfn = action->pfn;

       action->pfn = NULL;

       pfn(action->data0, action->data1);
     }
 }

 //
 // Flush a delayed semaphore
 //
 void
 spinel_deps_delayed_flush(struct spinel_deps * deps, spinel_deps_delayed_semaphore_t delayed)
 {
   spinel_deps_action_invoke(deps->delayed.submissions + delayed);
 }

 //
 //
 //
 static void
 spinel_deps_waitset_gather_set(struct spinel_deps_waitset_gather *   gather,
                                spinel_deps_delayed_semaphore_t const delayed)
 {
   if (delayed != SPN_DEPS_DELAYED_SEMAPHORE_INVALID)
     {
       uint32_t const delayed_base = (delayed >> 5);
       uint32_t const delayed_bit  = (1u << (delayed & 0x1F));

       gather->delayed.bitmap[delayed_base] |= delayed_bit;
     }
 }

 //
 // Gather the delayed semaphores of a linear span of handles
 //
 static void
 spinel_deps_waitset_gather_init(struct spinel_deps const *          deps,
                                 spinel_handle_t const *             handles,
                                 uint32_t const                      count,
                                 struct spinel_deps_waitset_gather * gather)
 {
   for (uint32_t ii = 0; ii < count; ii++)
     {
       spinel_handle_t const                 handle  = handles[ii];
       spinel_deps_delayed_semaphore_t const delayed = deps->delayed.handle_map[handle];

       spinel_deps_waitset_gather_set(gather, delayed);
     }
 }

 //
 // 1. Gather delayed semaphores
 // 2. For all delayed semaphores:
 //    1. Invoke .submission action
 //    2. Save semaphore handle and signalling value
 //
 static void
 spinel_deps_waitset_init(struct spinel_deps const *                       deps,
                          struct spinel_deps_waitset *                     waitset,
                          struct spinel_deps_immediate_submit_info const * info)
 {
   uint32_t wait_count = 0;

   //
   // First append info->wait.immediate[] array because we know the latest signal
   // value is valid.
   //
   for (uint32_t ii = 0; ii < info->wait.immediate.count; ii++)
     {
       spinel_deps_immediate_semaphore_t immediate = info->wait.immediate.semaphores[ii];

       waitset->stages[wait_count]     = deps->immediate.stages[immediate];
       waitset->semaphores[wait_count] = deps->immediate.semaphores[immediate];
       waitset->values[wait_count]     = deps->immediate.values[immediate];

       wait_count += 1;
     }

   //
   // Which delayed semaphores need to be waited upon?
   //
   bool const is_wait_delayed_handles = (info->wait.delayed.handles.span > 0);

   if (is_wait_delayed_handles)
     {
       //
       // Gather bitmap of delayed semaphores
       //
       struct spinel_deps_waitset_gather gather = { 0 };

       //
       // Gather the delayed semaphores of a ring of handles
       //
       // clang-format off
           uint32_t const head_max   = info->wait.delayed.handles.head + info->wait.delayed.handles.span;
           uint32_t const head_clamp = MIN_MACRO(uint32_t, head_max, info->wait.delayed.handles.size);
           uint32_t       count_lo   = head_clamp - info->wait.delayed.handles.head;
       // clang-format on

       spinel_handle_t const * handle_head = info->wait.delayed.handles.extent +  //
                                             info->wait.delayed.handles.head;

       spinel_deps_waitset_gather_init(deps, handle_head, count_lo, &gather);

       if (info->wait.delayed.handles.span > count_lo)
         {
           uint32_t count_hi = info->wait.delayed.handles.span - count_lo;

           spinel_deps_waitset_gather_init(deps,
                                           info->wait.delayed.handles.extent,
                                           count_hi,
                                           &gather);
         }

       //
       // Dispatch each delayed semaphore and save the semaphore and its
       // signalling value
       //
       for (uint32_t ii = 0; ii < SPN_DEPS_WAITSET_DELAYED_BITMAP_DWORDS; ii++)
         {
           uint32_t dword = gather.delayed.bitmap[ii];

           if (dword == 0)
             {
               continue;
             }

           uint32_t const delayed_base = ii * 32;

           do
             {
               //
               // The dword is non-zero so __builtin_ffs() returns [1,32].
               //
               // TODO(allanmac): Support _MSC_VER compiler.
               //
               uint32_t const lsb_plus_1 = __builtin_ffs(dword);

               dword ^= (1u << (lsb_plus_1 - 1));

               uint32_t const delayed = delayed_base + lsb_plus_1 - 1;

               spinel_deps_action_invoke(deps->delayed.submissions + delayed);

               waitset->stages[wait_count]     = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
               waitset->semaphores[wait_count] = deps->delayed.semaphores[delayed];
               waitset->values[wait_count]     = deps->delayed.values[delayed];

               wait_count += 1;

           } while (dword != 0);
         }
     }

   waitset->count = wait_count;
 }

 //
 //
 //
 static void
 spinel_deps_waitset_append_transfer(struct spinel_deps const *                       deps,
                                     struct spinel_deps_waitset *                     waitset,
                                     struct spinel_deps_immediate_submit_info const * info)
 {
   for (uint32_t ii = 0; ii < info->wait.transfer.count; ii++)
     {
       waitset->stages[waitset->count]     = info->wait.transfer.stages[ii];
       waitset->semaphores[waitset->count] = info->wait.transfer.semaphores[ii];
       waitset->values[waitset->count]     = info->wait.transfer.values[ii];

       waitset->count += 1;
     }
 }

 //
 //
 //
 static void
 spinel_deps_waitset_append_import(struct spinel_deps const *                       deps,
                                   struct spinel_deps_waitset *                     waitset,
                                   struct spinel_deps_immediate_submit_info const * info)
 {
   for (uint32_t ii = 0; ii < info->wait.import.count; ii++)
     {
       waitset->stages[waitset->count]     = info->wait.import.stages[ii];
       waitset->semaphores[waitset->count] = info->wait.import.semaphores[ii];
       waitset->values[waitset->count]     = info->wait.import.values[ii];

       waitset->count += 1;
     }
 }

 //
 // Drain all completion actions
 //
 static bool
 spinel_deps_completion_drain_all(struct spinel_deps * deps)
 {
   if (spinel_ring_is_full(&deps->completion.ring))
     {
       return false;
     }

   do
     {
       uint32_t const tail = deps->completion.ring.tail;

       spinel_ring_release_n(&deps->completion.ring, 1);

       spinel_deps_action_invoke(deps->completion.extent + tail);

   } while (!spinel_ring_is_full(&deps->completion.ring));

   return true;
 }

 //
 // Drains the submission at deps->immediate.tail.
 //
 // NOTE: Assumes there are submissions.
 //
 // FIXME(allanmac): Refactor to support VK_ERROR_DEVICE_LOST
 //
 static bool
 spinel_deps_immediate_drain_tail(struct spinel_deps *            deps,
                                  struct spinel_device_vk const * vk,
                                  uint64_t                        timeout)
 {
   assert(!spinel_ring_is_full(&deps->immediate.ring));

   //
   // Wait for this timeline to complete...
   //
   // NOTE(allanmac): This assumes the wait never times out.  If the device is
   // lost then we fail.  The proper way to handle this is to replace all context
   // pfns with device lost operations.
   //
   uint32_t const immediate = deps->immediate.ring.tail;

   VkSemaphoreWaitInfo const swi = {
     .sType          = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
     .pNext          = NULL,
     .flags          = 0,  // flag doesn't matter when there is 1 semaphore
     .semaphoreCount = 1,
     .pSemaphores    = deps->immediate.semaphores + immediate,
     .pValues        = deps->immediate.values + immediate,
   };

   //
   // Wait for semaphore to complete...
   //
   if (vkWaitSemaphores(vk->d, &swi, timeout) != VK_SUCCESS)
     {
       return false;
     }

   //
   // Copy the immediate's completion action to the completions ring.
   //
   uint32_t const completions_idx = spinel_ring_acquire_1(&deps->completion.ring);

   deps->completion.extent[completions_idx] = deps->immediate.completions[immediate];

   //
   // Release the semaphore.
   //
   spinel_ring_release_n(&deps->immediate.ring, 1);

   return true;
 }

 //
 // Acquire an immediate semaphore and its associated resources.
 //
 static uint32_t
 spinel_deps_immediate_acquire(struct spinel_deps * deps, struct spinel_device_vk const * vk)
 {
   //
   // Opportunistically drain all completed submissions and append their
   // completion actions to the completion ring.
   //
   // clang-format off
   //
   while (!spinel_ring_is_full(&deps->immediate.ring) && spinel_deps_immediate_drain_tail(deps, vk, 0UL))
     {
       ;
     }

   //
   // clang-format on
   //
   // If head is the first entry of a pool and there are active submissions in
   // the same pool then drain the tail entries until the pool has no active
   // submissions.
   //
   while (true)
     {
       div_t const pool_quot_rem = div(deps->immediate.ring.head, deps->immediate.pool.size);

       if (pool_quot_rem.rem == 0)
         {
           bool const is_active = (deps->immediate.ring.rem < deps->immediate.pool.size);

           if (is_active)
             {
               //
               // This command pool is active so block and drain the oldest
               // submitted command buffer.
               //
               (void)spinel_deps_immediate_drain_tail(deps, vk, UINT64_MAX);

               //
               // ... and try again!
               //
               continue;
             }
           else
             {
               //
               // This command pool isn't active so reset and proceed.
               //
               vk(ResetCommandPool(vk->d, deps->immediate.pool.extent[pool_quot_rem.quot], 0));
             }
         }

       //
       // Return the head entry.
       //
       return spinel_ring_acquire_1(&deps->immediate.ring);
     }
 }

 //
 // Acquire a "delayed" semaphore
 //
 spinel_deps_delayed_semaphore_t
 spinel_deps_delayed_acquire(struct spinel_deps *                            deps,
                             struct spinel_device_vk const *                 vk,
                             struct spinel_deps_acquire_delayed_info const * info)
 {
   //
   // Wrap to zero?
   //
   uint32_t const delayed = spinel_next_acquire_1(&deps->delayed.next);

   //
   // Invoke uninvoked submission actions.
   //
   // This implicitly:
   //
   //   1. Invokes and clears the action.
   //   2. Submits along with a paired immediate semaphore.
   //   3. Increments the delayed semaphore's timeline signal value.
   //
   spinel_deps_action_invoke(deps->delayed.submissions + delayed);

   //
   // There is a bug with Mesa 21.x when ANV_QUEUE_THREAD_DISABLE is defined.
   //
   // See: https://bugs.fuchsia.dev/p/fuchsia/issues/detail?id=92433
   //
   // FIXME(allanmac): This workaround exacts some performance. Remove it as soon
   // as it's feasible.
   //
   if (vk->workarounds.mesa_21_anv)
     {
       VkSemaphoreWaitInfo const swi = {
         .sType          = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
         .pNext          = NULL,
         .flags          = 0,  // flag doesn't matter when there is 1 semaphore
         .semaphoreCount = 1,
         .pSemaphores    = deps->delayed.semaphores + delayed,
         .pValues        = deps->delayed.values + delayed,
       };

       //
       // Wait for semaphore to complete...
       //
       if (vkWaitSemaphores(vk->d, &swi, UINT64_MAX) != VK_SUCCESS)
         {
           exit(EXIT_FAILURE);
         }
     }

   //
   // Save the new submission action
   //
   deps->delayed.submissions[delayed] = info->submission;

   //
   // Return the delayed semaphore
   //
   return (spinel_deps_delayed_semaphore_t)delayed;
 }

 //
 // Note that this is the only place delayed semaphores are incremented.
 //
 static void
 spinel_deps_signalset_init_delayed(struct spinel_deps const *                       deps,
                                    struct spinel_deps_signalset *                   signalset,
                                    struct spinel_deps_immediate_submit_info const * info)

 {
   for (uint32_t ii = 0; ii < info->signal.delayed.count; ii++)
     {
       spinel_deps_delayed_semaphore_t const delayed = info->signal.delayed.semaphores[ii];

       signalset->semaphores[ii] = deps->delayed.semaphores[delayed];
       signalset->values[ii]     = ++deps->delayed.values[delayed];
     }

   signalset->count = info->signal.delayed.count;
 }

 //
 //
 //
 static void
 spinel_deps_signalset_append_immediate(struct spinel_deps const *                       deps,
                                        struct spinel_deps_signalset *                   signalset,
                                        struct spinel_deps_immediate_submit_info const * info,
                                        uint32_t                                         immediate)
 {
   signalset->semaphores[signalset->count] = deps->immediate.semaphores[immediate];
   signalset->values[signalset->count]     = ++deps->immediate.values[immediate];

   signalset->count += 1;

   deps->immediate.completions[immediate] = info->completion;
 }

 //
 //
 //
 static void
 spinel_deps_signalset_append_transfer(struct spinel_deps const *                       deps,
                                       struct spinel_deps_signalset *                   signalset,
                                       struct spinel_deps_immediate_submit_info const * info)
 {
   for (uint32_t ii = 0; ii < info->signal.transfer.count; ii++)
     {
       signalset->semaphores[signalset->count] = info->signal.transfer.semaphores[ii];
       signalset->values[signalset->count]     = info->signal.transfer.values[ii];

       signalset->count += 1;
     }
 }

 //
 //
 //
 static void
 spinel_deps_signalset_append_import(struct spinel_deps const *                       deps,
                                     struct spinel_deps_signalset *                   signalset,
                                     struct spinel_deps_immediate_submit_info const * info)
 {
   for (uint32_t ii = 0; ii < info->signal.import.count; ii++)
     {
       signalset->semaphores[signalset->count] = info->signal.import.semaphores[ii];
       signalset->values[signalset->count]     = info->signal.import.values[ii];

       signalset->count += 1;
     }
 }

 //
 // Acquire an "immediate" semaphore
 //
 // Immediate semaphores (info.immediates) have already been submitted.
 //
 // But delayed semaphores associated with handles (info.handles) may not have
 // been submitted.
 //
 void
 SPN_VK_TRACE_DEFINE(spinel_deps_immediate_submit,
                     struct spinel_deps *                             deps,
                     struct spinel_device_vk *                        vk,
                     struct spinel_deps_immediate_submit_info const * info,
                     spinel_deps_immediate_semaphore_t *              p_immediate)
 {
   assert(info->wait.immediate.count <= SPN_DEPS_IMMEDIATE_SUBMIT_SIZE_WAIT_IMMEDIATE);
   assert(info->signal.delayed.count <= SPN_DEPS_IMMEDIATE_SUBMIT_SIZE_SIGNAL_DELAYED);

   SPN_VK_TRACE_HOST_DURATION_BEGIN();

   //
   // Gather immediate semaphores as well as delayed semaphores associated with a
   // ring span of handles.  Ensure all are submitted before continuing.
   //
   struct spinel_deps_waitset waitset;  // Do not zero-initialize

   spinel_deps_waitset_init(deps, &waitset, info);

   //
   // Append transfer and import wait timelines
   //
   spinel_deps_waitset_append_transfer(deps, &waitset, info);
   spinel_deps_waitset_append_import(deps, &waitset, info);

   //
   // Gather delayed signalling semaphores and their incremented values.
   //
   struct spinel_deps_signalset signalset;  // Do not zero-initialize

   spinel_deps_signalset_init_delayed(deps, &signalset, info);

   //
   // Acquire immediate semaphore
   //
   uint32_t const immediate = spinel_deps_immediate_acquire(deps, vk);

   if (p_immediate != NULL)
     {
       *p_immediate = (spinel_deps_immediate_semaphore_t)immediate;
     }

   //
   // Append signalling acquired immediate semaphore, its new value, and
   // completion action.
   //
   spinel_deps_signalset_append_immediate(deps, &signalset, info, immediate);

   //
   // Append transfer and import signal timelines
   //
   spinel_deps_signalset_append_transfer(deps, &signalset, info);
   spinel_deps_signalset_append_import(deps, &signalset, info);

   //
   // Record commands
   //
   VkCommandBuffer cb = deps->immediate.cbs[immediate];

   VkCommandBufferBeginInfo const cbbi = {

     .sType            = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
     .pNext            = NULL,
     .flags            = 0,
     .pInheritanceInfo = NULL
   };

   vk(BeginCommandBuffer(cb, &cbbi));

   SPN_VK_TRACE_DEVICE_BEGIN_COMMAND_BUFFER(cb);

   if (info->record.pfn != NULL)
     {
       SPN_VK_TRACE_HOST_DURATION_BEGIN_REGION("Recording");

       deps->immediate.stages[immediate] = info->record.pfn(cb,                   //
                                                            info->record.data0,   //
                                                            info->record.data1);  //

       SPN_VK_TRACE_HOST_DURATION_END_REGION();
     }
   else
     {
       deps->immediate.stages[immediate] = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
     }

   SPN_VK_TRACE_DEVICE_END_COMMAND_BUFFER(cb);

   vk(EndCommandBuffer(cb));

   //
   // Submit the command buffer with its associated wait and signal timelines.
   //
   VkTimelineSemaphoreSubmitInfo const tssi = {
     .sType                     = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO,
     .pNext                     = NULL,
     .waitSemaphoreValueCount   = waitset.count,
     .pWaitSemaphoreValues      = waitset.values,
     .signalSemaphoreValueCount = signalset.count,
     .pSignalSemaphoreValues    = signalset.values,
   };

   VkSubmitInfo const submit_info = {
     .sType                = VK_STRUCTURE_TYPE_SUBMIT_INFO,
     .pNext                = &tssi,
     .waitSemaphoreCount   = waitset.count,
     .pWaitSemaphores      = waitset.semaphores,
     .pWaitDstStageMask    = waitset.stages,
     .commandBufferCount   = 1,
     .pCommandBuffers      = &cb,
     .signalSemaphoreCount = signalset.count,
     .pSignalSemaphores    = signalset.semaphores,
   };

   VkQueue q = spinel_queue_pool_get_next(&vk->q.compute);

   vk(QueueSubmit(q, 1, &submit_info, VK_NULL_HANDLE));

   //
   // Drain enqueue completion actions.
   //
   // Ignore return value.
   //
   (void)spinel_deps_completion_drain_all(deps);

   SPN_VK_TRACE_HOST_DURATION_END();
 }

 //
 //
 //
 VkPipelineStageFlags
 spinel_deps_immediate_get_stage(struct spinel_deps *              deps,
                                 spinel_deps_immediate_semaphore_t immediate)
 {
   return deps->immediate.stages[immediate];
 }

 //
 // Blocks until:
 //
 //   * At least one completion action is executed
 //   * Or a submission is completed and its action is executed.
 //
 // Returns true if either case is true.
 //
 // FIXME(allanmac): Refactor to support VK_ERROR_DEVICE_LOST
 //
 bool
 SPN_VK_TRACE_DEFINE(spinel_deps_drain_1,
                     struct spinel_deps *            deps,
                     struct spinel_device_vk const * vk)
 {
   SPN_VK_TRACE_HOST_DURATION_BEGIN();

   bool const is_complete = spinel_deps_completion_drain_all(deps) ||
                            (!spinel_ring_is_full(&deps->immediate.ring) &&
                             spinel_deps_immediate_drain_tail(deps, vk, UINT64_MAX) &&
                             spinel_deps_completion_drain_all(deps));

   SPN_VK_TRACE_HOST_DURATION_END();

   return is_complete;
 }

 //
 // Blocks until all submissions and actions are drained.
 //
 // FIXME(allanmac): Refactor to support VK_ERROR_DEVICE_LOST
 //
 void
 SPN_VK_TRACE_DEFINE(spinel_deps_drain_all,
                     struct spinel_deps *            deps,
                     struct spinel_device_vk const * vk)
 {
   SPN_VK_TRACE_HOST_DURATION_BEGIN();

   spinel_deps_completion_drain_all(deps);

   while ((!spinel_ring_is_full(&deps->immediate.ring) &&
           spinel_deps_immediate_drain_tail(deps, vk, UINT64_MAX) &&
           spinel_deps_completion_drain_all(deps)))
     {
       ;
     }

   SPN_VK_TRACE_HOST_DURATION_END();
 }

 //
 //
 //