src/graphics/lib/compute/spinel/platforms/vk/shaders/ttrks_segment.comp - fuchsia - Git at Google

 // Copyright 2019 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #version 460

 //
 // SEGMENT TTRK
 //
 // FIXME(allanmac): The OpenCL and CUDA versions of this kernel are more
 // sophisticated but let's see how this performs.  Optimize this later
 // using CUDA & OpenCL techniques.
 //
 // FIXME(allanmac): Transition to a split lo/hi sort.
 //
 // FIXME(allanmac): Add a "SKIP" bit to the TTRK.
 //

 //
 //
 //
 #extension GL_EXT_debug_printf : enable

 //
 //
 //
 #extension GL_GOOGLE_include_directive : require
 #extension GL_KHR_shader_subgroup_shuffle_relative : require
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_KHR_shader_subgroup_ballot : require
 #extension GL_EXT_control_flow_attributes : require

 //
 //
 //
 #include "config.h"
 #include "push.h"

 //
 //
 //
 layout(local_size_x = SPN_DEVICE_TTRKS_SEGMENT_WORKGROUP_SIZE) in;

 //
 // Push constants
 //
 SPN_PUSH_LAYOUT_TTRKS_SEGMENT();

 //
 // Buffer references
 //
 SPN_BUFFER_DEFINE_TTRKS_HEADER(readwrite, readonly);
 SPN_BUFFER_DEFINE_TTRK_KEYVALS(readonly);

 //
 // MACROS
 //
 // clang-format off
 #define SPN_SUBGROUP_SIZE                    (1 << SPN_DEVICE_TTRKS_SEGMENT_SUBGROUP_SIZE_LOG2)
 #define SPN_SUBGROUPS                        (SPN_DEVICE_TTRKS_SEGMENT_WORKGROUP_SIZE / SPN_SUBGROUP_SIZE)
 #define SPN_SUBGROUP_TTRKS                   (SPN_DEVICE_TTRKS_SEGMENT_ROWS * SPN_SUBGROUP_SIZE)

 #define SPN_TTRK_COUNT_BITS_TTPK_COUNT       16
 #define SPN_TTRK_COUNT_BITS_BLOCK_COUNT      16

 #define SPN_TTRK_COUNT_OFFSET_TTPK_COUNT     0
 #define SPN_TTRK_COUNT_OFFSET_BLOCK_COUNT    (SPN_TTRK_COUNT_BITS_TTPK_COUNT)

 #define SPN_TTRK_COUNT_ONE_TTPK_COUNT        (1 << SPN_TTRK_COUNT_OFFSET_TTPK_COUNT)
 #define SPN_TTRK_COUNT_ONE_BLOCK_COUNT       (1 << SPN_TTRK_COUNT_OFFSET_BLOCK_COUNT)

 #define SPN_TTRK_COUNT_GET_TTPK_COUNT(s_)    SPN_BITFIELD_EXTRACT(s_,SPN_TTRK_COUNT_OFFSET_TTPK_COUNT,SPN_TTRK_COUNT_BITS_TTPK_COUNT)
 #define SPN_TTRK_COUNT_GET_BLOCK_COUNT(s_)   SPN_BITFIELD_EXTRACT(s_,SPN_TTRK_COUNT_OFFSET_BLOCK_COUNT,SPN_TTRK_COUNT_BITS_BLOCK_COUNT)

 #define SPN_TTRK_IS_NEW_COHORT(curr_, prev_) (((curr_.y ^ prev_.y) & SPN_TTRK_HI_MASK_COHORT) != 0)

 #define SPN_TTRK_ROW_LAST                    (SPN_DEVICE_TTRKS_SEGMENT_ROWS - 1)
 // clang-format on

 //
 // Make sure the scan can't overflow.
 //
 #if (SPN_SUBGROUP_TTRKS >= (1 << SPN_TTRK_COUNT_BITS_TTPK_COUNT))
 #error "Error: Too many TTRK keyvals in subgroup.  Reduce row count."
 #endif

 #if (SPN_SUBGROUP_TTRKS >= (1 << SPN_TTRK_COUNT_BITS_BLOCK_COUNT))
 #error "Error: Too many TTRK keyvals in subgroup.  Reduce row count."
 #endif

 //
 // Update curr TTRK.NEW_Y/NEW_X flags
 //
 // Also emit a `ttrk_count` for later scanning:
 //
 //   0                         31
 //   | TTPK COUNT | BLOCK COUNT |
 //   +------------+-------------+
 //   |     16     |      16     |
 //
 uint32_t
 spn_ttrk_update_and_compare(inout u32vec2 curr, const u32vec2 prev)
 {
   //
   // Update curr TTRK.NEW_Y/NEW_X
   //
   // clang-format off
   const uint32_t xor_lo        = (curr.x ^ prev.x);                               // XOR TTRK.X_LO
   const bool     is_new_x_lo   = ((xor_lo & SPN_TTRK_LO_MASK_X) != 0);            // XOR.X_LO != 0
   uint32_t       new_xy        = is_new_x_lo ? SPN_TTRK_LO_MASK_NEW_X : 0;        // init NEW_XY
   const uint32_t xor_hi        = (curr.y ^ prev.y);                               // XOR TTRK.HI
   const bool     is_new_x_hi   = ((xor_hi & SPN_TTRK_HI_MASK_X) != 0);            // XOR.X_HI != 0
   new_xy                       = is_new_x_hi ? SPN_TTRK_LO_MASK_NEW_X : new_xy;   // update NEW_XY
   const bool     is_new_y      = ((xor_hi & SPN_TTRK_HI_MASK_Y) != 0);            // XOR.Y_HI != 0
   new_xy                       = is_new_y ? SPN_TTRK_LO_MASK_NEW_Y : new_xy;      // update NEW_XY
   const bool     is_new_cohort = ((xor_hi & SPN_TTRK_HI_MASK_COHORT) != 0);       // COHORT != 0
   new_xy                       = is_new_cohort ? SPN_TTRK_LO_MASK_NEW_Y : new_xy; // COHORT != 0
   curr.x                      |= new_xy;                                          // update curr.NEW_XY
   // clang-format on

   //
   // Set TTPK count
   //
   const bool     is_new_ttpk = (new_xy == SPN_TTRK_LO_MASK_NEW_X);               // new_xy == NEW_X
   const uint32_t ttpk_count  = is_new_ttpk ? SPN_TTRK_COUNT_ONE_TTPK_COUNT : 0;  // set .TTPK_COUNT

   //
   // Set BLOCK count
   //
   const uint32_t block_count = SPN_BLOCK_ID_IS_BLOCK(curr.x) ? SPN_TTRK_COUNT_ONE_BLOCK_COUNT : 0;

   return (ttpk_count | block_count);
 }

 //
 //
 //
 void
 main()
 {
   //
   // Define ttrks header bufref
   //
   SPN_BUFREF_DEFINE(SPN_BUFFER_TYPE(ttrks_header), ttrks, push.devaddr_ttrks_header);

   //
   // Get ttrks.count
   //
   SPN_SUBGROUP_UNIFORM const uint32_t ttrks_count = ttrks.count_dispatch.w;

   //
   // Every subgroup processes a block of ttrks
   //
 #if (SPN_SUBGROUPS == 1)
   // clang-format off
   SPN_SUBGROUP_UNIFORM const uint32_t ttrks_base = (gl_WorkGroupID.x * SPN_SUBGROUP_TTRKS);
 #else
   SPN_SUBGROUP_UNIFORM const uint32_t ttrks_base = (gl_WorkGroupID.x * SPN_SUBGROUPS + gl_SubgroupID) * SPN_SUBGROUP_TTRKS;
   // clang-format on

   // Does this subgroup have work?
   if (ttrks_base >= ttrks_count)
     {
       return;
     }
 #endif

   //
   // Keyvals base index
   //
   const uint32_t ttrks_idx_row0 = ttrks_base + gl_SubgroupInvocationID;

   //
   // Define ttrk_keyvals bufref
   //
   u32vec2 ttrks_curr_offset;

   umulExtended(ttrks_idx_row0,
                8,                     // sizeof(ttrk)
                ttrks_curr_offset.y,   // msb
                ttrks_curr_offset.x);  // lsb

   SPN_BUFREF_DEFINE_AT_OFFSET_U32VEC2(SPN_BUFFER_TYPE(ttrk_keyvals),
                                       ttrks_curr,
                                       push.devaddr_ttrk_keyvals,
                                       ttrks_curr_offset);

   //
   // Load ttrk keyvals
   //
   u32vec2 curr[SPN_DEVICE_TTRKS_SEGMENT_ROWS];

   [[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
   {
     const uint32_t idx = ttrks_idx_row0 + ii * SPN_SUBGROUP_SIZE;

     if (idx < ttrks_count)
       {
         curr[ii] = ttrks_curr.extent[ii * SPN_SUBGROUP_SIZE];
       }
     else
       {
         curr[ii] = u32vec2(0, SPN_TTRK_HI_MASK_COHORT);  //
       }
   }

   //
   // Get prev keyval
   //
   u32vec2 prev[SPN_DEVICE_TTRKS_SEGMENT_ROWS];

   [[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
   {
     prev[ii] = subgroupShuffleUp(curr[ii], 1);
   }

   //
   // Fix lane 0 prev[] for rows [1,ROWS-1]
   //
   const bool is_lane0 = (gl_SubgroupInvocationID == 0);

   [[unroll]] for (uint32_t ii = 1, jj = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++, jj++)
   {
     const u32vec2 last = subgroupBroadcast(curr[jj], SPN_SUBGROUP_SIZE - 1);

     if (is_lane0)
       {
         prev[ii] = last;
       }
   }

   //
   // Fix lane 0 prev[] for row 0
   //
   if (is_lane0)
     {
       if (ttrks_idx_row0 > 0)
         {
           //
           // If this is the first key in any block other the first
           // then broadcast load the last key in the previous block.
           //
           // NOTE: This keyval may have already had its NEW_X/NEW_Y
           // bits updated by another subgroup.
           //
           u32vec2 ttrks_prev_offset;

           umulExtended(ttrks_idx_row0 - 1,
                        8,                     // sizeof(ttrk)
                        ttrks_prev_offset.y,   // msb
                        ttrks_prev_offset.x);  // lsb

           SPN_BUFREF_DEFINE_AT_OFFSET_U32VEC2(SPN_BUFFER_TYPE(ttrk_keyvals),
                                               ttrks_prev,
                                               push.devaddr_ttrk_keyvals,
                                               ttrks_prev_offset);

           prev[0] = ttrks_prev.extent[0];
         }
       else
         {
           //
           // This is the first block and first lane so we want to
           // force recording of a new y in order to clear the prefix
           // accumulator.
           //
           prev[0].x = curr[0].x;
           prev[0].y = curr[0].y ^ SPN_TTRK_HI_MASK_Y;
         }
     }

   //
   // Update curr ttrks and get counts
   //
   uint32_t ttrk_count[SPN_DEVICE_TTRKS_SEGMENT_ROWS];

   [[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
   {
     ttrk_count[ii] = spn_ttrk_update_and_compare(curr[ii], prev[ii]);
   }

   //
   // Store updated ttrk keyvals back to extent
   //
   [[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
   {
     const uint32_t idx = ttrks_idx_row0 + ii * SPN_SUBGROUP_SIZE;

     if (idx < ttrks_count)
       {
         ttrks_curr.extent[ii * SPN_SUBGROUP_SIZE] = curr[ii];
       }
   }

   //
   // Exclusive add all TTPK and BLOCK counts
   //
   uint32_t ttrk_count_exc[SPN_DEVICE_TTRKS_SEGMENT_ROWS];
   uint32_t prev_last = 0;

   [[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
   {
     ttrk_count_exc[ii] = prev_last + subgroupInclusiveAdd(ttrk_count[ii]);
     prev_last          = subgroupBroadcast(ttrk_count_exc[ii], SPN_SUBGROUP_SIZE - 1);
     ttrk_count_exc[ii] = ttrk_count_exc[ii] - ttrk_count[ii];
   }

   //////////////////////////////////////////////////////////////////////
   //
   // At this point every row has:
   //
   //   * curr ttrk
   //   * prev ttrk
   //   * exclusive prefix-sum of block and ttpk counts
   //
   // An invalid cohort id identifies an invalid keyval.
   //
   //////////////////////////////////////////////////////////////////////

   //
   // Atomically accumulate meta data for the raster cohort:
   //
   //   * meta.rk_off
   //   * meta.blocks
   //   * meta.ttpks
   //   * meta.ttrks
   //
   // Note that the explicit "+" signs in the atomicAdd() difference
   // operations are for clarity.
   //
   [[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
   {
     const uint32_t idx = ttrks_idx_row0 + ii * SPN_SUBGROUP_SIZE;

     if (SPN_TTRK_IS_NEW_COHORT(curr[ii], prev[ii]))
       {
         const uint32_t curr_cohort_id = SPN_TTRK_GET_COHORT(curr[ii]);
         const uint32_t prev_cohort_id = SPN_TTRK_GET_COHORT(prev[ii]);

         // meta.rk_off
         ttrks.meta.rk_off[curr_cohort_id] = idx;

         // meta.ttrks
         atomicAdd(ttrks.meta.ttrks[curr_cohort_id], -idx);
         atomicAdd(ttrks.meta.ttrks[prev_cohort_id], +idx);

         // meta.blocks
         const uint32_t block_count = SPN_TTRK_COUNT_GET_BLOCK_COUNT(ttrk_count_exc[ii]);

         atomicAdd(ttrks.meta.blocks[curr_cohort_id], -block_count);
         atomicAdd(ttrks.meta.blocks[prev_cohort_id], +block_count);

         // meta.ttpks
         const uint32_t ttpk_count = SPN_TTRK_COUNT_GET_TTPK_COUNT(ttrk_count_exc[ii]);

         atomicAdd(ttrks.meta.ttpks[curr_cohort_id], -ttpk_count);
         atomicAdd(ttrks.meta.ttpks[prev_cohort_id], +ttpk_count);
       }
   }

   //
   // Finalize by accumulating the contributions of the very last
   // keyval in the block.
   //
   // Note that `prev_last` contains the exclusive prefix-sum of the
   // row *following* the last row in the block.
   //
   if (gl_SubgroupInvocationID == SPN_SUBGROUP_SIZE - 1)
     {
       const uint32_t curr_cohort_id = SPN_TTRK_GET_COHORT(curr[SPN_TTRK_ROW_LAST]);

       //
       // If the number of keyvals is a multiple of SPN_SUBGROUP_TTRKS
       // then the final keyval has to contribute its position to the
       // ttrk count.
       //
       if ((ttrks_idx_row0 + SPN_TTRK_ROW_LAST * SPN_SUBGROUP_SIZE + 1) == ttrks_count)
         {
           atomicAdd(ttrks.meta.ttrks[curr_cohort_id], +ttrks_count);
         }

       // meta.blocks
       const uint32_t block_count = SPN_TTRK_COUNT_GET_BLOCK_COUNT(prev_last);

       atomicAdd(ttrks.meta.blocks[curr_cohort_id], +block_count);

       // meta.ttpks
       const uint32_t ttpk_count = SPN_TTRK_COUNT_GET_TTPK_COUNT(prev_last);

       atomicAdd(ttrks.meta.ttpks[curr_cohort_id], +ttpk_count);
     }
 }

 //
 //
 //
	// Copyright 2019 The Fuchsia Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#version 460

	//
	// SEGMENT TTRK
	//
	// FIXME(allanmac): The OpenCL and CUDA versions of this kernel are more
	// sophisticated but let's see how this performs. Optimize this later
	// using CUDA & OpenCL techniques.
	//
	// FIXME(allanmac): Transition to a split lo/hi sort.
	//
	// FIXME(allanmac): Add a "SKIP" bit to the TTRK.
	//

	//
	//
	//
	#extension GL_EXT_debug_printf : enable

	//
	//
	//
	#extension GL_GOOGLE_include_directive : require
	#extension GL_KHR_shader_subgroup_shuffle_relative : require
	#extension GL_KHR_shader_subgroup_arithmetic : require
	#extension GL_KHR_shader_subgroup_ballot : require
	#extension GL_EXT_control_flow_attributes : require

	//
	//
	//
	#include "config.h"
	#include "push.h"

	//
	//
	//
	layout(local_size_x = SPN_DEVICE_TTRKS_SEGMENT_WORKGROUP_SIZE) in;

	//
	// Push constants
	//
	SPN_PUSH_LAYOUT_TTRKS_SEGMENT();

	//
	// Buffer references
	//
	SPN_BUFFER_DEFINE_TTRKS_HEADER(readwrite, readonly);
	SPN_BUFFER_DEFINE_TTRK_KEYVALS(readonly);

	//
	// MACROS
	//
	// clang-format off
	#define SPN_SUBGROUP_SIZE (1 << SPN_DEVICE_TTRKS_SEGMENT_SUBGROUP_SIZE_LOG2)
	#define SPN_SUBGROUPS (SPN_DEVICE_TTRKS_SEGMENT_WORKGROUP_SIZE / SPN_SUBGROUP_SIZE)
	#define SPN_SUBGROUP_TTRKS (SPN_DEVICE_TTRKS_SEGMENT_ROWS * SPN_SUBGROUP_SIZE)

	#define SPN_TTRK_COUNT_BITS_TTPK_COUNT 16
	#define SPN_TTRK_COUNT_BITS_BLOCK_COUNT 16

	#define SPN_TTRK_COUNT_OFFSET_TTPK_COUNT 0
	#define SPN_TTRK_COUNT_OFFSET_BLOCK_COUNT (SPN_TTRK_COUNT_BITS_TTPK_COUNT)

	#define SPN_TTRK_COUNT_ONE_TTPK_COUNT (1 << SPN_TTRK_COUNT_OFFSET_TTPK_COUNT)
	#define SPN_TTRK_COUNT_ONE_BLOCK_COUNT (1 << SPN_TTRK_COUNT_OFFSET_BLOCK_COUNT)

	#define SPN_TTRK_COUNT_GET_TTPK_COUNT(s_) SPN_BITFIELD_EXTRACT(s_,SPN_TTRK_COUNT_OFFSET_TTPK_COUNT,SPN_TTRK_COUNT_BITS_TTPK_COUNT)
	#define SPN_TTRK_COUNT_GET_BLOCK_COUNT(s_) SPN_BITFIELD_EXTRACT(s_,SPN_TTRK_COUNT_OFFSET_BLOCK_COUNT,SPN_TTRK_COUNT_BITS_BLOCK_COUNT)

	#define SPN_TTRK_IS_NEW_COHORT(curr_, prev_) (((curr_.y ^ prev_.y) & SPN_TTRK_HI_MASK_COHORT) != 0)

	#define SPN_TTRK_ROW_LAST (SPN_DEVICE_TTRKS_SEGMENT_ROWS - 1)
	// clang-format on

	//
	// Make sure the scan can't overflow.
	//
	#if (SPN_SUBGROUP_TTRKS >= (1 << SPN_TTRK_COUNT_BITS_TTPK_COUNT))
	#error "Error: Too many TTRK keyvals in subgroup. Reduce row count."
	#endif

	#if (SPN_SUBGROUP_TTRKS >= (1 << SPN_TTRK_COUNT_BITS_BLOCK_COUNT))
	#error "Error: Too many TTRK keyvals in subgroup. Reduce row count."
	#endif

	//
	// Update curr TTRK.NEW_Y/NEW_X flags
	//
	// Also emit a `ttrk_count` for later scanning:
	//
	// 0 31
	// \| TTPK COUNT \| BLOCK COUNT \|
	// +------------+-------------+
	// \| 16 \| 16 \|
	//
	uint32_t
	spn_ttrk_update_and_compare(inout u32vec2 curr, const u32vec2 prev)
	{
	//
	// Update curr TTRK.NEW_Y/NEW_X
	//
	// clang-format off
	const uint32_t xor_lo = (curr.x ^ prev.x); // XOR TTRK.X_LO
	const bool is_new_x_lo = ((xor_lo & SPN_TTRK_LO_MASK_X) != 0); // XOR.X_LO != 0
	uint32_t new_xy = is_new_x_lo ? SPN_TTRK_LO_MASK_NEW_X : 0; // init NEW_XY
	const uint32_t xor_hi = (curr.y ^ prev.y); // XOR TTRK.HI
	const bool is_new_x_hi = ((xor_hi & SPN_TTRK_HI_MASK_X) != 0); // XOR.X_HI != 0
	new_xy = is_new_x_hi ? SPN_TTRK_LO_MASK_NEW_X : new_xy; // update NEW_XY
	const bool is_new_y = ((xor_hi & SPN_TTRK_HI_MASK_Y) != 0); // XOR.Y_HI != 0
	new_xy = is_new_y ? SPN_TTRK_LO_MASK_NEW_Y : new_xy; // update NEW_XY
	const bool is_new_cohort = ((xor_hi & SPN_TTRK_HI_MASK_COHORT) != 0); // COHORT != 0
	new_xy = is_new_cohort ? SPN_TTRK_LO_MASK_NEW_Y : new_xy; // COHORT != 0
	curr.x \|= new_xy; // update curr.NEW_XY
	// clang-format on

	//
	// Set TTPK count
	//
	const bool is_new_ttpk = (new_xy == SPN_TTRK_LO_MASK_NEW_X); // new_xy == NEW_X
	const uint32_t ttpk_count = is_new_ttpk ? SPN_TTRK_COUNT_ONE_TTPK_COUNT : 0; // set .TTPK_COUNT

	//
	// Set BLOCK count
	//
	const uint32_t block_count = SPN_BLOCK_ID_IS_BLOCK(curr.x) ? SPN_TTRK_COUNT_ONE_BLOCK_COUNT : 0;

	return (ttpk_count \| block_count);
	}

	//
	//
	//
	void
	main()
	{
	//
	// Define ttrks header bufref
	//
	SPN_BUFREF_DEFINE(SPN_BUFFER_TYPE(ttrks_header), ttrks, push.devaddr_ttrks_header);

	//
	// Get ttrks.count
	//
	SPN_SUBGROUP_UNIFORM const uint32_t ttrks_count = ttrks.count_dispatch.w;

	//
	// Every subgroup processes a block of ttrks
	//
	#if (SPN_SUBGROUPS == 1)
	// clang-format off
	SPN_SUBGROUP_UNIFORM const uint32_t ttrks_base = (gl_WorkGroupID.x * SPN_SUBGROUP_TTRKS);
	#else
	SPN_SUBGROUP_UNIFORM const uint32_t ttrks_base = (gl_WorkGroupID.x * SPN_SUBGROUPS + gl_SubgroupID) * SPN_SUBGROUP_TTRKS;
	// clang-format on

	// Does this subgroup have work?
	if (ttrks_base >= ttrks_count)
	{
	return;
	}
	#endif

	//
	// Keyvals base index
	//
	const uint32_t ttrks_idx_row0 = ttrks_base + gl_SubgroupInvocationID;

	//
	// Define ttrk_keyvals bufref
	//
	u32vec2 ttrks_curr_offset;

	umulExtended(ttrks_idx_row0,
	8, // sizeof(ttrk)
	ttrks_curr_offset.y, // msb
	ttrks_curr_offset.x); // lsb

	SPN_BUFREF_DEFINE_AT_OFFSET_U32VEC2(SPN_BUFFER_TYPE(ttrk_keyvals),
	ttrks_curr,
	push.devaddr_ttrk_keyvals,
	ttrks_curr_offset);

	//
	// Load ttrk keyvals
	//
	u32vec2 curr[SPN_DEVICE_TTRKS_SEGMENT_ROWS];

	[[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
	{
	const uint32_t idx = ttrks_idx_row0 + ii * SPN_SUBGROUP_SIZE;

	if (idx < ttrks_count)
	{
	curr[ii] = ttrks_curr.extent[ii * SPN_SUBGROUP_SIZE];
	}
	else
	{
	curr[ii] = u32vec2(0, SPN_TTRK_HI_MASK_COHORT); //
	}
	}

	//
	// Get prev keyval
	//
	u32vec2 prev[SPN_DEVICE_TTRKS_SEGMENT_ROWS];

	[[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
	{
	prev[ii] = subgroupShuffleUp(curr[ii], 1);
	}

	//
	// Fix lane 0 prev[] for rows [1,ROWS-1]
	//
	const bool is_lane0 = (gl_SubgroupInvocationID == 0);

	[[unroll]] for (uint32_t ii = 1, jj = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++, jj++)
	{
	const u32vec2 last = subgroupBroadcast(curr[jj], SPN_SUBGROUP_SIZE - 1);

	if (is_lane0)
	{
	prev[ii] = last;
	}
	}

	//
	// Fix lane 0 prev[] for row 0
	//
	if (is_lane0)
	{
	if (ttrks_idx_row0 > 0)
	{
	//
	// If this is the first key in any block other the first
	// then broadcast load the last key in the previous block.
	//
	// NOTE: This keyval may have already had its NEW_X/NEW_Y
	// bits updated by another subgroup.
	//
	u32vec2 ttrks_prev_offset;

	umulExtended(ttrks_idx_row0 - 1,
	8, // sizeof(ttrk)
	ttrks_prev_offset.y, // msb
	ttrks_prev_offset.x); // lsb

	SPN_BUFREF_DEFINE_AT_OFFSET_U32VEC2(SPN_BUFFER_TYPE(ttrk_keyvals),
	ttrks_prev,
	push.devaddr_ttrk_keyvals,
	ttrks_prev_offset);

	prev[0] = ttrks_prev.extent[0];
	}
	else
	{
	//
	// This is the first block and first lane so we want to
	// force recording of a new y in order to clear the prefix
	// accumulator.
	//
	prev[0].x = curr[0].x;
	prev[0].y = curr[0].y ^ SPN_TTRK_HI_MASK_Y;
	}
	}

	//
	// Update curr ttrks and get counts
	//
	uint32_t ttrk_count[SPN_DEVICE_TTRKS_SEGMENT_ROWS];

	[[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
	{
	ttrk_count[ii] = spn_ttrk_update_and_compare(curr[ii], prev[ii]);
	}

	//
	// Store updated ttrk keyvals back to extent
	//
	[[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
	{
	const uint32_t idx = ttrks_idx_row0 + ii * SPN_SUBGROUP_SIZE;

	if (idx < ttrks_count)
	{
	ttrks_curr.extent[ii * SPN_SUBGROUP_SIZE] = curr[ii];
	}
	}

	//
	// Exclusive add all TTPK and BLOCK counts
	//
	uint32_t ttrk_count_exc[SPN_DEVICE_TTRKS_SEGMENT_ROWS];
	uint32_t prev_last = 0;

	[[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
	{
	ttrk_count_exc[ii] = prev_last + subgroupInclusiveAdd(ttrk_count[ii]);
	prev_last = subgroupBroadcast(ttrk_count_exc[ii], SPN_SUBGROUP_SIZE - 1);
	ttrk_count_exc[ii] = ttrk_count_exc[ii] - ttrk_count[ii];
	}

	//////////////////////////////////////////////////////////////////////
	//
	// At this point every row has:
	//
	// * curr ttrk
	// * prev ttrk
	// * exclusive prefix-sum of block and ttpk counts
	//
	// An invalid cohort id identifies an invalid keyval.
	//
	//////////////////////////////////////////////////////////////////////

	//
	// Atomically accumulate meta data for the raster cohort:
	//
	// * meta.rk_off
	// * meta.blocks
	// * meta.ttpks
	// * meta.ttrks
	//
	// Note that the explicit "+" signs in the atomicAdd() difference
	// operations are for clarity.
	//
	[[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
	{
	const uint32_t idx = ttrks_idx_row0 + ii * SPN_SUBGROUP_SIZE;

	if (SPN_TTRK_IS_NEW_COHORT(curr[ii], prev[ii]))
	{
	const uint32_t curr_cohort_id = SPN_TTRK_GET_COHORT(curr[ii]);
	const uint32_t prev_cohort_id = SPN_TTRK_GET_COHORT(prev[ii]);

	// meta.rk_off
	ttrks.meta.rk_off[curr_cohort_id] = idx;

	// meta.ttrks
	atomicAdd(ttrks.meta.ttrks[curr_cohort_id], -idx);
	atomicAdd(ttrks.meta.ttrks[prev_cohort_id], +idx);

	// meta.blocks
	const uint32_t block_count = SPN_TTRK_COUNT_GET_BLOCK_COUNT(ttrk_count_exc[ii]);

	atomicAdd(ttrks.meta.blocks[curr_cohort_id], -block_count);
	atomicAdd(ttrks.meta.blocks[prev_cohort_id], +block_count);

	// meta.ttpks
	const uint32_t ttpk_count = SPN_TTRK_COUNT_GET_TTPK_COUNT(ttrk_count_exc[ii]);

	atomicAdd(ttrks.meta.ttpks[curr_cohort_id], -ttpk_count);
	atomicAdd(ttrks.meta.ttpks[prev_cohort_id], +ttpk_count);
	}
	}

	//
	// Finalize by accumulating the contributions of the very last
	// keyval in the block.
	//
	// Note that `prev_last` contains the exclusive prefix-sum of the
	// row following the last row in the block.
	//
	if (gl_SubgroupInvocationID == SPN_SUBGROUP_SIZE - 1)
	{
	const uint32_t curr_cohort_id = SPN_TTRK_GET_COHORT(curr[SPN_TTRK_ROW_LAST]);

	//
	// If the number of keyvals is a multiple of SPN_SUBGROUP_TTRKS
	// then the final keyval has to contribute its position to the
	// ttrk count.
	//
	if ((ttrks_idx_row0 + SPN_TTRK_ROW_LAST * SPN_SUBGROUP_SIZE + 1) == ttrks_count)
	{
	atomicAdd(ttrks.meta.ttrks[curr_cohort_id], +ttrks_count);
	}

	// meta.blocks
	const uint32_t block_count = SPN_TTRK_COUNT_GET_BLOCK_COUNT(prev_last);

	atomicAdd(ttrks.meta.blocks[curr_cohort_id], +block_count);

	// meta.ttpks
	const uint32_t ttpk_count = SPN_TTRK_COUNT_GET_TTPK_COUNT(prev_last);

	atomicAdd(ttrks.meta.ttpks[curr_cohort_id], +ttpk_count);
	}
	}

	//
	//
	//