src/graphics/lib/compute/spinel/core.h - fuchsia - Git at Google

 // Copyright 2019 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef SRC_GRAPHICS_LIB_COMPUTE_SPINEL_CORE_H_
 #define SRC_GRAPHICS_LIB_COMPUTE_SPINEL_CORE_H_

 //
 // clang-format off
 //

 #define SPN_EMPTY
 #define SPN_UINT_MAX                            0xFFFFFFFF

 //
 // TILE SIZE
 //
 // Width is a power-of-2 of height
 //

 #define SPN_TILE_WIDTH                          (1<<SPN_DEVICE_TILE_WIDTH_LOG2)
 #define SPN_TILE_HEIGHT                         (1<<SPN_DEVICE_TILE_HEIGHT_LOG2)
 #define SPN_TILE_HEIGHT_MASK                    (SPN_TILE_HEIGHT - 1)

 //
 // TAGGED BLOCK ID
 //
 //   0     5                    31
 //   | TAG |       BLOCK ID      |
 //   |     | SUBBLOCK |   BLOCK  |
 //   +-----+----------+----------+
 //   |  5  |    SUB   | 27 - SUB |
 //
 // BLOCK ID
 //
 //   0                    27    31
 //   |       BLOCK ID      |     |
 //   | SUBBLOCK |   BLOCK  | N/A |
 //   +----------+----------+-----+
 //   |    SUB   | 27 - SUB |  5  |
 //
 //
 // There are 27 bits of subblocks and 5 bits of tag.
 //
 // The block pool vends block ids.
 //
 // There are (2^S) subblocks in a block.
 //
 // There are at least 2 subblocks per block.
 //

 #define SPN_TAGGED_BLOCK_ID_BITS_ID             27 // this size is cast in stone
 #define SPN_TAGGED_BLOCK_ID_BITS_TAG            5  // which leaves 5 bits of tag

 #define SPN_TAGGED_BLOCK_ID_INVALID             SPN_UINT_MAX
 #define SPN_TAGGED_BLOCK_ID_MASK_TAG            SPN_BITS_TO_MASK(SPN_TAGGED_BLOCK_ID_BITS_TAG)

 #define SPN_TAGGED_BLOCK_ID_GET_TAG(tbid_)      ((tbid_) & SPN_TAGGED_BLOCK_ID_MASK_TAG)
 #define SPN_TAGGED_BLOCK_ID_GET_ID(tbid_)       SPN_BITFIELD_EXTRACT(tbid_,SPN_TAGGED_BLOCK_ID_BITS_TAG,SPN_TAGGED_BLOCK_ID_BITS_ID)

 #define SPN_BLOCK_ID_MAX                        SPN_BITS_TO_MASK(SPN_TAGGED_BLOCK_ID_BITS_ID)
 #define SPN_BLOCK_ID_INVALID                    SPN_UINT_MAX

 #define SPN_BLOCK_ID_TAG_PATH_LINE              0  // 0 -- 4  segments
 #define SPN_BLOCK_ID_TAG_PATH_QUAD              1  // 1 -- 6  segments
 #define SPN_BLOCK_ID_TAG_PATH_CUBIC             2  // 2 -- 8  segments
 #define SPN_BLOCK_ID_TAG_PATH_RAT_QUAD          3  // 3 -- 7  segments : 6 + w1      -- w0 = w2 = 1
 #define SPN_BLOCK_ID_TAG_PATH_RAT_CUBIC         4  // 4 -- 10 segments : 8 + w1 + w2 -- w0 = w3 = 1
 #define SPN_BLOCK_ID_TAG_PATH_RESERVED_5        5
 #define SPN_BLOCK_ID_TAG_PATH_RESERVED_6        6
 #define SPN_BLOCK_ID_TAG_PATH_RESERVED_7        7
 // ...
 // tags 8-29 are available
 // ...
 #define SPN_BLOCK_ID_TAG_PATH_COUNT             5  // how many path types?  can share same value with PATH_NEXT
 #define SPN_BLOCK_ID_TAG_PATH_NEXT              (SPN_TAGGED_BLOCK_ID_MASK_TAG - 1) // 30 : 0x1E
 #define SPN_BLOCK_ID_TAG_INVALID                SPN_TAGGED_BLOCK_ID_MASK_TAG       // 31 : 0x1F

 //
 // BLOCK POOL
 //

 #define SPN_BLOCK_POOL_BLOCK_DWORDS             (1<<SPN_DEVICE_BLOCK_POOL_BLOCK_DWORDS_LOG2)
 #define SPN_BLOCK_POOL_SUBBLOCK_DWORDS          (1<<SPN_DEVICE_BLOCK_POOL_SUBBLOCK_DWORDS_LOG2)

 #define SPN_BLOCK_POOL_BLOCK_DWORDS_MASK        SPN_BITS_TO_MASK(SPN_DEVICE_BLOCK_POOL_BLOCK_DWORDS_LOG2)
 #define SPN_BLOCK_POOL_SUBBLOCK_DWORDS_MASK     SPN_BITS_TO_MASK(SPN_DEVICE_BLOCK_POOL_SUBBLOCK_DWORDS_LOG2)

 #define SPN_BLOCK_POOL_SUBBLOCKS_PER_BLOCK_LOG2 (SPN_DEVICE_BLOCK_POOL_BLOCK_DWORDS_LOG2 - SPN_DEVICE_BLOCK_POOL_SUBBLOCK_DWORDS_LOG2)
 #define SPN_BLOCK_POOL_SUBBLOCKS_PER_BLOCK      (1<<SPN_BLOCK_POOL_SUBBLOCKS_PER_BLOCK_LOG2)
 #define SPN_BLOCK_POOL_SUBBLOCKS_PER_BLOCK_MASK SPN_BITS_TO_MASK(SPN_BLOCK_POOL_SUBBLOCKS_PER_BLOCK_LOG2)

 #define SPN_BLOCK_POOL_BLOCK_QWORDS_LOG2        (SPN_DEVICE_BLOCK_POOL_BLOCK_DWORDS_LOG2-1)
 #define SPN_BLOCK_POOL_BLOCK_QWORDS             (1<<SPN_BLOCK_POOL_BLOCK_QWORDS_LOG2)
 #define SPN_BLOCK_POOL_BLOCK_QWORDS_MASK        SPN_BITS_TO_MASK(SPN_BLOCK_POOL_BLOCK_QWORDS_LOG2)

 #define SPN_BLOCK_POOL_SUBBLOCK_QWORDS_LOG2     (SPN_DEVICE_BLOCK_POOL_SUBBLOCK_DWORDS_LOG2-1)
 #define SPN_BLOCK_POOL_SUBBLOCK_QWORDS          (1<<SPN_BLOCK_POOL_SUBBLOCK_QWORDS_LOG2)

 #define SPN_BLOCK_POOL_SUBBLOCK_OWORDS_LOG2     (SPN_DEVICE_BLOCK_POOL_SUBBLOCK_DWORDS_LOG2-2)
 #define SPN_BLOCK_POOL_SUBBLOCK_OWORDS          (1<<SPN_BLOCK_POOL_SUBBLOCK_OWORDS_LOG2)

 #define SPN_BLOCK_POOL_ATOMICS_READS            0
 #define SPN_BLOCK_POOL_ATOMICS_WRITES           1

 //
 //
 //

 #define SPN_BLOCK_POOL_SUBBLOCKS_PER_SUBGROUP(subgroup_size_)           \
   (subgroup_size_ / SPN_BLOCK_POOL_SUBBLOCK_DWORDS)

 #define SPN_BLOCK_POOL_SUBGROUPS_PER_BLOCK_LOG2(subgroup_size_log2_)    \
   (SPN_DEVICE_BLOCK_POOL_BLOCK_DWORDS_LOG2 - subgroup_size_log2_)

 #define SPN_BLOCK_POOL_SUBGROUPS_PER_BLOCK(subgroup_size_log2_)         \
   (1 << SPN_BLOCK_POOL_SUBGROUPS_PER_BLOCK_LOG2(subgroup_size_log2_))

 #define SPN_BLOCK_POOL_SUBGROUPS_PER_BLOCK_MASK(subgroup_size_log2_)    \
   SPN_BITS_TO_MASK(SPN_BLOCK_POOL_SUBGROUPS_PER_BLOCK_LOG2(subgroup_size_log2_))

 //
 //
 //

 #define SPN_BLOCK_ID_IS_BLOCK(bid_)             (((bid_) & SPN_BLOCK_POOL_SUBBLOCKS_PER_BLOCK_MASK) == 0)

 //
 // PATH HEAD
 //
 //   struct spinel_path_header
 //   {
 //     uint32_t   handle;     // host handle
 //     uint32_t   blocks;     // total number of blocks in entire path object -- includes nodes and segments
 //     uint32_t   nodes;      // number of trailing path node blocks -- not including head
 //
 //     struct {
 //       uint32_t lines;      // count of segments
 //       uint32_t quads;      // count of segments
 //       uint32_t cubics;     // count of segments
 //       uint32_t rat_quads;  // count of segments
 //       uint32_t rat_cubics; // count of segments
 //     } prims;
 //
 //     struct {
 //       float    x0;
 //       float    y0;
 //       float    x1;
 //       float    y1;
 //     } bounds;              // float4: bounds
 //   };
 //

 #define SPN_PATH_HEAD_DWORDS                    12
 #define SPN_PATH_HEAD_QWORDS                    (SPN_PATH_HEAD_DWORDS / 2)
 #define SPN_PATH_HEAD_DWORDS_POW2_RU            16

 #define SPN_PATH_HEAD_OFFSET_HANDLE             0
 #define SPN_PATH_HEAD_OFFSET_BLOCKS             1
 #define SPN_PATH_HEAD_OFFSET_NODES              2
 #define SPN_PATH_HEAD_OFFSET_PRIMS              3

 #define SPN_PATH_HEAD_OFFSET_LINES              (SPN_PATH_HEAD_OFFSET_PRIMS + SPN_BLOCK_ID_TAG_PATH_LINE)
 #define SPN_PATH_HEAD_OFFSET_QUADS              (SPN_PATH_HEAD_OFFSET_PRIMS + SPN_BLOCK_ID_TAG_PATH_QUAD)
 #define SPN_PATH_HEAD_OFFSET_CUBICS             (SPN_PATH_HEAD_OFFSET_PRIMS + SPN_BLOCK_ID_TAG_PATH_CUBIC)
 #define SPN_PATH_HEAD_OFFSET_RAT_QUADS          (SPN_PATH_HEAD_OFFSET_PRIMS + SPN_BLOCK_ID_TAG_PATH_RAT_QUAD)
 #define SPN_PATH_HEAD_OFFSET_RAT_CUBICS         (SPN_PATH_HEAD_OFFSET_PRIMS + SPN_BLOCK_ID_TAG_PATH_RAT_CUBIC)

 #define SPN_PATH_HEAD_OFFSET_BOUNDS             8
 //
 // PATH HEAD COMPILE-TIME PREDICATES
 //

 #define SPN_PATH_HEAD_ELEM_GTE(sgsz_,x_,i_)                     \
   ((x_) >= (i_) * sgsz_)

 #define SPN_PATH_HEAD_ELEM_IN_RANGE(sgsz_,x_,i_)                \
   (SPN_PATH_HEAD_ELEM_GTE(sgsz_,x_,i_) && !SPN_PATH_HEAD_ELEM_GTE(sgsz_,x_,(i_)+1))

 #define SPN_PATH_HEAD_ENTIRELY_HEADER(sgsz_,i_)                 \
   SPN_PATH_HEAD_ELEM_GTE(sgsz_,SPN_PATH_HEAD_DWORDS,(i_)+1)

 #define SPN_PATH_HEAD_PARTIALLY_HEADER(sgsz_,i_)                \
   SPN_PATH_HEAD_ELEM_IN_RANGE(sgsz_,SPN_PATH_HEAD_DWORDS,i_)

 #define SPN_PATH_HEAD_IS_HEADER(sgsz_,i_)                       \
   (gl_SubgroupInvocationID + i_ * sgsz_ < SPN_PATH_HEAD_DWORDS)

 //
 // RASTERIZATION TYPES
 //
 // Note that the projective rasterization types precede the integral and
 // rational path primitives in order to exploit a coalesced uvec4[2] path header
 // load.
 //

 #define SPN_RAST_TYPE_PROJ_LINE                 0 // Lines and integral beziers with
 #define SPN_RAST_TYPE_PROJ_QUAD                 1 // projective transforms applied
 #define SPN_RAST_TYPE_PROJ_CUBIC                2 // are rationals without weights.
 #define SPN_RAST_TYPE_LINE                      3
 #define SPN_RAST_TYPE_QUAD                      4
 #define SPN_RAST_TYPE_CUBIC                     5
 #define SPN_RAST_TYPE_RAT_QUAD                  6
 #define SPN_RAST_TYPE_RAT_CUBIC                 7

 #define SPN_RAST_TYPE_COUNT                     8

 //
 // FILL COMMANDS
 //
 //
 // A fill command is expanded into one or more rasterize commands.
 //
 // The rasterize command points to a specific dword of a block.
 //
 // For GLSL we will use a uvec4 laid out as follows:
 //
 //  union {
 //
 //    uvec4 u32v4;
 //
 //    struct spinel_cmd_fill {
 //      uint32_t path_h;               // host id
 //      uint32_t na              : 16; // unused
 //      uint32_t cohort          : 15; // cohort is 8-11 bits
 //      uint32_t transform_type  : 1;  // transform type: 0=affine,1=projective
 //      uint32_t transform;            // transform index
 //      uint32_t clip;                 // clip index
 //    } fill;
 //
 //    struct spinel_cmd_rast {
 //      uint32_t node_id;              // device block id
 //      uint32_t node_dword      : 16; // block dword offset
 //      uint32_t cohort          : 15; // cohort is 8-11 bits
 //      uint32_t transform_type  : 1;  // transform type: 0=affine,1=projective
 //      uint32_t transform             // transform index
 //      uint32_t clip;                 // clip index
 //    } rast;
 //
 //  };
 //
 // NOTE(allanmac): We can pack the transform and clip indices down to a
 // more practical 16 bits in case we want to add additional
 // rasterization command indices or flags.
 //

 #define SPN_CMD_FILL_TRANSFORM_TYPE_AFFINE                 0
 #define SPN_CMD_FILL_TRANSFORM_TYPE_PROJECTIVE             1

 #define SPN_CMD_FILL_GET_PATH_H(c_)                        c_[0]
 #define SPN_CMD_FILL_GET_COHORT(c_)                        SPN_BITFIELD_EXTRACT(c_[1],16,15)
 #define SPN_CMD_FILL_GET_TRANSFORM_TYPE(c_)                SPN_BITFIELD_EXTRACT(c_[1],31,1)
 #define SPN_CMD_FILL_GET_TRANSFORM(c_)                     c_[2]
 #define SPN_CMD_FILL_GET_CLIP(c_)                          c_[3]

 #define SPN_CMD_FILL_IS_TRANSFORM_TYPE_AFFINE(c_)          ((c_[1] & SPN_BITS_TO_MASK_AT(31,1)) == 0)
 #define SPN_CMD_FILL_IS_TRANSFORM_TYPE_PROJECTIVE(c_)      ((c_[1] & SPN_BITS_TO_MASK_AT(31,1)) != 0)

 //
 //
 //

 #define SPN_CMD_RASTERIZE_GET_COHORT(c_)                   SPN_CMD_FILL_GET_COHORT(c_)
 #define SPN_CMD_RASTERIZE_GET_TRANSFORM_TYPE(c_)           SPN_CMD_FILL_GET_TRANSFORM_TYPE(c_)
 #define SPN_CMD_RASTERIZE_GET_TRANSFORM(c_)                SPN_CMD_FILL_GET_TRANSFORM(c_)
 #define SPN_CMD_RASTERIZE_GET_CLIP(c_)                     SPN_CMD_FILL_GET_CLIP(c_)

 #define SPN_CMD_RASTERIZE_IS_TRANSFORM_TYPE_AFFINE(c_)     SPN_CMD_FILL_IS_TRANSFORM_TYPE_AFFINE(c_)
 #define SPN_CMD_RASTERIZE_IS_TRANSFORM_TYPE_PROJECTIVE(c_) SPN_CMD_FILL_IS_TRANSFORM_TYPE_PROJECTIVE(c_)

 #define SPN_CMD_RASTERIZE_GET_NODE_ID(c_)                  c_[0]
 #define SPN_CMD_RASTERIZE_GET_NODE_DWORD(c_)               SPN_BITFIELD_EXTRACT(c_[1],0,16)

 #define SPN_CMD_RASTERIZE_SET_NODE_ID(c_,n_id_)            c_[0] = n_id_
 #define SPN_CMD_RASTERIZE_SET_NODE_DWORD(c_,n_lo_)         c_[1] = SPN_BITFIELD_INSERT(c_[1],n_lo_,0,16)

 //
 // Spinel supports a projective transformation matrix with the
 // requirement that w2 is implicitly 1.0.
 //
 //   A---------B----+
 //   | sx  shx | tx |
 //   | shy sy  | ty |
 //   C---------D----+
 //   | w0  w1  | 1  |
 //   +---------+----+
 //
 // The transformation matrix can be initialized with the array:
 //
 //   { sx shx shy sy tx ty w0 w1 }
 //
 // struct spinel_transform
 // {
 //   SPN_TYPE_MAT2X2 a; //  { { sx shx } {shy sy } } -- rotate
 //   SPN_TYPE_VEC2   b; //  { tx ty }                -- translate
 //   SPN_TYPE_VEC2   c; //  { w0 w1 }                -- project
 // };
 //
 // struct spinel_transform_lo
 // {
 //   SPN_TYPE_MAT2X2 a; //  { { sx shx } {shy sy } } -- rotate
 // };
 //
 // struct spinel_transform_hi
 // {
 //   SPN_TYPE_VEC2   b; //  { tx ty }                -- translate
 //   SPN_TYPE_VEC2   c; //  { w0 w1 }                -- project
 // };
 //
 //
 // Note that the raster builder is storing the transform as two
 // float[4] quads.
 //
 // The rasterization shaders then load these vec4 quads as mat2
 // matrices.
 //

 #define SPN_TRANSFORM_LO_INDEX_SX                 0
 #define SPN_TRANSFORM_LO_INDEX_SHX                1
 #define SPN_TRANSFORM_LO_INDEX_SHY                2
 #define SPN_TRANSFORM_LO_INDEX_SY                 3

 #define SPN_TRANSFORM_HI_INDEX_TX                 0
 #define SPN_TRANSFORM_HI_INDEX_TY                 1
 #define SPN_TRANSFORM_HI_INDEX_W0                 2
 #define SPN_TRANSFORM_HI_INDEX_W1                 3

 //
 // PATHS COPY COMMANDS
 //
 // The PATH COPY command is simply a 32-bit tagged block id with a
 // host-controlled rolling counter stuffed into the id field.
 //

 #define SPN_PATHS_COPY_CMD_TYPE_SEGS              0
 #define SPN_PATHS_COPY_CMD_TYPE_NODE              1
 #define SPN_PATHS_COPY_CMD_TYPE_HEAD              2

 #define SPN_PATHS_COPY_CMD_GET_TYPE(cmd)          SPN_TAGGED_BLOCK_ID_GET_TAG(cmd)

 //
 // RASTER HEAD
 //
 // The raster header and nodes use a strided layout so that the block is
 // split in two with the low dword of the 64-bit keys stored in hte
 // first half of the block and the high dword in the second half.
 //
 // Note: a simple 32-bit .pkidx implies a 16 GB limit to the block pool.
 //
 // Note: we could interpret the 32-bit .pkidx as the low bits indexing
 // the dwords in the low half of the block and the high bits indexing
 // qwords.  This will index a 32 GB block pool.
 //
 //   raster head block
 //   {
 //     struct spinel_raster_header.lo
 //     {
 //       uint32_t nodes;   // # of nodes  -- not including header
 //       uint32_t ttsks;   // # of ttsks
 //       uint32_t ttpks;   // # of ttpks
 //       uint32_t pkidx;   // block pool dword of first ttpk.lo
 //       uint32_t blocks;  // # of blocks -- head+node+skb+pkb
 //
 //       ... TTXK.lo ...
 //     };
 //
 //     struct spinel_raster_header.hi
 //     {
 //       int32_t  x0;      // axis-aligned bounding box
 //       int32_t  x1;      // axis-aligned bounding box
 //       int32_t  y0;      // axis-aligned bounding box
 //       int32_t  y1;      // axis-aligned bounding box
 //       uint32_t na0;     // reserved
 //
 //       ... TTXK.hi ...
 //     };
 //   }
 //
 // Usage:
 //
 //   - RASTERS_RECLAIM: this shader only needs to load the low dwords of
 //     each block because only the block and node counts and the TTXB id
 //     of each key are required.
 //
 //   - RASTERS_PREFIX: this shader needs to vector load the values
 //     calculated by RASTERS_ALLOC and write them back to the block.
 //
 //   - PLACE_TT*K: these shaders need to efficiently load the raster
 //     header.
 //

 #define SPN_RASTER_NODE_QWORDS                   SPN_BLOCK_POOL_BLOCK_QWORDS

 #define SPN_RASTER_HEAD_DWORDS                   10
 #define SPN_RASTER_HEAD_QWORDS                   (SPN_RASTER_HEAD_DWORDS / 2)

 #define SPN_RASTER_HEAD_LO_OFFSET_NODES          0
 #define SPN_RASTER_HEAD_LO_OFFSET_TTSKS          1
 #define SPN_RASTER_HEAD_LO_OFFSET_TTPKS          2
 #define SPN_RASTER_HEAD_LO_OFFSET_PKIDX          3
 #define SPN_RASTER_HEAD_LO_OFFSET_BLOCKS         4

 #define SPN_RASTER_HEAD_HI_OFFSET_X0             0
 #define SPN_RASTER_HEAD_HI_OFFSET_X1             1
 #define SPN_RASTER_HEAD_HI_OFFSET_Y0             2
 #define SPN_RASTER_HEAD_HI_OFFSET_Y1             3
 #define SPN_RASTER_HEAD_HI_OFFSET_NA0            4

 //
 // RASTER HEAD COMPILE-TIME PREDICATES
 //

 #define SPN_RASTER_HEAD_ELEM_GTE(sgsz_,x_,i_)      \
   ((x_) >= (i_) * sgsz_)

 #define SPN_RASTER_HEAD_ELEM_IN_RANGE(sgsz_,x_,i_) \
   (SPN_RASTER_HEAD_ELEM_GTE(sgsz_,x_,i_) &&        \
    !SPN_RASTER_HEAD_ELEM_GTE(sgsz_,x_,(i_)+1))

 #define SPN_RASTER_HEAD_ENTIRELY_HEADER(sgsz_,i_)                 \
   SPN_RASTER_HEAD_ELEM_GTE(sgsz_,SPN_RASTER_HEAD_QWORDS,(i_)+1)

 #define SPN_RASTER_HEAD_PARTIALLY_HEADER(sgsz_,i_)                \
   SPN_RASTER_HEAD_ELEM_IN_RANGE(sgsz_,SPN_RASTER_HEAD_QWORDS,i_)

 #define SPN_RASTER_HEAD_IS_HEADER(sgsz_,i_)                       \
   (gl_SubgroupInvocationID + i_ * sgsz_ < SPN_RASTER_HEAD_QWORDS)

 //
 // Hard requirements:
 //
 //   - A TTXB "block pool" extent that is at least 1GB.
 //
 //   - A virtual surface of at least 8K x 8K
 //
 //   - A very large physical surface because it's advantageous to tile the
 //     physical surface since it's likely to shrink the post-place TTCK sorting
 //     step.
 //
 //                             TTXB BITS
 //    EXTENT     +------------------------------------+
 //   SIZE (MB)   |  22    23    24    25    26    27  |
 //          +----+------------------------------------+
 //    TTXB  |  8 |  128   256   512  1024  2048  4096 |
 //   DWORDS | 16 |  256   512  1024  2048  4096  8192 |
 //          +----+------------------------------------+
 //
 //
 //                                     X/Y BITS
 //   SURFACE DIM +------------------------------------------------------+
 //               |   5     6     7     8*    9*   10    11    12    13  |
 //          +----+------------------------------------------------------+
 //    TILE  |  2 |  128   256   512  1024  2048  4096  8192 16384 32768 |
 //    AXIS  |  3 |  256   512  1024  2048  4096  8192 16384 32768 65536 |
 //    LOG2  |  4 |  512  1024  2048  4096  8192 16384 32768 65536  128K |
 //          +----+------------------------------------------------------+
 //   TILES^2     | 1024  4096 16384 65536  256K    1M    4M   16M   64M |
 //               +------------------------------------------------------+
 //
 // The following values should be pretty future-proof across all GPUs:
 //
 //   - The minimum addressable subblock size is 16 dwords (64 bytes)
 //     to ensure there is enough space for a path or raster header and
 //     its payload.
 //
 //   - Blocks are power-of-2 multiples of subblocks. Larger blocks can
 //     reduce allocation activity (fewer atomic adds).
 //
 //   - 27 bits of TTXB_ID space implies a max of 4GB-32GB of
 //     rasterized paths depending on the size of the TTXB block.
 //     This could enable interesting use cases.
 //
 //   - A virtual rasterization surface that's from +/-16K to +/-128K
 //     depending on the size of the TTXB block.
 //
 //   - Keys that (optionally) only require a 32-bit high word
 //     comparison.
 //
 //   - Support for a minimum of 256K layers. This can be practically
 //     raised to 1m or 2m layers.
 //

 //
 // The size of the cohort determines the max number of rasters that can
 // be submitted to the GPU in a single dispatch.  We want this number to
 // be as large as possible.  A dispatch of 2048 subgroups is very large
 // but there is potential to push this to 8192 with modifications to the
 // segmenter and possibly an auxiliary extent.
 //
 // The max cohort id is reserved as it indicates an invalid TTRK.
 //
 // Each cohort member launches one subgroup per block of common path
 // geometry.
 //
 // The rasterizer produces TTRK keys:
 //
 // TTRK (64-BIT COMPARE)
 //
 //  0                                                                  63
 //  | TTSB_ID | NEW_Y  | NEW_X  | X_LO | X_HI |   Y  | RASTER COHORT ID |
 //  +---------+--------+--------+------+------+------+------------------+
 //  |    27   | 1 (=0) | 1 (=0) |   3  |   9  |  12  |        11        |
 //
 // After segmentation the cohort id can be ignored as we've gathered
 // enough statistics on the cohort to execute the prefix kernel.
 //

 #define SPN_TTRK_LO_BITS_TTSB_ID                 SPN_TAGGED_BLOCK_ID_BITS_ID
 #define SPN_TTRK_LO_HI_BITS_X                    12
 #define SPN_TTRK_LO_BITS_X                       3
 #define SPN_TTRK_HI_BITS_X                       9
 #define SPN_TTRK_HI_BITS_Y                       12
 #define SPN_TTRK_HI_BITS_COHORT                  11

 #define SPN_TTRK_BITS_XY                         (SPN_TTRK_LO_HI_BITS_X + SPN_TTRK_HI_BITS_Y)
 #define SPN_TTRK_LO_BITS_XY                      SPN_TTRK_LO_BITS_X
 #define SPN_TTRK_HI_BITS_XY                      (SPN_TTRK_HI_BITS_X + SPN_TTRK_HI_BITS_Y)

 #define SPN_TTRK_BITS_XY_COHORT                  (SPN_TTRK_BITS_XY + SPN_TTRK_HI_BITS_COHORT)

 #define SPN_TTRK_LO_OFFSET_NEW_Y                 SPN_TTRK_LO_BITS_TTSB_ID
 #define SPN_TTRK_LO_OFFSET_NEW_X                 (SPN_TTRK_LO_OFFSET_NEW_Y + 1)
 #define SPN_TTRK_LO_OFFSET_X                     (SPN_TTRK_LO_OFFSET_NEW_X + 1)
 #define SPN_TTRK_HI_OFFSET_Y                     SPN_TTRK_HI_BITS_X
 #define SPN_TTRK_HI_OFFSET_COHORT                (32 - SPN_TTRK_HI_BITS_COHORT)

 #define SPN_TTRK_LO_MASK_NEW_Y                   SPN_BITS_TO_MASK_AT(SPN_TTRK_LO_OFFSET_NEW_Y,1)
 #define SPN_TTRK_LO_MASK_NEW_X                   SPN_BITS_TO_MASK_AT(SPN_TTRK_LO_OFFSET_NEW_X,1)

 #define SPN_TTRK_LO_MASK_X                       SPN_BITS_TO_MASK_AT(SPN_TTRK_LO_OFFSET_X,SPN_TTRK_LO_BITS_X)
 #define SPN_TTRK_HI_MASK_X                       SPN_BITS_TO_MASK(SPN_TTRK_HI_BITS_X)
 #define SPN_TTRK_HI_MASK_Y                       SPN_BITS_TO_MASK_AT(SPN_TTRK_HI_OFFSET_Y,SPN_TTRK_HI_BITS_Y)

 #define SPN_TTRK_HI_MASK_Y_COHORT                SPN_BITS_TO_MASK_AT(SPN_TTRK_HI_OFFSET_Y,SPN_TTRK_HI_BITS_Y + SPN_TTRK_HI_BITS_COHORT)
 #define SPN_TTRK_HI_MASK_COHORT                  SPN_BITS_TO_MASK_AT(SPN_TTRK_HI_OFFSET_COHORT,SPN_TTRK_HI_BITS_COHORT)

 #define SPN_TTRK_LO_GET_TTSB_ID(t_lo_)           SPN_BITFIELD_EXTRACT(t_lo_,0,SPN_TTRK_LO_BITS_TTSB_ID)
 #define SPN_TTRK_HI_GET_COHORT(t_hi_)            SPN_BITFIELD_EXTRACT(t_hi_,SPN_TTRK_HI_OFFSET_COHORT,SPN_TTRK_HI_BITS_COHORT)
 #define SPN_TTRK_GET_COHORT(t_)                  SPN_TTRK_HI_GET_COHORT(t_.y)

 #define SPN_TTRK_SET_XY(t_,xy_)                  SPN_GLSL_INSERT_UVEC2_UINT(t_,xy_,SPN_TTRK_LO_OFFSET_X,SPN_TTRK_BITS_XY)
 #define SPN_TTRK_SET_COHORT(t_,c_)               (t_)[1] = SPN_BITFIELD_INSERT((t_)[1],c_,SPN_TTRK_HI_OFFSET_COHORT,SPN_TTRK_HI_BITS_COHORT)

 #define SPN_TTRK_IS_NEW_X(t_)                    (SPN_BITFIELD_EXTRACT((t_)[0],SPN_TTRK_LO_OFFSET_NEW_X,1) != 0)
 #define SPN_TTRK_IS_NEW_Y(t_)                    (SPN_BITFIELD_EXTRACT((t_)[0],SPN_TTRK_LO_OFFSET_NEW_Y,1) != 0)

 #define SPN_TTRK_IS_INVALID_COHORT(t_)           (((t_) & SPN_TTRK_HI_MASK_COHORT) == SPN_TTRK_HI_MASK_COHORT)

 //
 // TTSK v1 ( DEFAULT )
 //
 //  0                            63
 //  | TTSB_ID |   SPAN  |  X |  Y |
 //  +---------+---------+----+----+
 //  |    27   | 13 [<0] | 12 | 12 |
 //
 //
 // TTPK v2 ( DEFAULT )
 //
 //  0                                  63
 //  | TTPB_ID |      SPAN     |  X |  Y |
 //  +---------+---------------+----+----+
 //  |    27   | 13 [+1,+4095] | 12 | 12 |
 //
 //
 // A TTSK.SPAN inherits the TTRK[0] dword unmodified (in flux).
 //
 // A TTPK.SPAN has a range of [+1,+4095].
 //
 // A TTXK.SPAN of 0 indicates either:
 //
 //   - an invalid key
 //   - a TTXK key pointing to all TTS_INVALID values
 //   - a TTPK key pointing to all zero values
 //
 // In all cases, this key can be skipped during rendering.
 //
 // TTXK.Y and TTXK.X are signed but stored as biased unsigned.
 //
 // An invalid TTXK has a span of zero and a TTXB_ID of all 1's.
 //

 #define SPN_TTSK_IS_NEW_X(t_)                    SPN_TTRK_IS_NEW_X(t_)
 #define SPN_TTSK_IS_NEW_Y(t_)                    SPN_TTRK_IS_NEW_Y(t_)

 #define SPN_TTXK_LO_BITS_TTXB_ID                 SPN_TTRK_LO_BITS_TTSB_ID
 #define SPN_TTXK_LO_HI_BITS_SPAN                 13
 #define SPN_TTXK_LO_BITS_SPAN                    5  // straddles a
 #define SPN_TTXK_HI_BITS_SPAN                    8  // word boundary
 #define SPN_TTXK_HI_BITS_X                       SPN_TTRK_LO_HI_BITS_X
 #define SPN_TTXK_HI_BITS_Y                       SPN_TTRK_HI_BITS_Y
 #define SPN_TTXK_HI_BITS_XY                      (SPN_TTXK_HI_BITS_Y + SPN_TTXK_HI_BITS_X)

 #define SPN_TTXK_LO_OFFSET_SPAN                  SPN_TTXK_LO_BITS_TTXB_ID
 #define SPN_TTXK_HI_OFFSET_X                     (32 - SPN_TTXK_HI_BITS_XY) // 8
 #define SPN_TTXK_HI_OFFSET_Y                     (32 - SPN_TTXK_HI_BITS_Y)  // 20
 #define SPN_TTXK_HI_OFFSET_XY                    (32 - SPN_TTXK_HI_BITS_XY) // 8

 #define SPN_TTXK_LO_MASK_TTXB_ID                 SPN_BITS_TO_MASK(SPN_TTXK_LO_BITS_TTXB_ID)
 #define SPN_TTXK_LO_MASK_SPAN                    SPN_BITS_TO_MASK_AT(SPN_TTXK_LO_OFFSET_SPAN,SPN_TTXK_LO_BITS_SPAN)
 #define SPN_TTXK_HI_MASK_SPAN                    SPN_BITS_TO_MASK(SPN_TTXK_HI_BITS_SPAN)
 #define SPN_TTXK_HI_MASK_X                       SPN_BITS_TO_MASK_AT(SPN_TTXK_HI_OFFSET_X,SPN_TTXK_HI_BITS_X)
 #define SPN_TTXK_HI_MASK_Y                       SPN_BITS_TO_MASK_AT(SPN_TTXK_HI_OFFSET_Y,SPN_TTXK_HI_BITS_Y)
 #define SPN_TTXK_HI_MASK_XY                      SPN_BITS_TO_MASK_AT(SPN_TTXK_HI_OFFSET_X,SPN_TTXK_HI_BITS_XY)

 #define SPN_TTXK_HI_ONE_X                        (1u << SPN_TTXK_HI_OFFSET_X)

 #define SPN_TTXK_LO_GET_TTXB_ID(t_lo_)           SPN_BITFIELD_EXTRACT(t_lo_,0,SPN_TTXK_LO_BITS_TTXB_ID)
 #define SPN_TTXK_HI_GET_XY(t_hi_)                SPN_BITFIELD_EXTRACT(t_hi_,SPN_TTXK_HI_OFFSET_XY,SPN_TTXK_HI_BITS_XY)

 #define SPN_TTXK_GET_MASKED_XY(t_)               ((t_)[1] & SPN_TTXK_HI_MASK_XY)

 #define SPN_TTXK_GET_TTXB_ID(t_)                 SPN_TTXK_LO_GET_TTXB_ID((t_)[0])
 #define SPN_TTXK_GET_SPAN(t_)                    SPN_GLSL_EXTRACT_UVEC2_INT((t_),SPN_TTXK_LO_OFFSET_SPAN,SPN_TTXK_LO_HI_BITS_SPAN)
 #define SPN_TTXK_GET_X(t_)                       SPN_BITFIELD_EXTRACT((t_)[1],SPN_TTXK_HI_OFFSET_X,SPN_TTXK_HI_BITS_X)
 #define SPN_TTXK_GET_Y(t_)                       SPN_BITFIELD_EXTRACT((t_)[1],SPN_TTXK_HI_OFFSET_Y,SPN_TTXK_HI_BITS_Y)
 #define SPN_TTXK_GET_XY(t_)                      SPN_TTXK_HI_GET_XY((t_)[1])

 #define SPN_TTXK_SET_TTXB_ID(t_,i_)              t_[0] = SPN_BITFIELD_INSERT(t_[0],(i_),0,SPN_TTXK_LO_BITS_TTXB_ID)
 #define SPN_TTXK_SET_SPAN(t_,s_)                 SPN_GLSL_INSERT_UVEC2_UINT((t_),(s_),SPN_TTXK_LO_OFFSET_SPAN,SPN_TTXK_LO_HI_BITS_SPAN)
 #define SPN_TTXK_SET_XY(t_,i_)                   t_[1] = SPN_BITFIELD_INSERT(t_[1],(i_),SPN_TTXK_HI_OFFSET_XY,SPN_TTXK_HI_BITS_XY)

 #define SPN_TTXK_INVALID                         uvec2(SPN_TTXK_LO_MASK_TTXB_ID,0)

 //
 // XY
 //
 //  0        31
 //  |  X |  Y |
 //  +----+----+
 //  | 12 | 22 |
 //
 // A few shaders probe the XY value.
 //
 // The max value of X is 4095.
 //

 #define SPN_XY_GET_Y(xy_)                        SPN_BITFIELD_EXTRACT(xy_,SPN_TTXK_HI_BITS_X,32-SPN_TTXK_HI_BITS_X)
 #define SPN_XY_X_MASK                            SPN_BITS_TO_MASK(SPN_TTXK_HI_BITS_X)

 //
 // TTCK (64-BIT COMPARE) -- DEFAULT
 //
 // TODO(https://fxbug.dev/42064168): The TTSB encoding should encode TTP values between
 // -1073741824 to 1073741823 (signed 31 bit integer). LAYER bits rise to 19.
 //
 //  0                                                  63
 //  | PAYLOAD/TTSB/TTPB_ID | PREFIX | LAYER |  X  |  Y  |
 //  +----------------------+--------+-------+-----+-----+
 //  |          27          |    1   |   18  |  9  |  9  |
 //
 //  0                                         31                     63
 //  | PAYLOAD/TTSB/TTPB_ID | PREFIX | LAYER_LO | LAYER_HI |  X  |  Y  |
 //  +----------------------+--------+----------+----------+-----+-----+
 //  |          27          |    1   |     4    |    14    |  9  |  9  |
 //
 //
 // TTCK.X and TTCK.Y are unsigned
 //
 //  +-----------+-------------+
 //  | TILE SIZE | MAX SURFACE |
 //  +-----------+-------------+
 //  |   16x16   |   8K x 8K   | NVIDIA, AMD
 //  |    8x8    |   4K x 4K   | INTEL GEN+, Mali G52+
 //  |    4x4    |   2K x 2K   | Mali G31, SwiftShader
 //  +-----------+-------------+
 //

 #define SPN_TTCK_LO_BITS_TTXB_ID                 SPN_TAGGED_BLOCK_ID_BITS_ID
 #define SPN_TTCK_LO_BITS_PREFIX                  1

 #define SPN_TTCK_LO_HI_BITS_LAYER                18
 #define SPN_TTCK_LO_BITS_LAYER                   4
 #define SPN_TTCK_HI_BITS_LAYER                   14

 #define SPN_TTCK_HI_BITS_X                       9
 #define SPN_TTCK_HI_BITS_Y                       9
 #define SPN_TTCK_HI_BITS_XY                      (SPN_TTCK_HI_BITS_X + SPN_TTCK_HI_BITS_Y)
 #define SPN_TTCK_HI_BITS_LXY                     (SPN_TTCK_LO_HI_BITS_LAYER + SPN_TTCK_HI_BITS_X + SPN_TTCK_HI_BITS_Y)
 #define SPN_TTCK_BITS_LXY                        (SPN_TTCK_LO_BITS_LAYER + SPN_TTCK_HI_BITS_LXY)

 #define SPN_TTCK_LO_OFFSET_PREFIX                SPN_TTCK_LO_BITS_TTXB_ID
 #define SPN_TTCK_LO_OFFSET_LAYER                 (SPN_TTCK_LO_OFFSET_PREFIX + SPN_TTCK_LO_BITS_PREFIX)

 #define SPN_TTCK_HI_OFFSET_X                     (32 - SPN_TTCK_HI_BITS_XY)
 #define SPN_TTCK_HI_OFFSET_Y                     (32 - SPN_TTCK_HI_BITS_Y)
 #define SPN_TTCK_HI_OFFSET_XY                    (32 - SPN_TTCK_HI_BITS_XY)

 #define SPN_TTCK_LO_MASK_TTXB_ID                 SPN_BITS_TO_MASK(SPN_TTCK_LO_BITS_TTXB_ID)
 #define SPN_TTCK_LO_MASK_PREFIX                  SPN_BITS_TO_MASK_AT(SPN_TTCK_LO_OFFSET_PREFIX,SPN_TTCK_LO_BITS_PREFIX)
 #define SPN_TTCK_LO_MASK_LAYER                   SPN_BITS_TO_MASK_AT(SPN_TTCK_LO_OFFSET_LAYER,SPN_TTCK_LO_BITS_LAYER)

 #define SPN_TTCK_HI_MASK_LAYER                   SPN_BITS_TO_MASK(SPN_TTCK_HI_BITS_LAYER)
 #define SPN_TTCK_HI_MASK_XY                      SPN_BITS_TO_MASK_AT(SPN_TTCK_HI_OFFSET_XY,SPN_TTCK_HI_BITS_XY)

 #define SPN_TTCK_GET_TTXB_ID(t_)                 ( t_[0] & SPN_TTCK_LO_MASK_TTXB_ID)
 #define SPN_TTCK_LO_GET_TTXB_ID(t_lo_)           ( t_lo_ & SPN_TTCK_LO_MASK_TTXB_ID)

 #define SPN_TTCK_IS_PREFIX(t_)                   ((t_[0] & SPN_TTCK_LO_MASK_PREFIX) != 0)
 #define SPN_TTCK_LO_IS_PREFIX(t_lo_)             ((t_lo_ & SPN_TTCK_LO_MASK_PREFIX) != 0)

 #define SPN_TTCK_GET_LAYER(t_)                   SPN_GLSL_EXTRACT_UVEC2_UINT(t_,SPN_TTCK_LO_OFFSET_LAYER,SPN_TTCK_LO_HI_BITS_LAYER)
 #define SPN_TTCK_SET_LAYER(t_,l_)                SPN_GLSL_INSERT_UVEC2_UINT(t_,l_,SPN_TTCK_LO_OFFSET_LAYER,SPN_TTCK_LO_HI_BITS_LAYER)

 #define SPN_TTCK_GET_Y(t_)                       SPN_BITFIELD_EXTRACT(t_[1],SPN_TTCK_HI_OFFSET_Y,SPN_TTCK_HI_BITS_Y)
 #define SPN_TTCK_GET_X(t_)                       SPN_BITFIELD_EXTRACT(t_[1],SPN_TTCK_HI_OFFSET_X,SPN_TTCK_HI_BITS_X)

 #define SPN_TTCK_ADD_X(t_,d_)                    (t_[1] += ((d_) << SPN_TTCK_HI_OFFSET_X))

 #define SPN_TTCK_LAYER_MAX                       SPN_BITS_TO_MASK(SPN_TTCK_LO_HI_BITS_LAYER)

 //
 // 16x16 TILE TRACE SUBPIXEL
 //
 //  0                  31
 //  | TX | DX | TY | DY |
 //  +----+----+----+----+
 //  |  9 |  7 |  9 |  7 |
 //
 // 8x8 TILE TRACE SUBPIXEL (BIAS X)
 //
 //  0                  31
 //  | TX | DX | TY | DY |
 //  +----+----+----+----+
 //  |  9 |  8 |  8 |  7 |
 //
 // 8x8 TILE TRACE SUBPIXEL (BIAS Y)
 //
 //  0                  31
 //  | TX | DX | TY | DY |
 //  +----+----+----+----+
 //  |  8 |  7 |  9 |  8 |
 //
 // 4x4 TILE TRACE SUBPIXEL
 //
 //  0                  31
 //  | TX | DX | TY | DY |
 //  +----+----+----+----+
 //  |  8 |  8 |  8 |  8 |
 //
 // A 32-bit encoding of a subpixel-resolution line segment in a tile up to 16x16
 // (WxH) pixels.
 //
 // Subpixel resolution is 5 bits.
 //
 // We're using this representation across all target
 // architectures.
 //
 // A 16x16 tile X is encoded as:
 //
 //   TX : 9 : unsigned min(x0,x1) tile subpixel coordinate with a range of
 //             [0,511].
 //
 //   DX : 7 : signed subpixel delta x1-x0. The range of the delta is [-32,32]
 //            including 0.  Note that with 7 signed bits the range of the
 //            bitfield is [-64,63].  An "invalid" TTS relies on DX being
 //            infeasible value.
 //
 // A 16x16 tile Y is encoded as:
 //
 //   TY : 9 : unsigned min(y0,y1) tile subpixel coordinate with a range of
 //            [0,511].
 //
 //   DY : 7 : signed subpixel delta (y1-y0). The range of delta is [-32,-1] or
 //            [+1,+32] because horizontal lines are not encoded.
 //
 // NOTE(allanmac): There are assumptions in the shaders that the X and Y
 // subpixel resolutions are the same.  Despite this, let's keep the X and Y
 // definitions separated.
 //
 // NOTE(allanmac): The subpixel resolution *could* be increased on devices with
 // tiles smaller than 16x16.  Just beware of the floating point multiplication
 // using the SPN_TTS_FILL_MAX_AREA_RCP_F32 reciprocal -- other approaches can be
 // considered.
 //

 #if defined(SPN_DEVICE_TILE_WIDTH_LOG2) && defined(SPN_DEVICE_TILE_HEIGHT_LOG2)

 //
 // As tile sizes shrink, bits become available.
 //
 // On the 8x8 tile, arbitrarily default to providing more "Y" resolution because
 // the virtual workspace is reflected.
 //
 // Define SPN_DEVICE_TILE_HEIGHT_BIAS or SPN_DEVICE_TILE_WIDTH_BIAS to
 // explicitly steer where the extra bits are assigned.
 //

 #if (SPN_DEVICE_TILE_WIDTH_LOG2 == 4) && (SPN_DEVICE_TILE_HEIGHT_LOG2 == 4)

 #define SPN_TTS_BITS_TX                          9  // [   0, 511]
 #define SPN_TTS_BITS_DX                          7  // [ -64,  63] -> [-32,32]
 #define SPN_TTS_BITS_TY                          9  // [   0, 511]
 #define SPN_TTS_BITS_DY                          7  // [ -64,  63] -> [-32,32]

 #define SPN_TTS_SUBPIXEL_X_LOG2                  5
 #define SPN_TTS_SUBPIXEL_Y_LOG2                  5

 #elif (SPN_DEVICE_TILE_WIDTH_LOG2 == 3) && (SPN_DEVICE_TILE_HEIGHT_LOG2 == 3) && !defined(SPN_DEVICE_TILE_WIDTH_BIAS) \
                                                                               && !defined(SPN_DEVICE_TILE_HEIGHT_BIAS)
 #define SPN_TTS_BITS_TX                          9  // [   0, 511]
 #define SPN_TTS_BITS_DX                          7  // [ -64,  63] -> [-32,32]
 #define SPN_TTS_BITS_TY                          9  // [   0, 511]
 #define SPN_TTS_BITS_DY                          7  // [ -64,  63] -> [-32,32]

 #define SPN_TTS_SUBPIXEL_X_LOG2                  5
 #define SPN_TTS_SUBPIXEL_Y_LOG2                  5

 #elif (SPN_DEVICE_TILE_WIDTH_LOG2 == 3) && (SPN_DEVICE_TILE_HEIGHT_LOG2 == 3) && defined(SPN_DEVICE_TILE_WIDTH_BIAS) // (BIAS X)

 #error "No support for SPN_DEVICE_TILE_WIDTH_BIAS or SPN_DEVICE_TILE_HEIGHT_BIAS until the octant logic is updated."

 #define SPN_TTS_BITS_TX                          9  // [   0, 511]
 #define SPN_TTS_BITS_DX                          8  // [-128, 127] -> [-64,64]
 #define SPN_TTS_BITS_TY                          8  // [   0, 255]
 #define SPN_TTS_BITS_DY                          7  // [ -64,  63] -> [-32,32]

 #define SPN_TTS_SUBPIXEL_X_LOG2                  6
 #define SPN_TTS_SUBPIXEL_Y_LOG2                  5

 #elif (SPN_DEVICE_TILE_WIDTH_LOG2 == 3) && (SPN_DEVICE_TILE_HEIGHT_LOG2 == 3) && defined(SPN_DEVICE_TILE_HEIGHT_BIAS) // (BIAS Y)

 #error "No support for SPN_DEVICE_TILE_WIDTH_BIAS or SPN_DEVICE_TILE_HEIGHT_BIAS until the octant logic is updated."

 #define SPN_TTS_BITS_TX                          8  // [   0, 255]
 #define SPN_TTS_BITS_DX                          7  // [ -64,  63] -> [-32,32]
 #define SPN_TTS_BITS_TY                          9  // [   0, 511]
 #define SPN_TTS_BITS_DY                          8  // [-128, 127] -> [-64,64]

 #define SPN_TTS_SUBPIXEL_X_LOG2                  5
 #define SPN_TTS_SUBPIXEL_Y_LOG2                  6

 #elif (SPN_DEVICE_TILE_WIDTH_LOG2 == 2) && (SPN_DEVICE_TILE_HEIGHT_LOG2 == 2)

 #define SPN_TTS_BITS_TX                          8  // [   0, 255]
 #define SPN_TTS_BITS_DX                          8  // [-128, 127] -> [-64,64]
 #define SPN_TTS_BITS_TY                          8  // [   0, 255]
 #define SPN_TTS_BITS_DY                          8  // [-128, 127] -> [-64,64]

 #define SPN_TTS_SUBPIXEL_X_LOG2                  6
 #define SPN_TTS_SUBPIXEL_Y_LOG2                  6

 #else
 #error "SPN_DEVICE_TILE_WIDTH_LOG2 or SPN_DEVICE_TILE_HEIGHT_LOG2 not defined!"
 #endif

 //
 //
 //

 #define SPN_TTS_SUBPIXEL_X_SIZE                  (1 << SPN_TTS_SUBPIXEL_X_LOG2)
 #define SPN_TTS_SUBPIXEL_Y_SIZE                  (1 << SPN_TTS_SUBPIXEL_Y_LOG2)

 #define SPN_TTS_PIXEL_X_LOG2                     (SPN_TTS_BITS_TX - SPN_TTS_SUBPIXEL_X_LOG2)
 #define SPN_TTS_PIXEL_Y_LOG2                     (SPN_TTS_BITS_TY - SPN_TTS_SUBPIXEL_Y_LOG2)

 #define SPN_TTS_SUBPIXEL_X_RESL                  float(SPN_TTS_SUBPIXEL_X_SIZE)
 #define SPN_TTS_SUBPIXEL_Y_RESL                  float(SPN_TTS_SUBPIXEL_Y_SIZE)

 #define SPN_TTS_SUBPIXEL_X_SCALE_UP              SPN_TTS_SUBPIXEL_X_RESL
 #define SPN_TTS_SUBPIXEL_Y_SCALE_UP              SPN_TTS_SUBPIXEL_Y_RESL

 #define SPN_TTS_SUBPIXEL_X_SCALE_DOWN            (1.0f / SPN_TTS_SUBPIXEL_X_RESL)
 #define SPN_TTS_SUBPIXEL_Y_SCALE_DOWN            (1.0f / SPN_TTS_SUBPIXEL_Y_RESL)

 //
 // TTXK.X and .Y are biased and unsigned
 //

 #define SPN_TILE_SUBPIXEL_X_BITS_LOG2            (SPN_DEVICE_TILE_WIDTH_LOG2  + SPN_TTS_SUBPIXEL_X_LOG2)
 #define SPN_TILE_SUBPIXEL_Y_BITS_LOG2            (SPN_DEVICE_TILE_HEIGHT_LOG2 + SPN_TTS_SUBPIXEL_Y_LOG2)

 #define SPN_TILE_SUBPIXEL_X_SIZE                 (1 << SPN_TILE_SUBPIXEL_X_BITS_LOG2)
 #define SPN_TILE_SUBPIXEL_Y_SIZE                 (1 << SPN_TILE_SUBPIXEL_Y_BITS_LOG2)

 #define SPN_TTXK_X_BIAS                          (1 << (SPN_TTXK_HI_BITS_X + SPN_TILE_SUBPIXEL_X_BITS_LOG2 - 1))
 #define SPN_TTXK_Y_BIAS                          (1 << (SPN_TTXK_HI_BITS_Y + SPN_TILE_SUBPIXEL_Y_BITS_LOG2 - 1))

 #define SPN_TTXK_TILE_X_BIAS                     (1 << (SPN_TTXK_HI_BITS_X - 1))
 #define SPN_TTXK_TILE_Y_BIAS                     (1 << (SPN_TTXK_HI_BITS_Y - 1))

 #define SPN_TTXK_XY_BIAS                         ivec2(SPN_TTXK_X_BIAS, SPN_TTXK_Y_BIAS)

 //
 //
 //

 #define SPN_TTS_OFFSET_TX                        0
 #define SPN_TTS_OFFSET_DX                        (SPN_TTS_OFFSET_TX + SPN_TTS_BITS_TX)
 #define SPN_TTS_OFFSET_TY                        (SPN_TTS_OFFSET_DX + SPN_TTS_BITS_DX)
 #define SPN_TTS_OFFSET_DY                        (SPN_TTS_OFFSET_TY + SPN_TTS_BITS_TY)

 #define SPN_TTS_OFFSET_TX_PIXEL                  (SPN_TTS_OFFSET_TX + SPN_TTS_SUBPIXEL_X_LOG2)
 #define SPN_TTS_OFFSET_TY_PIXEL                  (SPN_TTS_OFFSET_TY + SPN_TTS_SUBPIXEL_Y_LOG2)

 #define SPN_TTS_MASK_TX                          SPN_BITS_TO_MASK(SPN_TTS_BITS_TX)
 #define SPN_TTS_MASK_DX                          SPN_BITS_TO_MASK_AT(SPN_TTS_BITS_DX,SPN_TTS_OFFSET_DX)
 #define SPN_TTS_MASK_TY                          SPN_BITS_TO_MASK_AT(SPN_TTS_BITS_TY,SPN_TTS_OFFSET_TY)

 #define SPN_TTS_GET_DX(tts_)                     SPN_BITFIELD_EXTRACT(int(tts_),SPN_TTS_OFFSET_DX,SPN_TTS_BITS_DX)
 #define SPN_TTS_GET_DY(tts_)                     SPN_BITFIELD_EXTRACT(int(tts_),SPN_TTS_OFFSET_DY,SPN_TTS_BITS_DY)

 #define SPN_TTS_GET_TX_SUBPIXEL(tts_)            SPN_BITFIELD_EXTRACT(uint(tts_),SPN_TTS_OFFSET_TX,SPN_TTS_SUBPIXEL_X_LOG2)
 #define SPN_TTS_GET_TY_SUBPIXEL(tts_)            SPN_BITFIELD_EXTRACT(uint(tts_),SPN_TTS_OFFSET_TY,SPN_TTS_SUBPIXEL_Y_LOG2)

 #define SPN_TTS_GET_TX_PIXEL(tts_)               SPN_BITFIELD_EXTRACT(uint(tts_),SPN_TTS_OFFSET_TX_PIXEL,SPN_TTS_PIXEL_X_LOG2)
 #define SPN_TTS_GET_TY_PIXEL(tts_)               SPN_BITFIELD_EXTRACT(uint(tts_),SPN_TTS_OFFSET_TY_PIXEL,SPN_TTS_PIXEL_Y_LOG2)

 #define SPN_TTS_GET_TX(tts_)                     SPN_BITFIELD_EXTRACT(uint(tts_),SPN_TTS_OFFSET_TX,SPN_TTS_BITS_TX)
 #define SPN_TTS_GET_TY(tts_)                     SPN_BITFIELD_EXTRACT(uint(tts_),SPN_TTS_OFFSET_TY,SPN_TTS_BITS_TY)

 //
 // Use impossible DX/DY values for TTS_INVALID
 //

 #define SPN_TTS_INVALID                          0

 //
 // Note that for a subpixel resolution of 5 bits, 2048.0 can be represented
 // exactly with fp16... fortuitous!
 //

 #define SPN_TTS_FILL_MAX_AREA                    (2 * SPN_TTS_SUBPIXEL_X_SIZE * SPN_TTS_SUBPIXEL_Y_SIZE)
 #define SPN_TTS_FILL_MAX_AREA_2                  (2 * SPN_TTS_FILL_MAX_AREA)
 #define SPN_TTS_FILL_EVEN_ODD_MASK               (SPN_TTS_FILL_MAX_AREA_2 - 1)
 #define SPN_TTS_FILL_MAX_AREA_RCP_F32            (1.0f / SPN_TTS_FILL_MAX_AREA)

 //
 //
 //

 #endif // defined(SPN_DEVICE_TILE_WIDTH_LOG2) && defined(SPN_DEVICE_TILE_HEIGHT_LOG2)#endif

 //
 // RASTER COHORT METADATA
 //
 // NOTE: Don't trim array even though the last entry in the pow2 array is not
 // used because there are aligned structure members following the metadata
 // table.
 //
 // FIXME(allanmac): the UINT64_MAX key is reserved in this segmenting phase.
 // This implies that the cohort id of all 1's needs to be reserved.  TL;DR: the
 // raster builder must only build (SPN_RASTER_COHORT_METAS_SIZE-1) rasters.
 //
 // FIXME(allanmac): split RKOFF from UVEC4/alloc
 //
 // struct spinel_rc_meta
 // {
 //   SPN_TYPE_U32VEC2 alloc [SPN_RASTER_COHORT_METAS_SIZE]; // block pool reads    -- uninitialized
 //   SPN_TYPE_U32     rk_off[SPN_RASTER_COHORT_METAS_SIZE]; // offset of rk keys   -- zeroed
 //   SPN_TYPE_U32     blocks[SPN_RASTER_COHORT_METAS_SIZE]; // number of blocks    -- zeroed
 //   SPN_TYPE_U32     ttpks [SPN_RASTER_COHORT_METAS_SIZE]; // number of TTPK keys -- zeroed
 //   SPN_TYPE_U32     ttrks [SPN_RASTER_COHORT_METAS_SIZE]; // number of TTRK keys -- zeroed
 //   //
 //   // FIXME(allanmac): the signed bounding box will be added to the meta
 //   // using the atomic signed min/max trick.
 //   //
 // };
 //

 #define SPN_RASTER_COHORT_METAS_SIZE_LOG2             SPN_TTRK_HI_BITS_COHORT
 #define SPN_RASTER_COHORT_METAS_SIZE                  (1 << SPN_RASTER_COHORT_METAS_SIZE_LOG2)

 #define SPN_RASTER_COHORT_META_ALLOC_OFFSET_SK_READS  0 // alloc[0] - block holding first ttsk (head)
 #define SPN_RASTER_COHORT_META_ALLOC_OFFSET_PK_READS  1 // alloc[1] - block holding first ttpk (head/node)

 #define SPN_RASTER_COHORT_MAX_SIZE                    (SPN_RASTER_COHORT_METAS_SIZE - 1)

 //
 // STYLING STRUCTS
 //
 //
 // LAYER
 //
 //   |     LAYER     |
 //   +---------------+
 //   | cmds | parent |
 //   +------+--------+
 //   0      1        2
 //
 // GROUP
 //
 //   |                 GROUP                  |
 //   +--------------+---------+---------------+
 //   |    parents   |  range  |     cmds      |
 //   | depth | base | lo | hi | enter | leave |
 //   +-------+------+----+----+-------+-------+
 //   0       1      2    3    4       5       6
 //
 //
 // It's simpler to define the group as a uvec2[3]:
 //
 //   struct spinel_group_node
 //   {
 //     spinel_group_parents parents; // path of parent groups leading back to root
 //     spinel_group_range   range;   // range of layers enclosed by this group
 //     spinel_group_cmds    cmds;    // enter/leave command indices
 //   };
 //
 // The RENDER kernel lays out the current layer node, group node and
 // flags in either registers or shared memory:
 //
 // LGF -- layer / group / flags
 //                                                               optional
 //   | current layer |          current group           |       |       |       |
 //   +---------------+------------+-------+-------------+.......+.......+.......f....
 //   |     layer     |   parents  | range |    cmds     | layer | group | flags | ...
 //   |  cmds parent  | depth base | lo hi | enter leave |  id   |  id   |       |
 //   +------+--------+------+-----+---+---+------+------+.......+-......+.......+....
 //   0      1        2      3     4   5   6      7      8       9       10      11
 //
 //
 // struct spinel_layer_node
 // {
 //   uint32_t cmds;   // starting index of sequence of command dwords
 //   uint32_t parent; // index of parent group
 // };
 //
 // struct spinel_group_parents
 // {
 //   uint32_t depth;
 //   uint32_t base;
 // };
 //
 // struct spinel_group_range
 // {
 //   uint32_t lo; // first layer
 //   uint32_t hi; // last  layer
 // };
 //
 // struct spinel_group_cmds
 // {
 //   uint32_t enter; // starting index of sequence of command dwords
 //   uint32_t leave; // starting index of sequence of command dwords
 // };
 //

 //
 //
 //

 #define SPN_STYLING_LAYER_OFFSET_CMDS                  0
 #define SPN_STYLING_LAYER_OFFSET_PARENT                1
 #define SPN_STYLING_LAYER_COUNT_DWORDS                 2

 #define SPN_STYLING_GROUP_OFFSET_PARENTS_DEPTH         0
 #define SPN_STYLING_GROUP_OFFSET_PARENTS_BASE          1
 #define SPN_STYLING_GROUP_OFFSET_RANGE_LO              2
 #define SPN_STYLING_GROUP_OFFSET_RANGE_HI              3
 #define SPN_STYLING_GROUP_OFFSET_CMDS_ENTER            4
 #define SPN_STYLING_GROUP_OFFSET_CMDS_LEAVE            5
 #define SPN_STYLING_GROUP_COUNT_DWORDS                 6

 //
 //
 //

 #define SPN_STYLING_CMDS_BITS_COUNT                    3
 #define SPN_STYLING_CMDS_BITS_BASE                     (32-SPN_STYLING_CMDS_BITS_COUNT)

 #define SPN_STYLING_CMDS_OFFSET_BASE                   0
 #define SPN_STYLING_CMDS_OFFSET_COUNT                  SPN_STYLING_CMDS_BITS_BASE

 #define SPN_STYLING_CMDS_MAX_BASE                      ((1<<SPN_STYLING_CMDS_BITS_BASE) - 1)
 #define SPN_STYLING_CMDS_MAX_COUNT                     ((1<<SPN_STYLING_CMDS_BITS_COUNT) - 1)

 #define SPN_STYLING_CMDS_GET_COUNT(c_)                 SPN_BITFIELD_EXTRACT(c_,                                 \
                                                                             SPN_STYLING_CMDS_OFFSET_COUNT,      \
                                                                             SPN_STYLING_CMDS_BITS_COUNT)

 #define SPN_STYLING_CMDS_GET_BASE(c_)                  SPN_BITFIELD_EXTRACT(c_,                                 \
                                                                             SPN_STYLING_CMDS_OFFSET_BASE,       \
                                                                             SPN_STYLING_CMDS_BITS_BASE)

 #if 0

 union spinel_gradient_vector
 {
   skc_float4               f32v4;

   struct {
     skc_float              dx;
     skc_float              p0;
     skc_float              dy;
     skc_float              denom;
   };

   union skc_gradient_slope slopes[4];
 };

 #endif

 //
 // FIXME -- will eventually need to know if this gradient is
 // perspective transformed and if so additional values will need to be
 // encoded
 //
 // VERSION 1
 // =============================================================
 //
 // LINEAR GRADIENT HEADER FOR N STOPS
 //
 // +----------+----------+------------+----------+-------------+
 // |  HEADER  |   INFO   |    LUTS    |  FLOORS  |    COLORS   |
 // +----------+----------+------------+----------+-------------+
 // |  uintv4  | u32v2[1] | f32v2[N-1] | f32[N-2] | ushort2[4N] |
 // +----------+----------+------------+----------+-------------+
 //
 //   COLOR PAIR            WORD EXPANSION            TOTAL
 // +------------+---------------------------------+--------+-------------------------+
 // |  ushort2   |  4 + 2 + 2*(N-1) + N - 2 + 4*N  | 7N + 2 | = 7(N-1+1)+2 = 7(N-1)+9 |
 // +------------+---------------------------------+--------+-------------------------+
 //
 // COLOR LAYOUT:
 //
 //   R[0]R[1], R[1]R[2], ... R[N-1]R[N-1]
 //   G[0]G[1], G[1]G[2], ... G[N-1]G[N-1]
 //   B[0]B[1], B[1]B[2], ... B[N-1]B[N-1]
 //   A[0]A[1], A[1]A[2], ... A[N-1]A[N-1]
 //
 //
 // MINIMUM DWORDS:  N=2 --> 16
 //
 //
 // VERSION 2
 // =============================================================
 //
 // LINEAR GRADIENT DESCRIPTOR FOR N STOPS
 //
 //                           +--------------- REMOVE ME LATER
 //                           v
 // +--------+------+-------+---+----------+-----------+
 // | VECTOR | TYPE | COUNT | N |  SLOPES  |   COLORS  |
 // +--------+------+-------+---+----------+-----------+
 // |  f32v4 |   1  |   1   | 1 | f32[N-1] | f16v2[4N] |
 // +--------+------+-------+---+----------+-----------+
 //
 //   COLOR PAIR           WORD EXPANSION            TOTAL
 // +------------+--------------------------------+--------+
 // |   f16v2    |  4 + 1 + 1 + 1 + [N-1] + [4*N] | 5N + 6 |
 // +------------+--------------------------------+--------+
 //
 // COLOR LAYOUT:
 //
 //   R[0]R[1], R[1]R[2], ... R[N-1]R[N-1] <-------------------------- FIXME -- USE HERB'S SINGLE FMA REPRESENTATION
 //   G[0]G[1], G[1]G[2], ... G[N-1]G[N-1] <-------------------------- FIXME -- USE HERB'S SINGLE FMA REPRESENTATION
 //   B[0]B[1], B[1]B[2], ... B[N-1]B[N-1] <-------------------------- FIXME -- USE HERB'S SINGLE FMA REPRESENTATION
 //   A[0]A[1], A[1]A[2], ... A[N-1]A[N-1] <-------------------------- FIXME -- USE HERB'S SINGLE FMA REPRESENTATION
 //
 //
 // MINIMUM DWORDS:  N=2 --> 16
 //
 //
 // VERSION 3+
 // =============================================================
 //
 // FIXME -- will probably want to try using the sampler/texture
 // hardware to interpolate colors.
 //
 // This will require that the colors are laid out in sampler-friendly
 // order:
 //
 //    RGBA[0]RGBA[1], RGBA[1]RGBA[2], ..., RGBA[N-1]RGBA[N-1]
 //
 //

 #if 0
 #define SPN_GRADIENT_HEADER_DWORDS_LUTS_OFFSET       4
 #define SPN_GRADIENT_HEADER_DWORDS_TOTAL(n_minus_1)  (7 * (n_minus_1) + 9)
 #define SPN_GRADIENT_HEADER_DWORDS_MIN               SPN_GRADIENT_HEADER_DWORDS_TOTAL(1)
 #define SPN_GRADIENT_CMD_DWORDS_V1(n)                (1 + SPN_GRADIENT_HEADER_DWORDS_TOTAL(n-1))
 #endif

 #define SPN_GRADIENT_CMD_DWORDS_V1(n)                (7 * (n) + 2)
 #define SPN_GRADIENT_CMD_DWORDS_V2(n)                (5 * (n) + 6)
 #define SPN_GRADIENT_CMD_DWORDS_V2_ADJUST(v1,v2)     (SPN_GRADIENT_CMD_DWORDS_V1(v1) - ((v2) + 6))

 //
 // clang-format on
 //

 #endif  // SRC_GRAPHICS_LIB_COMPUTE_SPINEL_CORE_H_