src/graphics/lib/compute/spinel/core.h - fuchsia - Git at Google

 // Copyright 2019 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef SRC_GRAPHICS_LIB_COMPUTE_SPINEL_CORE_H_
 #define SRC_GRAPHICS_LIB_COMPUTE_SPINEL_CORE_H_

 #include "include/spinel/spinel_opcodes.h"

 //
 // clang-format off
 //

 #define SPN_EMPTY

 //
 // MAXIMUM SUBGROUP SIZE
 //
 // This is used to properly align GLSL buffers so the variable-sized
 // arrays are aligned on an architectural memory transaction boundary.
 //

 #define SPN_SUBGROUP_ALIGN_LIMIT                256

 //
 // TILE SIZE
 //
 // Width is a power-of-2 of height
 //

 #define SPN_TILE_WIDTH                          (1<<SPN_DEVICE_TILE_WIDTH_LOG2)
 #define SPN_TILE_HEIGHT                         (1<<SPN_DEVICE_TILE_HEIGHT_LOG2)

 #define SPN_TILE_HEIGHT_MASK                    (SPN_TILE_HEIGHT - 1)

 //
 // TAGGED BLOCK ID
 //
 //   0     5                    31
 //   | TAG |       BLOCK ID      |
 //   |     | SUBBLOCK |   BLOCK  |
 //   +-----+----------+----------+
 //   |  5  |    SUB   | 27 - SUB |
 //
 // BLOCK ID
 //
 //   0                    27    31
 //   |       BLOCK ID      |     |
 //   | SUBBLOCK |   BLOCK  | N/A |
 //   +----------+----------+-----+
 //   |    SUB   | 27 - SUB |  5  |
 //
 //
 // There are 27 bits of subblocks and 5 bits of tag.
 //
 // The block pool vends block ids.
 //
 // There are (2^S) subblocks in a block.
 //
 // There are at least 2 subblocks per block.
 //

 #define SPN_TAGGED_BLOCK_ID_BITS_ID             27 // this size is cast in stone
 #define SPN_TAGGED_BLOCK_ID_BITS_TAG            5  // which leaves 5 bits of tag

 #define SPN_TAGGED_BLOCK_ID_INVALID             SPN_UINT_MAX
 #define SPN_TAGGED_BLOCK_ID_MASK_TAG            SPN_BITS_TO_MASK(SPN_TAGGED_BLOCK_ID_BITS_TAG)

 #define SPN_TAGGED_BLOCK_ID_GET_TAG(tbid_)      ((tbid_) & SPN_TAGGED_BLOCK_ID_MASK_TAG)
 #define SPN_TAGGED_BLOCK_ID_GET_ID(tbid_)       SPN_BITFIELD_EXTRACT(tbid_,SPN_TAGGED_BLOCK_ID_BITS_TAG,SPN_TAGGED_BLOCK_ID_BITS_ID)

 #define SPN_BLOCK_ID_MAX                        SPN_BITS_TO_MASK(SPN_TAGGED_BLOCK_ID_BITS_ID)
 #define SPN_BLOCK_ID_INVALID                    SPN_UINT_MAX

 #define SPN_BLOCK_ID_TAG_PATH_LINE              0  // 0 -- 4  segments
 #define SPN_BLOCK_ID_TAG_PATH_QUAD              1  // 1 -- 6  segments
 #define SPN_BLOCK_ID_TAG_PATH_CUBIC             2  // 2 -- 8  segments
 #define SPN_BLOCK_ID_TAG_PATH_RAT_QUAD          3  // 3 -- 7  segments : 6 + w1      -- w0 = w2 = 1
 #define SPN_BLOCK_ID_TAG_PATH_RAT_CUBIC         4  // 4 -- 10 segments : 8 + w1 + w2 -- w0 = w3 = 1
 // ...
 // tags 5-29 are available
 // ...
 #define SPN_BLOCK_ID_TAG_PATH_COUNT             5  // how many path types?  can share same value with PATH_NEXT
 #define SPN_BLOCK_ID_TAG_PATH_NEXT              (SPN_TAGGED_BLOCK_ID_MASK_TAG - 1) // 30 : 0x1E
 #define SPN_BLOCK_ID_TAG_INVALID                SPN_TAGGED_BLOCK_ID_MASK_TAG       // 31 : 0x1F

 //
 // BLOCK POOL
 //

 #define SPN_BLOCK_POOL_BLOCK_DWORDS             (1<<SPN_DEVICE_BLOCK_POOL_BLOCK_DWORDS_LOG2)
 #define SPN_BLOCK_POOL_SUBBLOCK_DWORDS          (1<<SPN_DEVICE_BLOCK_POOL_SUBBLOCK_DWORDS_LOG2)

 #define SPN_BLOCK_POOL_BLOCK_DWORDS_MASK        SPN_BITS_TO_MASK(SPN_DEVICE_BLOCK_POOL_BLOCK_DWORDS_LOG2)
 #define SPN_BLOCK_POOL_SUBBLOCK_DWORDS_MASK     SPN_BITS_TO_MASK(SPN_DEVICE_BLOCK_POOL_SUBBLOCK_DWORDS_LOG2)

 #define SPN_BLOCK_POOL_SUBBLOCKS_PER_BLOCK_LOG2 (SPN_DEVICE_BLOCK_POOL_BLOCK_DWORDS_LOG2 - SPN_DEVICE_BLOCK_POOL_SUBBLOCK_DWORDS_LOG2)
 #define SPN_BLOCK_POOL_SUBBLOCKS_PER_BLOCK      (1<<SPN_BLOCK_POOL_SUBBLOCKS_PER_BLOCK_LOG2)
 #define SPN_BLOCK_POOL_SUBBLOCKS_PER_BLOCK_MASK SPN_BITS_TO_MASK(SPN_BLOCK_POOL_SUBBLOCKS_PER_BLOCK_LOG2)

 #define SPN_BLOCK_POOL_BLOCK_QWORDS_LOG2        (SPN_DEVICE_BLOCK_POOL_BLOCK_DWORDS_LOG2-1)
 #define SPN_BLOCK_POOL_BLOCK_QWORDS             (1<<SPN_BLOCK_POOL_BLOCK_QWORDS_LOG2)
 #define SPN_BLOCK_POOL_BLOCK_QWORDS_MASK        SPN_BITS_TO_MASK(SPN_BLOCK_POOL_BLOCK_QWORDS_LOG2)

 #define SPN_BLOCK_POOL_SUBBLOCK_QWORDS_LOG2     (SPN_DEVICE_BLOCK_POOL_SUBBLOCK_DWORDS_LOG2-1)
 #define SPN_BLOCK_POOL_SUBBLOCK_QWORDS          (1<<SPN_BLOCK_POOL_SUBBLOCK_QWORDS_LOG2)

 #define SPN_BLOCK_POOL_SUBBLOCK_OWORDS_LOG2     (SPN_DEVICE_BLOCK_POOL_SUBBLOCK_DWORDS_LOG2-2)
 #define SPN_BLOCK_POOL_SUBBLOCK_OWORDS          (1<<SPN_BLOCK_POOL_SUBBLOCK_OWORDS_LOG2)

 #define SPN_BLOCK_POOL_ATOMICS_READS            0
 #define SPN_BLOCK_POOL_ATOMICS_WRITES           1

 //
 //
 //

 #define SPN_BLOCK_POOL_SUBBLOCKS_PER_SUBGROUP(subgroup_size_)           \
   (subgroup_size_ / SPN_BLOCK_POOL_SUBBLOCK_DWORDS)

 #define SPN_BLOCK_POOL_SUBGROUPS_PER_BLOCK_LOG2(subgroup_size_log2_)    \
   (SPN_DEVICE_BLOCK_POOL_BLOCK_DWORDS_LOG2 - subgroup_size_log2_)

 #define SPN_BLOCK_POOL_SUBGROUPS_PER_BLOCK(subgroup_size_log2_)         \
   (1 << SPN_BLOCK_POOL_SUBGROUPS_PER_BLOCK_LOG2(subgroup_size_log2_))

 #define SPN_BLOCK_POOL_SUBGROUPS_PER_BLOCK_MASK(subgroup_size_log2_)    \
   SPN_BITS_TO_MASK(SPN_BLOCK_POOL_SUBGROUPS_PER_BLOCK_LOG2(subgroup_size_log2_))

 //
 //
 //

 #ifdef SPN_BLOCK_ID_IS_BLOCK_USE_MASK
 #define SPN_BLOCK_ID_IS_BLOCK(bid_)             (((bid_) & SPN_BLOCK_POOL_SUBBLOCKS_PER_BLOCK_MASK) == 0)
 #else
 #define SPN_BLOCK_ID_IS_BLOCK(bid_)             (SPN_BITFIELD_EXTRACT(bid_,0,SPN_BLOCK_POOL_SUBBLOCKS_PER_BLOCK_LOG2) == 0)
 #endif

 //
 // PATH HEAD
 //
 //
 //   struct spn_path_header
 //   {
 //     struct {
 //       uint32_t handle; // host handle
 //       uint32_t blocks; // total number of blocks in path object
 //       uint32_t nodes;  // number of path node blocks -- does not include head
 //       uint32_t na;     // unused
 //     } count;           // uvec4
 //
 //     uvec4      prims;  // packed counts: lines, quads, cubics, rat-quads, rat-cubics
 //
 //     struct {
 //       float    x0;
 //       float    y0;
 //       float    x1;
 //       float    y1;
 //     } bounds;
 //   };
 //

 #define SPN_PATH_HEAD_DWORDS                    12
 #define SPN_PATH_HEAD_QWORDS                    (SPN_PATH_HEAD_DWORDS / 2)
 #define SPN_PATH_HEAD_DWORDS_POW2_RU            16

 #define SPN_PATH_HEAD_OFFSET_HANDLE             0
 #define SPN_PATH_HEAD_OFFSET_BLOCKS             1
 #define SPN_PATH_HEAD_OFFSET_NODES              2
 #define SPN_PATH_HEAD_OFFSET_PRIMS              4

 //
 // Counts of the 5 path types are packed into a uvec4
 //
 // lines:      26 -- 64m
 // quads:      26 -- 64m
 // cubics:     26 -- 64m
 // rat_quads:  25 -- 32m
 // rat_cubics: 25 -- 32m
 //

 #define SPN_PATH_PRIMS_GET_LINES(p_)       (SPN_BITFIELD_EXTRACT(p_[0], 0,26))                                             // 26
 #define SPN_PATH_PRIMS_GET_QUADS(p_)       (SPN_BITFIELD_EXTRACT(p_[0],26, 6) | (SPN_BITFIELD_EXTRACT(p_[1],0,20) <<  6))  // 26
 #define SPN_PATH_PRIMS_GET_CUBICS(p_)      (SPN_BITFIELD_EXTRACT(p_[1],20,12) | (SPN_BITFIELD_EXTRACT(p_[2],0,14) << 12))  // 26
 #define SPN_PATH_PRIMS_GET_RAT_QUADS(p_)   (SPN_BITFIELD_EXTRACT(p_[2],14,18) | (SPN_BITFIELD_EXTRACT(p_[3],0, 7) << 18))  // 25
 #define SPN_PATH_PRIMS_GET_RAT_CUBICS(p_)  (SPN_BITFIELD_EXTRACT(p_[3], 7,25))                                             // 25

 #define SPN_PATH_PRIMS_INIT_UNSAFE(ll_,qq_,cc_,rq_,rc_) \
   {                                                     \
     (ll_      ) | (qq_ << 26),                          \
     (qq_ >>  6) | (cc_ << 20),                          \
     (cc_ >> 12) | (rq_ << 14),                          \
     (rq_ >> 18) | (rc_ <<  7)                           \
   }

 #define SPN_PATH_PRIMS_INIT(ll_,qq_,cc_,rq_,rc_)        \
   SPN_PATH_PRIMS_INIT_UNSAFE(ll_,qq_,cc_,rq_,rc_)

 //
 // PATH HEAD COMPILE-TIME PREDICATES
 //

 #define SPN_PATH_HEAD_ELEM_GTE(sgsz_,x_,i_)     \
   ((x_) >= (i_) * sgsz_)

 #define SPN_PATH_HEAD_ELEM_IN_RANGE(sgsz_,x_,i_)        \
   (SPN_PATH_HEAD_ELEM_GTE(sgsz_,x_,i_) &&               \
    !SPN_PATH_HEAD_ELEM_GTE(sgsz_,x_,(i_)+1))

 #define SPN_PATH_HEAD_ENTIRELY_HEADER(sgsz_,i_)                 \
   SPN_PATH_HEAD_ELEM_GTE(sgsz_,SPN_PATH_HEAD_DWORDS,(i_)+1)

 #define SPN_PATH_HEAD_PARTIALLY_HEADER(sgsz_,i_)                \
   SPN_PATH_HEAD_ELEM_IN_RANGE(sgsz_,SPN_PATH_HEAD_DWORDS,i_)

 #define SPN_PATH_HEAD_IS_HEADER(sgsz_,i_)                       \
   (gl_SubgroupInvocationID + i_ * sgsz_ < SPN_PATH_HEAD_DWORDS)

 //
 // FILL COMMANDS
 //
 //
 // Fill and rasterize cmds only differ in their first word semantics.
 //
 // The rasterize command points to a 32-bit nodeword so will need to
 // be more than 32-bits if we want to access more than 16GB of blocks.
 //
 // FIXME(allanmac): drop support for more than 16GB
 //
 // For GLSL we will use a uvec4 laid out as follows:
 //
 //  union {
 //
 //    uvec4 u32v4;
 //
 //    struct spn_cmd_fill {
 //      uint32_t path_h;               // host id
 //      uint32_t na              : 16; // unused
 //      uint32_t cohort          : 15; // cohort is 8-11 bits
 //      uint32_t transform_type  : 1;  // transform type: 0=affine,1=projective
 //      uint32_t transform;            // transform index
 //      uint32_t clip;                 // clip index
 //    } fill;
 //
 //    struct spn_cmd_rast {
 //      uint32_t node_id;              // device block id
 //      uint32_t node_dword      : 16; // block dword offset
 //      uint32_t cohort          : 15; // cohort is 8-11 bits
 //      uint32_t transform_type  : 1;  // transform type: 0=affine,1=projective
 //      uint32_t transform             // transform index
 //      uint32_t clip;                 // clip index
 //    } rasterize;
 //
 //  };
 //
 //
 // NOTE(allanmac): We can pack the transform and clip indices down to a
 // more practical 16 bits in case we want to add additional
 // rasterization command indices or flags.
 //

 #define SPN_CMD_FILL_TRANSFORM_TYPE_AFFINE                 0
 #define SPN_CMD_FILL_TRANSFORM_TYPE_PROJECTIVE             1

 #define SPN_CMD_FILL_GET_PATH_H(c_)                        c_[0]
 #define SPN_CMD_FILL_GET_COHORT(c_)                        SPN_BITFIELD_EXTRACT(c_[1],16,15)
 #define SPN_CMD_FILL_GET_TRANSFORM_TYPE(c_)                SPN_BITFIELD_EXTRACT(c_[1],31,1)
 #define SPN_CMD_FILL_GET_TRANSFORM(c_)                     c_[2]
 #define SPN_CMD_FILL_GET_CLIP(c_)                          c_[3]

 #define SPN_CMD_FILL_IS_TRANSFORM_TYPE_AFFINE(c_)          ((c_[1] & SPN_BITS_TO_MASK_AT(31,1)) == 0)
 #define SPN_CMD_FILL_IS_TRANSFORM_TYPE_PROJECTIVE(c_)      ((c_[1] & SPN_BITS_TO_MASK_AT(31,1)) != 0)

 //
 //
 //

 #define SPN_CMD_RASTERIZE_GET_COHORT(c_)                   SPN_CMD_FILL_GET_COHORT(c_)
 #define SPN_CMD_RASTERIZE_GET_TRANSFORM_TYPE(c_)           SPN_CMD_FILL_GET_TRANSFORM_TYPE(c_)
 #define SPN_CMD_RASTERIZE_GET_TRANSFORM(c_)                SPN_CMD_FILL_GET_TRANSFORM(c_)
 #define SPN_CMD_RASTERIZE_GET_CLIP(c_)                     SPN_CMD_FILL_GET_CLIP(c_)

 #define SPN_CMD_RASTERIZE_IS_TRANSFORM_TYPE_AFFINE(c_)     SPN_CMD_FILL_IS_TRANSFORM_TYPE_AFFINE(c_)
 #define SPN_CMD_RASTERIZE_IS_TRANSFORM_TYPE_PROJECTIVE(c_) SPN_CMD_FILL_IS_TRANSFORM_TYPE_PROJECTIVE(c_)

 #define SPN_CMD_RASTERIZE_GET_NODE_ID(c_)                  c_[0]
 #define SPN_CMD_RASTERIZE_GET_NODE_DWORD(c_)               SPN_BITFIELD_EXTRACT(c_[1],0,16)

 #define SPN_CMD_RASTERIZE_SET_NODE_ID(c_,n_id_)            c_[0] = n_id_
 #define SPN_CMD_RASTERIZE_SET_NODE_DWORD(c_,n_lo_)         c_[1] = SPN_BITFIELD_INSERT(c_[1],n_lo_,0,16)

 //
 // Spinel supports a projective transformation matrix with the
 // requirement that w2 is implicitly 1.0.
 //
 //   A---------B----+
 //   | sx  shx | tx |
 //   | shy sy  | ty |
 //   C---------D----+
 //   | w0  w1  | 1  |
 //   +---------+----+
 //
 // The transformation matrix can be initialized with the array:
 //
 //   { sx shx shy sy tx ty w0 w1 }
 //
 // struct spn_transform
 // {
 //   SPN_TYPE_MAT2X2 a; //  { { sx shx } {shy sy } } -- rotate
 //   SPN_TYPE_VEC2   b; //  { tx ty }                -- translate
 //   SPN_TYPE_VEC2   c; //  { w0 w1 }                -- project
 // };
 //
 // struct spn_transform_lo
 // {
 //   SPN_TYPE_MAT2X2 a; //  { { sx shx } {shy sy } } -- rotate
 // };
 //
 // struct spn_transform_hi
 // {
 //   SPN_TYPE_VEC2   b; //  { tx ty }                -- translate
 //   SPN_TYPE_VEC2   c; //  { w0 w1 }                -- project
 // };
 //
 //
 // Note that the raster builder is storing the transform as two
 // float[4] quads.
 //
 // The rasterization shaders then load these vec4 quads as mat2
 // matrices.
 //

 #define SPN_TRANSFORM_LO_INDEX_SX                 0
 #define SPN_TRANSFORM_LO_INDEX_SHX                1
 #define SPN_TRANSFORM_LO_INDEX_SHY                2
 #define SPN_TRANSFORM_LO_INDEX_SY                 3

 #define SPN_TRANSFORM_HI_INDEX_TX                 0
 #define SPN_TRANSFORM_HI_INDEX_TY                 1
 #define SPN_TRANSFORM_HI_INDEX_W0                 2
 #define SPN_TRANSFORM_HI_INDEX_W1                 3

 //
 // PATHS COPY COMMANDS
 //
 // The PATH COPY command is simply a 32-bit tagged block id with a
 // host-controlled rolling counter stuffed into the id field.
 //

 #define SPN_PATHS_COPY_CMD_TYPE_SEGS              0
 #define SPN_PATHS_COPY_CMD_TYPE_NODE              1
 #define SPN_PATHS_COPY_CMD_TYPE_HEAD              2

 #define SPN_PATHS_COPY_CMD_GET_TYPE(cmd)          SPN_TAGGED_BLOCK_ID_GET_TAG(cmd)

 //
 // RASTER HEAD
 //
 // The raster header and nodes use a strided layout so that the block is
 // split in two with the low dword of the 64-bit keys stored in hte
 // first half of the block and the high dword in the second half.
 //
 // Note: a simple 32-bit .pkidx implies a 16 GB limit to the block pool.
 //
 // Note: we could interpret the 32-bit .pkidx as the low bits indexing
 // the dwords in the low half of the block and the high bits indexing
 // qwords.  This will index a 32 GB block pool.
 //
 //   raster head block
 //   {
 //     struct spn_raster_header.lo
 //     {
 //       uint32_t nodes;   // # of nodes  -- not including header
 //       uint32_t ttsks;   // # of ttsks
 //       uint32_t ttpks;   // # of ttpks
 //       uint32_t pkidx;   // block pool dword of first ttpk.lo
 //       uint32_t blocks;  // # of blocks -- head+node+skb+pkb
 //
 //       ... TTXK.lo ...
 //     };
 //
 //     struct spn_raster_header.hi
 //     {
 //       int32_t  x0;      // axis-aligned bounding box
 //       int32_t  x1;      // axis-aligned bounding box
 //       int32_t  y0;      // axis-aligned bounding box
 //       int32_t  y1;      // axis-aligned bounding box
 //       uint32_t na0;     // reserved
 //
 //       ... TTXK.hi ...
 //     };
 //   }
 //
 // Usage:
 //
 //   - RASTERS_RECLAIM: this shader only needs to load the low dwords of
 //     each block because only the block and node counts and the TTXB id
 //     of each key are required.
 //
 //   - RASTERS_PREFIX: this shader needs to vector load the values
 //     calculated by RASTERS_ALLOC and write them back to the block.
 //
 //   - PLACE_TT*K: these shaders need to efficiently load the raster
 //     header.
 //

 #define SPN_RASTER_NODE_QWORDS                   SPN_BLOCK_POOL_BLOCK_QWORDS

 #define SPN_RASTER_HEAD_DWORDS                   10
 #define SPN_RASTER_HEAD_QWORDS                   (SPN_RASTER_HEAD_DWORDS / 2)

 #define SPN_RASTER_HEAD_LO_OFFSET_NODES          0
 #define SPN_RASTER_HEAD_LO_OFFSET_TTSKS          1
 #define SPN_RASTER_HEAD_LO_OFFSET_TTPKS          2
 #define SPN_RASTER_HEAD_LO_OFFSET_PKIDX          3
 #define SPN_RASTER_HEAD_LO_OFFSET_BLOCKS         4

 #define SPN_RASTER_HEAD_HI_OFFSET_X0             0
 #define SPN_RASTER_HEAD_HI_OFFSET_X1             1
 #define SPN_RASTER_HEAD_HI_OFFSET_Y0             2
 #define SPN_RASTER_HEAD_HI_OFFSET_Y1             3
 #define SPN_RASTER_HEAD_HI_OFFSET_NA0            4

 //
 // RASTER HEAD COMPILE-TIME PREDICATES
 //

 #define SPN_RASTER_HEAD_ELEM_GTE(sgsz_,x_,i_)      \
   ((x_) >= (i_) * sgsz_)

 #define SPN_RASTER_HEAD_ELEM_IN_RANGE(sgsz_,x_,i_) \
   (SPN_RASTER_HEAD_ELEM_GTE(sgsz_,x_,i_) &&        \
    !SPN_RASTER_HEAD_ELEM_GTE(sgsz_,x_,(i_)+1))

 #define SPN_RASTER_HEAD_ENTIRELY_HEADER(sgsz_,i_)                 \
   SPN_RASTER_HEAD_ELEM_GTE(sgsz_,SPN_RASTER_HEAD_QWORDS,(i_)+1)

 #define SPN_RASTER_HEAD_PARTIALLY_HEADER(sgsz_,i_)                \
   SPN_RASTER_HEAD_ELEM_IN_RANGE(sgsz_,SPN_RASTER_HEAD_QWORDS,i_)

 #define SPN_RASTER_HEAD_IS_HEADER(sgsz_,i_)                       \
   (gl_SubgroupInvocationID + i_ * sgsz_ < SPN_RASTER_HEAD_QWORDS)

 //
 // Hard requirements:
 //
 //   - A TTXB "block pool" extent that is at least 1GB.
 //
 //   - A virtual surface of at least 8K x 8K
 //
 //   - A physical surface of __don't really care__ because it's
 //     advantageous to tile the physical surface since it's likely
 //     to shrink the post-place TTCK sorting step.
 //
 //                             TTXB BITS
 //    EXTENT     +------------------------------------+
 //   SIZE (MB)   |  22    23    24    25    26    27  |
 //          +----+------------------------------------+
 //    TTXB  |  8 |  128   256   512  1024  2048  4096 |
 //   DWORDS | 16 |  256   512  1024  2048  4096  8192 |
 //          +----+------------------------------------+
 //
 //
 //                                     X/Y BITS
 //   SURFACE DIM +------------------------------------------------------+
 //               |   5     6     7     8     9    10    11    12    13  |
 //    TILE  +----+------------------------------------------------------+
 //   HEIGHT |  3 |  256   512  1024  2048  4096  8192 16384 32768 65536 |
 //    LOG2  |  4 |  512  1024  2048  4096  8192 16384 32768 65536  128K |
 //          +----+------------------------------------------------------+
 //   TILES^2     | 1024  4096 16384 65536  256K    1M    4M   16M   64M |
 //               +------------------------------------------------------+
 //
 // The following values should be pretty future-proof across all GPUs:
 //
 //   - The minimum addressable subblock size is 16 dwords (64 bytes)
 //     to ensure there is enough space for a path or raster header and
 //     its payload.
 //
 //   - Blocks are power-of-2 multiples of subblocks. Larger blocks can
 //     reduce allocation activity (fewer atomic adds).
 //
 //   - 27 bits of TTXB_ID space implies a max of 4GB-32GB of
 //     rasterized paths depending on the size of the TTXB block.
 //     This could enable interesting use cases.
 //
 //   - A virtual rasterization surface that's from +/-16K to +/-128K
 //     depending on the size of the TTXB block.
 //
 //   - Keys that (optionally) only require a 32-bit high word
 //     comparison.
 //
 //   - Support for a minimum of 256K layers. This can be practically
 //     raised to 1m or 2m layers.
 //

 //
 // The size of the cohort determines the max number of rasters that can
 // be submitted to the GPU in a single dispatch.  We want this number to
 // be as large as possible.  A dispatch of 2048 subgroups is very large
 // but there is potential to push this to 8192 with modifications to the
 // segmenter and possibly an auxilary extent.
 //
 // The max cohort id is reserved as it indicates an invalid TTRK.
 //
 // Each cohort member launches one subgroup per block of common path
 // geometry.
 //
 // The rasterizer produces TTRK keys:
 //
 // TTRK (64-BIT COMPARE)
 //
 //  0                                                                  63
 //  | TTSB_ID | NEW_X  | NEW_Y  | X_LO | X_HI |   Y  | RASTER COHORT ID |
 //  +---------+--------+--------+------+------+------+------------------+
 //  |    27   | 1 (=0) | 1 (=0) |   3  |   9  |  12  |        11        |
 //
 // After segmentation the cohort id can be ignored as we've gathered
 // enough statistics on the cohort to execute the prefix kernel.
 //

 //
 // FIXME(allanmac): Harmonize on low-to-high "XY" naming instead of "YX"!
 //

 #define SPN_TTRK_LO_BITS_TTSB_ID                 SPN_TAGGED_BLOCK_ID_BITS_ID
 #define SPN_TTRK_LO_HI_BITS_X                    12
 #define SPN_TTRK_LO_BITS_X                       3
 #define SPN_TTRK_HI_BITS_X                       9
 #define SPN_TTRK_HI_BITS_Y                       12
 #define SPN_TTRK_HI_BITS_COHORT                  11

 #define SPN_TTRK_BITS_XY                         (SPN_TTRK_LO_HI_BITS_X + SPN_TTRK_HI_BITS_Y)
 #define SPN_TTRK_LO_BITS_XY                      SPN_TTRK_LO_BITS_X
 #define SPN_TTRK_HI_BITS_XY                      (SPN_TTRK_HI_BITS_X + SPN_TTRK_HI_BITS_Y)

 #define SPN_TTRK_LO_OFFSET_NEW_X                 SPN_TTRK_LO_BITS_TTSB_ID
 #define SPN_TTRK_LO_OFFSET_NEW_Y                 (SPN_TTRK_LO_OFFSET_NEW_X + 1)
 #define SPN_TTRK_LO_OFFSET_X                     (SPN_TTRK_LO_OFFSET_NEW_Y + 1)
 #define SPN_TTRK_HI_OFFSET_Y                     SPN_TTRK_HI_BITS_X
 #define SPN_TTRK_HI_OFFSET_COHORT                (32 - SPN_TTRK_HI_BITS_COHORT)

 #define SPN_TTRK_LO_MASK_X                       SPN_BITS_TO_MASK_AT(SPN_TTRK_LO_OFFSET_X,SPN_TTRK_LO_BITS_X)
 #define SPN_TTRK_HI_MASK_Y                       SPN_BITS_TO_MASK_AT(SPN_TTRK_HI_OFFSET_Y,SPN_TTRK_HI_BITS_Y)

 #define SPN_TTRK_NEW_X                           1
 #define SPN_TTRK_NEW_Y                           2

 #define SPN_TTRK_LO_GET_TTSB_ID(t_lo_)           SPN_BITFIELD_EXTRACT(t_lo_,0,SPN_TTRK_LO_BITS_TTSB_ID)
 #define SPN_TTRK_HI_GET_COHORT(t_hi_)            SPN_BITFIELD_EXTRACT(t_hi_,SPN_TTRK_HI_OFFSET_COHORT,SPN_TTRK_HI_BITS_COHORT)

 #define SPN_TTRK_SET_NEW_Y(t_,y_)                (t_)[0] = SPN_BITFIELD_INSERT((t_)[0],y_,SPN_TTRK_LO_OFFSET_NEW_Y,1)
 #define SPN_TTRK_SET_NEW_XY(t_,xy_)              (t_)[0] = SPN_BITFIELD_INSERT((t_)[0],xy_,SPN_TTRK_LO_OFFSET_NEW_X,2)
 #define SPN_TTRK_SET_XY(t_,xy_)                  SPN_GLSL_INSERT_UVEC2_UINT(t_,xy_,SPN_TTRK_LO_OFFSET_X,SPN_TTRK_BITS_XY)
 #define SPN_TTRK_SET_COHORT(t_,c_)               (t_)[1] = SPN_BITFIELD_INSERT((t_)[1],c_,SPN_TTRK_HI_OFFSET_COHORT,SPN_TTRK_HI_BITS_COHORT)

 #define SPN_TTRK_IS_NEW_X(t_)                    (SPN_BITFIELD_EXTRACT((t_)[0],SPN_TTRK_LO_OFFSET_NEW_X,1) != 0)
 #define SPN_TTRK_IS_NEW_Y(t_)                    (SPN_BITFIELD_EXTRACT((t_)[0],SPN_TTRK_LO_OFFSET_NEW_Y,1) != 0)

 //
 // TTSK v1 ( DEFAULT )
 //
 //  0                            63
 //  | TTSB_ID |   SPAN  |  X |  Y |
 //  +---------+---------+----+----+
 //  |    27   | 13 [<0] | 12 | 12 |
 //
 //
 // TTPK v2 ( DEFAULT )
 //
 //  0                                  63
 //  | TTPB_ID |      SPAN     |  X |  Y |
 //  +---------+---------------+----+----+
 //  |    27   | 13 [+1,+4095] | 12 | 12 |
 //
 //
 // A TTSK.SPAN inherits the TTRK[0] dword unmodified (in flux).
 //
 // A TTPK.SPAN has a range of [+1,+4095].
 //
 // A TTXK.SPAN of 0 indicates either:
 //
 //   - an invalid key
 //   - a TTXK key pointing to all TTS_INVALID values
 //   - a TTPK key pointing to all zero values
 //
 // In all cases, this key can be skipped during rendering.
 //
 // TTXK.Y and TTXK.X are signed but stored as biased unsigned.
 //
 // An invalid TTXK has a span of zero and a TTXB_ID of all 1's.
 //

 #define SPN_TTSK_IS_NEW_X(t_)                    SPN_TTRK_IS_NEW_X(t_)
 #define SPN_TTSK_IS_NEW_Y(t_)                    SPN_TTRK_IS_NEW_Y(t_)

 #define SPN_TTXK_LO_BITS_TTXB_ID                 SPN_TTRK_LO_BITS_TTSB_ID
 #define SPN_TTXK_LO_HI_BITS_SPAN                 13
 #define SPN_TTXK_LO_BITS_SPAN                    5  // straddles a
 #define SPN_TTXK_HI_BITS_SPAN                    8  // word boundary
 #define SPN_TTXK_HI_BITS_X                       SPN_TTRK_LO_HI_BITS_X
 #define SPN_TTXK_HI_BITS_Y                       SPN_TTRK_HI_BITS_Y
 #define SPN_TTXK_HI_BITS_XY                      (SPN_TTXK_HI_BITS_Y + SPN_TTXK_HI_BITS_X)

 #define SPN_TTXK_LO_OFFSET_SPAN                  SPN_TTXK_LO_BITS_TTXB_ID
 #define SPN_TTXK_HI_OFFSET_X                     (32 - SPN_TTXK_HI_BITS_XY) // 8
 #define SPN_TTXK_HI_OFFSET_Y                     (32 - SPN_TTXK_HI_BITS_Y)  // 20
 #define SPN_TTXK_HI_OFFSET_XY                    (32 - SPN_TTXK_HI_BITS_XY) // 8

 #define SPN_TTXK_LO_MASK_TTXB_ID                 SPN_BITS_TO_MASK(SPN_TTXK_LO_BITS_TTXB_ID)
 #define SPN_TTXK_LO_MASK_SPAN                    SPN_BITS_TO_MASK_AT(SPN_TTXK_LO_OFFSET_SPAN,SPN_TTXK_LO_BITS_SPAN)
 #define SPN_TTXK_HI_MASK_SPAN                    SPN_BITS_TO_MASK(SPN_TTXK_HI_BITS_SPAN)
 #define SPN_TTXK_HI_MASK_X                       SPN_BITS_TO_MASK_AT(SPN_TTXK_HI_OFFSET_X,SPN_TTXK_HI_BITS_X)
 #define SPN_TTXK_HI_MASK_Y                       SPN_BITS_TO_MASK_AT(SPN_TTXK_HI_OFFSET_Y,SPN_TTXK_HI_BITS_Y)
 #define SPN_TTXK_HI_MASK_XY                      SPN_BITS_TO_MASK_AT(SPN_TTXK_HI_OFFSET_Y,SPN_TTXK_HI_BITS_XY)

 #define SPN_TTXK_HI_ONE_X                        (1u << SPN_TTXK_HI_OFFSET_X)

 #define SPN_TTXK_LO_GET_TTXB_ID(t_lo_)           SPN_BITFIELD_EXTRACT(t_lo_,0,SPN_TTXK_LO_BITS_TTXB_ID)
 #define SPN_TTXK_HI_GET_XY(t_hi_)                SPN_BITFIELD_EXTRACT(t_hi_,SPN_TTXK_HI_OFFSET_XY,SPN_TTXK_HI_BITS_XY)

 #define SPN_TTXK_GET_HI(t_)                      (t_)[1]
 #define SPN_TTXK_GET_TTXB_ID(t_)                 SPN_TTXK_LO_GET_TTXB_ID((t_)[0])
 #define SPN_TTXK_GET_SPAN(t_)                    SPN_GLSL_EXTRACT_UVEC2_INT((t_),SPN_TTXK_LO_OFFSET_SPAN,SPN_TTXK_LO_HI_BITS_SPAN)
 #define SPN_TTXK_GET_X(t_)                       SPN_BITFIELD_EXTRACT((t_)[1],SPN_TTXK_HI_OFFSET_X,SPN_TTXK_HI_BITS_X)
 #define SPN_TTXK_GET_Y(t_)                       SPN_BITFIELD_EXTRACT((t_)[1],SPN_TTXK_HI_OFFSET_Y,SPN_TTXK_HI_BITS_Y)
 #define SPN_TTXK_GET_XY(t_)                      SPN_TTXK_HI_GET_XY((t_)[1])

 #define SPN_TTXK_SET_TTXB_ID(t_,i_)              t_[0] = SPN_BITFIELD_INSERT(t_[0],(i_),0,SPN_TTXK_LO_BITS_TTXB_ID)
 #define SPN_TTXK_SET_SPAN(t_,s_)                 SPN_GLSL_INSERT_UVEC2_UINT((t_),(s_),SPN_TTXK_LO_OFFSET_SPAN,SPN_TTXK_LO_HI_BITS_SPAN)
 #define SPN_TTXK_SET_XY(t_,i_)                   t_[1] = SPN_BITFIELD_INSERT(t_[1],(i_),SPN_TTXK_HI_OFFSET_XY,SPN_TTXK_HI_BITS_XY)

 #define SPN_TTXK_INVALID                         uvec2(SPN_TTXK_LO_MASK_TTXB_ID,0)

 //
 // YX
 //
 //  0        32
 //  |  X |  Y |
 //  +----+----+
 //  | 12 | 22 |
 //
 // A few shaders probe the YX value.
 //
 // The max value of X is 4095.
 //

 #define SPN_XY_GET_Y(yx_)                        SPN_BITFIELD_EXTRACT(yx_,SPN_TTXK_HI_BITS_X,32-SPN_TTXK_HI_BITS_X)
 #define SPN_XY_X_MASK                            SPN_BITS_TO_MASK(SPN_TTXK_HI_BITS_X)

 //
 // PLACE
 //

 struct spn_cmd_place
 {
   SPN_TYPE_UINT raster_h;
   SPN_TYPE_UINT layer_id;
   SPN_TYPE_INT  txty[2];
 };

 //
 // TTCK (64-BIT COMPARE) -- DEFAULT
 //
 //  0                                                           63
 //  | PAYLOAD/TTSB/TTPB_ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
 //  +----------------------+--------+--------+-------+-----+-----+
 //  |          27          |    1   |    1   |   18  |  9  |  8  |
 //
 //  0                                                  32                     63
 //  | PAYLOAD/TTSB/TTPB_ID | PREFIX | ESCAPE | LAYER_LO | LAYER_HI |  X  |  Y  |
 //  +----------------------+--------+--------+----------+----------+-----+-----+
 //  |          27          |    1   |    1   |     3    |    15    |  9  |  8  |
 //
 //
 // TTCK (32-BIT COMPARE) v2 -- NOT USED
 //
 //  0                                                           63
 //  | PAYLOAD/TTSB/TTPB_ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
 //  +----------------------+--------+--------+-------+-----+-----+
 //  |          30          |    1   |    1   |   15  |  9  |  8  |
 //
 //
 // TTCK.X and TTCK.Y are unsigned
 //
 //  +-----------+-------------+
 //  | TILE SIZE | MAX SURFACE |
 //  +-----------+-------------+
 //  |   16x16   |   8K x 4K   | NVIDIA, AMD
 //  |    8x8    |   4K x 2K   | INTEL GEN, Mali G52+
 //  |    4x4    |   2K x 1K   | Mali G31, SwiftShader
 //  +-----------+-------------+
 //

 #define SPN_TTCK_LO_BITS_TTXB_ID                 SPN_TAGGED_BLOCK_ID_BITS_ID
 #define SPN_TTCK_LO_BITS_PREFIX                  1
 #define SPN_TTCK_LO_BITS_ESCAPE                  1

 #define SPN_TTCK_LO_HI_BITS_LAYER                18
 #define SPN_TTCK_LO_BITS_LAYER                   3
 #define SPN_TTCK_HI_BITS_LAYER                   15

 #define SPN_TTCK_HI_BITS_X                       9
 #define SPN_TTCK_HI_BITS_Y                       8
 #define SPN_TTCK_HI_BITS_XY                      (SPN_TTCK_HI_BITS_X + SPN_TTCK_HI_BITS_Y)

 #define SPN_TTCK_LO_OFFSET_PREFIX                SPN_TTCK_LO_BITS_TTXB_ID
 #define SPN_TTCK_LO_OFFSET_ESCAPE                (SPN_TTCK_LO_OFFSET_PREFIX + SPN_TTCK_LO_BITS_PREFIX)
 #define SPN_TTCK_LO_OFFSET_LAYER                 (SPN_TTCK_LO_OFFSET_ESCAPE + SPN_TTCK_LO_BITS_ESCAPE)

 #define SPN_TTCK_HI_OFFSET_X                     (32 - SPN_TTCK_HI_BITS_XY)
 #define SPN_TTCK_HI_OFFSET_Y                     (32 - SPN_TTCK_HI_BITS_Y)
 #define SPN_TTCK_HI_OFFSET_XY                    (32 - SPN_TTCK_HI_BITS_XY)

 #define SPN_TTCK_LO_MASK_TTXB_ID                 SPN_BITS_TO_MASK(SPN_TTCK_LO_BITS_TTXB_ID)
 #define SPN_TTCK_LO_MASK_PREFIX                  SPN_BITS_TO_MASK_AT(SPN_TTCK_LO_OFFSET_PREFIX,SPN_TTCK_LO_BITS_PREFIX)
 #define SPN_TTCK_LO_MASK_ESCAPE                  SPN_BITS_TO_MASK_AT(SPN_TTCK_LO_OFFSET_ESCAPE,SPN_TTCK_LO_BITS_ESCAPE)
 #define SPN_TTCK_LO_MASK_LAYER                   SPN_BITS_TO_MASK_AT(SPN_TTCK_LO_OFFSET_LAYER,SPN_TTCK_LO_BITS_LAYER)

 #define SPN_TTCK_HI_MASK_LAYER                   SPN_BITS_TO_MASK(SPN_TTCK_HI_BITS_LAYER)
 #define SPN_TTCK_HI_MASK_XY                      SPN_BITS_TO_MASK_AT(SPN_TTCK_HI_OFFSET_XY,SPN_TTCK_HI_BITS_XY)

 #define SPN_TTCK_GET_TTXB_ID(t_)                 ( t_[0] & SPN_TTCK_LO_MASK_TTXB_ID)
 #define SPN_TTCK_LO_GET_TTXB_ID(t_lo_)           ( t_lo_ & SPN_TTCK_LO_MASK_TTXB_ID)

 #define SPN_TTCK_IS_PREFIX(t_)                   ((t_[0] & SPN_TTCK_LO_MASK_PREFIX) != 0)
 #define SPN_TTCK_LO_IS_PREFIX(t_lo_)             ((t_lo_ & SPN_TTCK_LO_MASK_PREFIX) != 0)

 #define SPN_TTCK_IS_ESCAPE(t_)                   ((t_[0] & SPN_TTCK_LO_MASK_ESCAPE) != 0)

 #define SPN_TTCK_GET_LAYER(t_)                   SPN_GLSL_EXTRACT_UVEC2_UINT(t_,SPN_TTCK_LO_OFFSET_LAYER,SPN_TTCK_LO_HI_BITS_LAYER)
 #define SPN_TTCK_SET_LAYER(t_,l_)                SPN_GLSL_INSERT_UVEC2_UINT(t_,l_,SPN_TTCK_LO_OFFSET_LAYER,SPN_TTCK_LO_HI_BITS_LAYER)

 #define SPN_TTCK_GET_Y(t_)                       SPN_BITFIELD_EXTRACT(t_[1],SPN_TTCK_HI_OFFSET_Y,SPN_TTCK_HI_BITS_Y)
 #define SPN_TTCK_GET_X(t_)                       SPN_BITFIELD_EXTRACT(t_[1],SPN_TTCK_HI_OFFSET_X,SPN_TTCK_HI_BITS_X)

 #define SPN_TTCK_ADD_X(t_,d_)                    (t_[1] += ((d_) << SPN_TTCK_HI_OFFSET_X))

 #define SPN_TTCK_LAYER_MAX                       SPN_BITS_TO_MASK(SPN_TTCK_LO_HI_BITS_LAYER)

 //
 // TILE TRACE SUBPIXEL v2 (DEFAULT)
 //
 // TTS:
 //
 //  0                  31
 //  | TX | DX | TY | DY |
 //  +----+----+----+----+
 //  | 10 |  7 |  9 |  6 |
 //
 //
 // A a subpixel-resolution line segment within a 32x16 (WxH) tile is
 // encoded in a 32-bit dword.
 //
 // Subpixel resoluion is 5 bits.
 //
 // We're using this representation across all target
 // architectures.
 //
 // A tile X is encoded as:
 //
 //   TX : 10 : unsigned min(x0,x1) tile subpixel coordinate with a range
 //             of [0,1023].
 //
 //   DX :  7 : signed subpixel delta x1-x0. The range of the delta is
 //             [-32,32] including 0.  Note that with 7 signed bits the
 //             range of the bitfield is [-64,63].  An "invalid" TTS
 //             relies on DX being infeasible value.
 //
 // A tile Y is encoded as:
 //
 //   TY :  9 : unsigned min(y0,y1) tile subpixel coordinate with a range
 //             of [0,511].
 //
 //   DY :  6 : signed subpixel delta y1-y0. The range of delta is
 //             [-32,32] but horizontal lines are not encoded so [1,32]
 //             is mapped to [0,31]. The resulting range [-32,31] fits in
 //             6 bits.
 //
 // Note: There are assumptions in the shaders that the X and Y subpixel
 // resolutions are the same.  Despite this, let's keep the X and Y
 // definitions separated.
 //

 #define SPN_TTS_BITS_TX                          10
 #define SPN_TTS_BITS_DX                          7
 #define SPN_TTS_BITS_TY                          9
 #define SPN_TTS_BITS_DY                          6

 //
 //
 //

 #define SPN_TTS_SUBPIXEL_X_LOG2                  5
 #define SPN_TTS_SUBPIXEL_Y_LOG2                  5

 #define SPN_TTS_SUBPIXEL_X_SIZE                  (1 << SPN_TTS_SUBPIXEL_X_LOG2)
 #define SPN_TTS_SUBPIXEL_Y_SIZE                  (1 << SPN_TTS_SUBPIXEL_Y_LOG2)

 #define SPN_TTS_PIXEL_X_LOG2                     (SPN_TTS_BITS_TX - SPN_TTS_SUBPIXEL_X_LOG2)
 #define SPN_TTS_PIXEL_Y_LOG2                     (SPN_TTS_BITS_TY - SPN_TTS_SUBPIXEL_Y_LOG2)

 #define SPN_TTS_SUBPIXEL_X_RESL                  float(SPN_TTS_SUBPIXEL_X_SIZE)
 #define SPN_TTS_SUBPIXEL_Y_RESL                  float(SPN_TTS_SUBPIXEL_Y_SIZE)

 #define SPN_TTS_SUBPIXEL_X_SCALE_UP              SPN_TTS_SUBPIXEL_X_RESL
 #define SPN_TTS_SUBPIXEL_Y_SCALE_UP              SPN_TTS_SUBPIXEL_Y_RESL

 #define SPN_TTS_SUBPIXEL_X_SCALE_DOWN            (1.0f / SPN_TTS_SUBPIXEL_X_RESL)
 #define SPN_TTS_SUBPIXEL_Y_SCALE_DOWN            (1.0f / SPN_TTS_SUBPIXEL_Y_RESL)

 //
 // TTXK.X and .Y are biased and unsigned
 //

 #define SPN_TILE_SUBPIXEL_X_BITS_LOG2            (SPN_DEVICE_TILE_WIDTH_LOG2  + SPN_TTS_SUBPIXEL_X_LOG2)
 #define SPN_TILE_SUBPIXEL_Y_BITS_LOG2            (SPN_DEVICE_TILE_HEIGHT_LOG2 + SPN_TTS_SUBPIXEL_Y_LOG2)

 #define SPN_TILE_SUBPIXEL_X_SIZE                 (1 << SPN_TILE_SUBPIXEL_X_BITS_LOG2)
 #define SPN_TILE_SUBPIXEL_Y_SIZE                 (1 << SPN_TILE_SUBPIXEL_Y_BITS_LOG2)

 #define SPN_TTXK_X_BIAS                          (1 << (SPN_TTXK_HI_BITS_X + SPN_TILE_SUBPIXEL_X_BITS_LOG2 - 1))
 #define SPN_TTXK_Y_BIAS                          (1 << (SPN_TTXK_HI_BITS_Y + SPN_TILE_SUBPIXEL_Y_BITS_LOG2 - 1))

 #define SPN_TTXK_TILE_X_BIAS                     (1 << (SPN_TTXK_HI_BITS_X - 1))
 #define SPN_TTXK_TILE_Y_BIAS                     (1 << (SPN_TTXK_HI_BITS_Y - 1))

 #define SPN_TTXK_XY_BIAS                         ivec2(SPN_TTXK_X_BIAS, SPN_TTXK_Y_BIAS)

 //
 //
 //

 #define SPN_TTS_OFFSET_TX                        0
 #define SPN_TTS_OFFSET_DX                        (SPN_TTS_OFFSET_TX + SPN_TTS_BITS_TX)
 #define SPN_TTS_OFFSET_TY                        (SPN_TTS_OFFSET_DX + SPN_TTS_BITS_DX)
 #define SPN_TTS_OFFSET_DY                        (SPN_TTS_OFFSET_TY + SPN_TTS_BITS_TY)

 #define SPN_TTS_OFFSET_TX_PIXEL                  (SPN_TTS_OFFSET_TX + SPN_TTS_SUBPIXEL_X_LOG2)
 #define SPN_TTS_OFFSET_TY_PIXEL                  (SPN_TTS_OFFSET_TY + SPN_TTS_SUBPIXEL_Y_LOG2)

 #define SPN_TTS_MASK_TX                          SPN_BITS_TO_MASK(SPN_TTS_BITS_TX)
 #define SPN_TTS_MASK_DX                          SPN_BITS_TO_MASK_AT(SPN_TTS_BITS_DX,SPN_TTS_OFFSET_DX)
 #define SPN_TTS_MASK_TY                          SPN_BITS_TO_MASK_AT(SPN_TTS_BITS_TY,SPN_TTS_OFFSET_TY)

 #define SPN_TTS_GET_DX(tts_)                     SPN_BITFIELD_EXTRACT(int(tts_),SPN_TTS_OFFSET_DX,SPN_TTS_BITS_DX)
 #define SPN_TTS_GET_DY(tts_)                     SPN_BITFIELD_EXTRACT(int(tts_),SPN_TTS_OFFSET_DY,SPN_TTS_BITS_DY)

 #define SPN_TTS_GET_TX_SUBPIXEL(tts_)            SPN_BITFIELD_EXTRACT(uint(tts_),SPN_TTS_OFFSET_TX,SPN_TTS_SUBPIXEL_X_LOG2)
 #define SPN_TTS_GET_TY_SUBPIXEL(tts_)            SPN_BITFIELD_EXTRACT(uint(tts_),SPN_TTS_OFFSET_TY,SPN_TTS_SUBPIXEL_Y_LOG2)

 #define SPN_TTS_GET_TX_PIXEL(tts_)               SPN_BITFIELD_EXTRACT(uint(tts_),SPN_TTS_OFFSET_TX_PIXEL,SPN_TTS_PIXEL_X_LOG2)
 #define SPN_TTS_GET_TY_PIXEL(tts_)               SPN_BITFIELD_EXTRACT(uint(tts_),SPN_TTS_OFFSET_TY_PIXEL,SPN_TTS_PIXEL_Y_LOG2)

 #define SPN_TTS_GET_TX(tts_)                     SPN_BITFIELD_EXTRACT(uint(tts_),SPN_TTS_OFFSET_TX,SPN_TTS_BITS_TX)
 #define SPN_TTS_GET_TY(tts_)                     SPN_BITFIELD_EXTRACT(uint(tts_),SPN_TTS_OFFSET_TY,SPN_TTS_BITS_TY)

 //
 // Use an impossible DX value for TTS_INVALID
 //

 #define SPN_TTS_INVALID                          (63<<SPN_TTS_OFFSET_DX)

 //
 // Note that 2048.0 can be represented exactly with fp16... fortuitous!
 //

 #define SPN_TTS_FILL_MAX_AREA                    (2 * SPN_TTS_SUBPIXEL_X_SIZE * SPN_TTS_SUBPIXEL_Y_SIZE)
 #define SPN_TTS_FILL_MAX_AREA_2                  (2 * SPN_TTS_FILL_MAX_AREA)
 #define SPN_TTS_FILL_EVEN_ODD_MASK               (SPN_TTS_FILL_MAX_AREA_2 - 1)
 #define SPN_TTS_FILL_MAX_AREA_RCP_F32            (1.0f / SPN_TTS_FILL_MAX_AREA)

 //
 // RASTER COHORT METADATA
 //
 // MAXIMUM RASTER COHORT META TABLE SIZE IS DETERMINED BY COHORT BITFIELD
 //
 // NOTE: Don't trim array even though the last entry in the pow2 array is not used.
 //
 // FIXME(allanmac): get rid of PKNODE/NA as soon as possible
 //

 #define SPN_RASTER_COHORT_METAS_SIZE_LOG2          SPN_TTRK_HI_BITS_COHORT
 #define SPN_RASTER_COHORT_METAS_SIZE               (1 << SPN_RASTER_COHORT_METAS_SIZE_LOG2)

 //
 // FIXME(allanmac): the UINT64_MAX key is reserved in this segmenting
 // phase.  This implies that the cohort id of all 1's needs to be
 // reserved.  TL;DR: the raster builder must only build
 // (SPN_RASTER_COHORT_METAS_SIZE-1) rasters.
 //

 #define SPN_RASTER_COHORT_META_ALLOC_OFFSET_SK_READS  0 // alloc[0] - block holding first ttsk (head)
 #define SPN_RASTER_COHORT_META_ALLOC_OFFSET_PK_READS  1 // alloc[1] - block holding first ttpk (head/node)

 //
 // FIXME(allanmac): split RKOFF from UVEC4/alloc
 //

 struct spn_rc_meta
 {
   SPN_TYPE_UVEC2 alloc [SPN_RASTER_COHORT_METAS_SIZE]; // block pool reads    -- uninitialized
   SPN_TYPE_UINT  rk_off[SPN_RASTER_COHORT_METAS_SIZE]; // offset of rk keys   -- zeroed
   SPN_TYPE_UINT  blocks[SPN_RASTER_COHORT_METAS_SIZE]; // number of blocks    -- zeroed
   SPN_TYPE_UINT  ttpks [SPN_RASTER_COHORT_METAS_SIZE]; // number of TTPK keys -- zeroed
   SPN_TYPE_UINT  ttrks [SPN_RASTER_COHORT_METAS_SIZE]; // number of TTRK keys -- zeroed
   //
   // FIXME(allanmac): the signed bounding box will be added to the meta
   // using the atomic signed min/max trick.
   //
 };

 //
 // STYLING STRUCTS
 //
 //
 // LAYER
 //
 //   |     LAYER     |
 //   +---------------+
 //   | cmds | parent |
 //   +------+--------+
 //   0      1        2
 //
 // GROUP
 //
 //   |                 GROUP                  |
 //   +--------------+---------+---------------+
 //   |    parents   |  range  |     cmds      |
 //   | depth | base | lo | hi | enter | leave |
 //   +-------+------+----+----+-------+-------+
 //   0       1      2    3    4       5       6
 //
 //
 // It's simpler to define the group as a uvec2[3]:
 //
 //   struct spn_group_node
 //   {
 //     spn_group_parents parents; // path of parent groups leading back to root
 //     spn_group_range   range;   // range of layers enclosed by this group
 //     spn_group_cmds    cmds;    // enter/leave command indices
 //   };
 //
 // The RENDER kernel lays out the current layer node, group node and
 // flags in either registers or shared memory:
 //
 // LGF -- layer / group / flags
 //                                                               optional
 //   | current layer |          current group           |       |       |       |
 //   +---------------+------------+-------+-------------+.......+.......+.......f....
 //   |     layer     |   parents  | range |    cmds     | layer | group | flags | ...
 //   |  cmds parent  | depth base | lo hi | enter leave |  id   |  id   |       |
 //   +------+--------+------+-----+---+---+------+------+.......+-......+.......+....
 //   0      1        2      3     4   5   6      7      8       9       10      11
 //

 struct spn_layer_node
 {
   SPN_TYPE_UINT cmds;   // starting index of sequence of command dwords
   SPN_TYPE_UINT parent; // index of parent group
 };

 struct spn_group_parents
 {
   SPN_TYPE_UINT depth;
   SPN_TYPE_UINT base;
 };

 struct spn_group_range
 { // inclusive layer range [lo,hi]
   SPN_TYPE_UINT lo; // first layer
   SPN_TYPE_UINT hi; // last  layer
 };

 struct spn_group_cmds
 {
   SPN_TYPE_UINT enter; // starting index of sequence of command dwords
   SPN_TYPE_UINT leave; // starting index of sequence of command dwords
 };

 //
 //
 //

 #define SPN_STYLING_LAYER_OFFSET_CMDS                  0
 #define SPN_STYLING_LAYER_OFFSET_PARENT                1
 #define SPN_STYLING_LAYER_COUNT_DWORDS                 2

 #define SPN_STYLING_GROUP_OFFSET_PARENTS_DEPTH         0
 #define SPN_STYLING_GROUP_OFFSET_PARENTS_BASE          1
 #define SPN_STYLING_GROUP_OFFSET_RANGE_LO              2
 #define SPN_STYLING_GROUP_OFFSET_RANGE_HI              3
 #define SPN_STYLING_GROUP_OFFSET_CMDS_ENTER            4
 #define SPN_STYLING_GROUP_OFFSET_CMDS_LEAVE            5
 #define SPN_STYLING_GROUP_COUNT_DWORDS                 6

 //
 //
 //

 #define SPN_STYLING_CMDS_BITS_COUNT                    3
 #define SPN_STYLING_CMDS_BITS_BASE                     (32-SPN_STYLING_CMDS_BITS_COUNT)

 #define SPN_STYLING_CMDS_OFFSET_BASE                   0
 #define SPN_STYLING_CMDS_OFFSET_COUNT                  SPN_STYLING_CMDS_BITS_BASE

 #define SPN_STYLING_CMDS_MAX_BASE                      (1<<SPN_STYLING_CMDS_BITS_BASE)
 #define SPN_STYLING_CMDS_MAX_COUNT                     (1<<SPN_STYLING_CMDS_BITS_COUNT)

 #define SPN_STYLING_CMDS_GET_COUNT(c_)                 SPN_BITFIELD_EXTRACT(c_,                                 \
                                                                             SPN_STYLING_CMDS_OFFSET_COUNT,      \
                                                                             SPN_STYLING_CMDS_BITS_COUNT)

 #define SPN_STYLING_CMDS_GET_BASE(c_)                  SPN_BITFIELD_EXTRACT(c_,                                 \
                                                                             SPN_STYLING_CMDS_OFFSET_BASE,       \
                                                                             SPN_STYLING_CMDS_BITS_BASE)

 #if 0

 union spn_gradient_vector
 {
   skc_float4               f32v4;

   struct {
     skc_float              dx;
     skc_float              p0;
     skc_float              dy;
     skc_float              denom;
   };

   union skc_gradient_slope slopes[4];
 };

 #endif

 //
 // FIXME -- will eventually need to know if this gradient is
 // perspective transformed and if so additional values will need to be
 // encoded
 //
 // VERSION 1
 // =============================================================
 //
 // LINEAR GRADIENT HEADER FOR N STOPS
 //
 // +----------+----------+------------+----------+-------------+
 // |  HEADER  |   INFO   |    LUTS    |  FLOORS  |    COLORS   |
 // +----------+----------+------------+----------+-------------+
 // |  uintv4  | u32v2[1] | f32v2[N-1] | f32[N-2] | ushort2[4N] |
 // +----------+----------+------------+----------+-------------+
 //
 //   COLOR PAIR            WORD EXPANSION            TOTAL
 // +------------+---------------------------------+--------+-------------------------+
 // |  ushort2   |  4 + 2 + 2*(N-1) + N - 2 + 4*N  | 7N + 2 | = 7(N-1+1)+2 = 7(N-1)+9 |
 // +------------+---------------------------------+--------+-------------------------+
 //
 // COLOR LAYOUT:
 //
 //   R[0]R[1], R[1]R[2], ... R[N-1]R[N-1]
 //   G[0]G[1], G[1]G[2], ... G[N-1]G[N-1]
 //   B[0]B[1], B[1]B[2], ... B[N-1]B[N-1]
 //   A[0]A[1], A[1]A[2], ... A[N-1]A[N-1]
 //
 //
 // MINIMUM DWORDS:  N=2 --> 16
 //
 //
 // VERSION 2
 // =============================================================
 //
 // LINEAR GRADIENT DESCRIPTOR FOR N STOPS
 //
 //                           +--------------- REMOVE ME LATER
 //                           v
 // +--------+------+-------+---+----------+-----------+
 // | VECTOR | TYPE | COUNT | N |  SLOPES  |   COLORS  |
 // +--------+------+-------+---+----------+-----------+
 // |  f32v4 |   1  |   1   | 1 | f32[N-1] | f16v2[4N] |
 // +--------+------+-------+---+----------+-----------+
 //
 //   COLOR PAIR           WORD EXPANSION            TOTAL
 // +------------+--------------------------------+--------+
 // |   f16v2    |  4 + 1 + 1 + 1 + [N-1] + [4*N] | 5N + 6 |
 // +------------+--------------------------------+--------+
 //
 // COLOR LAYOUT:
 //
 //   R[0]R[1], R[1]R[2], ... R[N-1]R[N-1] <-------------------------- FIXME -- USE HERB'S SINGLE FMA REPRESENTATION
 //   G[0]G[1], G[1]G[2], ... G[N-1]G[N-1] <-------------------------- FIXME -- USE HERB'S SINGLE FMA REPRESENTATION
 //   B[0]B[1], B[1]B[2], ... B[N-1]B[N-1] <-------------------------- FIXME -- USE HERB'S SINGLE FMA REPRESENTATION
 //   A[0]A[1], A[1]A[2], ... A[N-1]A[N-1] <-------------------------- FIXME -- USE HERB'S SINGLE FMA REPRESENTATION
 //
 //
 // MINIMUM DWORDS:  N=2 --> 16
 //
 //
 // VERSION 3+
 // =============================================================
 //
 // FIXME -- will probably want to try using the sampler/texture
 // hardware to interpolate colors.
 //
 // This will require that the colors are laid out in sampler-friendly
 // order:
 //
 //    RGBA[0]RGBA[1], RGBA[1]RGBA[2], ..., RGBA[N-1]RGBA[N-1]
 //
 //

 #if 0
 #define SPN_GRADIENT_HEADER_DWORDS_LUTS_OFFSET       4
 #define SPN_GRADIENT_HEADER_DWORDS_TOTAL(n_minus_1)  (7 * (n_minus_1) + 9)
 #define SPN_GRADIENT_HEADER_DWORDS_MIN               SPN_GRADIENT_HEADER_DWORDS_TOTAL(1)
 #define SPN_GRADIENT_CMD_DWORDS_V1(n)                (1 + SPN_GRADIENT_HEADER_DWORDS_TOTAL(n-1))
 #endif

 #define SPN_GRADIENT_CMD_DWORDS_V1(n)                (7 * (n) + 2)
 #define SPN_GRADIENT_CMD_DWORDS_V2(n)                (5 * (n) + 6)
 #define SPN_GRADIENT_CMD_DWORDS_V2_ADJUST(v1,v2)     (SPN_GRADIENT_CMD_DWORDS_V1(v1) - ((v2) + 6))

 //
 // clang-format on
 //

 #endif  // SRC_GRAPHICS_LIB_COMPUTE_SPINEL_CORE_H_