| // Copyright 2019 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #version 460 |
| |
| // |
| // |
| // |
| |
| #extension GL_GOOGLE_include_directive : require |
| #extension GL_KHR_shader_subgroup_basic : require |
| #extension GL_EXT_shader_explicit_arithmetic_types : require |
| |
| // |
| // RENDER KERNEL |
| // |
| |
| #include "spn_config.h" |
| #include "vk_layouts.h" |
| |
| // |
| // COLOR/COVER CHANNELS ARE DETERMINED BY TARGET HARDWARE |
| // |
| // clang-format off |
| // |
| |
| // |
| // SINGLE PRECISION FLOAT |
| // |
| #if defined(SPN_DEVICE_RENDER_TILE_CHANNEL_IS_FLOAT32) |
| |
| #define SPN_RENDER_TILE_CHANNEL float |
| #define SPN_RENDER_TILE_COVER SPN_RENDER_TILE_CHANNEL |
| #define SPN_RENDER_TILE_COLOR vec4 |
| |
| #define SPN_RENDER_PIXEL_COVER float |
| |
| #define SPN_RENDER_TILE_CHANNEL_IS_ZERO(c) ((c) == SPN_RENDER_TILE_CHANNEL(0)) |
| |
| #define SPN_RENDER_COLOR_UNPACK(rg32,ba32) \ |
| SPN_RENDER_TILE_COLOR(unpackHalf2x16(rg32),unpackHalf2x16(ba32)); |
| |
| #define SPN_RENDER_COLOR_ACC_RGBA vec4 |
| |
| // |
| // HALF PRECISION FLOAT (FP16) |
| // |
| #elif defined(SPN_DEVICE_RENDER_TILE_CHANNEL_IS_FLOAT16) |
| |
| #ifdef SPN_DEVICE_AMD_GCN3 |
| #extension GL_AMD_gpu_shader_half_float : require // GCN3/AMDVLK disables float16 |
| #endif |
| |
| #define SPN_RENDER_TILE_CHANNEL float16_t |
| #define SPN_RENDER_TILE_COVER SPN_RENDER_TILE_CHANNEL |
| #define SPN_RENDER_TILE_COLOR f16vec4 |
| |
| #define SPN_RENDER_PIXEL_COVER float16_t |
| |
| #define SPN_RENDER_TILE_CHANNEL_IS_ZERO(c) ((c) == SPN_RENDER_TILE_CHANNEL(0)) |
| |
| #define SPN_RENDER_COLOR_UNPACK(rg32,ba32) \ |
| SPN_RENDER_TILE_COLOR(unpackFloat2x16(rg32),unpackFloat2x16(ba32)); |
| |
| #define SPN_RENDER_COLOR_ACC_RGBA f16vec4 |
| |
| #else |
| |
| #error "SPN_DEVICE_RENDER_TILE_CHANNEL_IS_FLOATXX is not defined!" |
| |
| #endif |
| |
| // |
| // COMMON DEFINES |
| // |
| |
| #define SPN_RENDER_WORKGROUP_SIZE (1 << SPN_DEVICE_RENDER_WORKGROUP_SIZE_LOG2) |
| |
| #define SPN_RENDER_SUBGROUP_SIZE (1 << SPN_DEVICE_RENDER_SUBGROUP_SIZE_LOG2) |
| #define SPN_RENDER_SUBGROUP_MASK SPN_GLSL_BITS_TO_MASK(SPN_DEVICE_RENDER_SUBGROUP_SIZE_LOG2) |
| |
| #define SPN_RENDER_SUBGROUPS (SPN_DEVICE_RENDER_WORKGROUP_SIZE / SPN_RENDER_SUBGROUP_SIZE) |
| |
| #define SPN_RENDER_SUBTILE_COUNT_LOG2 (SPN_DEVICE_RENDER_SUBGROUP_SIZE_LOG2 - SPN_DEVICE_TILE_HEIGHT_LOG2) |
| #define SPN_RENDER_SUBTILE_COUNT (1 << SPN_RENDER_SUBTILE_COUNT_LOG2) |
| |
| #define SPN_RENDER_TTS SPN_RENDER_TTX |
| #define SPN_RENDER_TTP SPN_RENDER_TTX |
| |
| #define SPN_RENDER_SUBTILE_WIDTH_LOG2 (SPN_DEVICE_TILE_WIDTH_LOG2 + SPN_DEVICE_TILE_HEIGHT_LOG2 - SPN_DEVICE_RENDER_SUBGROUP_SIZE_LOG2) |
| #define SPN_RENDER_SUBTILE_WIDTH (1 << SPN_RENDER_SUBTILE_WIDTH_LOG2) |
| |
| // |
| // Make sure the config has all necessary steering switches |
| // |
| |
| #if !defined(SPN_DEVICE_RENDER_LGF_USE_SHUFFLE) && \ |
| !defined(SPN_DEVICE_RENDER_LGF_USE_SHARED) |
| #error "SPN_DEVICE_RENDER_LGF_XXX undefined!" |
| #endif |
| |
| #if !defined(SPN_DEVICE_RENDER_TTCKS_USE_SHUFFLE) && \ |
| !defined(SPN_DEVICE_RENDER_TTCKS_USE_SHARED) && \ |
| !defined(SPN_DEVICE_RENDER_TTCKS_NO_SHARED) |
| #error "SPN_DEVICE_RENDER_TTCKS_XXX undefined!" |
| #endif |
| |
| #if !defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHUFFLE) && \ |
| !defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHARED) && \ |
| !defined(SPN_DEVICE_RENDER_STYLING_CMDS_NO_SHARED) |
| #error "SPN_DEVICE_RENDER_STYLING_CMDS_XXX undefined!" |
| #endif |
| |
| #if (SPN_RENDER_SUBTILE_COUNT > 1) && \ |
| !defined(SPN_DEVICE_RENDER_COVERAGE_USE_SHUFFLE) && \ |
| !defined(SPN_DEVICE_RENDER_COVERAGE_USE_SHARED) |
| #error "SPN_DEVICE_RENDER_COVERAGE_XXX undefined!" |
| #endif |
| |
| // |
| // Coarsely enable all advanced subgroup features if we are using any |
| // shuffle. Improve this switch if a new architecture requires it. |
| // |
| |
| #if defined(SPN_DEVICE_RENDER_LGF_USE_SHUFFLE) || \ |
| defined(SPN_DEVICE_RENDER_TTCKS_USE_SHUFFLE) || \ |
| defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHUFFLE) || \ |
| defined(SPN_DEVICE_RENDER_COVERAGE_USE_SHUFFLE) |
| |
| #extension GL_KHR_shader_subgroup_shuffle : require |
| #extension GL_KHR_shader_subgroup_ballot : require |
| #extension GL_KHR_shader_subgroup_shuffle_relative : require |
| |
| #endif |
| |
| // |
| // Do we have vote support? |
| // |
| |
| #ifndef SPN_DEVICE_RENDER_NO_VOTE |
| |
| #extension GL_KHR_shader_subgroup_vote : require |
| |
| #endif |
| |
| // |
| // |
| // |
| |
| layout(local_size_x = SPN_DEVICE_RENDER_WORKGROUP_SIZE) in; |
| |
| // |
| // |
| // |
| |
| SPN_VK_GLSL_DECL_KERNEL_RENDER(); |
| |
| // |
| // SUBTILE WIDTH EXPANSION |
| // |
| |
| #if (SPN_RENDER_SUBTILE_WIDTH_LOG2 == 0) |
| #define SPN_RENDER_SUBTILE_WIDTH_EXPAND() SPN_EXPAND_1() |
| #elif (SPN_RENDER_SUBTILE_WIDTH_LOG2 == 1) |
| #define SPN_RENDER_SUBTILE_WIDTH_EXPAND() SPN_EXPAND_2() |
| #elif (SPN_RENDER_SUBTILE_WIDTH_LOG2 == 2) |
| #define SPN_RENDER_SUBTILE_WIDTH_EXPAND() SPN_EXPAND_4() |
| #elif (SPN_RENDER_SUBTILE_WIDTH_LOG2 == 3) |
| #define SPN_RENDER_SUBTILE_WIDTH_EXPAND() SPN_EXPAND_8() |
| #elif (SPN_RENDER_SUBTILE_WIDTH_LOG2 == 4) |
| #define SPN_RENDER_SUBTILE_WIDTH_EXPAND() SPN_EXPAND_16() |
| #elif (SPN_RENDER_SUBTILE_WIDTH_LOG2 == 5) |
| #define SPN_RENDER_SUBTILE_WIDTH_EXPAND() SPN_EXPAND_32() |
| #else |
| #error "SPN_RENDER_SUBTILE_WIDTH_LOG2 not supported!" |
| #endif |
| |
| // |
| // Globally declare tile cover and color registers |
| // |
| // Total number of color and cover "channels" is 11. |
| // |
| // A channel is represented with either a float16 or float32. |
| // |
| // If the target hardware supports float16 and the driver isn't |
| // broken, then use float16. |
| // |
| // This occupies '11 * SPN_RENDER_SUBTILE_WIDTH' channels per |
| // subgroup lane. |
| // |
| // Tile Size |
| // Subgroup +---------------------------- |
| // Size | 4x4 8x8 16x16 32x32 |
| // ---------+---------------------------- |
| // 4 | 44 176 704 2816 |
| // 8 | 22 88 352 1408 |
| // 16 | 11 44 176 704 |
| // 32 | --- 22 88 352 |
| // 64 | --- 11 44 176 |
| // |
| |
| // |
| // COLOR |
| // |
| // color_wip |
| // color_acc |
| // |
| #ifndef SPN_RENDER_TILE_COLOR_WIP_ENABLED |
| // this results in the color_wip being a scalar -- per thread or per |
| // subgroup depending on the capability of the target arch. |
| SPN_SUBGROUP_UNIFORM SPN_RENDER_TILE_COLOR color_wip; |
| |
| #define SPN_RENDER_TILE_COLOR_WIP(I) color_wip |
| |
| #else |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) SPN_RENDER_TILE_COLOR color_wip##I; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| |
| #define SPN_RENDER_TILE_COLOR_WIP(I) color_wip##I |
| |
| #endif |
| |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) SPN_RENDER_TILE_COLOR color_acc##I; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| |
| // |
| // COVER |
| // |
| // cover_wip |
| // cover_acc |
| // cover_msk |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) SPN_RENDER_TILE_COVER cover_wip##I; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| |
| // cover_acc |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) SPN_RENDER_TILE_COVER cover_acc##I; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) SPN_RENDER_TILE_COVER cover_msk##I; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| |
| // |
| // GENERAL DATA TYPES THAT WE MAY WANT TO TWEAK LATER |
| // |
| // NOTE(allanmac): these could vary in OpenCL |
| // |
| |
| #define SPN_RENDER_TTX int |
| #define SPN_RENDER_PIXEL_AREA int |
| |
| // |
| // Shared memory is primarily for accumulating areas but is also |
| // used as a scratch buffer for gradients and other operations that |
| // might require random-access lookups. |
| // |
| |
| #define SPN_RENDER_TILE_SMEM_DWORDS ((SPN_TILE_WIDTH + 1) * SPN_TILE_HEIGHT) |
| |
| struct spn_subgroup_smem |
| { |
| SPN_RENDER_PIXEL_AREA area[SPN_RENDER_TILE_SMEM_DWORDS]; |
| }; |
| |
| // |
| // |
| // |
| #if (SPN_RENDER_SUBGROUPS == 1) |
| |
| shared spn_subgroup_smem smem; |
| |
| #define SPN_RENDER_SMEM() smem |
| |
| #else |
| |
| shared spn_subgroup_smem smem[SPN_RENDER_SUBGROUPS]; |
| |
| #define SPN_RENDER_SMEM() smem[gl_WorkGroupID.x] |
| |
| #endif |
| |
| // |
| // render flags |
| // |
| // |
| // FIXME: testing for opacity and skipping scattering is on its way to |
| // becoming a much more programmable option because sometimes we may |
| // be compositing/blending from back-to-front and/or be using group |
| // blend rules that ignore opacity. |
| // |
| // The point is that all of these decisions should be encoded in |
| // styling commands and, as much as possible, removed from the final |
| // group/layer styling traversal render loop. |
| // |
| |
| // FLUSH FLAGS |
| #define SPN_RENDER_FLAGS_FLUSH_FINALIZE 0x1 |
| #define SPN_RENDER_FLAGS_FLUSH_UNWIND 0x2 |
| #define SPN_RENDER_FLAGS_FLUSH_COMPLETE 0x4 |
| // OPACITY FLAG |
| #define SPN_RENDER_FLAGS_SCATTER_SKIP 0x8 |
| |
| // |
| // LGF -- layer / group / flags |
| // optional |
| // | current layer | current group | | | | |
| // +---------------+------------+-------+-------------+.......+.......+.......f.... |
| // | layer | parents | range | cmds | layer | group | flags | ... |
| // | cmds parent | depth base | lo hi | enter leave | id | id | | |
| // +------+--------+------+-----+---+---+------+------+.......+-......+.......+.... |
| // 0 1 2 3 4 5 6 7 8 9 10 11 |
| // |
| |
| // |
| // FIXME(allanmac): harmonize these constants with core.h |
| // |
| #define SPN_LGF_LAYER_CMDS 0 |
| #define SPN_LGF_LAYER_PARENT 1 |
| |
| #define SPN_LGF_GROUP_PARENTS_DEPTH 2 |
| #define SPN_LGF_GROUP_FIRST SPN_LGF_GROUP_PARENTS_DEPTH |
| #define SPN_LGF_GROUP_PARENTS_BASE 3 |
| #define SPN_LGF_GROUP_RANGE_LO 4 |
| #define SPN_LGF_GROUP_RANGE_HI 5 |
| #define SPN_LGF_GROUP_CMDS_ENTER 6 |
| #define SPN_LGF_GROUP_CMDS_LEAVE 7 |
| #define SPN_LGF_GROUP_LAST SPN_LGF_GROUP_CMDS_LEAVE |
| |
| #define SPN_LGF_COUNT 8 |
| |
| // |
| // SHUFFLE |
| // |
| #if defined(SPN_DEVICE_RENDER_LGF_USE_SHUFFLE) |
| |
| #define SPN_LGF_BANKS ((SPN_LGF_COUNT + SPN_RENDER_SUBGROUP_SIZE - 1) / SPN_RENDER_SUBGROUP_SIZE) |
| #define SPN_LGF_BANK(idx) ((idx) / SPN_RENDER_SUBGROUP_SIZE) |
| |
| #define SPN_LGF_LANE(idx) ((idx)-SPN_LGF_BANK(idx) * SPN_RENDER_SUBGROUP_SIZE) |
| #define SPN_LGF_IS_LANE(idx) (gl_SubgroupInvocationID == SPN_LGF_LANE(idx)) |
| |
| #define SPN_LGF_LOAD(idx) subgroupBroadcast(lgf[SPN_LGF_BANK(idx)], idx) |
| |
| uint lgf[SPN_LGF_BANKS]; // subgroup-wide register variable at global scope |
| |
| // |
| // SHARED |
| // |
| #elif defined(SPN_DEVICE_RENDER_LGF_USE_SHARED) |
| |
| shared uint lgf[SPN_LGF_COUNT]; |
| |
| #define SPN_LGF_LOAD(idx) lgf[idx] |
| |
| #endif |
| |
| // |
| // clang-format on |
| // |
| |
| SPN_SUBGROUP_UNIFORM uvec2 lgf_lxy; |
| SPN_SUBGROUP_UNIFORM uint lgf_flags; |
| SPN_SUBGROUP_UNIFORM uint lgf_group_id; |
| |
| // |
| // |
| // |
| |
| void |
| spn_lgf_init() |
| { |
| #if defined(SPN_DEVICE_RENDER_LGF_USE_SHUFFLE) |
| // |
| // SHUFFLE |
| // |
| lgf[0] = SPN_UINT_MAX; |
| |
| #if (SPN_LGF_BANKS >= 2) |
| lgf[1] = SPN_UINT_MAX; |
| #endif |
| |
| if (SPN_LGF_IS_LANE(SPN_LGF_GROUP_RANGE_LO)) |
| lgf[SPN_LGF_BANK(SPN_LGF_GROUP_RANGE_LO)] = 0; |
| |
| #elif defined(SPN_DEVICE_RENDER_LGF_USE_SHARED) |
| // |
| // SHARED |
| // |
| #if (SPN_RENDER_SUBGROUP_SIZE == 4) |
| |
| lgf[gl_SubgroupInvocationID + 0] = SPN_UINT_MAX; |
| lgf[gl_SubgroupInvocationID + 4] = SPN_UINT_MAX; |
| |
| // CAREFUL -- if gl_SubgroupInvocationID doesn't match! |
| #elif (SPN_RENDER_SUBGROUP_SIZE == 8) |
| |
| lgf[gl_SubgroupInvocationID] = SPN_UINT_MAX; |
| |
| #else // >= 16 |
| |
| if (gl_SubgroupInvocationID < SPN_LGF_COUNT) |
| lgf[gl_SubgroupInvocationID] = SPN_UINT_MAX; |
| |
| #endif |
| |
| lgf[SPN_LGF_GROUP_RANGE_LO] = 0; |
| |
| #endif |
| |
| lgf_flags = 0; |
| lgf_group_id = SPN_UINT_MAX; |
| } |
| |
| void |
| spn_lgf_flag_set_flush_finalize() |
| { |
| lgf_flags |= SPN_RENDER_FLAGS_FLUSH_FINALIZE; |
| } |
| |
| void |
| spn_lgf_flag_set_flush_unwind() |
| { |
| lgf_flags |= SPN_RENDER_FLAGS_FLUSH_UNWIND; |
| } |
| |
| void |
| spn_lgf_flag_set_flush_complete() |
| { |
| lgf_flags |= SPN_RENDER_FLAGS_FLUSH_COMPLETE; |
| } |
| |
| void |
| spn_lgf_flag_clear_flush_complete() |
| { |
| lgf_flags &= ~SPN_RENDER_FLAGS_FLUSH_COMPLETE; |
| } |
| |
| void |
| spn_lgf_flag_set_scatter_skip() |
| { |
| lgf_flags |= SPN_RENDER_FLAGS_SCATTER_SKIP; |
| } |
| |
| SPN_SUBGROUP_UNIFORM |
| bool |
| spn_lgf_flag_is_scatter_noskip() |
| { |
| return (lgf_flags & SPN_RENDER_FLAGS_SCATTER_SKIP) == 0; |
| } |
| |
| SPN_SUBGROUP_UNIFORM |
| bool |
| spn_lgf_flag_is_flush_unwind() |
| { |
| return (lgf_flags & SPN_RENDER_FLAGS_FLUSH_UNWIND) != 0; |
| } |
| |
| SPN_SUBGROUP_UNIFORM |
| bool |
| spn_lgf_flag_is_not_flush_finalize() |
| { |
| return (lgf_flags & SPN_RENDER_FLAGS_FLUSH_FINALIZE) == 0; |
| } |
| |
| void |
| spn_lgf_if_not_flush_finalize_then_complete_else_unwind() |
| { |
| const bool is_not_ff = spn_lgf_flag_is_not_flush_finalize(); |
| |
| lgf_flags |= is_not_ff ? SPN_RENDER_FLAGS_FLUSH_COMPLETE : SPN_RENDER_FLAGS_FLUSH_UNWIND; |
| } |
| |
| SPN_SUBGROUP_UNIFORM |
| bool |
| spn_lgf_flag_is_not_flush_complete() |
| { |
| return (lgf_flags & SPN_RENDER_FLAGS_FLUSH_COMPLETE) == 0; |
| } |
| |
| SPN_SUBGROUP_UNIFORM |
| uint |
| spn_lgf_get_layer_cmds() |
| { |
| return SPN_LGF_LOAD(SPN_LGF_LAYER_CMDS); |
| } |
| |
| SPN_SUBGROUP_UNIFORM |
| uint |
| spn_lgf_get_group_cmds_enter() |
| { |
| return SPN_LGF_LOAD(SPN_LGF_GROUP_CMDS_ENTER); |
| } |
| |
| SPN_SUBGROUP_UNIFORM |
| uint |
| spn_lgf_get_group_cmds_leave() |
| { |
| return SPN_LGF_LOAD(SPN_LGF_GROUP_CMDS_LEAVE); |
| } |
| |
| SPN_SUBGROUP_UNIFORM |
| bool |
| spn_lgf_layer_in_group_range(SPN_SUBGROUP_UNIFORM const uint layer_id) |
| { |
| // |
| // FIXME -- test against single comparison |
| // |
| SPN_SUBGROUP_UNIFORM const uint range_lo = SPN_LGF_LOAD(SPN_LGF_GROUP_RANGE_LO); |
| SPN_SUBGROUP_UNIFORM const uint range_hi = SPN_LGF_LOAD(SPN_LGF_GROUP_RANGE_HI); |
| SPN_SUBGROUP_UNIFORM const uint lo = layer_id - range_lo; |
| |
| return lo <= (range_hi - range_lo); |
| } |
| |
| SPN_SUBGROUP_UNIFORM |
| bool |
| spn_lgf_layer_parent_equals_group() |
| { |
| SPN_SUBGROUP_UNIFORM const uint layer_parent = SPN_LGF_LOAD(SPN_LGF_LAYER_PARENT); |
| |
| return layer_parent == lgf_group_id; |
| } |
| |
| void |
| spn_lgf_layer_load(SPN_SUBGROUP_UNIFORM const uint layer_id) |
| { |
| // |
| // Load dwords: |
| // |
| // SPN_LGF_LAYER_CMDS 0 |
| // SPN_LGF_LAYER_PARENT 1 |
| // |
| #if defined(SPN_DEVICE_RENDER_LGF_USE_SHUFFLE) |
| // |
| // SHUFFLE |
| // |
| if (gl_SubgroupInvocationID <= SPN_LGF_LAYER_PARENT) |
| lgf[0] = styling[layer_id * SPN_STYLING_LAYER_COUNT_DWORDS + gl_SubgroupInvocationID]; |
| |
| #elif defined(SPN_DEVICE_RENDER_LGF_USE_SHARED) |
| // |
| // SHARED |
| // |
| if (gl_SubgroupInvocationID <= SPN_LGF_LAYER_PARENT) |
| lgf[gl_SubgroupInvocationID] = |
| styling[layer_id * SPN_STYLING_LAYER_COUNT_DWORDS + gl_SubgroupInvocationID]; |
| |
| #endif |
| } |
| |
| void |
| spn_lgf_group_load() |
| { |
| // |
| // Load dwords: |
| // |
| // SPN_LGF_GROUP_PARENTS_DEPTH 2 (SPN_LGF_GROUP_FIRST) |
| // SPN_LGF_GROUP_PARENTS_BASE 3 |
| // SPN_LGF_GROUP_RANGE_LO 4 |
| // SPN_LGF_GROUP_RANGE_HI 5 |
| // SPN_LGF_GROUP_CMDS_ENTER 6 |
| // SPN_LGF_GROUP_CMDS_LEAVE 7 (SPN_LGF_GROUP_LAST) |
| // |
| #if defined(SPN_DEVICE_RENDER_LGF_USE_SHUFFLE) |
| // |
| // SHUFFLE |
| // |
| // highp = mediump - highp; |
| const uint lgf_group_first = SPN_LGF_GROUP_FIRST; |
| const uint iid = gl_SubgroupInvocationID - lgf_group_first; |
| |
| if (iid < SPN_LGF_GROUP_LAST - SPN_LGF_GROUP_FIRST + 1) |
| lgf[0] = styling[lgf_group_id + iid]; |
| |
| #if (SPN_RENDER_SUBGROUP_SIZE == 4) |
| lgf[1] = styling[lgf_group_id + 2 + gl_SubgroupInvocationID]; |
| #endif |
| |
| #elif defined(SPN_DEVICE_RENDER_LGF_USE_SHARED) |
| // |
| // SHARED |
| // |
| const uint iid = lgf_group_id + gl_SubgroupInvocationID; |
| const uint lgf_idx = SPN_LGF_GROUP_FIRST + gl_SubgroupInvocationID; |
| |
| #if (SPN_RENDER_SUBGROUP_SIZE == 4) |
| |
| lgf[lgf_idx] = styling[iid]; |
| |
| if (gl_SubgroupInvocationID < SPN_LGF_GROUP_LAST - SPN_LGF_GROUP_FIRST + 1 - 4) |
| lgf[lgf_idx + 4] = styling[iid + 4]; |
| |
| #else // >= 8 |
| |
| if (gl_SubgroupInvocationID < SPN_LGF_GROUP_LAST - SPN_LGF_GROUP_FIRST + 1) |
| lgf[lgf_idx] = styling[iid]; |
| |
| #endif |
| #endif |
| } |
| |
| void |
| spn_lgf_load_child_group() |
| { |
| lgf_group_id = SPN_LGF_LOAD(SPN_LGF_LAYER_PARENT); |
| SPN_SUBGROUP_UNIFORM const uint group_depth_old = SPN_LGF_LOAD(SPN_LGF_GROUP_PARENTS_DEPTH) + 1; |
| |
| spn_lgf_group_load(); |
| |
| SPN_SUBGROUP_UNIFORM const uint group_depth_new = SPN_LGF_LOAD(SPN_LGF_GROUP_PARENTS_DEPTH); |
| SPN_SUBGROUP_UNIFORM const uint group_base_offset = group_depth_new - group_depth_old; |
| |
| if (group_base_offset != 0) |
| { |
| SPN_SUBGROUP_UNIFORM const uint group_base = SPN_LGF_LOAD(SPN_LGF_GROUP_PARENTS_BASE); |
| SPN_SUBGROUP_UNIFORM const uint group_id_idx = group_base + group_base_offset - 1; |
| |
| lgf_group_id = styling[group_id_idx]; |
| |
| spn_lgf_group_load(); |
| } |
| } |
| |
| void |
| spn_lgf_load_parent_group() |
| { |
| SPN_SUBGROUP_UNIFORM const uint group_depth = SPN_LGF_LOAD(SPN_LGF_GROUP_PARENTS_DEPTH); |
| |
| if (group_depth == 0) |
| { |
| spn_lgf_flag_set_flush_complete(); |
| } |
| else |
| { |
| SPN_SUBGROUP_UNIFORM const uint group_base = SPN_LGF_LOAD(SPN_LGF_GROUP_PARENTS_BASE); |
| |
| lgf_group_id = styling[group_base]; |
| |
| spn_lgf_group_load(); |
| } |
| } |
| |
| // |
| // |
| // |
| |
| bool |
| spn_ttck_lxy_equal(const uvec2 a, SPN_SUBGROUP_UNIFORM const uvec2 lxy) |
| { |
| // FIXME FIXME |
| uvec2 c = a ^ lxy; |
| |
| return ((c[0] & SPN_TTCK_LO_MASK_LAYER) | c[1]) == 0; |
| } |
| |
| SPN_SUBGROUP_UNIFORM |
| bool |
| spn_ttck_lxy_neq_uni(SPN_SUBGROUP_UNIFORM const uvec2 a, SPN_SUBGROUP_UNIFORM const uvec2 lxy) |
| { |
| // FIXME FIXME |
| uvec2 c = a ^ lxy; |
| |
| c[0] = c[0] & SPN_TTCK_LO_MASK_LAYER; |
| |
| return (c[0] | c[1]) != 0; |
| } |
| |
| bool |
| spn_ttck_hi_xy_equal(const uint a, SPN_SUBGROUP_UNIFORM const uint lxy_hi) |
| { |
| // FIXME FIXME |
| return ((a ^ lxy_hi) & SPN_TTCK_HI_MASK_XY) == 0; |
| } |
| |
| SPN_SUBGROUP_UNIFORM |
| bool |
| spn_ttck_hi_xy_equal_uni(SPN_SUBGROUP_UNIFORM const uint a, SPN_SUBGROUP_UNIFORM const uint lxy_hi) |
| { |
| // FIXME FIXME |
| return ((a ^ lxy_hi) & SPN_TTCK_HI_MASK_XY) == 0; |
| } |
| |
| SPN_SUBGROUP_UNIFORM |
| uint |
| spn_ttck_get_layer_uni(SPN_SUBGROUP_UNIFORM const uvec2 lxy) |
| { |
| return SPN_TTCK_GET_LAYER(lxy); |
| } |
| |
| // |
| // |
| // |
| |
| void |
| spn_tile_smem_zero() |
| { |
| // |
| // Note that atomic_init() is likely implemented as a simple |
| // assignment so there is no identifiable performance difference on |
| // current targets. |
| // |
| // If such an architecture appears in the future then we'll probably |
| // still want to implement this zero'ing operation as below but |
| // follow with an appropriate fence that occurs before any scatter |
| // operations. |
| // |
| // FIXME: try to (re)implement 8-byte writes in GLSL for GEN9 |
| // |
| // NOT IMPLEMENTED: |
| // |
| // Intel GENx has a documented 64 byte per cycle SLM write limit. |
| // So having each lane in an 8 lane subgroup zero-write 8 bytes is |
| // probably a safe bet (Later: benchmarking backs this up!). |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) \ |
| SPN_RENDER_SMEM().area[gl_SubgroupInvocationID + I * SPN_RENDER_SUBGROUP_SIZE] = 0; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // Note this is going to be vectorizable on most architectures. |
| // |
| // The return of the key translation feature might complicate things. |
| // |
| |
| #if (SPN_RENDER_SUBTILE_COUNT == 1) |
| |
| void |
| spn_tile_scatter_ttpb(const SPN_RENDER_TTP ttp) |
| { |
| if (ttp != 0) |
| { |
| const int area = ttp * SPN_TTS_SUBPIXEL_Y_SIZE * 2; |
| |
| SPN_RENDER_SMEM().area[gl_SubgroupInvocationID] += area; |
| } |
| } |
| |
| #else // SPN_RENDER_SUBTILE_COUNT >= 2 |
| |
| void |
| spn_tile_scatter_ttpb(const SPN_RENDER_TTP ttp, const uint iid) |
| { |
| if (ttp != 0) |
| { |
| const int area = ttp * SPN_TTS_SUBPIXEL_Y_SIZE * 2; |
| |
| atomicAdd(SPN_RENDER_SMEM().area[iid], area); |
| } |
| } |
| |
| #endif |
| |
| // |
| // |
| // |
| |
| int |
| spn_tts_get_dy(const SPN_RENDER_TTS tts) |
| { |
| // |
| // The tts.dy bitfield maps: |
| // |
| // [-32,-1] -> [-32,-1] |
| // [ 0,31] -> [ 1,32] |
| // |
| // After extracting the bitfield, the range must be adjusted: |
| // |
| // if (dy >= 0) then ++dy |
| // |
| const int dy = SPN_TTS_GET_DY(tts); |
| |
| return dy + ((dy >= 0) ? 1 : 0); |
| } |
| |
| // |
| // Accumulate altitudes and areas -- see docs to understand what's |
| // going on here with Surveyor's Algorithm. |
| // |
| // Note that other coverate calculation algorithms are possible |
| // because the TTS values encode (flattened) subpixel line segments. |
| // |
| // Note that spn_scatter_ttsb is *not* vectorizable unless the |
| // architecture supports a "scatter-add" capability. All relevant |
| // GPUs support atomic add on shared/local memory and thus support |
| // scatter-add. |
| // |
| // On a SIMD device without scatter support, the vector components are |
| // are stored sequentially. |
| // |
| |
| void |
| spn_tile_scatter_ttsb(const SPN_RENDER_TTS tts) |
| { |
| #ifdef SPN_DEVICE_RENDER_TEST_TTS_INVALID_EARLY |
| if (tts != SPN_TTS_INVALID) |
| #endif |
| { |
| // |
| // FIXME(allanmac): skipping per-key pixel and subpixel |
| // translation for now -- implement via a dedicated opcode. |
| // |
| |
| // The "min(x0,x1) * 2 + abs(dx)" is equivalent to "x0 + x1" |
| // and is always positive and <= 1023 |
| const uint tx_sub = SPN_TTS_GET_TX_SUBPIXEL(tts); |
| const int dx = SPN_TTS_GET_DX(tts); |
| const int dx_abs = abs(dx); |
| const uint widths = tx_sub * 2 + dx_abs; |
| |
| // Calculate left and right coverage contribution trapezoids |
| const int dy = spn_tts_get_dy(tts); |
| const int left = dy * int(widths); |
| const int right = dy * (SPN_TTS_SUBPIXEL_X_SIZE * 2) - left; |
| |
| // |
| // The final column is a guard column that is OK to write to |
| // but will never be read. It simplifies the TTSB scatter but |
| // could be predicated if SMEM is really at a premium. |
| // |
| const uint tx_pix = SPN_TTS_GET_TX_PIXEL(tts); |
| const uint ty_pix = SPN_TTS_GET_TY_PIXEL(tts); |
| const uint tile_idx = tx_pix * SPN_TILE_HEIGHT + ty_pix; |
| |
| // |
| // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD |
| // |
| #ifndef SPN_DEVICE_RENDER_TEST_TTS_INVALID_EARLY |
| if (tts != SPN_TTS_INVALID) |
| #endif |
| { |
| atomicAdd(SPN_RENDER_SMEM().area[tile_idx], right); |
| atomicAdd(SPN_RENDER_SMEM().area[tile_idx + SPN_TILE_HEIGHT], left); |
| } |
| } |
| } |
| |
| // |
| // clang-format off |
| // |
| |
| #define SPN_PIXEL_SMEM_AREA(I, lane) SPN_RENDER_SMEM().area[lane + I * SPN_RENDER_SUBGROUP_SIZE] |
| |
| // |
| // If there are multiple subtiles per subgroup then we need to |
| // horizontally exclusive scan add the accumulated areas |
| // |
| |
| // |
| // SUBTILE IS ENTIRE TILE |
| // |
| #if (SPN_RENDER_SUBTILE_COUNT == 1) |
| |
| #define SPN_RENDER_PIXEL_AREA_PREAMBLE() // noop |
| #define SPN_SUBTILE_AREA_SCAN_PRE(I, area) area += SPN_PIXEL_SMEM_AREA(I, gl_SubgroupInvocationID) |
| #define SPN_SUBTILE_AREA_SCAN_POST(area) // noop |
| |
| // |
| // MULTIPLE SUBTILES |
| // |
| #else |
| |
| #define SPN_RENDER_SUBTILE_LAST (SPN_RENDER_SUBTILE_COUNT - 1) |
| #define SPN_RENDER_SUBTILE_LAST_BASE (SPN_RENDER_SUBTILE_LAST * SPN_TILE_HEIGHT) |
| |
| // |
| // clang-format on |
| // |
| |
| // |
| // COVERAGE USES SHUFFLE |
| // |
| #if defined(SPN_DEVICE_RENDER_COVERAGE_USE_SHUFFLE) |
| |
| // |
| // -- SUBTILES COUNT = 2 |
| // |
| #if (SPN_RENDER_SUBTILE_COUNT_LOG2 == 1) |
| |
| #define SPN_RENDER_PIXEL_AREA_PREAMBLE() \ |
| const bool is_p0 = (gl_SubgroupInvocationID >= SPN_TILE_HEIGHT); \ |
| SPN_RENDER_PIXEL_AREA total |
| |
| #define SPN_SUBTILE_AREA_SCAN_PRE(I, area) \ |
| { \ |
| total = area; \ |
| \ |
| SPN_RENDER_PIXEL_AREA pp = SPN_PIXEL_SMEM_AREA(I, gl_SubgroupInvocationID); \ |
| SPN_RENDER_PIXEL_AREA x0 = subgroupShuffleXor(pp, SPN_TILE_HEIGHT); \ |
| SPN_RENDER_PIXEL_AREA rr = pp + x0; \ |
| \ |
| total += rr; \ |
| \ |
| if (is_p0) \ |
| pp = rr; \ |
| \ |
| area += pp; \ |
| } |
| |
| #define SPN_SUBTILE_AREA_SCAN_POST(area) area = total; |
| |
| // |
| // -- SUBTILES COUNT = 4 |
| // |
| #elif (SPN_RENDER_SUBTILE_COUNT_LOG2 == 2) |
| |
| #define SPN_RENDER_PIXEL_AREA_PREAMBLE() \ |
| const bool is_p0 = (gl_SubgroupInvocationID & SPN_TILE_HEIGHT) != 0; \ |
| const bool is_p1 = (gl_SubgroupInvocationID >= SPN_TILE_HEIGHT * 2); \ |
| SPN_RENDER_PIXEL_AREA total |
| |
| #define SPN_SUBTILE_AREA_SCAN_PRE(I, area) \ |
| { \ |
| total = area; \ |
| \ |
| SPN_RENDER_PIXEL_AREA pp = SPN_PIXEL_SMEM_AREA(I, gl_SubgroupInvocationID); \ |
| SPN_RENDER_PIXEL_AREA x0 = subgroupShuffleXor(pp, SPN_TILE_HEIGHT); \ |
| SPN_RENDER_PIXEL_AREA rr = pp + x0; \ |
| \ |
| total += rr; \ |
| \ |
| if (is_p0) \ |
| pp = rr; \ |
| \ |
| SPN_RENDER_PIXEL_AREA x1 = subgroupShuffleXor(rr, SPN_TILE_HEIGHT * 2); \ |
| \ |
| total += x1; \ |
| \ |
| if (is_p1) \ |
| pp += x1; \ |
| \ |
| area += pp; \ |
| } |
| |
| #define SPN_SUBTILE_AREA_SCAN_POST(area) area = total; |
| |
| // |
| // -- SUBTILES COUNT >= 8 |
| // |
| #else |
| |
| #error "SPN_RENDER_SUBTILE_COUNT_LOG2 > 2 not supported" |
| |
| #endif |
| |
| // |
| // COVERAGE USES SHARED |
| // |
| #elif defined(SPN_DEVICE_RENDER_COVERAGE_USE_SHARED) |
| |
| // |
| // -- SUBTILES COUNT = 2 |
| // |
| #define SPN_RENDER_PIXEL_AREA_PREAMBLE() \ |
| const bool is_p0 = (gl_SubgroupInvocationID >= SPN_TILE_HEIGHT); \ |
| const uint iid_xor = (gl_SubgroupInvocationID ^ SPN_TILE_HEIGHT); \ |
| SPN_RENDER_PIXEL_AREA total |
| |
| #define SPN_SUBTILE_AREA_SCAN_PRE(I, area) \ |
| { \ |
| total = area; \ |
| \ |
| SPN_RENDER_PIXEL_AREA pp = SPN_PIXEL_SMEM_AREA(I, gl_SubgroupInvocationID); \ |
| SPN_RENDER_PIXEL_AREA x0 = SPN_PIXEL_SMEM_AREA(I, iid_xor); \ |
| SPN_RENDER_PIXEL_AREA rr = pp + x0; \ |
| \ |
| total += rr; \ |
| \ |
| if (is_p0) \ |
| pp = rr; \ |
| \ |
| area += pp; \ |
| } |
| |
| #define SPN_SUBTILE_AREA_SCAN_POST(area) area = total; |
| |
| #if (SPN_RENDER_SUBTILE_COUNT_LOG2 > 1) |
| #error "SPN_DEVICE_RENDER_COVERAGE_USE_SHARED missing support for a subtile count > 2" |
| #endif |
| |
| #endif |
| #endif |
| |
| // |
| // Compute accumulated pixel coverage "fill rules" using Surveyor's |
| // Algorithm. |
| // |
| // FIXME -- we may want SPN_DEVICE_RENDER_COVER_AREA to be an int2() |
| // which means the initial SMEM load and subsequent shuffles would |
| // need to hide the second load and shuffle. |
| // |
| |
| void |
| spn_tile_cover_nonzero() |
| { |
| SPN_RENDER_PIXEL_AREA_PREAMBLE(); |
| |
| SPN_RENDER_PIXEL_AREA area = 0; |
| |
| subgroupMemoryBarrierShared(); |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) \ |
| { \ |
| SPN_SUBTILE_AREA_SCAN_PRE(I, area); \ |
| \ |
| const SPN_RENDER_PIXEL_AREA trapabs = abs(area); \ |
| const SPN_RENDER_PIXEL_AREA trapmin = min(trapabs, SPN_TTS_FILL_MAX_AREA); \ |
| const SPN_RENDER_PIXEL_COVER nonzero = SPN_RENDER_PIXEL_COVER(trapmin); \ |
| \ |
| cover_wip##I = nonzero * SPN_RENDER_PIXEL_COVER(SPN_TTS_FILL_MAX_AREA_RCP_F32); \ |
| \ |
| if (!L) \ |
| { \ |
| SPN_SUBTILE_AREA_SCAN_POST(area); \ |
| } \ |
| } |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND(); |
| } |
| |
| void |
| spn_tile_cover_evenodd() |
| { |
| SPN_RENDER_PIXEL_AREA_PREAMBLE(); |
| |
| SPN_RENDER_PIXEL_AREA area = 0; |
| |
| subgroupMemoryBarrierShared(); |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) \ |
| { \ |
| SPN_SUBTILE_AREA_SCAN_PRE(I, area); \ |
| \ |
| const SPN_RENDER_PIXEL_AREA trapabs = abs(area); \ |
| const SPN_RENDER_PIXEL_AREA maskabs = trapabs & SPN_TTS_FILL_EVEN_ODD_MASK; \ |
| const SPN_RENDER_PIXEL_AREA reflect = abs(maskabs - SPN_TTS_FILL_MAX_AREA); \ |
| const SPN_RENDER_PIXEL_COVER evenodd = \ |
| SPN_RENDER_PIXEL_COVER(SPN_TTS_FILL_MAX_AREA - reflect); \ |
| \ |
| cover_wip##I = evenodd * SPN_RENDER_PIXEL_COVER(SPN_TTS_FILL_MAX_AREA_RCP_F32); \ |
| \ |
| if (!L) \ |
| { \ |
| SPN_SUBTILE_AREA_SCAN_POST(area); \ |
| } \ |
| } |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND(); |
| } |
| |
| // |
| // |
| // |
| |
| void |
| spn_tile_color_fill_solid(SPN_SUBGROUP_UNIFORM const uint rg32, |
| SPN_SUBGROUP_UNIFORM const uint ba32) |
| { |
| // |
| // solid fill |
| // |
| // loads { fp16x2 rg, fp16x2 ba } from cmd stream |
| // |
| // NOTE(allanmac): we could load the color into column 0 and then |
| // copy it to the remaining columns. |
| // |
| #ifndef SPN_RENDER_TILE_COLOR_WIP_ENABLED |
| color_wip = SPN_RENDER_COLOR_UNPACK(rg32, ba32); |
| |
| color_wip.a = -color_wip.a; |
| #else |
| SPN_SUBGROUP_UNIFORM SPN_RENDER_TILE_COLOR rgba = SPN_RENDER_COLOR_UNPACK(rg32, ba32); |
| |
| rgba.a = -rgba.a; // temporarily here |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) color_wip##I = rgba; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| void |
| spn_tile_blend_over() |
| { |
| // |
| // fralunco = cover.wip * acc.a |
| // |
| // acc.r = +fralunco * wip.r + acc.r |
| // acc.g = +fralunco * wip.g + acc.g |
| // acc.b = +fralunco * wip.b + acc.b |
| // acc.a = -fralunco * wip.a + acc.a <-- wip.a is negated |
| // |
| // Assumes color.wip.a is negated. |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) \ |
| color_acc##I += (cover_wip##I * color_acc##I.a) * SPN_RENDER_TILE_COLOR_WIP(I); |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // |
| // |
| |
| void |
| spn_tile_blend_plus() |
| { |
| // |
| // cover_min = min(cover.wip,acc.a) |
| // |
| // r.acc = cover_min * wip.r + acc.r |
| // g.acc = cover_min * wip.g + acc.g |
| // b.acc = cover_min * wip.b + acc.b |
| // a.acc = -cover_min * wip.a + acc.a |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) \ |
| color_acc##I += min(cover_wip##I, color_acc##I.a) * SPN_RENDER_TILE_COLOR_WIP(I); |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // |
| // |
| |
| void |
| spn_tile_blend_multiply() |
| { |
| // |
| // acc.r = (cover.wip * wip.r) * acc.r |
| // acc.g = (cover.wip * wip.g) * acc.g |
| // acc.b = (cover.wip * wip.b) * acc.b |
| // acc.a = (cover.wip * wip.a) * (1.0 - acc.a) <-- acc.a is already (1.0 - alpha) |
| // |
| // FIXME(allanmac): This may be incorrect. |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) \ |
| color_acc##I = cover_wip##I * SPN_RENDER_TILE_COLOR_WIP(I) * color_acc##I; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // |
| // |
| |
| void |
| spn_tile_blend_knockout() |
| { |
| // |
| // cover.wip.contrib = (1.0 - cover.acc) * cover.wip |
| // cover.acc = cover.acc + cover.wip.contrib |
| // |
| // r.acc = cover.wip.contrib * wip.r + acc.r |
| // g.acc = cover.wip.contrib * wip.g + acc.g |
| // b.acc = cover.wip.contrib * wip.b + acc.b |
| // a.acc = -cover.wip.contrib * wip.a + acc.a |
| // |
| // Destructively updates cover.wip |
| // |
| |
| // |
| // 1. cover_wip = cover_wip - cover_wip * cover.acc |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) cover_wip##I = fma(-cover_wip##I, cover_acc##I, cover_wip##I); |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| |
| // |
| // 2. cover_acc += cover_wip |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) cover_acc##I += cover_wip##I; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| |
| // |
| // 3. color_acc = color_wip * cover_wip + color_acc |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) color_acc##I += SPN_RENDER_TILE_COLOR_WIP(I) * cover_wip##I; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // |
| // |
| |
| void |
| spn_tile_cover_msk_copy_wip() |
| { |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) cover_msk##I = cover_wip##I; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // |
| // |
| |
| void |
| spn_tile_cover_msk_copy_acc() |
| { |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) cover_msk##I = cover_acc##I; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // |
| // |
| |
| void |
| spn_tile_cover_accumulate() |
| { |
| // |
| // cover.wip.contrib = (1.0 - cover.acc) * cover.wip |
| // cover.acc = cover.acc + cover.wip.contrib |
| // |
| // Destructively updates cover.wip |
| // |
| |
| // |
| // cover.wip = cover.wip - cover.acc * cover.wip |
| // cover.acc = cover.acc + cover.wip |
| // |
| // cover.acc = cover.acc + cover.wip - cover.acc * cover.wip |
| // |
| |
| // |
| // 1. cover_wip = -cover_wip * cover_acc + cover_wip |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) cover_wip##I = fma(-cover_wip##I, cover_acc##I, cover_wip##I); |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| |
| // |
| // 2. cover_acc = cover_acc + cover_wip |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) cover_acc##I += cover_wip##I; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // COVER MASK |
| // |
| |
| void |
| spn_tile_cover_wip_mask() |
| { |
| // |
| // cover.wip *= cover.msk |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) cover_wip##I *= cover_msk##I; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // COVER ZERO |
| // |
| |
| // |
| // FIXME(allanmac): cover_wip_zero() is never going to be used |
| // |
| void |
| spn_tile_cover_wip_zero() |
| { |
| const SPN_RENDER_TILE_COVER zero = SPN_RENDER_TILE_COVER(0); |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) cover_wip##I = zero; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| void |
| spn_tile_cover_acc_zero() |
| { |
| const SPN_RENDER_TILE_COVER zero = SPN_RENDER_TILE_COVER(0); |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) cover_acc##I = zero; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| void |
| spn_tile_cover_msk_zero() |
| { |
| const SPN_RENDER_TILE_COVER zero = SPN_RENDER_TILE_COVER(0); |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) cover_msk##I = zero; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // COVER ONE |
| // |
| |
| void |
| spn_tile_cover_msk_one() |
| { |
| const SPN_RENDER_TILE_COVER one = SPN_RENDER_TILE_COVER(1); |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) cover_msk##I = one; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // |
| // |
| |
| void |
| spn_tile_cover_msk_invert() |
| { |
| const SPN_RENDER_TILE_COVER one = SPN_RENDER_TILE_COVER(1); |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) cover_msk##I = one - cover_msk##I; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // |
| // |
| |
| // |
| // FIXME(allanmac): color_wip_zero() will never be used |
| // |
| void |
| spn_tile_color_wip_zero() |
| { |
| #ifndef SPN_RENDER_TILE_COLOR_WIP_ENABLED |
| color_wip = SPN_RENDER_TILE_COLOR(0, 0, 0, -1); |
| #else |
| const SPN_RENDER_TILE_COLOR rgba = SPN_RENDER_TILE_COLOR(0, 0, 0, -1); |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) color_wip##I = rgba; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| #endif |
| } |
| |
| void |
| spn_tile_color_acc_zero() |
| { |
| const SPN_RENDER_TILE_COLOR rgba = SPN_RENDER_TILE_COLOR(0, 0, 0, 1); |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) color_acc##I = rgba; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // |
| // |
| |
| void |
| spn_tile_color_acc_test_opacity() |
| { |
| // |
| // returns true if tile is opaque |
| // |
| // various hacks to test for complete tile opacity |
| // |
| // note that front-to-back currently has alpha at 0.0f -- this can |
| // be harmonized to use a traditional alpha if we want to support |
| // rendering in either direction |
| // |
| // hack -- ADD/MAX/OR all alphas together and test for non-zero |
| // |
| #ifndef SPN_DEVICE_RENDER_NO_VOTE |
| // |
| // VOTE |
| // |
| SPN_RENDER_TILE_CHANNEL a = SPN_RENDER_TILE_CHANNEL(0); |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) a = max(a, color_acc##I.a); |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND(); |
| |
| // are all components in the subtile zero? |
| if (subgroupAll(SPN_RENDER_TILE_CHANNEL_IS_ZERO(a))) |
| spn_lgf_flag_set_scatter_skip(); |
| |
| #else |
| // |
| // NO VOTE |
| // |
| |
| // FIXME -- for now, do nothing on basic-only devices |
| |
| #endif |
| } |
| |
| // |
| // |
| // |
| |
| void |
| spn_tile_color_acc_over_background(SPN_SUBGROUP_UNIFORM const uint rg32, |
| SPN_SUBGROUP_UNIFORM const uint ba32) |
| { |
| // |
| // acc.r = acc.a * r + acc.r |
| // acc.g = acc.a * g + acc.g |
| // acc.b = acc.a * b + acc.b |
| // |
| SPN_SUBGROUP_UNIFORM const SPN_RENDER_TILE_COLOR rgb1 = SPN_RENDER_COLOR_UNPACK(rg32, ba32); |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) \ |
| { \ |
| color_acc##I.rg += color_acc##I.a * rgb1.rg; \ |
| color_acc##I.b += color_acc##I.a * rgb1.b; \ |
| } |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| // |
| // Map accumulator register rows to surface coordinates |
| // |
| |
| #if (SPN_RENDER_SUBTILE_COUNT == 1) |
| |
| #define SPN_RENDER_SUBTILE_LANE_TO_X(sgid) 0 |
| #define SPN_RENDER_SUBTILE_LANE_TO_Y(sgid) (sgid) |
| |
| #else |
| |
| #define SPN_RENDER_SUBTILE_LANE_TO_X(sgid) (sgid >> SPN_DEVICE_TILE_HEIGHT_LOG2) |
| #define SPN_RENDER_SUBTILE_LANE_TO_Y(sgid) (sgid & SPN_TILE_HEIGHT_MASK) |
| |
| #endif |
| |
| // |
| // FIXME(allanmac): use a specialization constant to steer codegen for |
| // different color depths or multi-plane images. |
| // |
| // Multi-plane might be optimal because the R/G/B arrays can be |
| // directly copied? |
| // |
| |
| #ifndef SPN_RENDER_STORE_TO_SURFACE_REFLECTED |
| // |
| // X |
| // +-------> |
| // | |
| // Y | |
| // | |
| // v |
| // |
| void |
| spn_tile_color_acc_store_to_surface() |
| { |
| SPN_SUBGROUP_UNIFORM const uint x_uni = SPN_TTCK_GET_X(lgf_lxy) * SPN_TILE_WIDTH; |
| SPN_SUBGROUP_UNIFORM const uint y_uni = SPN_TTCK_GET_Y(lgf_lxy) * SPN_TILE_HEIGHT; |
| |
| ivec2 xy = ivec2(x_uni + SPN_RENDER_SUBTILE_LANE_TO_X(gl_SubgroupInvocationID), |
| y_uni + SPN_RENDER_SUBTILE_LANE_TO_Y(gl_SubgroupInvocationID)); |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) \ |
| imageStore(surface, xy, color_acc##I); \ |
| xy.x += SPN_RENDER_SUBTILE_COUNT; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| #else |
| // |
| // Y |
| // +-------> |
| // | |
| // X | REFLECTED |
| // | |
| // v |
| // |
| void |
| spn_tile_color_acc_store_to_surface() |
| { |
| SPN_SUBGROUP_UNIFORM const uint x_uni = SPN_TTCK_GET_X(lgf_lxy) * SPN_TILE_WIDTH; |
| SPN_SUBGROUP_UNIFORM const uint y_uni = SPN_TTCK_GET_Y(lgf_lxy) * SPN_TILE_HEIGHT; |
| |
| ivec2 xy = ivec2(y_uni + SPN_RENDER_SUBTILE_LANE_TO_Y(gl_SubgroupInvocationID), |
| x_uni + SPN_RENDER_SUBTILE_LANE_TO_X(gl_SubgroupInvocationID)); |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) \ |
| imageStore(surface, xy, color_acc##I); \ |
| xy.y += SPN_RENDER_SUBTILE_COUNT; |
| |
| SPN_RENDER_SUBTILE_WIDTH_EXPAND() |
| } |
| |
| #endif |
| |
| // |
| // The default "TTCKS_USE_SHUFFLE" will load a subgroup size of TTCK |
| // keys in registers and index them with a subgroup shuffle. |
| // |
| // The "TTCKS_USE_SHARED" switch enables loading a number of TTCK keys |
| // and storing them to shared memory. |
| // |
| // The "TTCKS_NO_SHARED" switch results in one TTCK key being loaded |
| // at a time. |
| // |
| |
| #if defined(SPN_DEVICE_RENDER_TTCKS_USE_SHARED) |
| |
| shared uvec2 ttck_smem[SPN_RENDER_SUBGROUP_SIZE]; // this could be smaller |
| |
| #endif |
| |
| // |
| // The "STYLING_CMDS_USE_SHUFFLE" is to load up to a subgroup size of |
| // commands in registers and index them with a subgroup shuffle. |
| // |
| // The "STYLING_CMDS_USE_SHARED" switch enables loading a number of |
| // styling commands and storing them to shared memory. |
| // |
| // The "STYLING_CMDS_NO_SHARED" switch is an even lower performance |
| // implementation that reads commands one at a time from global |
| // memory. |
| // |
| |
| #if defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHUFFLE) |
| |
| #if SPN_RENDER_SUBGROUP_SIZE < SPN_STYLING_CMDS_MAX_COUNT |
| #error "SPN_RENDER_SUBGROUP_SIZE < SPN_STYLING_CMDS_MAX_COUNT" |
| #endif |
| |
| #endif |
| |
| // |
| // |
| // |
| |
| #if defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHARED) |
| |
| shared uint spn_cmds[SPN_STYLING_CMDS_MAX_COUNT]; |
| |
| #endif |
| |
| // |
| // |
| // |
| |
| void |
| main() |
| { |
| #if (SPN_RENDER_SUBGROUPS == 1) |
| // |
| // A workgroup contains a single subgroup |
| // |
| SPN_SUBGROUP_UNIFORM |
| const uint ttck_offset_idx = gl_WorkGroupID.x; |
| #else |
| // |
| // A workgroup contains multiple subgroups. Subgroups with no work exit early. |
| // |
| SPN_SUBGROUP_UNIFORM |
| const uint ttck_offset_idx = gl_WorkGroupID.x * SPN_RENDER_SUBGROUPS + gl_SubgroupID; |
| |
| if (ttck_offset_idx >= offsets_count[0]) |
| return; |
| #endif |
| |
| // |
| // |
| // |
| |
| SPN_SUBGROUP_UNIFORM const uint ttcks_count_minus_1 = ttcks_count[0] - 1; |
| |
| // |
| // load the starting ttck for this offset and get a bound on the max |
| // number of keys that might be loaded |
| // |
| // then load one or more TTCK keys |
| // |
| #if defined(SPN_DEVICE_RENDER_TTCKS_USE_SHUFFLE) |
| // |
| // SHUFFLE |
| // |
| SPN_SUBGROUP_UNIFORM const uint ttck_base = offsets[ttck_offset_idx]; |
| |
| // align on a subgroup |
| uint ttck_idx_next = (ttck_base & ~SPN_RENDER_SUBGROUP_MASK) + gl_SubgroupInvocationID; |
| |
| // row of TTCK keys in registers |
| uvec2 ttck_sg; |
| |
| { |
| SPN_SUBGROUP_UNIFORM const uint ttck_lane = (ttck_base & SPN_RENDER_SUBGROUP_MASK); |
| |
| const bool is_valid = |
| (gl_SubgroupInvocationID >= ttck_lane) && (ttck_idx_next <= ttcks_count_minus_1); |
| |
| if (is_valid) |
| ttck_sg = ttcks_keys[ttck_idx_next]; |
| |
| ttck_idx_next += SPN_RENDER_SUBGROUP_SIZE; |
| |
| lgf_lxy[0] = subgroupShuffle(ttck_sg[0], ttck_lane) & SPN_TTCK_LO_MASK_LAYER; |
| lgf_lxy[1] = subgroupShuffle(ttck_sg[1], ttck_lane); |
| |
| // bit-twiddle invalid keys so they mismatch: ~xy |
| if (!is_valid) |
| ttck_sg[1] = ~lgf_lxy[1]; |
| } |
| |
| #elif defined(SPN_DEVICE_RENDER_TTCKS_USE_SHARED) |
| // |
| // SHARED |
| // |
| SPN_SUBGROUP_UNIFORM uint ttck_idx_next = offsets[ttck_offset_idx]; |
| |
| { |
| const uint ttck_idx_aligned = |
| (ttck_idx_next & ~SPN_RENDER_SUBGROUP_MASK) + gl_SubgroupInvocationID; |
| |
| SPN_SUBGROUP_UNIFORM const uint ttck_lane = (ttck_idx_next & SPN_RENDER_SUBGROUP_MASK); |
| |
| const bool is_valid = |
| (gl_SubgroupInvocationID >= ttck_lane) && (ttck_idx_aligned <= ttcks_count_minus_1); |
| |
| uvec2 ttck_new = { 0, 0 }; |
| |
| if (is_valid) |
| ttck_new = ttcks_keys[ttck_idx_aligned]; |
| |
| ttck_smem[gl_SubgroupInvocationID] = ttck_new; |
| |
| subgroupMemoryBarrierShared(); |
| |
| SPN_SUBGROUP_UNIFORM const uvec2 ttck_first = ttck_smem[ttck_lane]; |
| |
| if (!is_valid) |
| ttck_smem[gl_SubgroupInvocationID][1] = ~ttck_first[1]; // ~xy |
| |
| lgf_lxy[0] = ttck_first[0] & SPN_TTCK_LO_MASK_LAYER; |
| lgf_lxy[1] = ttck_first[1]; |
| } |
| |
| #elif defined(SPN_DEVICE_RENDER_TTCKS_NO_SHARED) |
| // |
| // NO SHARED |
| // |
| SPN_SUBGROUP_UNIFORM uint ttck_idx_next = offsets[ttck_offset_idx]; |
| SPN_SUBGROUP_UNIFORM uvec2 ttck_sgu; // subgroup uniform TTCK key |
| { |
| ttck_sgu = ttcks_keys[ttck_idx_next++]; |
| lgf_lxy[0] = ttck_sgu[0] & SPN_TTCK_LO_MASK_LAYER; |
| lgf_lxy[1] = ttck_sgu[1]; |
| } |
| |
| #endif |
| |
| // |
| // evaluate the coarse clip as late as possible |
| // |
| SPN_SUBGROUP_UNIFORM const uint ttck_x = SPN_TTCK_GET_X(lgf_lxy); |
| |
| if (ttck_x < render_clip[0]) |
| return; |
| |
| if (ttck_x >= render_clip[2]) |
| return; |
| |
| SPN_SUBGROUP_UNIFORM const uint ttck_y = SPN_TTCK_GET_Y(lgf_lxy); |
| |
| if (ttck_y < render_clip[1]) |
| return; |
| |
| if (ttck_y >= render_clip[3]) |
| return; |
| |
| // |
| // initialize rendering and styling state |
| // |
| // save the first key so we know what tile we're in |
| // |
| spn_lgf_init(); |
| |
| // |
| // load -> scatter -> flush |
| // |
| do |
| { |
| // clear the accumulator for this layer |
| spn_tile_smem_zero(); |
| |
| // load the layer we're working on |
| SPN_SUBGROUP_UNIFORM const uint layer_id = spn_ttck_get_layer_uni(lgf_lxy); |
| |
| spn_lgf_layer_load(layer_id); |
| |
| // do we need to skip all keys on this layer because the tile |
| // was marked as opaque or for some other reason? |
| SPN_SUBGROUP_UNIFORM const bool is_scatter = spn_lgf_flag_is_scatter_noskip(); |
| |
| // |
| // load and scatter all TTXBs on this layer |
| // |
| #ifdef SPN_DEVICE_RENDER_TTCKS_USE_SHUFFLE |
| // |
| // SHUFFLE IS SUPPORTED |
| // |
| while (true) |
| { |
| // |
| // How many matches? Note that matches will be contiguous. |
| // |
| const bool lxy_equal = spn_ttck_lxy_equal(ttck_sg, lgf_lxy); |
| SPN_SUBGROUP_UNIFORM const uvec4 match = subgroupBallot(lxy_equal); |
| SPN_SUBGROUP_UNIFORM uint count = subgroupBallotBitCount(match); |
| SPN_SUBGROUP_UNIFORM uint last = 0; |
| |
| if ((count > 0) && is_scatter) |
| { |
| SPN_SUBGROUP_UNIFORM uint next = subgroupBallotFindLSB(match); |
| |
| last = next + count; |
| |
| #if (SPN_RENDER_SUBTILE_COUNT == 1) |
| // |
| // SUBTILES == 1 |
| // |
| for (; next < last; next += SPN_RENDER_SUBTILE_COUNT) |
| { |
| SPN_SUBGROUP_UNIFORM const uint ttck_lo = subgroupShuffle(ttck_sg[0], next); |
| SPN_SUBGROUP_UNIFORM const bool is_ttpb = SPN_TTCK_LO_IS_PREFIX(ttck_lo); |
| |
| const uint ttxb_id = SPN_TTCK_LO_GET_TTXB_ID(ttck_lo); |
| const uint ttxb_base = ttxb_id * SPN_BLOCK_POOL_SUBBLOCK_DWORDS; |
| |
| const SPN_RENDER_TTX ttx = int(bp_blocks[ttxb_base + gl_SubgroupInvocationID]); |
| |
| if (is_ttpb) |
| { |
| spn_tile_scatter_ttpb(ttx); |
| } |
| else |
| { |
| spn_tile_scatter_ttsb(ttx); |
| } |
| } |
| #else |
| // |
| // SUBTILES >= 2 |
| // |
| |
| // hopefully these lane constants get hoisted upwards as necessary |
| const uint subtile_idx = gl_SubgroupInvocationID >> SPN_DEVICE_TILE_HEIGHT_LOG2; |
| const uint subtile_iid = gl_SubgroupInvocationID & SPN_TILE_HEIGHT_MASK; |
| |
| for (; next < last; next += SPN_RENDER_SUBTILE_COUNT) |
| { |
| // |
| // NOTE: we don't care if the shuffle index is out of bounds |
| // |
| const uint next_subtile = next + subtile_idx; |
| const bool is_valid_subtile = (next_subtile < last); |
| const uint ttck_lo = subgroupShuffle(ttck_sg[0], next_subtile); |
| |
| // predicates valid subtiles |
| if (is_valid_subtile) |
| { |
| const uint ttxb_id = SPN_TTCK_LO_GET_TTXB_ID(ttck_lo); |
| const uint ttxb_base = ttxb_id * SPN_BLOCK_POOL_SUBBLOCK_DWORDS; |
| |
| const SPN_RENDER_TTX ttx = int(bp_blocks[ttxb_base + subtile_iid]); |
| |
| // |
| // NOTE(allanmac): POTENTIAL OPTIMIZATION -- It's |
| // not a requirement, but sorting against all |
| // 64-bits of the TTCK keys results in all PREFIX |
| // keys being placed at the end of a LXY sequence. |
| // |
| const bool is_ttpb = SPN_TTCK_LO_IS_PREFIX(ttck_lo); |
| |
| if (is_ttpb) |
| { |
| spn_tile_scatter_ttpb(ttx, subtile_iid); |
| } |
| else |
| { |
| spn_tile_scatter_ttsb(ttx); |
| } |
| } |
| } |
| #endif |
| } |
| |
| // |
| // Is the subgroup out of keys? |
| // |
| if (last == SPN_RENDER_SUBGROUP_SIZE) |
| { |
| // mark all keys invalid |
| last = 0; |
| ttck_sg[1] = ~lgf_lxy[1]; |
| |
| if (ttck_idx_next <= ttcks_count_minus_1) |
| ttck_sg = ttcks_keys[ttck_idx_next]; |
| |
| ttck_idx_next += SPN_RENDER_SUBGROUP_SIZE; |
| } |
| |
| SPN_SUBGROUP_UNIFORM const uvec2 ttck_first = { |
| |
| subgroupShuffle(ttck_sg[0], last), |
| subgroupShuffle(ttck_sg[1], last) |
| }; |
| |
| // is this a new LXY? |
| if (spn_ttck_lxy_neq_uni(ttck_first, lgf_lxy)) |
| { |
| if (spn_ttck_hi_xy_equal_uni(ttck_first[1], lgf_lxy[1])) |
| { |
| // this is a new layer and the ttck is the new lxy |
| lgf_lxy[0] = ttck_first[0] & SPN_TTCK_LO_MASK_LAYER; |
| lgf_lxy[1] = ttck_first[1]; |
| } |
| else |
| { |
| // no more tiles left to process! |
| spn_lgf_flag_set_flush_finalize(); |
| } |
| break; |
| } |
| } |
| |
| #elif defined(SPN_DEVICE_RENDER_TTCKS_USE_SHARED) |
| // |
| // SHARED |
| // |
| while (true) |
| { |
| SPN_SUBGROUP_UNIFORM const uint ttck_lane = (ttck_idx_next & SPN_RENDER_SUBGROUP_MASK); |
| SPN_SUBGROUP_UNIFORM const uvec2 ttck = ttck_smem[ttck_lane]; |
| |
| // is this a new LXY? |
| if (spn_ttck_lxy_neq_uni(ttck, lgf_lxy)) |
| { |
| if (spn_ttck_hi_xy_equal_uni(ttck[1], lgf_lxy[1])) |
| { |
| // this is a new layer and the ttck is the new lxy |
| lgf_lxy[0] = ttck[0] & SPN_TTCK_LO_MASK_LAYER; |
| lgf_lxy[1] = ttck[1]; |
| } |
| else |
| { |
| // no more tiles left to process |
| spn_lgf_flag_set_flush_finalize(); |
| } |
| break; |
| } |
| |
| // |
| // scatter the key? |
| // |
| if (is_scatter) |
| { |
| SPN_SUBGROUP_UNIFORM const bool is_ttpb = SPN_TTCK_LO_IS_PREFIX(ttck[0]); |
| SPN_SUBGROUP_UNIFORM const uint ttxb_id = SPN_TTCK_LO_GET_TTXB_ID(ttck[0]); |
| SPN_SUBGROUP_UNIFORM const uint ttxb_base = ttxb_id * SPN_BLOCK_POOL_SUBBLOCK_DWORDS; |
| |
| const SPN_RENDER_TTX ttx = int(bp_blocks[ttxb_base + gl_SubgroupInvocationID]); |
| |
| if (is_ttpb) |
| { |
| spn_tile_scatter_ttpb(ttx); |
| } |
| else |
| { |
| spn_tile_scatter_ttsb(ttx); |
| } |
| } |
| |
| // |
| // are we now out of keys? |
| // |
| if ((++ttck_idx_next & SPN_RENDER_SUBGROUP_MASK) == 0) |
| { |
| const uint ttck_idx_aligned = ttck_idx_next + gl_SubgroupInvocationID; |
| |
| const bool is_valid = (ttck_idx_aligned <= ttcks_count_minus_1); |
| |
| uvec2 ttck_new = { 0, ~lgf_lxy[1] }; |
| |
| if (is_valid) |
| ttck_new = ttcks_keys[ttck_idx_aligned]; |
| |
| ttck_smem[gl_SubgroupInvocationID] = ttck_new; |
| |
| subgroupMemoryBarrierShared(); |
| } |
| } |
| |
| #elif defined(SPN_DEVICE_RENDER_TTCKS_NO_SHARED) |
| // |
| // NO SHARED |
| // |
| #endif |
| |
| // |
| // given: new layer id from ttxk key |
| // |
| // load [layer id]{ group id, depth } |
| // |
| // if within current group's layer range |
| // |
| // if at same depth |
| // |
| // load and execute cover>[mask>]color>blend commands |
| // |
| // else if not at same depth then move deeper |
| // |
| // for all groups in group trail from cur depth to new depth |
| // enter group, saving and initializing regs as necessary |
| // increment depth and update layer range |
| // load and execute cover>[mask>]color>blend commands |
| // |
| // else not within layer range |
| // |
| // exit current group, restoring regs as necessary |
| // decrement depth and update layer range |
| |
| // clear flag that controls group/layer traversal |
| spn_lgf_flag_clear_flush_complete(); |
| |
| do |
| { |
| SPN_SUBGROUP_UNIFORM const bool unwind = spn_lgf_flag_is_flush_unwind(); |
| |
| // |
| // is layer a child of the current parent group? |
| // |
| SPN_SUBGROUP_UNIFORM uint cmd_next; |
| |
| if (!unwind && spn_lgf_layer_parent_equals_group()) |
| { |
| // if there are no more TTCK keys then configure the loop |
| // so groups get unwound until done |
| spn_lgf_if_not_flush_finalize_then_complete_else_unwind(); |
| |
| // execute this layer's cmds |
| cmd_next = spn_lgf_get_layer_cmds(); |
| } |
| else if (!unwind && spn_lgf_layer_in_group_range(layer_id)) |
| { |
| // |
| // is layer in a child group? |
| // |
| spn_lgf_load_child_group(); |
| |
| // enter new group |
| cmd_next = spn_lgf_get_group_cmds_enter(); |
| } |
| else // otherwise, exit this group |
| { |
| // leave current group |
| cmd_next = spn_lgf_get_group_cmds_leave(); |
| |
| // load parent group |
| spn_lgf_load_parent_group(); |
| } |
| |
| // |
| // execute cmds |
| // |
| // currently limited to 8 commands -- a subgroup size of 4 will |
| // break this but is easily fixed or avoided by using shared |
| // memory or reading the commands one at a time. |
| // |
| // implicitly add 1 to the cmd_count |
| // |
| // FIXME -- all tiles will be picking their way through the |
| // smallish styling buffer so performing these subgroup uniform |
| // reads through the texture cache (or equivalent) would |
| // probably be a performance win. |
| // |
| SPN_SUBGROUP_UNIFORM const uint cmd_base = SPN_STYLING_CMDS_GET_BASE(cmd_next); |
| SPN_SUBGROUP_UNIFORM const uint cmd_count = SPN_STYLING_CMDS_GET_COUNT(cmd_next); |
| |
| #if defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHUFFLE) |
| // |
| // DEFAULT |
| // |
| #define SPN_STYLING_CMDS_LOAD(ii_) subgroupShuffle(cmds, ii_) |
| |
| uint cmds; |
| |
| if (gl_SubgroupInvocationID <= cmd_count) |
| { |
| cmds = styling[cmd_base + gl_SubgroupInvocationID]; |
| } |
| |
| #elif defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHARED) |
| // |
| // ONLY SUBGROUP BASIC SUPPORT |
| // |
| // load a number of commands into shared |
| // |
| #if SPN_RENDER_SUBGROUP_SIZE >= SPN_STYLING_CMDS_MAX_COUNT |
| if (gl_SubgroupInvocationID <= cmd_count) |
| spn_cmds[gl_SubgroupInvocationID] = styling[cmd_base + gl_SubgroupInvocationID]; |
| #else |
| for (uint ii = gl_SubgroupInvocationID; ii <= cmd_count; ii += SPN_RENDER_SUBGROUP_SIZE) |
| { |
| spn_cmds[ii] = styling[cmd_base + ii]; |
| } |
| #endif |
| |
| #define SPN_STYLING_CMDS_LOAD(ii_) spn_cmds[ii_] |
| |
| #elif defined(SPN_DEVICE_RENDER_STYLING_CMDS_NO_SHARED) |
| // |
| // ONLY SUBGROUP BASIC SUPPORT |
| // |
| // load each command from styling buffer |
| // |
| #define SPN_STYLING_CMDS_LOAD(ii_) styling[cmd_base + ii_] |
| |
| #endif |
| |
| for (SPN_SUBGROUP_UNIFORM uint ii = 0; ii < cmd_count; ii++) |
| { |
| SPN_SUBGROUP_UNIFORM uint cmd = SPN_STYLING_CMDS_LOAD(ii); |
| |
| switch (cmd) |
| { |
| case SPN_STYLING_OPCODE_NOOP: |
| break; |
| |
| case SPN_STYLING_OPCODE_COVER_NONZERO: |
| spn_tile_cover_nonzero(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COVER_EVENODD: |
| spn_tile_cover_evenodd(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COVER_ACCUMULATE: |
| spn_tile_cover_accumulate(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COVER_MASK: |
| spn_tile_cover_wip_mask(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COVER_WIP_ZERO: |
| spn_tile_cover_wip_zero(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COVER_ACC_ZERO: |
| spn_tile_cover_acc_zero(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COVER_MASK_ZERO: |
| spn_tile_cover_msk_zero(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COVER_MASK_ONE: |
| spn_tile_cover_msk_one(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COVER_MASK_INVERT: |
| spn_tile_cover_msk_invert(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COLOR_FILL_SOLID: { |
| SPN_SUBGROUP_UNIFORM const uint rg = SPN_STYLING_CMDS_LOAD(++ii); |
| SPN_SUBGROUP_UNIFORM const uint ba = SPN_STYLING_CMDS_LOAD(++ii); |
| spn_tile_color_fill_solid(rg, ba); |
| } |
| break; |
| |
| case SPN_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR: |
| // |
| // FIXME -- gradients shouldn't be executing so much |
| // conditional driven code at runtime since we *know* |
| // the gradient style on the host can just create a |
| // new styling command to exploit this. |
| // |
| // FIXME -- it might be time to try using the GPU's |
| // sampler on a linear array of half4 vectors -- it |
| // might outperform the explicit load/lerp routines. |
| // |
| // FIXME -- optimizing for vertical gradients (uhhh, |
| // they're actually horizontal due to the -90 degree |
| // view transform) is nice but is it worthwhile to |
| // have this in the kernel? Easy to add it back... |
| // |
| // spn_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi); |
| |
| // disable gradients for now |
| cmd_next += SPN_GRADIENT_CMD_DWORDS_V1(styling[cmd_next + 6]); |
| break; |
| |
| case SPN_STYLING_OPCODE_COLOR_WIP_ZERO: |
| spn_tile_color_wip_zero(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COLOR_ACC_ZERO: |
| spn_tile_color_acc_zero(); |
| break; |
| |
| case SPN_STYLING_OPCODE_BLEND_OVER: |
| spn_tile_blend_over(); |
| break; |
| |
| case SPN_STYLING_OPCODE_BLEND_PLUS: |
| spn_tile_blend_plus(); |
| break; |
| |
| case SPN_STYLING_OPCODE_BLEND_MULTIPLY: |
| spn_tile_blend_multiply(); |
| break; |
| |
| case SPN_STYLING_OPCODE_BLEND_KNOCKOUT: |
| spn_tile_blend_knockout(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK: |
| spn_tile_cover_msk_copy_wip(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK: |
| spn_tile_cover_msk_copy_acc(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COLOR_ACC_OVER_BACKGROUND: { |
| SPN_SUBGROUP_UNIFORM const uint rg = SPN_STYLING_CMDS_LOAD(++ii); |
| SPN_SUBGROUP_UNIFORM const uint ba = SPN_STYLING_CMDS_LOAD(++ii); |
| spn_tile_color_acc_over_background(rg, ba); |
| } |
| break; |
| |
| case SPN_STYLING_OPCODE_COLOR_ACC_STORE_TO_SURFACE: |
| spn_tile_color_acc_store_to_surface(); |
| break; |
| |
| case SPN_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY: |
| spn_tile_color_acc_test_opacity(); |
| break; |
| |
| // default: |
| // return; // this is an illegal opcode -- trap and die! |
| } |
| } |
| } // continue as long as tile flush isn't complete |
| while (spn_lgf_flag_is_not_flush_complete()); |
| |
| } // continue as long as there are still keys in this tile |
| while (spn_lgf_flag_is_not_flush_finalize()); |
| } |
| |
| // |
| // |
| // |