blob: 25744a50be27ae839ea4cd73166de210c72601d6 [file] [log] [blame]
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#version 460
//
//
//
#extension GL_GOOGLE_include_directive : require
#extension GL_KHR_shader_subgroup_basic : require
#extension GL_EXT_shader_explicit_arithmetic_types : require
//
// RENDER KERNEL
//
#include "spn_config.h"
#include "vk_layouts.h"
//
// COLOR/COVER CHANNELS ARE DETERMINED BY TARGET HARDWARE
//
// clang-format off
//
//
// SINGLE PRECISION FLOAT
//
#if defined(SPN_DEVICE_RENDER_TILE_CHANNEL_IS_FLOAT32)
#define SPN_RENDER_TILE_CHANNEL float
#define SPN_RENDER_TILE_COVER SPN_RENDER_TILE_CHANNEL
#define SPN_RENDER_TILE_COLOR vec4
#define SPN_RENDER_PIXEL_COVER float
#define SPN_RENDER_TILE_CHANNEL_IS_ZERO(c) ((c) == SPN_RENDER_TILE_CHANNEL(0))
#define SPN_RENDER_COLOR_UNPACK(rg32,ba32) \
SPN_RENDER_TILE_COLOR(unpackHalf2x16(rg32),unpackHalf2x16(ba32));
#define SPN_RENDER_COLOR_ACC_RGBA vec4
//
// HALF PRECISION FLOAT (FP16)
//
#elif defined(SPN_DEVICE_RENDER_TILE_CHANNEL_IS_FLOAT16)
#ifdef SPN_DEVICE_AMD_GCN3
#extension GL_AMD_gpu_shader_half_float : require // GCN3/AMDVLK disables float16
#endif
#define SPN_RENDER_TILE_CHANNEL float16_t
#define SPN_RENDER_TILE_COVER SPN_RENDER_TILE_CHANNEL
#define SPN_RENDER_TILE_COLOR f16vec4
#define SPN_RENDER_PIXEL_COVER float16_t
#define SPN_RENDER_TILE_CHANNEL_IS_ZERO(c) ((c) == SPN_RENDER_TILE_CHANNEL(0))
#define SPN_RENDER_COLOR_UNPACK(rg32,ba32) \
SPN_RENDER_TILE_COLOR(unpackFloat2x16(rg32),unpackFloat2x16(ba32));
#define SPN_RENDER_COLOR_ACC_RGBA f16vec4
#else
#error "SPN_DEVICE_RENDER_TILE_CHANNEL_IS_FLOATXX is not defined!"
#endif
//
// COMMON DEFINES
//
#define SPN_RENDER_WORKGROUP_SIZE (1 << SPN_DEVICE_RENDER_WORKGROUP_SIZE_LOG2)
#define SPN_RENDER_SUBGROUP_SIZE (1 << SPN_DEVICE_RENDER_SUBGROUP_SIZE_LOG2)
#define SPN_RENDER_SUBGROUP_MASK SPN_GLSL_BITS_TO_MASK(SPN_DEVICE_RENDER_SUBGROUP_SIZE_LOG2)
#define SPN_RENDER_SUBGROUPS (SPN_DEVICE_RENDER_WORKGROUP_SIZE / SPN_RENDER_SUBGROUP_SIZE)
#define SPN_RENDER_SUBTILE_COUNT_LOG2 (SPN_DEVICE_RENDER_SUBGROUP_SIZE_LOG2 - SPN_DEVICE_TILE_HEIGHT_LOG2)
#define SPN_RENDER_SUBTILE_COUNT (1 << SPN_RENDER_SUBTILE_COUNT_LOG2)
#define SPN_RENDER_TTS SPN_RENDER_TTX
#define SPN_RENDER_TTP SPN_RENDER_TTX
#define SPN_RENDER_SUBTILE_WIDTH_LOG2 (SPN_DEVICE_TILE_WIDTH_LOG2 + SPN_DEVICE_TILE_HEIGHT_LOG2 - SPN_DEVICE_RENDER_SUBGROUP_SIZE_LOG2)
#define SPN_RENDER_SUBTILE_WIDTH (1 << SPN_RENDER_SUBTILE_WIDTH_LOG2)
//
// Make sure the config has all necessary steering switches
//
#if !defined(SPN_DEVICE_RENDER_LGF_USE_SHUFFLE) && \
!defined(SPN_DEVICE_RENDER_LGF_USE_SHARED)
#error "SPN_DEVICE_RENDER_LGF_XXX undefined!"
#endif
#if !defined(SPN_DEVICE_RENDER_TTCKS_USE_SHUFFLE) && \
!defined(SPN_DEVICE_RENDER_TTCKS_USE_SHARED) && \
!defined(SPN_DEVICE_RENDER_TTCKS_NO_SHARED)
#error "SPN_DEVICE_RENDER_TTCKS_XXX undefined!"
#endif
#if !defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHUFFLE) && \
!defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHARED) && \
!defined(SPN_DEVICE_RENDER_STYLING_CMDS_NO_SHARED)
#error "SPN_DEVICE_RENDER_STYLING_CMDS_XXX undefined!"
#endif
#if (SPN_RENDER_SUBTILE_COUNT > 1) && \
!defined(SPN_DEVICE_RENDER_COVERAGE_USE_SHUFFLE) && \
!defined(SPN_DEVICE_RENDER_COVERAGE_USE_SHARED)
#error "SPN_DEVICE_RENDER_COVERAGE_XXX undefined!"
#endif
//
// Coarsely enable all advanced subgroup features if we are using any
// shuffle. Improve this switch if a new architecture requires it.
//
#if defined(SPN_DEVICE_RENDER_LGF_USE_SHUFFLE) || \
defined(SPN_DEVICE_RENDER_TTCKS_USE_SHUFFLE) || \
defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHUFFLE) || \
defined(SPN_DEVICE_RENDER_COVERAGE_USE_SHUFFLE)
#extension GL_KHR_shader_subgroup_shuffle : require
#extension GL_KHR_shader_subgroup_ballot : require
#extension GL_KHR_shader_subgroup_shuffle_relative : require
#endif
//
// Do we have vote support?
//
#ifndef SPN_DEVICE_RENDER_NO_VOTE
#extension GL_KHR_shader_subgroup_vote : require
#endif
//
//
//
layout(local_size_x = SPN_DEVICE_RENDER_WORKGROUP_SIZE) in;
//
//
//
SPN_VK_GLSL_DECL_KERNEL_RENDER();
//
// SUBTILE WIDTH EXPANSION
//
#if (SPN_RENDER_SUBTILE_WIDTH_LOG2 == 0)
#define SPN_RENDER_SUBTILE_WIDTH_EXPAND() SPN_EXPAND_1()
#elif (SPN_RENDER_SUBTILE_WIDTH_LOG2 == 1)
#define SPN_RENDER_SUBTILE_WIDTH_EXPAND() SPN_EXPAND_2()
#elif (SPN_RENDER_SUBTILE_WIDTH_LOG2 == 2)
#define SPN_RENDER_SUBTILE_WIDTH_EXPAND() SPN_EXPAND_4()
#elif (SPN_RENDER_SUBTILE_WIDTH_LOG2 == 3)
#define SPN_RENDER_SUBTILE_WIDTH_EXPAND() SPN_EXPAND_8()
#elif (SPN_RENDER_SUBTILE_WIDTH_LOG2 == 4)
#define SPN_RENDER_SUBTILE_WIDTH_EXPAND() SPN_EXPAND_16()
#elif (SPN_RENDER_SUBTILE_WIDTH_LOG2 == 5)
#define SPN_RENDER_SUBTILE_WIDTH_EXPAND() SPN_EXPAND_32()
#else
#error "SPN_RENDER_SUBTILE_WIDTH_LOG2 not supported!"
#endif
//
// Globally declare tile cover and color registers
//
// Total number of color and cover "channels" is 11.
//
// A channel is represented with either a float16 or float32.
//
// If the target hardware supports float16 and the driver isn't
// broken, then use float16.
//
// This occupies '11 * SPN_RENDER_SUBTILE_WIDTH' channels per
// subgroup lane.
//
// Tile Size
// Subgroup +----------------------------
// Size | 4x4 8x8 16x16 32x32
// ---------+----------------------------
// 4 | 44 176 704 2816
// 8 | 22 88 352 1408
// 16 | 11 44 176 704
// 32 | --- 22 88 352
// 64 | --- 11 44 176
//
//
// COLOR
//
// color_wip
// color_acc
//
#ifndef SPN_RENDER_TILE_COLOR_WIP_ENABLED
// this results in the color_wip being a scalar -- per thread or per
// subgroup depending on the capability of the target arch.
SPN_SUBGROUP_UNIFORM SPN_RENDER_TILE_COLOR color_wip;
#define SPN_RENDER_TILE_COLOR_WIP(I) color_wip
#else
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) SPN_RENDER_TILE_COLOR color_wip##I;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
#define SPN_RENDER_TILE_COLOR_WIP(I) color_wip##I
#endif
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) SPN_RENDER_TILE_COLOR color_acc##I;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
//
// COVER
//
// cover_wip
// cover_acc
// cover_msk
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) SPN_RENDER_TILE_COVER cover_wip##I;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
// cover_acc
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) SPN_RENDER_TILE_COVER cover_acc##I;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) SPN_RENDER_TILE_COVER cover_msk##I;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
//
// GENERAL DATA TYPES THAT WE MAY WANT TO TWEAK LATER
//
// NOTE(allanmac): these could vary in OpenCL
//
#define SPN_RENDER_TTX int
#define SPN_RENDER_PIXEL_AREA int
//
// Shared memory is primarily for accumulating areas but is also
// used as a scratch buffer for gradients and other operations that
// might require random-access lookups.
//
#define SPN_RENDER_TILE_SMEM_DWORDS ((SPN_TILE_WIDTH + 1) * SPN_TILE_HEIGHT)
struct spn_subgroup_smem
{
SPN_RENDER_PIXEL_AREA area[SPN_RENDER_TILE_SMEM_DWORDS];
};
//
//
//
#if (SPN_RENDER_SUBGROUPS == 1)
shared spn_subgroup_smem smem;
#define SPN_RENDER_SMEM() smem
#else
shared spn_subgroup_smem smem[SPN_RENDER_SUBGROUPS];
#define SPN_RENDER_SMEM() smem[gl_WorkGroupID.x]
#endif
//
// render flags
//
//
// FIXME: testing for opacity and skipping scattering is on its way to
// becoming a much more programmable option because sometimes we may
// be compositing/blending from back-to-front and/or be using group
// blend rules that ignore opacity.
//
// The point is that all of these decisions should be encoded in
// styling commands and, as much as possible, removed from the final
// group/layer styling traversal render loop.
//
// FLUSH FLAGS
#define SPN_RENDER_FLAGS_FLUSH_FINALIZE 0x1
#define SPN_RENDER_FLAGS_FLUSH_UNWIND 0x2
#define SPN_RENDER_FLAGS_FLUSH_COMPLETE 0x4
// OPACITY FLAG
#define SPN_RENDER_FLAGS_SCATTER_SKIP 0x8
//
// LGF -- layer / group / flags
// optional
// | current layer | current group | | | |
// +---------------+------------+-------+-------------+.......+.......+.......f....
// | layer | parents | range | cmds | layer | group | flags | ...
// | cmds parent | depth base | lo hi | enter leave | id | id | |
// +------+--------+------+-----+---+---+------+------+.......+-......+.......+....
// 0 1 2 3 4 5 6 7 8 9 10 11
//
//
// FIXME(allanmac): harmonize these constants with core.h
//
#define SPN_LGF_LAYER_CMDS 0
#define SPN_LGF_LAYER_PARENT 1
#define SPN_LGF_GROUP_PARENTS_DEPTH 2
#define SPN_LGF_GROUP_FIRST SPN_LGF_GROUP_PARENTS_DEPTH
#define SPN_LGF_GROUP_PARENTS_BASE 3
#define SPN_LGF_GROUP_RANGE_LO 4
#define SPN_LGF_GROUP_RANGE_HI 5
#define SPN_LGF_GROUP_CMDS_ENTER 6
#define SPN_LGF_GROUP_CMDS_LEAVE 7
#define SPN_LGF_GROUP_LAST SPN_LGF_GROUP_CMDS_LEAVE
#define SPN_LGF_COUNT 8
//
// SHUFFLE
//
#if defined(SPN_DEVICE_RENDER_LGF_USE_SHUFFLE)
#define SPN_LGF_BANKS ((SPN_LGF_COUNT + SPN_RENDER_SUBGROUP_SIZE - 1) / SPN_RENDER_SUBGROUP_SIZE)
#define SPN_LGF_BANK(idx) ((idx) / SPN_RENDER_SUBGROUP_SIZE)
#define SPN_LGF_LANE(idx) ((idx)-SPN_LGF_BANK(idx) * SPN_RENDER_SUBGROUP_SIZE)
#define SPN_LGF_IS_LANE(idx) (gl_SubgroupInvocationID == SPN_LGF_LANE(idx))
#define SPN_LGF_LOAD(idx) subgroupBroadcast(lgf[SPN_LGF_BANK(idx)], idx)
uint lgf[SPN_LGF_BANKS]; // subgroup-wide register variable at global scope
//
// SHARED
//
#elif defined(SPN_DEVICE_RENDER_LGF_USE_SHARED)
shared uint lgf[SPN_LGF_COUNT];
#define SPN_LGF_LOAD(idx) lgf[idx]
#endif
//
// clang-format on
//
SPN_SUBGROUP_UNIFORM uvec2 lgf_lxy;
SPN_SUBGROUP_UNIFORM uint lgf_flags;
SPN_SUBGROUP_UNIFORM uint lgf_group_id;
//
//
//
void
spn_lgf_init()
{
#if defined(SPN_DEVICE_RENDER_LGF_USE_SHUFFLE)
//
// SHUFFLE
//
lgf[0] = SPN_UINT_MAX;
#if (SPN_LGF_BANKS >= 2)
lgf[1] = SPN_UINT_MAX;
#endif
if (SPN_LGF_IS_LANE(SPN_LGF_GROUP_RANGE_LO))
lgf[SPN_LGF_BANK(SPN_LGF_GROUP_RANGE_LO)] = 0;
#elif defined(SPN_DEVICE_RENDER_LGF_USE_SHARED)
//
// SHARED
//
#if (SPN_RENDER_SUBGROUP_SIZE == 4)
lgf[gl_SubgroupInvocationID + 0] = SPN_UINT_MAX;
lgf[gl_SubgroupInvocationID + 4] = SPN_UINT_MAX;
// CAREFUL -- if gl_SubgroupInvocationID doesn't match!
#elif (SPN_RENDER_SUBGROUP_SIZE == 8)
lgf[gl_SubgroupInvocationID] = SPN_UINT_MAX;
#else // >= 16
if (gl_SubgroupInvocationID < SPN_LGF_COUNT)
lgf[gl_SubgroupInvocationID] = SPN_UINT_MAX;
#endif
lgf[SPN_LGF_GROUP_RANGE_LO] = 0;
#endif
lgf_flags = 0;
lgf_group_id = SPN_UINT_MAX;
}
void
spn_lgf_flag_set_flush_finalize()
{
lgf_flags |= SPN_RENDER_FLAGS_FLUSH_FINALIZE;
}
void
spn_lgf_flag_set_flush_unwind()
{
lgf_flags |= SPN_RENDER_FLAGS_FLUSH_UNWIND;
}
void
spn_lgf_flag_set_flush_complete()
{
lgf_flags |= SPN_RENDER_FLAGS_FLUSH_COMPLETE;
}
void
spn_lgf_flag_clear_flush_complete()
{
lgf_flags &= ~SPN_RENDER_FLAGS_FLUSH_COMPLETE;
}
void
spn_lgf_flag_set_scatter_skip()
{
lgf_flags |= SPN_RENDER_FLAGS_SCATTER_SKIP;
}
SPN_SUBGROUP_UNIFORM
bool
spn_lgf_flag_is_scatter_noskip()
{
return (lgf_flags & SPN_RENDER_FLAGS_SCATTER_SKIP) == 0;
}
SPN_SUBGROUP_UNIFORM
bool
spn_lgf_flag_is_flush_unwind()
{
return (lgf_flags & SPN_RENDER_FLAGS_FLUSH_UNWIND) != 0;
}
SPN_SUBGROUP_UNIFORM
bool
spn_lgf_flag_is_not_flush_finalize()
{
return (lgf_flags & SPN_RENDER_FLAGS_FLUSH_FINALIZE) == 0;
}
void
spn_lgf_if_not_flush_finalize_then_complete_else_unwind()
{
const bool is_not_ff = spn_lgf_flag_is_not_flush_finalize();
lgf_flags |= is_not_ff ? SPN_RENDER_FLAGS_FLUSH_COMPLETE : SPN_RENDER_FLAGS_FLUSH_UNWIND;
}
SPN_SUBGROUP_UNIFORM
bool
spn_lgf_flag_is_not_flush_complete()
{
return (lgf_flags & SPN_RENDER_FLAGS_FLUSH_COMPLETE) == 0;
}
SPN_SUBGROUP_UNIFORM
uint
spn_lgf_get_layer_cmds()
{
return SPN_LGF_LOAD(SPN_LGF_LAYER_CMDS);
}
SPN_SUBGROUP_UNIFORM
uint
spn_lgf_get_group_cmds_enter()
{
return SPN_LGF_LOAD(SPN_LGF_GROUP_CMDS_ENTER);
}
SPN_SUBGROUP_UNIFORM
uint
spn_lgf_get_group_cmds_leave()
{
return SPN_LGF_LOAD(SPN_LGF_GROUP_CMDS_LEAVE);
}
SPN_SUBGROUP_UNIFORM
bool
spn_lgf_layer_in_group_range(SPN_SUBGROUP_UNIFORM const uint layer_id)
{
//
// FIXME -- test against single comparison
//
SPN_SUBGROUP_UNIFORM const uint range_lo = SPN_LGF_LOAD(SPN_LGF_GROUP_RANGE_LO);
SPN_SUBGROUP_UNIFORM const uint range_hi = SPN_LGF_LOAD(SPN_LGF_GROUP_RANGE_HI);
SPN_SUBGROUP_UNIFORM const uint lo = layer_id - range_lo;
return lo <= (range_hi - range_lo);
}
SPN_SUBGROUP_UNIFORM
bool
spn_lgf_layer_parent_equals_group()
{
SPN_SUBGROUP_UNIFORM const uint layer_parent = SPN_LGF_LOAD(SPN_LGF_LAYER_PARENT);
return layer_parent == lgf_group_id;
}
void
spn_lgf_layer_load(SPN_SUBGROUP_UNIFORM const uint layer_id)
{
//
// Load dwords:
//
// SPN_LGF_LAYER_CMDS 0
// SPN_LGF_LAYER_PARENT 1
//
#if defined(SPN_DEVICE_RENDER_LGF_USE_SHUFFLE)
//
// SHUFFLE
//
if (gl_SubgroupInvocationID <= SPN_LGF_LAYER_PARENT)
lgf[0] = styling[layer_id * SPN_STYLING_LAYER_COUNT_DWORDS + gl_SubgroupInvocationID];
#elif defined(SPN_DEVICE_RENDER_LGF_USE_SHARED)
//
// SHARED
//
if (gl_SubgroupInvocationID <= SPN_LGF_LAYER_PARENT)
lgf[gl_SubgroupInvocationID] =
styling[layer_id * SPN_STYLING_LAYER_COUNT_DWORDS + gl_SubgroupInvocationID];
#endif
}
void
spn_lgf_group_load()
{
//
// Load dwords:
//
// SPN_LGF_GROUP_PARENTS_DEPTH 2 (SPN_LGF_GROUP_FIRST)
// SPN_LGF_GROUP_PARENTS_BASE 3
// SPN_LGF_GROUP_RANGE_LO 4
// SPN_LGF_GROUP_RANGE_HI 5
// SPN_LGF_GROUP_CMDS_ENTER 6
// SPN_LGF_GROUP_CMDS_LEAVE 7 (SPN_LGF_GROUP_LAST)
//
#if defined(SPN_DEVICE_RENDER_LGF_USE_SHUFFLE)
//
// SHUFFLE
//
// highp = mediump - highp;
const uint lgf_group_first = SPN_LGF_GROUP_FIRST;
const uint iid = gl_SubgroupInvocationID - lgf_group_first;
if (iid < SPN_LGF_GROUP_LAST - SPN_LGF_GROUP_FIRST + 1)
lgf[0] = styling[lgf_group_id + iid];
#if (SPN_RENDER_SUBGROUP_SIZE == 4)
lgf[1] = styling[lgf_group_id + 2 + gl_SubgroupInvocationID];
#endif
#elif defined(SPN_DEVICE_RENDER_LGF_USE_SHARED)
//
// SHARED
//
const uint iid = lgf_group_id + gl_SubgroupInvocationID;
const uint lgf_idx = SPN_LGF_GROUP_FIRST + gl_SubgroupInvocationID;
#if (SPN_RENDER_SUBGROUP_SIZE == 4)
lgf[lgf_idx] = styling[iid];
if (gl_SubgroupInvocationID < SPN_LGF_GROUP_LAST - SPN_LGF_GROUP_FIRST + 1 - 4)
lgf[lgf_idx + 4] = styling[iid + 4];
#else // >= 8
if (gl_SubgroupInvocationID < SPN_LGF_GROUP_LAST - SPN_LGF_GROUP_FIRST + 1)
lgf[lgf_idx] = styling[iid];
#endif
#endif
}
void
spn_lgf_load_child_group()
{
lgf_group_id = SPN_LGF_LOAD(SPN_LGF_LAYER_PARENT);
SPN_SUBGROUP_UNIFORM const uint group_depth_old = SPN_LGF_LOAD(SPN_LGF_GROUP_PARENTS_DEPTH) + 1;
spn_lgf_group_load();
SPN_SUBGROUP_UNIFORM const uint group_depth_new = SPN_LGF_LOAD(SPN_LGF_GROUP_PARENTS_DEPTH);
SPN_SUBGROUP_UNIFORM const uint group_base_offset = group_depth_new - group_depth_old;
if (group_base_offset != 0)
{
SPN_SUBGROUP_UNIFORM const uint group_base = SPN_LGF_LOAD(SPN_LGF_GROUP_PARENTS_BASE);
SPN_SUBGROUP_UNIFORM const uint group_id_idx = group_base + group_base_offset - 1;
lgf_group_id = styling[group_id_idx];
spn_lgf_group_load();
}
}
void
spn_lgf_load_parent_group()
{
SPN_SUBGROUP_UNIFORM const uint group_depth = SPN_LGF_LOAD(SPN_LGF_GROUP_PARENTS_DEPTH);
if (group_depth == 0)
{
spn_lgf_flag_set_flush_complete();
}
else
{
SPN_SUBGROUP_UNIFORM const uint group_base = SPN_LGF_LOAD(SPN_LGF_GROUP_PARENTS_BASE);
lgf_group_id = styling[group_base];
spn_lgf_group_load();
}
}
//
//
//
bool
spn_ttck_lxy_equal(const uvec2 a, SPN_SUBGROUP_UNIFORM const uvec2 lxy)
{
// FIXME FIXME
uvec2 c = a ^ lxy;
return ((c[0] & SPN_TTCK_LO_MASK_LAYER) | c[1]) == 0;
}
SPN_SUBGROUP_UNIFORM
bool
spn_ttck_lxy_neq_uni(SPN_SUBGROUP_UNIFORM const uvec2 a, SPN_SUBGROUP_UNIFORM const uvec2 lxy)
{
// FIXME FIXME
uvec2 c = a ^ lxy;
c[0] = c[0] & SPN_TTCK_LO_MASK_LAYER;
return (c[0] | c[1]) != 0;
}
bool
spn_ttck_hi_xy_equal(const uint a, SPN_SUBGROUP_UNIFORM const uint lxy_hi)
{
// FIXME FIXME
return ((a ^ lxy_hi) & SPN_TTCK_HI_MASK_XY) == 0;
}
SPN_SUBGROUP_UNIFORM
bool
spn_ttck_hi_xy_equal_uni(SPN_SUBGROUP_UNIFORM const uint a, SPN_SUBGROUP_UNIFORM const uint lxy_hi)
{
// FIXME FIXME
return ((a ^ lxy_hi) & SPN_TTCK_HI_MASK_XY) == 0;
}
SPN_SUBGROUP_UNIFORM
uint
spn_ttck_get_layer_uni(SPN_SUBGROUP_UNIFORM const uvec2 lxy)
{
return SPN_TTCK_GET_LAYER(lxy);
}
//
//
//
void
spn_tile_smem_zero()
{
//
// Note that atomic_init() is likely implemented as a simple
// assignment so there is no identifiable performance difference on
// current targets.
//
// If such an architecture appears in the future then we'll probably
// still want to implement this zero'ing operation as below but
// follow with an appropriate fence that occurs before any scatter
// operations.
//
// FIXME: try to (re)implement 8-byte writes in GLSL for GEN9
//
// NOT IMPLEMENTED:
//
// Intel GENx has a documented 64 byte per cycle SLM write limit.
// So having each lane in an 8 lane subgroup zero-write 8 bytes is
// probably a safe bet (Later: benchmarking backs this up!).
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) \
SPN_RENDER_SMEM().area[gl_SubgroupInvocationID + I * SPN_RENDER_SUBGROUP_SIZE] = 0;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
// Note this is going to be vectorizable on most architectures.
//
// The return of the key translation feature might complicate things.
//
#if (SPN_RENDER_SUBTILE_COUNT == 1)
void
spn_tile_scatter_ttpb(const SPN_RENDER_TTP ttp)
{
if (ttp != 0)
{
const int area = ttp * SPN_TTS_SUBPIXEL_Y_SIZE * 2;
SPN_RENDER_SMEM().area[gl_SubgroupInvocationID] += area;
}
}
#else // SPN_RENDER_SUBTILE_COUNT >= 2
void
spn_tile_scatter_ttpb(const SPN_RENDER_TTP ttp, const uint iid)
{
if (ttp != 0)
{
const int area = ttp * SPN_TTS_SUBPIXEL_Y_SIZE * 2;
atomicAdd(SPN_RENDER_SMEM().area[iid], area);
}
}
#endif
//
//
//
int
spn_tts_get_dy(const SPN_RENDER_TTS tts)
{
//
// The tts.dy bitfield maps:
//
// [-32,-1] -> [-32,-1]
// [ 0,31] -> [ 1,32]
//
// After extracting the bitfield, the range must be adjusted:
//
// if (dy >= 0) then ++dy
//
const int dy = SPN_TTS_GET_DY(tts);
return dy + ((dy >= 0) ? 1 : 0);
}
//
// Accumulate altitudes and areas -- see docs to understand what's
// going on here with Surveyor's Algorithm.
//
// Note that other coverate calculation algorithms are possible
// because the TTS values encode (flattened) subpixel line segments.
//
// Note that spn_scatter_ttsb is *not* vectorizable unless the
// architecture supports a "scatter-add" capability. All relevant
// GPUs support atomic add on shared/local memory and thus support
// scatter-add.
//
// On a SIMD device without scatter support, the vector components are
// are stored sequentially.
//
void
spn_tile_scatter_ttsb(const SPN_RENDER_TTS tts)
{
#ifdef SPN_DEVICE_RENDER_TEST_TTS_INVALID_EARLY
if (tts != SPN_TTS_INVALID)
#endif
{
//
// FIXME(allanmac): skipping per-key pixel and subpixel
// translation for now -- implement via a dedicated opcode.
//
// The "min(x0,x1) * 2 + abs(dx)" is equivalent to "x0 + x1"
// and is always positive and <= 1023
const uint tx_sub = SPN_TTS_GET_TX_SUBPIXEL(tts);
const int dx = SPN_TTS_GET_DX(tts);
const int dx_abs = abs(dx);
const uint widths = tx_sub * 2 + dx_abs;
// Calculate left and right coverage contribution trapezoids
const int dy = spn_tts_get_dy(tts);
const int left = dy * int(widths);
const int right = dy * (SPN_TTS_SUBPIXEL_X_SIZE * 2) - left;
//
// The final column is a guard column that is OK to write to
// but will never be read. It simplifies the TTSB scatter but
// could be predicated if SMEM is really at a premium.
//
const uint tx_pix = SPN_TTS_GET_TX_PIXEL(tts);
const uint ty_pix = SPN_TTS_GET_TY_PIXEL(tts);
const uint tile_idx = tx_pix * SPN_TILE_HEIGHT + ty_pix;
//
// GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
//
#ifndef SPN_DEVICE_RENDER_TEST_TTS_INVALID_EARLY
if (tts != SPN_TTS_INVALID)
#endif
{
atomicAdd(SPN_RENDER_SMEM().area[tile_idx], right);
atomicAdd(SPN_RENDER_SMEM().area[tile_idx + SPN_TILE_HEIGHT], left);
}
}
}
//
// clang-format off
//
#define SPN_PIXEL_SMEM_AREA(I, lane) SPN_RENDER_SMEM().area[lane + I * SPN_RENDER_SUBGROUP_SIZE]
//
// If there are multiple subtiles per subgroup then we need to
// horizontally exclusive scan add the accumulated areas
//
//
// SUBTILE IS ENTIRE TILE
//
#if (SPN_RENDER_SUBTILE_COUNT == 1)
#define SPN_RENDER_PIXEL_AREA_PREAMBLE() // noop
#define SPN_SUBTILE_AREA_SCAN_PRE(I, area) area += SPN_PIXEL_SMEM_AREA(I, gl_SubgroupInvocationID)
#define SPN_SUBTILE_AREA_SCAN_POST(area) // noop
//
// MULTIPLE SUBTILES
//
#else
#define SPN_RENDER_SUBTILE_LAST (SPN_RENDER_SUBTILE_COUNT - 1)
#define SPN_RENDER_SUBTILE_LAST_BASE (SPN_RENDER_SUBTILE_LAST * SPN_TILE_HEIGHT)
//
// clang-format on
//
//
// COVERAGE USES SHUFFLE
//
#if defined(SPN_DEVICE_RENDER_COVERAGE_USE_SHUFFLE)
//
// -- SUBTILES COUNT = 2
//
#if (SPN_RENDER_SUBTILE_COUNT_LOG2 == 1)
#define SPN_RENDER_PIXEL_AREA_PREAMBLE() \
const bool is_p0 = (gl_SubgroupInvocationID >= SPN_TILE_HEIGHT); \
SPN_RENDER_PIXEL_AREA total
#define SPN_SUBTILE_AREA_SCAN_PRE(I, area) \
{ \
total = area; \
\
SPN_RENDER_PIXEL_AREA pp = SPN_PIXEL_SMEM_AREA(I, gl_SubgroupInvocationID); \
SPN_RENDER_PIXEL_AREA x0 = subgroupShuffleXor(pp, SPN_TILE_HEIGHT); \
SPN_RENDER_PIXEL_AREA rr = pp + x0; \
\
total += rr; \
\
if (is_p0) \
pp = rr; \
\
area += pp; \
}
#define SPN_SUBTILE_AREA_SCAN_POST(area) area = total;
//
// -- SUBTILES COUNT = 4
//
#elif (SPN_RENDER_SUBTILE_COUNT_LOG2 == 2)
#define SPN_RENDER_PIXEL_AREA_PREAMBLE() \
const bool is_p0 = (gl_SubgroupInvocationID & SPN_TILE_HEIGHT) != 0; \
const bool is_p1 = (gl_SubgroupInvocationID >= SPN_TILE_HEIGHT * 2); \
SPN_RENDER_PIXEL_AREA total
#define SPN_SUBTILE_AREA_SCAN_PRE(I, area) \
{ \
total = area; \
\
SPN_RENDER_PIXEL_AREA pp = SPN_PIXEL_SMEM_AREA(I, gl_SubgroupInvocationID); \
SPN_RENDER_PIXEL_AREA x0 = subgroupShuffleXor(pp, SPN_TILE_HEIGHT); \
SPN_RENDER_PIXEL_AREA rr = pp + x0; \
\
total += rr; \
\
if (is_p0) \
pp = rr; \
\
SPN_RENDER_PIXEL_AREA x1 = subgroupShuffleXor(rr, SPN_TILE_HEIGHT * 2); \
\
total += x1; \
\
if (is_p1) \
pp += x1; \
\
area += pp; \
}
#define SPN_SUBTILE_AREA_SCAN_POST(area) area = total;
//
// -- SUBTILES COUNT >= 8
//
#else
#error "SPN_RENDER_SUBTILE_COUNT_LOG2 > 2 not supported"
#endif
//
// COVERAGE USES SHARED
//
#elif defined(SPN_DEVICE_RENDER_COVERAGE_USE_SHARED)
//
// -- SUBTILES COUNT = 2
//
#define SPN_RENDER_PIXEL_AREA_PREAMBLE() \
const bool is_p0 = (gl_SubgroupInvocationID >= SPN_TILE_HEIGHT); \
const uint iid_xor = (gl_SubgroupInvocationID ^ SPN_TILE_HEIGHT); \
SPN_RENDER_PIXEL_AREA total
#define SPN_SUBTILE_AREA_SCAN_PRE(I, area) \
{ \
total = area; \
\
SPN_RENDER_PIXEL_AREA pp = SPN_PIXEL_SMEM_AREA(I, gl_SubgroupInvocationID); \
SPN_RENDER_PIXEL_AREA x0 = SPN_PIXEL_SMEM_AREA(I, iid_xor); \
SPN_RENDER_PIXEL_AREA rr = pp + x0; \
\
total += rr; \
\
if (is_p0) \
pp = rr; \
\
area += pp; \
}
#define SPN_SUBTILE_AREA_SCAN_POST(area) area = total;
#if (SPN_RENDER_SUBTILE_COUNT_LOG2 > 1)
#error "SPN_DEVICE_RENDER_COVERAGE_USE_SHARED missing support for a subtile count > 2"
#endif
#endif
#endif
//
// Compute accumulated pixel coverage "fill rules" using Surveyor's
// Algorithm.
//
// FIXME -- we may want SPN_DEVICE_RENDER_COVER_AREA to be an int2()
// which means the initial SMEM load and subsequent shuffles would
// need to hide the second load and shuffle.
//
void
spn_tile_cover_nonzero()
{
SPN_RENDER_PIXEL_AREA_PREAMBLE();
SPN_RENDER_PIXEL_AREA area = 0;
subgroupMemoryBarrierShared();
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) \
{ \
SPN_SUBTILE_AREA_SCAN_PRE(I, area); \
\
const SPN_RENDER_PIXEL_AREA trapabs = abs(area); \
const SPN_RENDER_PIXEL_AREA trapmin = min(trapabs, SPN_TTS_FILL_MAX_AREA); \
const SPN_RENDER_PIXEL_COVER nonzero = SPN_RENDER_PIXEL_COVER(trapmin); \
\
cover_wip##I = nonzero * SPN_RENDER_PIXEL_COVER(SPN_TTS_FILL_MAX_AREA_RCP_F32); \
\
if (!L) \
{ \
SPN_SUBTILE_AREA_SCAN_POST(area); \
} \
}
SPN_RENDER_SUBTILE_WIDTH_EXPAND();
}
void
spn_tile_cover_evenodd()
{
SPN_RENDER_PIXEL_AREA_PREAMBLE();
SPN_RENDER_PIXEL_AREA area = 0;
subgroupMemoryBarrierShared();
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) \
{ \
SPN_SUBTILE_AREA_SCAN_PRE(I, area); \
\
const SPN_RENDER_PIXEL_AREA trapabs = abs(area); \
const SPN_RENDER_PIXEL_AREA maskabs = trapabs & SPN_TTS_FILL_EVEN_ODD_MASK; \
const SPN_RENDER_PIXEL_AREA reflect = abs(maskabs - SPN_TTS_FILL_MAX_AREA); \
const SPN_RENDER_PIXEL_COVER evenodd = \
SPN_RENDER_PIXEL_COVER(SPN_TTS_FILL_MAX_AREA - reflect); \
\
cover_wip##I = evenodd * SPN_RENDER_PIXEL_COVER(SPN_TTS_FILL_MAX_AREA_RCP_F32); \
\
if (!L) \
{ \
SPN_SUBTILE_AREA_SCAN_POST(area); \
} \
}
SPN_RENDER_SUBTILE_WIDTH_EXPAND();
}
//
//
//
void
spn_tile_color_fill_solid(SPN_SUBGROUP_UNIFORM const uint rg32,
SPN_SUBGROUP_UNIFORM const uint ba32)
{
//
// solid fill
//
// loads { fp16x2 rg, fp16x2 ba } from cmd stream
//
// NOTE(allanmac): we could load the color into column 0 and then
// copy it to the remaining columns.
//
#ifndef SPN_RENDER_TILE_COLOR_WIP_ENABLED
color_wip = SPN_RENDER_COLOR_UNPACK(rg32, ba32);
color_wip.a = -color_wip.a;
#else
SPN_SUBGROUP_UNIFORM SPN_RENDER_TILE_COLOR rgba = SPN_RENDER_COLOR_UNPACK(rg32, ba32);
rgba.a = -rgba.a; // temporarily here
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) color_wip##I = rgba;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
#endif
}
//
//
//
void
spn_tile_blend_over()
{
//
// fralunco = cover.wip * acc.a
//
// acc.r = +fralunco * wip.r + acc.r
// acc.g = +fralunco * wip.g + acc.g
// acc.b = +fralunco * wip.b + acc.b
// acc.a = -fralunco * wip.a + acc.a <-- wip.a is negated
//
// Assumes color.wip.a is negated.
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) \
color_acc##I += (cover_wip##I * color_acc##I.a) * SPN_RENDER_TILE_COLOR_WIP(I);
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
//
//
void
spn_tile_blend_plus()
{
//
// cover_min = min(cover.wip,acc.a)
//
// r.acc = cover_min * wip.r + acc.r
// g.acc = cover_min * wip.g + acc.g
// b.acc = cover_min * wip.b + acc.b
// a.acc = -cover_min * wip.a + acc.a
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) \
color_acc##I += min(cover_wip##I, color_acc##I.a) * SPN_RENDER_TILE_COLOR_WIP(I);
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
//
//
void
spn_tile_blend_multiply()
{
//
// acc.r = (cover.wip * wip.r) * acc.r
// acc.g = (cover.wip * wip.g) * acc.g
// acc.b = (cover.wip * wip.b) * acc.b
// acc.a = (cover.wip * wip.a) * (1.0 - acc.a) <-- acc.a is already (1.0 - alpha)
//
// FIXME(allanmac): This may be incorrect.
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) \
color_acc##I = cover_wip##I * SPN_RENDER_TILE_COLOR_WIP(I) * color_acc##I;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
//
//
void
spn_tile_blend_knockout()
{
//
// cover.wip.contrib = (1.0 - cover.acc) * cover.wip
// cover.acc = cover.acc + cover.wip.contrib
//
// r.acc = cover.wip.contrib * wip.r + acc.r
// g.acc = cover.wip.contrib * wip.g + acc.g
// b.acc = cover.wip.contrib * wip.b + acc.b
// a.acc = -cover.wip.contrib * wip.a + acc.a
//
// Destructively updates cover.wip
//
//
// 1. cover_wip = cover_wip - cover_wip * cover.acc
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) cover_wip##I = fma(-cover_wip##I, cover_acc##I, cover_wip##I);
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
//
// 2. cover_acc += cover_wip
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) cover_acc##I += cover_wip##I;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
//
// 3. color_acc = color_wip * cover_wip + color_acc
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) color_acc##I += SPN_RENDER_TILE_COLOR_WIP(I) * cover_wip##I;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
//
//
void
spn_tile_cover_msk_copy_wip()
{
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) cover_msk##I = cover_wip##I;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
//
//
void
spn_tile_cover_msk_copy_acc()
{
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) cover_msk##I = cover_acc##I;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
//
//
void
spn_tile_cover_accumulate()
{
//
// cover.wip.contrib = (1.0 - cover.acc) * cover.wip
// cover.acc = cover.acc + cover.wip.contrib
//
// Destructively updates cover.wip
//
//
// cover.wip = cover.wip - cover.acc * cover.wip
// cover.acc = cover.acc + cover.wip
//
// cover.acc = cover.acc + cover.wip - cover.acc * cover.wip
//
//
// 1. cover_wip = -cover_wip * cover_acc + cover_wip
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) cover_wip##I = fma(-cover_wip##I, cover_acc##I, cover_wip##I);
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
//
// 2. cover_acc = cover_acc + cover_wip
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) cover_acc##I += cover_wip##I;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
// COVER MASK
//
void
spn_tile_cover_wip_mask()
{
//
// cover.wip *= cover.msk
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) cover_wip##I *= cover_msk##I;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
// COVER ZERO
//
//
// FIXME(allanmac): cover_wip_zero() is never going to be used
//
void
spn_tile_cover_wip_zero()
{
const SPN_RENDER_TILE_COVER zero = SPN_RENDER_TILE_COVER(0);
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) cover_wip##I = zero;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
void
spn_tile_cover_acc_zero()
{
const SPN_RENDER_TILE_COVER zero = SPN_RENDER_TILE_COVER(0);
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) cover_acc##I = zero;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
void
spn_tile_cover_msk_zero()
{
const SPN_RENDER_TILE_COVER zero = SPN_RENDER_TILE_COVER(0);
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) cover_msk##I = zero;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
// COVER ONE
//
void
spn_tile_cover_msk_one()
{
const SPN_RENDER_TILE_COVER one = SPN_RENDER_TILE_COVER(1);
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) cover_msk##I = one;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
//
//
void
spn_tile_cover_msk_invert()
{
const SPN_RENDER_TILE_COVER one = SPN_RENDER_TILE_COVER(1);
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) cover_msk##I = one - cover_msk##I;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
//
//
//
// FIXME(allanmac): color_wip_zero() will never be used
//
void
spn_tile_color_wip_zero()
{
#ifndef SPN_RENDER_TILE_COLOR_WIP_ENABLED
color_wip = SPN_RENDER_TILE_COLOR(0, 0, 0, -1);
#else
const SPN_RENDER_TILE_COLOR rgba = SPN_RENDER_TILE_COLOR(0, 0, 0, -1);
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) color_wip##I = rgba;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
#endif
}
void
spn_tile_color_acc_zero()
{
const SPN_RENDER_TILE_COLOR rgba = SPN_RENDER_TILE_COLOR(0, 0, 0, 1);
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) color_acc##I = rgba;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
//
//
void
spn_tile_color_acc_test_opacity()
{
//
// returns true if tile is opaque
//
// various hacks to test for complete tile opacity
//
// note that front-to-back currently has alpha at 0.0f -- this can
// be harmonized to use a traditional alpha if we want to support
// rendering in either direction
//
// hack -- ADD/MAX/OR all alphas together and test for non-zero
//
#ifndef SPN_DEVICE_RENDER_NO_VOTE
//
// VOTE
//
SPN_RENDER_TILE_CHANNEL a = SPN_RENDER_TILE_CHANNEL(0);
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) a = max(a, color_acc##I.a);
SPN_RENDER_SUBTILE_WIDTH_EXPAND();
// are all components in the subtile zero?
if (subgroupAll(SPN_RENDER_TILE_CHANNEL_IS_ZERO(a)))
spn_lgf_flag_set_scatter_skip();
#else
//
// NO VOTE
//
// FIXME -- for now, do nothing on basic-only devices
#endif
}
//
//
//
void
spn_tile_color_acc_over_background(SPN_SUBGROUP_UNIFORM const uint rg32,
SPN_SUBGROUP_UNIFORM const uint ba32)
{
//
// acc.r = acc.a * r + acc.r
// acc.g = acc.a * g + acc.g
// acc.b = acc.a * b + acc.b
//
SPN_SUBGROUP_UNIFORM const SPN_RENDER_TILE_COLOR rgb1 = SPN_RENDER_COLOR_UNPACK(rg32, ba32);
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) \
{ \
color_acc##I.rg += color_acc##I.a * rgb1.rg; \
color_acc##I.b += color_acc##I.a * rgb1.b; \
}
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
//
// Map accumulator register rows to surface coordinates
//
#if (SPN_RENDER_SUBTILE_COUNT == 1)
#define SPN_RENDER_SUBTILE_LANE_TO_X(sgid) 0
#define SPN_RENDER_SUBTILE_LANE_TO_Y(sgid) (sgid)
#else
#define SPN_RENDER_SUBTILE_LANE_TO_X(sgid) (sgid >> SPN_DEVICE_TILE_HEIGHT_LOG2)
#define SPN_RENDER_SUBTILE_LANE_TO_Y(sgid) (sgid & SPN_TILE_HEIGHT_MASK)
#endif
//
// FIXME(allanmac): use a specialization constant to steer codegen for
// different color depths or multi-plane images.
//
// Multi-plane might be optimal because the R/G/B arrays can be
// directly copied?
//
#ifndef SPN_RENDER_STORE_TO_SURFACE_REFLECTED
//
// X
// +------->
// |
// Y |
// |
// v
//
void
spn_tile_color_acc_store_to_surface()
{
SPN_SUBGROUP_UNIFORM const uint x_uni = SPN_TTCK_GET_X(lgf_lxy) * SPN_TILE_WIDTH;
SPN_SUBGROUP_UNIFORM const uint y_uni = SPN_TTCK_GET_Y(lgf_lxy) * SPN_TILE_HEIGHT;
ivec2 xy = ivec2(x_uni + SPN_RENDER_SUBTILE_LANE_TO_X(gl_SubgroupInvocationID),
y_uni + SPN_RENDER_SUBTILE_LANE_TO_Y(gl_SubgroupInvocationID));
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) \
imageStore(surface, xy, color_acc##I); \
xy.x += SPN_RENDER_SUBTILE_COUNT;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
#else
//
// Y
// +------->
// |
// X | REFLECTED
// |
// v
//
void
spn_tile_color_acc_store_to_surface()
{
SPN_SUBGROUP_UNIFORM const uint x_uni = SPN_TTCK_GET_X(lgf_lxy) * SPN_TILE_WIDTH;
SPN_SUBGROUP_UNIFORM const uint y_uni = SPN_TTCK_GET_Y(lgf_lxy) * SPN_TILE_HEIGHT;
ivec2 xy = ivec2(y_uni + SPN_RENDER_SUBTILE_LANE_TO_Y(gl_SubgroupInvocationID),
x_uni + SPN_RENDER_SUBTILE_LANE_TO_X(gl_SubgroupInvocationID));
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) \
imageStore(surface, xy, color_acc##I); \
xy.y += SPN_RENDER_SUBTILE_COUNT;
SPN_RENDER_SUBTILE_WIDTH_EXPAND()
}
#endif
//
// The default "TTCKS_USE_SHUFFLE" will load a subgroup size of TTCK
// keys in registers and index them with a subgroup shuffle.
//
// The "TTCKS_USE_SHARED" switch enables loading a number of TTCK keys
// and storing them to shared memory.
//
// The "TTCKS_NO_SHARED" switch results in one TTCK key being loaded
// at a time.
//
#if defined(SPN_DEVICE_RENDER_TTCKS_USE_SHARED)
shared uvec2 ttck_smem[SPN_RENDER_SUBGROUP_SIZE]; // this could be smaller
#endif
//
// The "STYLING_CMDS_USE_SHUFFLE" is to load up to a subgroup size of
// commands in registers and index them with a subgroup shuffle.
//
// The "STYLING_CMDS_USE_SHARED" switch enables loading a number of
// styling commands and storing them to shared memory.
//
// The "STYLING_CMDS_NO_SHARED" switch is an even lower performance
// implementation that reads commands one at a time from global
// memory.
//
#if defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHUFFLE)
#if SPN_RENDER_SUBGROUP_SIZE < SPN_STYLING_CMDS_MAX_COUNT
#error "SPN_RENDER_SUBGROUP_SIZE < SPN_STYLING_CMDS_MAX_COUNT"
#endif
#endif
//
//
//
#if defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHARED)
shared uint spn_cmds[SPN_STYLING_CMDS_MAX_COUNT];
#endif
//
//
//
void
main()
{
#if (SPN_RENDER_SUBGROUPS == 1)
//
// A workgroup contains a single subgroup
//
SPN_SUBGROUP_UNIFORM
const uint ttck_offset_idx = gl_WorkGroupID.x;
#else
//
// A workgroup contains multiple subgroups. Subgroups with no work exit early.
//
SPN_SUBGROUP_UNIFORM
const uint ttck_offset_idx = gl_WorkGroupID.x * SPN_RENDER_SUBGROUPS + gl_SubgroupID;
if (ttck_offset_idx >= offsets_count[0])
return;
#endif
//
//
//
SPN_SUBGROUP_UNIFORM const uint ttcks_count_minus_1 = ttcks_count[0] - 1;
//
// load the starting ttck for this offset and get a bound on the max
// number of keys that might be loaded
//
// then load one or more TTCK keys
//
#if defined(SPN_DEVICE_RENDER_TTCKS_USE_SHUFFLE)
//
// SHUFFLE
//
SPN_SUBGROUP_UNIFORM const uint ttck_base = offsets[ttck_offset_idx];
// align on a subgroup
uint ttck_idx_next = (ttck_base & ~SPN_RENDER_SUBGROUP_MASK) + gl_SubgroupInvocationID;
// row of TTCK keys in registers
uvec2 ttck_sg;
{
SPN_SUBGROUP_UNIFORM const uint ttck_lane = (ttck_base & SPN_RENDER_SUBGROUP_MASK);
const bool is_valid =
(gl_SubgroupInvocationID >= ttck_lane) && (ttck_idx_next <= ttcks_count_minus_1);
if (is_valid)
ttck_sg = ttcks_keys[ttck_idx_next];
ttck_idx_next += SPN_RENDER_SUBGROUP_SIZE;
lgf_lxy[0] = subgroupShuffle(ttck_sg[0], ttck_lane) & SPN_TTCK_LO_MASK_LAYER;
lgf_lxy[1] = subgroupShuffle(ttck_sg[1], ttck_lane);
// bit-twiddle invalid keys so they mismatch: ~xy
if (!is_valid)
ttck_sg[1] = ~lgf_lxy[1];
}
#elif defined(SPN_DEVICE_RENDER_TTCKS_USE_SHARED)
//
// SHARED
//
SPN_SUBGROUP_UNIFORM uint ttck_idx_next = offsets[ttck_offset_idx];
{
const uint ttck_idx_aligned =
(ttck_idx_next & ~SPN_RENDER_SUBGROUP_MASK) + gl_SubgroupInvocationID;
SPN_SUBGROUP_UNIFORM const uint ttck_lane = (ttck_idx_next & SPN_RENDER_SUBGROUP_MASK);
const bool is_valid =
(gl_SubgroupInvocationID >= ttck_lane) && (ttck_idx_aligned <= ttcks_count_minus_1);
uvec2 ttck_new = { 0, 0 };
if (is_valid)
ttck_new = ttcks_keys[ttck_idx_aligned];
ttck_smem[gl_SubgroupInvocationID] = ttck_new;
subgroupMemoryBarrierShared();
SPN_SUBGROUP_UNIFORM const uvec2 ttck_first = ttck_smem[ttck_lane];
if (!is_valid)
ttck_smem[gl_SubgroupInvocationID][1] = ~ttck_first[1]; // ~xy
lgf_lxy[0] = ttck_first[0] & SPN_TTCK_LO_MASK_LAYER;
lgf_lxy[1] = ttck_first[1];
}
#elif defined(SPN_DEVICE_RENDER_TTCKS_NO_SHARED)
//
// NO SHARED
//
SPN_SUBGROUP_UNIFORM uint ttck_idx_next = offsets[ttck_offset_idx];
SPN_SUBGROUP_UNIFORM uvec2 ttck_sgu; // subgroup uniform TTCK key
{
ttck_sgu = ttcks_keys[ttck_idx_next++];
lgf_lxy[0] = ttck_sgu[0] & SPN_TTCK_LO_MASK_LAYER;
lgf_lxy[1] = ttck_sgu[1];
}
#endif
//
// evaluate the coarse clip as late as possible
//
SPN_SUBGROUP_UNIFORM const uint ttck_x = SPN_TTCK_GET_X(lgf_lxy);
if (ttck_x < render_clip[0])
return;
if (ttck_x >= render_clip[2])
return;
SPN_SUBGROUP_UNIFORM const uint ttck_y = SPN_TTCK_GET_Y(lgf_lxy);
if (ttck_y < render_clip[1])
return;
if (ttck_y >= render_clip[3])
return;
//
// initialize rendering and styling state
//
// save the first key so we know what tile we're in
//
spn_lgf_init();
//
// load -> scatter -> flush
//
do
{
// clear the accumulator for this layer
spn_tile_smem_zero();
// load the layer we're working on
SPN_SUBGROUP_UNIFORM const uint layer_id = spn_ttck_get_layer_uni(lgf_lxy);
spn_lgf_layer_load(layer_id);
// do we need to skip all keys on this layer because the tile
// was marked as opaque or for some other reason?
SPN_SUBGROUP_UNIFORM const bool is_scatter = spn_lgf_flag_is_scatter_noskip();
//
// load and scatter all TTXBs on this layer
//
#ifdef SPN_DEVICE_RENDER_TTCKS_USE_SHUFFLE
//
// SHUFFLE IS SUPPORTED
//
while (true)
{
//
// How many matches? Note that matches will be contiguous.
//
const bool lxy_equal = spn_ttck_lxy_equal(ttck_sg, lgf_lxy);
SPN_SUBGROUP_UNIFORM const uvec4 match = subgroupBallot(lxy_equal);
SPN_SUBGROUP_UNIFORM uint count = subgroupBallotBitCount(match);
SPN_SUBGROUP_UNIFORM uint last = 0;
if ((count > 0) && is_scatter)
{
SPN_SUBGROUP_UNIFORM uint next = subgroupBallotFindLSB(match);
last = next + count;
#if (SPN_RENDER_SUBTILE_COUNT == 1)
//
// SUBTILES == 1
//
for (; next < last; next += SPN_RENDER_SUBTILE_COUNT)
{
SPN_SUBGROUP_UNIFORM const uint ttck_lo = subgroupShuffle(ttck_sg[0], next);
SPN_SUBGROUP_UNIFORM const bool is_ttpb = SPN_TTCK_LO_IS_PREFIX(ttck_lo);
const uint ttxb_id = SPN_TTCK_LO_GET_TTXB_ID(ttck_lo);
const uint ttxb_base = ttxb_id * SPN_BLOCK_POOL_SUBBLOCK_DWORDS;
const SPN_RENDER_TTX ttx = int(bp_blocks[ttxb_base + gl_SubgroupInvocationID]);
if (is_ttpb)
{
spn_tile_scatter_ttpb(ttx);
}
else
{
spn_tile_scatter_ttsb(ttx);
}
}
#else
//
// SUBTILES >= 2
//
// hopefully these lane constants get hoisted upwards as necessary
const uint subtile_idx = gl_SubgroupInvocationID >> SPN_DEVICE_TILE_HEIGHT_LOG2;
const uint subtile_iid = gl_SubgroupInvocationID & SPN_TILE_HEIGHT_MASK;
for (; next < last; next += SPN_RENDER_SUBTILE_COUNT)
{
//
// NOTE: we don't care if the shuffle index is out of bounds
//
const uint next_subtile = next + subtile_idx;
const bool is_valid_subtile = (next_subtile < last);
const uint ttck_lo = subgroupShuffle(ttck_sg[0], next_subtile);
// predicates valid subtiles
if (is_valid_subtile)
{
const uint ttxb_id = SPN_TTCK_LO_GET_TTXB_ID(ttck_lo);
const uint ttxb_base = ttxb_id * SPN_BLOCK_POOL_SUBBLOCK_DWORDS;
const SPN_RENDER_TTX ttx = int(bp_blocks[ttxb_base + subtile_iid]);
//
// NOTE(allanmac): POTENTIAL OPTIMIZATION -- It's
// not a requirement, but sorting against all
// 64-bits of the TTCK keys results in all PREFIX
// keys being placed at the end of a LXY sequence.
//
const bool is_ttpb = SPN_TTCK_LO_IS_PREFIX(ttck_lo);
if (is_ttpb)
{
spn_tile_scatter_ttpb(ttx, subtile_iid);
}
else
{
spn_tile_scatter_ttsb(ttx);
}
}
}
#endif
}
//
// Is the subgroup out of keys?
//
if (last == SPN_RENDER_SUBGROUP_SIZE)
{
// mark all keys invalid
last = 0;
ttck_sg[1] = ~lgf_lxy[1];
if (ttck_idx_next <= ttcks_count_minus_1)
ttck_sg = ttcks_keys[ttck_idx_next];
ttck_idx_next += SPN_RENDER_SUBGROUP_SIZE;
}
SPN_SUBGROUP_UNIFORM const uvec2 ttck_first = {
subgroupShuffle(ttck_sg[0], last),
subgroupShuffle(ttck_sg[1], last)
};
// is this a new LXY?
if (spn_ttck_lxy_neq_uni(ttck_first, lgf_lxy))
{
if (spn_ttck_hi_xy_equal_uni(ttck_first[1], lgf_lxy[1]))
{
// this is a new layer and the ttck is the new lxy
lgf_lxy[0] = ttck_first[0] & SPN_TTCK_LO_MASK_LAYER;
lgf_lxy[1] = ttck_first[1];
}
else
{
// no more tiles left to process!
spn_lgf_flag_set_flush_finalize();
}
break;
}
}
#elif defined(SPN_DEVICE_RENDER_TTCKS_USE_SHARED)
//
// SHARED
//
while (true)
{
SPN_SUBGROUP_UNIFORM const uint ttck_lane = (ttck_idx_next & SPN_RENDER_SUBGROUP_MASK);
SPN_SUBGROUP_UNIFORM const uvec2 ttck = ttck_smem[ttck_lane];
// is this a new LXY?
if (spn_ttck_lxy_neq_uni(ttck, lgf_lxy))
{
if (spn_ttck_hi_xy_equal_uni(ttck[1], lgf_lxy[1]))
{
// this is a new layer and the ttck is the new lxy
lgf_lxy[0] = ttck[0] & SPN_TTCK_LO_MASK_LAYER;
lgf_lxy[1] = ttck[1];
}
else
{
// no more tiles left to process
spn_lgf_flag_set_flush_finalize();
}
break;
}
//
// scatter the key?
//
if (is_scatter)
{
SPN_SUBGROUP_UNIFORM const bool is_ttpb = SPN_TTCK_LO_IS_PREFIX(ttck[0]);
SPN_SUBGROUP_UNIFORM const uint ttxb_id = SPN_TTCK_LO_GET_TTXB_ID(ttck[0]);
SPN_SUBGROUP_UNIFORM const uint ttxb_base = ttxb_id * SPN_BLOCK_POOL_SUBBLOCK_DWORDS;
const SPN_RENDER_TTX ttx = int(bp_blocks[ttxb_base + gl_SubgroupInvocationID]);
if (is_ttpb)
{
spn_tile_scatter_ttpb(ttx);
}
else
{
spn_tile_scatter_ttsb(ttx);
}
}
//
// are we now out of keys?
//
if ((++ttck_idx_next & SPN_RENDER_SUBGROUP_MASK) == 0)
{
const uint ttck_idx_aligned = ttck_idx_next + gl_SubgroupInvocationID;
const bool is_valid = (ttck_idx_aligned <= ttcks_count_minus_1);
uvec2 ttck_new = { 0, ~lgf_lxy[1] };
if (is_valid)
ttck_new = ttcks_keys[ttck_idx_aligned];
ttck_smem[gl_SubgroupInvocationID] = ttck_new;
subgroupMemoryBarrierShared();
}
}
#elif defined(SPN_DEVICE_RENDER_TTCKS_NO_SHARED)
//
// NO SHARED
//
#endif
//
// given: new layer id from ttxk key
//
// load [layer id]{ group id, depth }
//
// if within current group's layer range
//
// if at same depth
//
// load and execute cover>[mask>]color>blend commands
//
// else if not at same depth then move deeper
//
// for all groups in group trail from cur depth to new depth
// enter group, saving and initializing regs as necessary
// increment depth and update layer range
// load and execute cover>[mask>]color>blend commands
//
// else not within layer range
//
// exit current group, restoring regs as necessary
// decrement depth and update layer range
// clear flag that controls group/layer traversal
spn_lgf_flag_clear_flush_complete();
do
{
SPN_SUBGROUP_UNIFORM const bool unwind = spn_lgf_flag_is_flush_unwind();
//
// is layer a child of the current parent group?
//
SPN_SUBGROUP_UNIFORM uint cmd_next;
if (!unwind && spn_lgf_layer_parent_equals_group())
{
// if there are no more TTCK keys then configure the loop
// so groups get unwound until done
spn_lgf_if_not_flush_finalize_then_complete_else_unwind();
// execute this layer's cmds
cmd_next = spn_lgf_get_layer_cmds();
}
else if (!unwind && spn_lgf_layer_in_group_range(layer_id))
{
//
// is layer in a child group?
//
spn_lgf_load_child_group();
// enter new group
cmd_next = spn_lgf_get_group_cmds_enter();
}
else // otherwise, exit this group
{
// leave current group
cmd_next = spn_lgf_get_group_cmds_leave();
// load parent group
spn_lgf_load_parent_group();
}
//
// execute cmds
//
// currently limited to 8 commands -- a subgroup size of 4 will
// break this but is easily fixed or avoided by using shared
// memory or reading the commands one at a time.
//
// implicitly add 1 to the cmd_count
//
// FIXME -- all tiles will be picking their way through the
// smallish styling buffer so performing these subgroup uniform
// reads through the texture cache (or equivalent) would
// probably be a performance win.
//
SPN_SUBGROUP_UNIFORM const uint cmd_base = SPN_STYLING_CMDS_GET_BASE(cmd_next);
SPN_SUBGROUP_UNIFORM const uint cmd_count = SPN_STYLING_CMDS_GET_COUNT(cmd_next);
#if defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHUFFLE)
//
// DEFAULT
//
#define SPN_STYLING_CMDS_LOAD(ii_) subgroupShuffle(cmds, ii_)
uint cmds;
if (gl_SubgroupInvocationID <= cmd_count)
{
cmds = styling[cmd_base + gl_SubgroupInvocationID];
}
#elif defined(SPN_DEVICE_RENDER_STYLING_CMDS_USE_SHARED)
//
// ONLY SUBGROUP BASIC SUPPORT
//
// load a number of commands into shared
//
#if SPN_RENDER_SUBGROUP_SIZE >= SPN_STYLING_CMDS_MAX_COUNT
if (gl_SubgroupInvocationID <= cmd_count)
spn_cmds[gl_SubgroupInvocationID] = styling[cmd_base + gl_SubgroupInvocationID];
#else
for (uint ii = gl_SubgroupInvocationID; ii <= cmd_count; ii += SPN_RENDER_SUBGROUP_SIZE)
{
spn_cmds[ii] = styling[cmd_base + ii];
}
#endif
#define SPN_STYLING_CMDS_LOAD(ii_) spn_cmds[ii_]
#elif defined(SPN_DEVICE_RENDER_STYLING_CMDS_NO_SHARED)
//
// ONLY SUBGROUP BASIC SUPPORT
//
// load each command from styling buffer
//
#define SPN_STYLING_CMDS_LOAD(ii_) styling[cmd_base + ii_]
#endif
for (SPN_SUBGROUP_UNIFORM uint ii = 0; ii < cmd_count; ii++)
{
SPN_SUBGROUP_UNIFORM uint cmd = SPN_STYLING_CMDS_LOAD(ii);
switch (cmd)
{
case SPN_STYLING_OPCODE_NOOP:
break;
case SPN_STYLING_OPCODE_COVER_NONZERO:
spn_tile_cover_nonzero();
break;
case SPN_STYLING_OPCODE_COVER_EVENODD:
spn_tile_cover_evenodd();
break;
case SPN_STYLING_OPCODE_COVER_ACCUMULATE:
spn_tile_cover_accumulate();
break;
case SPN_STYLING_OPCODE_COVER_MASK:
spn_tile_cover_wip_mask();
break;
case SPN_STYLING_OPCODE_COVER_WIP_ZERO:
spn_tile_cover_wip_zero();
break;
case SPN_STYLING_OPCODE_COVER_ACC_ZERO:
spn_tile_cover_acc_zero();
break;
case SPN_STYLING_OPCODE_COVER_MASK_ZERO:
spn_tile_cover_msk_zero();
break;
case SPN_STYLING_OPCODE_COVER_MASK_ONE:
spn_tile_cover_msk_one();
break;
case SPN_STYLING_OPCODE_COVER_MASK_INVERT:
spn_tile_cover_msk_invert();
break;
case SPN_STYLING_OPCODE_COLOR_FILL_SOLID: {
SPN_SUBGROUP_UNIFORM const uint rg = SPN_STYLING_CMDS_LOAD(++ii);
SPN_SUBGROUP_UNIFORM const uint ba = SPN_STYLING_CMDS_LOAD(++ii);
spn_tile_color_fill_solid(rg, ba);
}
break;
case SPN_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR:
//
// FIXME -- gradients shouldn't be executing so much
// conditional driven code at runtime since we *know*
// the gradient style on the host can just create a
// new styling command to exploit this.
//
// FIXME -- it might be time to try using the GPU's
// sampler on a linear array of half4 vectors -- it
// might outperform the explicit load/lerp routines.
//
// FIXME -- optimizing for vertical gradients (uhhh,
// they're actually horizontal due to the -90 degree
// view transform) is nice but is it worthwhile to
// have this in the kernel? Easy to add it back...
//
// spn_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi);
// disable gradients for now
cmd_next += SPN_GRADIENT_CMD_DWORDS_V1(styling[cmd_next + 6]);
break;
case SPN_STYLING_OPCODE_COLOR_WIP_ZERO:
spn_tile_color_wip_zero();
break;
case SPN_STYLING_OPCODE_COLOR_ACC_ZERO:
spn_tile_color_acc_zero();
break;
case SPN_STYLING_OPCODE_BLEND_OVER:
spn_tile_blend_over();
break;
case SPN_STYLING_OPCODE_BLEND_PLUS:
spn_tile_blend_plus();
break;
case SPN_STYLING_OPCODE_BLEND_MULTIPLY:
spn_tile_blend_multiply();
break;
case SPN_STYLING_OPCODE_BLEND_KNOCKOUT:
spn_tile_blend_knockout();
break;
case SPN_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK:
spn_tile_cover_msk_copy_wip();
break;
case SPN_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK:
spn_tile_cover_msk_copy_acc();
break;
case SPN_STYLING_OPCODE_COLOR_ACC_OVER_BACKGROUND: {
SPN_SUBGROUP_UNIFORM const uint rg = SPN_STYLING_CMDS_LOAD(++ii);
SPN_SUBGROUP_UNIFORM const uint ba = SPN_STYLING_CMDS_LOAD(++ii);
spn_tile_color_acc_over_background(rg, ba);
}
break;
case SPN_STYLING_OPCODE_COLOR_ACC_STORE_TO_SURFACE:
spn_tile_color_acc_store_to_surface();
break;
case SPN_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY:
spn_tile_color_acc_test_opacity();
break;
// default:
// return; // this is an illegal opcode -- trap and die!
}
}
} // continue as long as tile flush isn't complete
while (spn_lgf_flag_is_not_flush_complete());
} // continue as long as there are still keys in this tile
while (spn_lgf_flag_is_not_flush_finalize());
}
//
//
//