blob: 33b2a69a6c33be2d79e4f2a9532871d07a924220 [file] [log] [blame]
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#version 460
//
// PLACE TTSK
//
//
// IMPORTANT:
//
// Note that the RASTER TTXK keys are already in *sorted* YX order.
//
// This enables some huge future optimizations:
//
// 1. The PLACE kernel can be exited early once TTXK.x >= clip.x1
//
// 2. If the TTXK keys can be stored together then composition
// high-performance sorting problem becomes a merging problem --
// this is especially useful for a CPU/SIMD implementation.
//
// 3. Finally, the PLACE kernel can "bin" TTCK keys and
// significantly shrink the TTCK YX coordinates freeing bits for
// either an increased address space or layer stack.
//
//
//
//
#extension GL_EXT_debug_printf : enable
//
//
//
#extension GL_GOOGLE_include_directive : require
#extension GL_KHR_shader_subgroup_vote : require
#extension GL_KHR_shader_subgroup_basic : require
#extension GL_KHR_shader_subgroup_ballot : require
#extension GL_KHR_shader_subgroup_arithmetic : require
#extension GL_KHR_shader_subgroup_shuffle : require
//
//
//
#include "config.h"
#include "push.h"
//
//
//
layout(local_size_x = SPN_DEVICE_PLACE_WORKGROUP_SIZE) in;
//
// Push constants
//
SPN_PUSH_LAYOUT_PLACE();
//
// Buffer references
//
SPN_BUFFER_DEFINE_BLOCK_POOL_BLOCKS(readonly);
SPN_BUFFER_DEFINE_BLOCK_POOL_HOST_MAP(readonly);
SPN_BUFFER_DEFINE_TTCKS(readwrite, noaccess, readwrite, noaccess);
SPN_BUFFER_DEFINE_PLACE(readonly);
//
// Local defines
//
// clang-format off
#define SPN_PLACE_SUBGROUP_SIZE (1 << SPN_DEVICE_PLACE_SUBGROUP_SIZE_LOG2)
#define SPN_PLACE_SUBGROUP_MASK SPN_BITS_TO_MASK(SPN_DEVICE_PLACE_SUBGROUP_SIZE_LOG2)
#define SPN_PLACE_SUBGROUPS (SPN_DEVICE_PLACE_WORKGROUP_SIZE / SPN_PLACE_SUBGROUP_SIZE)
// clang-format on
//
// Block expansion size
//
#define SPN_PLACE_BLOCK_EXPAND_SIZE (SPN_BLOCK_POOL_BLOCK_QWORDS / SPN_PLACE_SUBGROUP_SIZE)
//
// Block expansion
//
#if (SPN_PLACE_BLOCK_EXPAND_SIZE == 1)
#define SPN_PLACE_BLOCK_EXPAND() SPN_EXPAND_1()
#define SPN_PLACE_BLOCK_EXPAND_I_LAST 0
#elif (SPN_PLACE_BLOCK_EXPAND_SIZE == 2)
#define SPN_PLACE_BLOCK_EXPAND() SPN_EXPAND_2()
#define SPN_PLACE_BLOCK_EXPAND_I_LAST 1
#elif (SPN_PLACE_BLOCK_EXPAND_SIZE == 4)
#define SPN_PLACE_BLOCK_EXPAND() SPN_EXPAND_4()
#define SPN_PLACE_BLOCK_EXPAND_I_LAST 3
#elif (SPN_PLACE_BLOCK_EXPAND_SIZE == 8)
#define SPN_PLACE_BLOCK_EXPAND() SPN_EXPAND_8()
#define SPN_PLACE_BLOCK_EXPAND_I_LAST 7
#elif (SPN_PLACE_BLOCK_EXPAND_SIZE == 16)
#define SPN_PLACE_BLOCK_EXPAND() SPN_EXPAND_16()
#define SPN_PLACE_BLOCK_EXPAND_I_LAST 15
#elif (SPN_PLACE_BLOCK_EXPAND_SIZE == 32)
#define SPN_PLACE_BLOCK_EXPAND() SPN_EXPAND_32()
#define SPN_PLACE_BLOCK_EXPAND_I_LAST 31
#else
#error "Define larger expansion!"
#endif
//
// Broadcast to a compile-time lane
//
#define SPN_PLACE_BROADCAST(E, O, I) subgroupBroadcast(E, O - I * SPN_PLACE_SUBGROUP_SIZE)
//
// Translate and clip the TTSK key and massage it into a TTIK key.
//
// Note that all TTSK keys are considered to have a negative span that
// is implicitly 1 so clipping is simplified.
//
// TTSK v1 ( DEFAULT )
//
// 0 63
// | TTSB_ID | SPAN | X | Y |
// +---------+---------+----+----+
// | 27 | 13 [<0] | 12 | 12 |
//
// vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
//
// TTIK
// ZERO_TTSK
// 0 +----------------+ 63
// | PAYLOAD/TTSB/TTPB ID | ZERO | IS_TTSK | SPAN | X | Y |
// +----------------------+------+---------+------+-----+-----+
// | 27 | 4 | 1 | 14 | 9 | 9 |
//
// vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
//
// TTCK (64-BIT COMPARE) -- DEFAULT
//
// 0 63
// | PAYLOAD/TTSB/TTPB_ID | PREFIX | LAYER | X | Y |
// +----------------------+--------+-------+-----+-----+
// | 27 | 1 | 18 | 9 | 9 |
//
// TTSK keys are simpler than TTPK keys so we can be fast and testing
// one bit in the span is enough.
//
// clang-format off
#define SPN_IS_VALID_TTSK(t_) (SPN_BITFIELD_EXTRACT((t_)[1],SPN_TTXK_HI_BITS_SPAN-1,1) != 0)
#define SPN_TTIK_IS_TTSK(t_) (SPN_BITFIELD_EXTRACT((t_)[0],31,1) != 0)
#define SPN_TTIK_LO_MASK_ZERO_TTSK_TRUE 0x10
#define SPN_TTIK_LO_MASK_ZERO_TTSK_FALSE 0x00
#define SPN_TTIK_LO_BITS_ZERO_TTSK 5
#define SPN_TTIK_HI_BITS_SPAN (32 - SPN_TTCK_HI_BITS_X - SPN_TTCK_HI_BITS_Y)
#define SPN_TTIK_HI_BITS_X SPN_TTCK_HI_BITS_X
#define SPN_TTIK_HI_BITS_Y SPN_TTCK_HI_BITS_Y
#define SPN_TTIK_HI_OFFSET_X SPN_TTCK_HI_OFFSET_X
#define SPN_TTIK_HI_OFFSET_Y SPN_TTCK_HI_OFFSET_Y
#define SPN_TTIK_SET_X(t_,x_) (t_)[1] = SPN_BITFIELD_INSERT((t_)[1],x_,SPN_TTIK_HI_OFFSET_X,SPN_TTIK_HI_BITS_X)
#define SPN_TTIK_SET_Y(t_,y_) (t_)[1] = SPN_BITFIELD_INSERT((t_)[1],y_,SPN_TTIK_HI_OFFSET_Y,SPN_TTIK_HI_BITS_Y)
// clang-format on
//
// Note that the surface is rendered "reflected" but reflected back
// before being stored.
//
void
spn_ttsk_translate_and_clip(SPN_SUBGROUP_UNIFORM const SPN_STRUCT_TYPE(cmd_place) cmd,
inout u32vec2 ttsk,
inout uint32_t lane_keys)
{
//
// we know a valid TTSK has a negative span
//
bool is_ttsk = SPN_IS_VALID_TTSK(ttsk);
if (is_ttsk)
{
// this will wrap if negative -- which is ok
const int32_t x = int32_t(SPN_TTXK_GET_X(ttsk) - SPN_TTXK_TILE_X_BIAS);
const int32_t y = int32_t(SPN_TTXK_GET_Y(ttsk) - SPN_TTXK_TILE_Y_BIAS);
//
// FIXME(allanmac): potentially use the SIMD4 clip trick
//
// clang-format off
is_ttsk = (x >= push.place_clip[1]) && //
(x < push.place_clip[3]) && //
(y >= push.place_clip[0]) && //
(y < push.place_clip[2]); //
// clang-format on
if (is_ttsk)
{
lane_keys += 1;
SPN_TTIK_SET_X(ttsk, x);
SPN_TTIK_SET_Y(ttsk, y);
}
}
const uint32_t zero_ttsk_mask = is_ttsk //
? SPN_TTIK_LO_MASK_ZERO_TTSK_TRUE
: SPN_TTIK_LO_MASK_ZERO_TTSK_FALSE;
// update the zero/ttsk bits which implicitly updates the
// prefix/escape bits
ttsk[0] = SPN_BITFIELD_INSERT(ttsk[0], //
zero_ttsk_mask, //
SPN_TTXK_LO_BITS_TTXB_ID, //
SPN_TTIK_LO_BITS_ZERO_TTSK);
}
//
// There are TTSK keys but no TTPK keys. There are potentially
// invalid keys as well.
//
void
spn_ttik_append(SPN_BUFFER_TYPE(ttcks) ttcks,
SPN_SUBGROUP_UNIFORM const SPN_STRUCT_TYPE(cmd_place) cmd,
inout SPN_SUBGROUP_UNIFORM uint32_t ttck_base,
inout u32vec2 ttik)
{
const bool is_ttsk = SPN_TTIK_IS_TTSK(ttik);
const u32vec4 ttsk_ballot = subgroupBallot(is_ttsk);
const uint32_t ttsk_count = subgroupBallotBitCount(ttsk_ballot);
if (is_ttsk)
{
const uint32_t ttck_exc = subgroupBallotExclusiveBitCount(ttsk_ballot);
const uint32_t ttck_idx = ttck_base + ttck_exc;
SPN_TTCK_SET_LAYER(ttik, cmd.layer_id);
ttcks.ttck_keyvals[ttck_idx] = ttik;
}
ttck_base += ttsk_count;
}
//
//
//
void
main()
{
//
// Each subgroup is responsible for a command.
//
// Test the raster's translated bounds against the composition's tile clip
//
// There are 3 cases:
//
// - the raster is completely clipped -> return
// - the raster is partially clipped -> all keys must be clipped
// - the raster is not clipped -> no keys are tested
//
// There are at least 4 implementations of place and we want to special-case
// them as much as possible so that, at the least, the fastpath remains fast.
//
// - implement CLIPPED + TILE TRANSLATION
// - implement CLIPPED + PIXEL TRANSLATION
// - implement CLIPPED + SUBPIXEL TRANSLATION
//
// FIXME(allanmac): split scan accumulator into a triple-bin 12:12:8 integer
// where:
//
// 12: ttsk
// 12: ttpk
// 8: /dev/null -- clipped or invalid key
//
// Three kinds of nodes in a raster's list:
//
// - the head node
// - internal nodes
// - the final node
//
// The layout of a raster node is optimized for reclamation:
//
// union {
// u32vec2 qwords[block_qwords];
// struct {
// uint32_t dwords_lo[block_qwords];
// uint32_t dwords_hi[block_qwords];
// };
// };
//
// This complicates the PREFIX and PLACE shaders.
//
#if (SPN_PLACE_SUBGROUPS == 1)
SPN_SUBGROUP_UNIFORM
uint32_t cmd_idx = gl_WorkGroupID.x;
#else
SPN_SUBGROUP_UNIFORM
uint32_t cmd_idx = gl_WorkGroupID.x * SPN_PLACE_SUBGROUPS + gl_SubgroupID;
if (cmd_idx >= push.place_span)
{
return;
}
#endif
// wrap to ring
cmd_idx += push.place_head;
if (cmd_idx >= push.place_size)
{
cmd_idx -= push.place_size;
}
// define place bufref
SPN_BUFREF_DEFINE(SPN_BUFFER_TYPE(place), place, push.devaddr_place);
// load the cmd
SPN_SUBGROUP_UNIFORM const SPN_STRUCT_TYPE(cmd_place) cmd = place.extent[cmd_idx];
// define host map bufref
SPN_BUFREF_DEFINE(SPN_BUFFER_TYPE(block_pool_host_map),
bp_host_map,
push.devaddr_block_pool_host_map);
// get the device id
SPN_SUBGROUP_UNIFORM const uint32_t head_id = bp_host_map.extent[cmd.raster_h];
// where is the raster node in the pool?
SPN_SUBGROUP_UNIFORM const uint32_t head_base = head_id * SPN_BLOCK_POOL_SUBBLOCK_DWORDS;
// define block pool blocks bufref
SPN_BUFREF_DEFINE(SPN_BUFFER_TYPE(block_pool_blocks), bp_blocks, push.devaddr_block_pool_blocks);
// how many ttsks?
SPN_SUBGROUP_UNIFORM uint32_t ttsks_rem = bp_blocks.extent[head_base + //
SPN_RASTER_HEAD_LO_OFFSET_TTSKS];
// no ttsks?
if (ttsks_rem == 0)
{
return;
}
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) u32vec2 t##I;
// declare the head node
SPN_PLACE_BLOCK_EXPAND();
// load two rows at a time
const uint32_t head_idx = head_base + gl_SubgroupInvocationID;
//
// load all ttsks in head block otherwise invalidating
//
// NOTE: highp = mediump - highp
//
const uint32_t raster_head_qwords = SPN_RASTER_HEAD_QWORDS;
const uint32_t ttsks_base = gl_SubgroupInvocationID - raster_head_qwords;
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) \
if (!SPN_RASTER_HEAD_ENTIRELY_HEADER(SPN_PLACE_SUBGROUP_SIZE, I)) \
{ \
t##I = SPN_TTXK_INVALID; \
\
if (ttsks_base + I * SPN_PLACE_SUBGROUP_SIZE < ttsks_rem) \
{ \
t##I = u32vec2(bp_blocks.extent[head_idx + /**/ \
SPN_PLACE_SUBGROUP_SIZE * I], /**/ \
bp_blocks.extent[head_idx + /**/ \
SPN_PLACE_SUBGROUP_SIZE * I + /**/ \
SPN_BLOCK_POOL_BLOCK_QWORDS]); \
} \
}
SPN_PLACE_BLOCK_EXPAND();
//
// - translate and clip TTSK keys in the head
// - count total clipped TTSK keys in this slab
//
uint32_t h_lane_keys = 0;
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) \
if (!SPN_RASTER_HEAD_ENTIRELY_HEADER(SPN_PLACE_SUBGROUP_SIZE, I)) \
{ \
spn_ttsk_translate_and_clip(cmd, t##I, h_lane_keys); \
}
SPN_PLACE_BLOCK_EXPAND();
//
// define ttcks bufref
//
SPN_BUFREF_DEFINE(SPN_BUFFER_TYPE(ttcks), ttcks, push.devaddr_ttcks);
//
// atomically allocate space for keys in this node
//
SPN_SUBGROUP_UNIFORM const uint32_t h_block_keys = subgroupAdd(h_lane_keys);
uint32_t h_ttck_atomic;
if (gl_SubgroupInvocationID == 0)
{
h_ttck_atomic = atomicAdd(ttcks.segment_dispatch.w, h_block_keys);
}
SPN_SUBGROUP_UNIFORM uint32_t h_ttck_base = subgroupBroadcast(h_ttck_atomic, 0);
//
// dump the keys
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) \
if (!SPN_RASTER_HEAD_ENTIRELY_HEADER(SPN_PLACE_SUBGROUP_SIZE, I)) \
{ \
spn_ttik_append(ttcks, cmd, h_ttck_base, t##I); \
}
SPN_PLACE_BLOCK_EXPAND();
//
// any more ttsk keys?
//
if (ttsks_rem <= SPN_BLOCK_POOL_BLOCK_QWORDS - 1 - SPN_RASTER_HEAD_QWORDS)
{
return;
}
//
// otherwise, keep processing
//
ttsks_rem -= SPN_BLOCK_POOL_BLOCK_QWORDS - 1 - SPN_RASTER_HEAD_QWORDS;
//
// if more nodes, load next raster node
//
while (true)
{
//
// jump to next node
//
SPN_SUBGROUP_UNIFORM const uint32_t node_id =
subgroupBroadcast(SPN_GLSL_CONCAT(t, SPN_PLACE_BLOCK_EXPAND_I_LAST)[0],
SPN_PLACE_SUBGROUP_SIZE - 1);
const uint32_t node_idx = node_id * SPN_BLOCK_POOL_SUBBLOCK_DWORDS + gl_SubgroupInvocationID;
//
// load node block
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) \
t##I = SPN_TTXK_INVALID; \
if (gl_SubgroupInvocationID + I * SPN_PLACE_SUBGROUP_SIZE < ttsks_rem) \
{ \
t##I = u32vec2( \
bp_blocks.extent[node_idx + SPN_PLACE_SUBGROUP_SIZE * I], \
bp_blocks.extent[node_idx + SPN_PLACE_SUBGROUP_SIZE * I + SPN_BLOCK_POOL_BLOCK_QWORDS]); \
}
SPN_PLACE_BLOCK_EXPAND();
//
// - translate and clip TTPK keys in the node
// - count total clipped TTSK keys in this slab
//
uint32_t n_lane_keys = 0;
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) spn_ttsk_translate_and_clip(cmd, t##I, n_lane_keys);
SPN_PLACE_BLOCK_EXPAND();
//
// atomically allocate space for keys in this node
//
SPN_SUBGROUP_UNIFORM const uint32_t n_block_keys = subgroupAdd(n_lane_keys);
uint32_t n_ttck_atomic;
if (gl_SubgroupInvocationID == 0)
{
n_ttck_atomic = atomicAdd(ttcks.segment_dispatch.w, n_block_keys);
}
SPN_SUBGROUP_UNIFORM uint32_t n_ttck_base = subgroupBroadcast(n_ttck_atomic, 0);
//
// dump the keys
//
#undef SPN_EXPAND_X
#define SPN_EXPAND_X(I, N, P, L) spn_ttik_append(ttcks, cmd, n_ttck_base, t##I);
SPN_PLACE_BLOCK_EXPAND();
//
// any more ttsk keys?
//
if (ttsks_rem <= SPN_BLOCK_POOL_BLOCK_QWORDS - 1)
{
return;
}
//
// otherwise, keep processing
//
ttsks_rem -= SPN_BLOCK_POOL_BLOCK_QWORDS - 1;
}
}
//
//
//