| // Copyright 2019 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #version 460 |
| |
| // |
| // PLACE TTSK |
| // |
| |
| // |
| // IMPORTANT: |
| // |
| // Note that the RASTER TTXK keys are already in *sorted* YX order. |
| // |
| // This enables some huge future optimizations: |
| // |
| // 1. The PLACE kernel can be exited early once TTXK.x >= clip.x1 |
| // |
| // 2. If the TTXK keys can be stored together then composition |
| // high-performance sorting problem becomes a merging problem -- |
| // this is especially useful for a CPU/SIMD implementation. |
| // |
| // 3. Finally, the PLACE kernel can "bin" TTCK keys and |
| // significantly shrink the TTCK YX coordinates freeing bits for |
| // either an increased address space or layer stack. |
| // |
| |
| // |
| // |
| // |
| #extension GL_EXT_debug_printf : enable |
| |
| // |
| // |
| // |
| #extension GL_GOOGLE_include_directive : require |
| #extension GL_KHR_shader_subgroup_vote : require |
| #extension GL_KHR_shader_subgroup_basic : require |
| #extension GL_KHR_shader_subgroup_ballot : require |
| #extension GL_KHR_shader_subgroup_arithmetic : require |
| #extension GL_KHR_shader_subgroup_shuffle : require |
| |
| // |
| // |
| // |
| #include "config.h" |
| #include "push.h" |
| |
| // |
| // |
| // |
| layout(local_size_x = SPN_DEVICE_PLACE_WORKGROUP_SIZE) in; |
| |
| // |
| // Push constants |
| // |
| SPN_PUSH_LAYOUT_PLACE(); |
| |
| // |
| // Buffer references |
| // |
| SPN_BUFFER_DEFINE_BLOCK_POOL_BLOCKS(readonly); |
| SPN_BUFFER_DEFINE_BLOCK_POOL_HOST_MAP(readonly); |
| SPN_BUFFER_DEFINE_TTCKS(readwrite, noaccess, readwrite, noaccess); |
| SPN_BUFFER_DEFINE_PLACE(readonly); |
| |
| // |
| // Local defines |
| // |
| // clang-format off |
| #define SPN_PLACE_SUBGROUP_SIZE (1 << SPN_DEVICE_PLACE_SUBGROUP_SIZE_LOG2) |
| #define SPN_PLACE_SUBGROUP_MASK SPN_BITS_TO_MASK(SPN_DEVICE_PLACE_SUBGROUP_SIZE_LOG2) |
| #define SPN_PLACE_SUBGROUPS (SPN_DEVICE_PLACE_WORKGROUP_SIZE / SPN_PLACE_SUBGROUP_SIZE) |
| // clang-format on |
| |
| // |
| // Block expansion size |
| // |
| #define SPN_PLACE_BLOCK_EXPAND_SIZE (SPN_BLOCK_POOL_BLOCK_QWORDS / SPN_PLACE_SUBGROUP_SIZE) |
| |
| // |
| // Block expansion |
| // |
| #if (SPN_PLACE_BLOCK_EXPAND_SIZE == 1) |
| |
| #define SPN_PLACE_BLOCK_EXPAND() SPN_EXPAND_1() |
| #define SPN_PLACE_BLOCK_EXPAND_I_LAST 0 |
| |
| #elif (SPN_PLACE_BLOCK_EXPAND_SIZE == 2) |
| |
| #define SPN_PLACE_BLOCK_EXPAND() SPN_EXPAND_2() |
| #define SPN_PLACE_BLOCK_EXPAND_I_LAST 1 |
| |
| #elif (SPN_PLACE_BLOCK_EXPAND_SIZE == 4) |
| |
| #define SPN_PLACE_BLOCK_EXPAND() SPN_EXPAND_4() |
| #define SPN_PLACE_BLOCK_EXPAND_I_LAST 3 |
| |
| #elif (SPN_PLACE_BLOCK_EXPAND_SIZE == 8) |
| |
| #define SPN_PLACE_BLOCK_EXPAND() SPN_EXPAND_8() |
| #define SPN_PLACE_BLOCK_EXPAND_I_LAST 7 |
| |
| #elif (SPN_PLACE_BLOCK_EXPAND_SIZE == 16) |
| |
| #define SPN_PLACE_BLOCK_EXPAND() SPN_EXPAND_16() |
| #define SPN_PLACE_BLOCK_EXPAND_I_LAST 15 |
| |
| #elif (SPN_PLACE_BLOCK_EXPAND_SIZE == 32) |
| |
| #define SPN_PLACE_BLOCK_EXPAND() SPN_EXPAND_32() |
| #define SPN_PLACE_BLOCK_EXPAND_I_LAST 31 |
| |
| #else |
| #error "Define larger expansion!" |
| #endif |
| |
| // |
| // Broadcast to a compile-time lane |
| // |
| #define SPN_PLACE_BROADCAST(E, O, I) subgroupBroadcast(E, O - I * SPN_PLACE_SUBGROUP_SIZE) |
| |
| // |
| // Translate and clip the TTSK key and massage it into a TTIK key. |
| // |
| // Note that all TTSK keys are considered to have a negative span that |
| // is implicitly 1 so clipping is simplified. |
| // |
| // TTSK v1 ( DEFAULT ) |
| // |
| // 0 63 |
| // | TTSB_ID | SPAN | X | Y | |
| // +---------+---------+----+----+ |
| // | 27 | 13 [<0] | 12 | 12 | |
| // |
| // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv |
| // |
| // TTIK |
| // ZERO_TTSK |
| // 0 +----------------+ 63 |
| // | PAYLOAD/TTSB/TTPB ID | ZERO | IS_TTSK | SPAN | X | Y | |
| // +----------------------+------+---------+------+-----+-----+ |
| // | 27 | 4 | 1 | 14 | 9 | 9 | |
| // |
| // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv |
| // |
| // TTCK (64-BIT COMPARE) -- DEFAULT |
| // |
| // 0 63 |
| // | PAYLOAD/TTSB/TTPB_ID | PREFIX | LAYER | X | Y | |
| // +----------------------+--------+-------+-----+-----+ |
| // | 27 | 1 | 18 | 9 | 9 | |
| // |
| // TTSK keys are simpler than TTPK keys so we can be fast and testing |
| // one bit in the span is enough. |
| // |
| |
| // clang-format off |
| #define SPN_IS_VALID_TTSK(t_) (SPN_BITFIELD_EXTRACT((t_)[1],SPN_TTXK_HI_BITS_SPAN-1,1) != 0) |
| |
| #define SPN_TTIK_IS_TTSK(t_) (SPN_BITFIELD_EXTRACT((t_)[0],31,1) != 0) |
| |
| #define SPN_TTIK_LO_MASK_ZERO_TTSK_TRUE 0x10 |
| #define SPN_TTIK_LO_MASK_ZERO_TTSK_FALSE 0x00 |
| |
| #define SPN_TTIK_LO_BITS_ZERO_TTSK 5 |
| |
| #define SPN_TTIK_HI_BITS_SPAN (32 - SPN_TTCK_HI_BITS_X - SPN_TTCK_HI_BITS_Y) |
| #define SPN_TTIK_HI_BITS_X SPN_TTCK_HI_BITS_X |
| #define SPN_TTIK_HI_BITS_Y SPN_TTCK_HI_BITS_Y |
| |
| #define SPN_TTIK_HI_OFFSET_X SPN_TTCK_HI_OFFSET_X |
| #define SPN_TTIK_HI_OFFSET_Y SPN_TTCK_HI_OFFSET_Y |
| |
| #define SPN_TTIK_SET_X(t_,x_) (t_)[1] = SPN_BITFIELD_INSERT((t_)[1],x_,SPN_TTIK_HI_OFFSET_X,SPN_TTIK_HI_BITS_X) |
| #define SPN_TTIK_SET_Y(t_,y_) (t_)[1] = SPN_BITFIELD_INSERT((t_)[1],y_,SPN_TTIK_HI_OFFSET_Y,SPN_TTIK_HI_BITS_Y) |
| // clang-format on |
| |
| // |
| // Note that the surface is rendered "reflected" but reflected back |
| // before being stored. |
| // |
| void |
| spn_ttsk_translate_and_clip(SPN_SUBGROUP_UNIFORM const SPN_STRUCT_TYPE(cmd_place) cmd, |
| inout u32vec2 ttsk, |
| inout uint32_t lane_keys) |
| { |
| // |
| // we know a valid TTSK has a negative span |
| // |
| bool is_ttsk = SPN_IS_VALID_TTSK(ttsk); |
| |
| if (is_ttsk) |
| { |
| // this will wrap if negative -- which is ok |
| const int32_t x = int32_t(SPN_TTXK_GET_X(ttsk) - SPN_TTXK_TILE_X_BIAS); |
| const int32_t y = int32_t(SPN_TTXK_GET_Y(ttsk) - SPN_TTXK_TILE_Y_BIAS); |
| |
| // |
| // FIXME(allanmac): potentially use the SIMD4 clip trick |
| // |
| // clang-format off |
| is_ttsk = (x >= push.place_clip[1]) && // |
| (x < push.place_clip[3]) && // |
| (y >= push.place_clip[0]) && // |
| (y < push.place_clip[2]); // |
| // clang-format on |
| |
| if (is_ttsk) |
| { |
| lane_keys += 1; |
| |
| SPN_TTIK_SET_X(ttsk, x); |
| SPN_TTIK_SET_Y(ttsk, y); |
| } |
| } |
| |
| const uint32_t zero_ttsk_mask = is_ttsk // |
| ? SPN_TTIK_LO_MASK_ZERO_TTSK_TRUE |
| : SPN_TTIK_LO_MASK_ZERO_TTSK_FALSE; |
| |
| // update the zero/ttsk bits which implicitly updates the |
| // prefix/escape bits |
| ttsk[0] = SPN_BITFIELD_INSERT(ttsk[0], // |
| zero_ttsk_mask, // |
| SPN_TTXK_LO_BITS_TTXB_ID, // |
| SPN_TTIK_LO_BITS_ZERO_TTSK); |
| } |
| |
| // |
| // There are TTSK keys but no TTPK keys. There are potentially |
| // invalid keys as well. |
| // |
| void |
| spn_ttik_append(SPN_BUFFER_TYPE(ttcks) ttcks, |
| SPN_SUBGROUP_UNIFORM const SPN_STRUCT_TYPE(cmd_place) cmd, |
| inout SPN_SUBGROUP_UNIFORM uint32_t ttck_base, |
| inout u32vec2 ttik) |
| { |
| const bool is_ttsk = SPN_TTIK_IS_TTSK(ttik); |
| const u32vec4 ttsk_ballot = subgroupBallot(is_ttsk); |
| const uint32_t ttsk_count = subgroupBallotBitCount(ttsk_ballot); |
| |
| if (is_ttsk) |
| { |
| const uint32_t ttck_exc = subgroupBallotExclusiveBitCount(ttsk_ballot); |
| const uint32_t ttck_idx = ttck_base + ttck_exc; |
| |
| SPN_TTCK_SET_LAYER(ttik, cmd.layer_id); |
| |
| ttcks.ttck_keyvals[ttck_idx] = ttik; |
| } |
| |
| ttck_base += ttsk_count; |
| } |
| |
| // |
| // |
| // |
| void |
| main() |
| { |
| // |
| // Each subgroup is responsible for a command. |
| // |
| // Test the raster's translated bounds against the composition's tile clip |
| // |
| // There are 3 cases: |
| // |
| // - the raster is completely clipped -> return |
| // - the raster is partially clipped -> all keys must be clipped |
| // - the raster is not clipped -> no keys are tested |
| // |
| // There are at least 4 implementations of place and we want to special-case |
| // them as much as possible so that, at the least, the fastpath remains fast. |
| // |
| // - implement CLIPPED + TILE TRANSLATION |
| // - implement CLIPPED + PIXEL TRANSLATION |
| // - implement CLIPPED + SUBPIXEL TRANSLATION |
| // |
| // FIXME(allanmac): split scan accumulator into a triple-bin 12:12:8 integer |
| // where: |
| // |
| // 12: ttsk |
| // 12: ttpk |
| // 8: /dev/null -- clipped or invalid key |
| // |
| // Three kinds of nodes in a raster's list: |
| // |
| // - the head node |
| // - internal nodes |
| // - the final node |
| // |
| // The layout of a raster node is optimized for reclamation: |
| // |
| // union { |
| // u32vec2 qwords[block_qwords]; |
| // struct { |
| // uint32_t dwords_lo[block_qwords]; |
| // uint32_t dwords_hi[block_qwords]; |
| // }; |
| // }; |
| // |
| // This complicates the PREFIX and PLACE shaders. |
| // |
| #if (SPN_PLACE_SUBGROUPS == 1) |
| |
| SPN_SUBGROUP_UNIFORM |
| uint32_t cmd_idx = gl_WorkGroupID.x; |
| |
| #else |
| |
| SPN_SUBGROUP_UNIFORM |
| uint32_t cmd_idx = gl_WorkGroupID.x * SPN_PLACE_SUBGROUPS + gl_SubgroupID; |
| |
| if (cmd_idx >= push.place_span) |
| { |
| return; |
| } |
| |
| #endif |
| |
| // wrap to ring |
| cmd_idx += push.place_head; |
| |
| if (cmd_idx >= push.place_size) |
| { |
| cmd_idx -= push.place_size; |
| } |
| |
| // define place bufref |
| SPN_BUFREF_DEFINE(SPN_BUFFER_TYPE(place), place, push.devaddr_place); |
| |
| // load the cmd |
| SPN_SUBGROUP_UNIFORM const SPN_STRUCT_TYPE(cmd_place) cmd = place.extent[cmd_idx]; |
| |
| // define host map bufref |
| SPN_BUFREF_DEFINE(SPN_BUFFER_TYPE(block_pool_host_map), |
| bp_host_map, |
| push.devaddr_block_pool_host_map); |
| |
| // get the device id |
| SPN_SUBGROUP_UNIFORM const uint32_t head_id = bp_host_map.extent[cmd.raster_h]; |
| |
| // where is the raster node in the pool? |
| SPN_SUBGROUP_UNIFORM const uint32_t head_base = head_id * SPN_BLOCK_POOL_SUBBLOCK_DWORDS; |
| |
| // define block pool blocks bufref |
| SPN_BUFREF_DEFINE(SPN_BUFFER_TYPE(block_pool_blocks), bp_blocks, push.devaddr_block_pool_blocks); |
| |
| // how many ttsks? |
| SPN_SUBGROUP_UNIFORM uint32_t ttsks_rem = bp_blocks.extent[head_base + // |
| SPN_RASTER_HEAD_LO_OFFSET_TTSKS]; |
| |
| // no ttsks? |
| if (ttsks_rem == 0) |
| { |
| return; |
| } |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) u32vec2 t##I; |
| |
| // declare the head node |
| SPN_PLACE_BLOCK_EXPAND(); |
| |
| // load two rows at a time |
| const uint32_t head_idx = head_base + gl_SubgroupInvocationID; |
| |
| // |
| // load all ttsks in head block otherwise invalidating |
| // |
| // NOTE: highp = mediump - highp |
| // |
| const uint32_t raster_head_qwords = SPN_RASTER_HEAD_QWORDS; |
| const uint32_t ttsks_base = gl_SubgroupInvocationID - raster_head_qwords; |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) \ |
| if (!SPN_RASTER_HEAD_ENTIRELY_HEADER(SPN_PLACE_SUBGROUP_SIZE, I)) \ |
| { \ |
| t##I = SPN_TTXK_INVALID; \ |
| \ |
| if (ttsks_base + I * SPN_PLACE_SUBGROUP_SIZE < ttsks_rem) \ |
| { \ |
| t##I = u32vec2(bp_blocks.extent[head_idx + /**/ \ |
| SPN_PLACE_SUBGROUP_SIZE * I], /**/ \ |
| bp_blocks.extent[head_idx + /**/ \ |
| SPN_PLACE_SUBGROUP_SIZE * I + /**/ \ |
| SPN_BLOCK_POOL_BLOCK_QWORDS]); \ |
| } \ |
| } |
| |
| SPN_PLACE_BLOCK_EXPAND(); |
| |
| // |
| // - translate and clip TTSK keys in the head |
| // - count total clipped TTSK keys in this slab |
| // |
| uint32_t h_lane_keys = 0; |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) \ |
| if (!SPN_RASTER_HEAD_ENTIRELY_HEADER(SPN_PLACE_SUBGROUP_SIZE, I)) \ |
| { \ |
| spn_ttsk_translate_and_clip(cmd, t##I, h_lane_keys); \ |
| } |
| |
| SPN_PLACE_BLOCK_EXPAND(); |
| |
| // |
| // define ttcks bufref |
| // |
| SPN_BUFREF_DEFINE(SPN_BUFFER_TYPE(ttcks), ttcks, push.devaddr_ttcks); |
| |
| // |
| // atomically allocate space for keys in this node |
| // |
| SPN_SUBGROUP_UNIFORM const uint32_t h_block_keys = subgroupAdd(h_lane_keys); |
| uint32_t h_ttck_atomic; |
| |
| if (gl_SubgroupInvocationID == 0) |
| { |
| h_ttck_atomic = atomicAdd(ttcks.segment_dispatch.w, h_block_keys); |
| } |
| |
| SPN_SUBGROUP_UNIFORM uint32_t h_ttck_base = subgroupBroadcast(h_ttck_atomic, 0); |
| |
| // |
| // dump the keys |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) \ |
| if (!SPN_RASTER_HEAD_ENTIRELY_HEADER(SPN_PLACE_SUBGROUP_SIZE, I)) \ |
| { \ |
| spn_ttik_append(ttcks, cmd, h_ttck_base, t##I); \ |
| } |
| |
| SPN_PLACE_BLOCK_EXPAND(); |
| |
| // |
| // any more ttsk keys? |
| // |
| if (ttsks_rem <= SPN_BLOCK_POOL_BLOCK_QWORDS - 1 - SPN_RASTER_HEAD_QWORDS) |
| { |
| return; |
| } |
| |
| // |
| // otherwise, keep processing |
| // |
| ttsks_rem -= SPN_BLOCK_POOL_BLOCK_QWORDS - 1 - SPN_RASTER_HEAD_QWORDS; |
| |
| // |
| // if more nodes, load next raster node |
| // |
| while (true) |
| { |
| // |
| // jump to next node |
| // |
| SPN_SUBGROUP_UNIFORM const uint32_t node_id = |
| subgroupBroadcast(SPN_GLSL_CONCAT(t, SPN_PLACE_BLOCK_EXPAND_I_LAST)[0], |
| SPN_PLACE_SUBGROUP_SIZE - 1); |
| |
| const uint32_t node_idx = node_id * SPN_BLOCK_POOL_SUBBLOCK_DWORDS + gl_SubgroupInvocationID; |
| |
| // |
| // load node block |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) \ |
| t##I = SPN_TTXK_INVALID; \ |
| if (gl_SubgroupInvocationID + I * SPN_PLACE_SUBGROUP_SIZE < ttsks_rem) \ |
| { \ |
| t##I = u32vec2( \ |
| bp_blocks.extent[node_idx + SPN_PLACE_SUBGROUP_SIZE * I], \ |
| bp_blocks.extent[node_idx + SPN_PLACE_SUBGROUP_SIZE * I + SPN_BLOCK_POOL_BLOCK_QWORDS]); \ |
| } |
| |
| SPN_PLACE_BLOCK_EXPAND(); |
| |
| // |
| // - translate and clip TTPK keys in the node |
| // - count total clipped TTSK keys in this slab |
| // |
| uint32_t n_lane_keys = 0; |
| |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) spn_ttsk_translate_and_clip(cmd, t##I, n_lane_keys); |
| |
| SPN_PLACE_BLOCK_EXPAND(); |
| |
| // |
| // atomically allocate space for keys in this node |
| // |
| SPN_SUBGROUP_UNIFORM const uint32_t n_block_keys = subgroupAdd(n_lane_keys); |
| uint32_t n_ttck_atomic; |
| |
| if (gl_SubgroupInvocationID == 0) |
| { |
| n_ttck_atomic = atomicAdd(ttcks.segment_dispatch.w, n_block_keys); |
| } |
| |
| SPN_SUBGROUP_UNIFORM uint32_t n_ttck_base = subgroupBroadcast(n_ttck_atomic, 0); |
| |
| // |
| // dump the keys |
| // |
| #undef SPN_EXPAND_X |
| #define SPN_EXPAND_X(I, N, P, L) spn_ttik_append(ttcks, cmd, n_ttck_base, t##I); |
| |
| SPN_PLACE_BLOCK_EXPAND(); |
| |
| // |
| // any more ttsk keys? |
| // |
| if (ttsks_rem <= SPN_BLOCK_POOL_BLOCK_QWORDS - 1) |
| { |
| return; |
| } |
| |
| // |
| // otherwise, keep processing |
| // |
| ttsks_rem -= SPN_BLOCK_POOL_BLOCK_QWORDS - 1; |
| } |
| } |
| |
| // |
| // |
| // |