blob: d9baade317e869acbe8f8a6978ec099ed4df31a6 [file] [log] [blame]
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#version 460
//
// SEGMENT TTRK
//
// FIXME(allanmac): The OpenCL and CUDA versions of this kernel are more
// sophisticated but let's see how this performs. Optimize this later
// using CUDA & OpenCL techniques.
//
// FIXME(allanmac): Transition to a split lo/hi sort.
//
// FIXME(allanmac): Add a "SKIP" bit to the TTRK.
//
//
//
//
#extension GL_EXT_debug_printf : enable
//
//
//
#extension GL_GOOGLE_include_directive : require
#extension GL_KHR_shader_subgroup_shuffle_relative : require
#extension GL_KHR_shader_subgroup_arithmetic : require
#extension GL_KHR_shader_subgroup_ballot : require
#extension GL_EXT_control_flow_attributes : require
//
//
//
#include "config.h"
#include "push.h"
//
//
//
layout(local_size_x = SPN_DEVICE_TTRKS_SEGMENT_WORKGROUP_SIZE) in;
//
// Push constants
//
SPN_PUSH_LAYOUT_TTRKS_SEGMENT();
//
// Buffer references
//
SPN_BUFFER_DEFINE_TTRKS_HEADER(readwrite, readonly);
SPN_BUFFER_DEFINE_TTRK_KEYVALS(readonly);
//
// MACROS
//
// clang-format off
#define SPN_SUBGROUP_SIZE (1 << SPN_DEVICE_TTRKS_SEGMENT_SUBGROUP_SIZE_LOG2)
#define SPN_SUBGROUPS (SPN_DEVICE_TTRKS_SEGMENT_WORKGROUP_SIZE / SPN_SUBGROUP_SIZE)
#define SPN_SUBGROUP_TTRKS (SPN_DEVICE_TTRKS_SEGMENT_ROWS * SPN_SUBGROUP_SIZE)
#define SPN_TTRK_COUNT_BITS_TTPK_COUNT 16
#define SPN_TTRK_COUNT_BITS_BLOCK_COUNT 16
#define SPN_TTRK_COUNT_OFFSET_TTPK_COUNT 0
#define SPN_TTRK_COUNT_OFFSET_BLOCK_COUNT (SPN_TTRK_COUNT_BITS_TTPK_COUNT)
#define SPN_TTRK_COUNT_ONE_TTPK_COUNT (1 << SPN_TTRK_COUNT_OFFSET_TTPK_COUNT)
#define SPN_TTRK_COUNT_ONE_BLOCK_COUNT (1 << SPN_TTRK_COUNT_OFFSET_BLOCK_COUNT)
#define SPN_TTRK_COUNT_GET_TTPK_COUNT(s_) SPN_BITFIELD_EXTRACT(s_,SPN_TTRK_COUNT_OFFSET_TTPK_COUNT,SPN_TTRK_COUNT_BITS_TTPK_COUNT)
#define SPN_TTRK_COUNT_GET_BLOCK_COUNT(s_) SPN_BITFIELD_EXTRACT(s_,SPN_TTRK_COUNT_OFFSET_BLOCK_COUNT,SPN_TTRK_COUNT_BITS_BLOCK_COUNT)
#define SPN_TTRK_IS_NEW_COHORT(curr_, prev_) (((curr_.y ^ prev_.y) & SPN_TTRK_HI_MASK_COHORT) != 0)
#define SPN_TTRK_ROW_LAST (SPN_DEVICE_TTRKS_SEGMENT_ROWS - 1)
// clang-format on
//
// Make sure the scan can't overflow.
//
#if (SPN_SUBGROUP_TTRKS >= (1 << SPN_TTRK_COUNT_BITS_TTPK_COUNT))
#error "Error: Too many TTRK keyvals in subgroup. Reduce row count."
#endif
#if (SPN_SUBGROUP_TTRKS >= (1 << SPN_TTRK_COUNT_BITS_BLOCK_COUNT))
#error "Error: Too many TTRK keyvals in subgroup. Reduce row count."
#endif
//
// Update curr TTRK.NEW_Y/NEW_X flags
//
// Also emit a `ttrk_count` for later scanning:
//
// 0 31
// | TTPK COUNT | BLOCK COUNT |
// +------------+-------------+
// | 16 | 16 |
//
uint32_t
spn_ttrk_update_and_compare(inout u32vec2 curr, const u32vec2 prev)
{
//
// Update curr TTRK.NEW_Y/NEW_X
//
// clang-format off
const uint32_t xor_lo = (curr.x ^ prev.x); // XOR TTRK.X_LO
const bool is_new_x_lo = ((xor_lo & SPN_TTRK_LO_MASK_X) != 0); // XOR.X_LO != 0
uint32_t new_xy = is_new_x_lo ? SPN_TTRK_LO_MASK_NEW_X : 0; // init NEW_XY
const uint32_t xor_hi = (curr.y ^ prev.y); // XOR TTRK.HI
const bool is_new_x_hi = ((xor_hi & SPN_TTRK_HI_MASK_X) != 0); // XOR.X_HI != 0
new_xy = is_new_x_hi ? SPN_TTRK_LO_MASK_NEW_X : new_xy; // update NEW_XY
const bool is_new_y = ((xor_hi & SPN_TTRK_HI_MASK_Y) != 0); // XOR.Y_HI != 0
new_xy = is_new_y ? SPN_TTRK_LO_MASK_NEW_Y : new_xy; // update NEW_XY
const bool is_new_cohort = ((xor_hi & SPN_TTRK_HI_MASK_COHORT) != 0); // COHORT != 0
new_xy = is_new_cohort ? SPN_TTRK_LO_MASK_NEW_Y : new_xy; // COHORT != 0
curr.x |= new_xy; // update curr.NEW_XY
// clang-format on
//
// Set TTPK count
//
const bool is_new_ttpk = (new_xy == SPN_TTRK_LO_MASK_NEW_X); // new_xy == NEW_X
const uint32_t ttpk_count = is_new_ttpk ? SPN_TTRK_COUNT_ONE_TTPK_COUNT : 0; // set .TTPK_COUNT
//
// Set BLOCK count
//
const uint32_t block_count = SPN_BLOCK_ID_IS_BLOCK(curr.x) ? SPN_TTRK_COUNT_ONE_BLOCK_COUNT : 0;
return (ttpk_count | block_count);
}
//
//
//
void
main()
{
//
// Define ttrks header bufref
//
SPN_BUFREF_DEFINE(SPN_BUFFER_TYPE(ttrks_header), ttrks, push.devaddr_ttrks_header);
//
// Get ttrks.count
//
SPN_SUBGROUP_UNIFORM const uint32_t ttrks_count = ttrks.count_dispatch.w;
//
// Every subgroup processes a block of ttrks
//
#if (SPN_SUBGROUPS == 1)
// clang-format off
SPN_SUBGROUP_UNIFORM const uint32_t ttrks_base = (gl_WorkGroupID.x * SPN_SUBGROUP_TTRKS);
#else
SPN_SUBGROUP_UNIFORM const uint32_t ttrks_base = (gl_WorkGroupID.x * SPN_SUBGROUPS + gl_SubgroupID) * SPN_SUBGROUP_TTRKS;
// clang-format on
// Does this subgroup have work?
if (ttrks_base >= ttrks_count)
{
return;
}
#endif
//
// Keyvals base index
//
const uint32_t ttrks_idx_row0 = ttrks_base + gl_SubgroupInvocationID;
//
// Define ttrk_keyvals bufref
//
u32vec2 ttrks_curr_offset;
umulExtended(ttrks_idx_row0,
8, // sizeof(ttrk)
ttrks_curr_offset.y, // msb
ttrks_curr_offset.x); // lsb
SPN_BUFREF_DEFINE_AT_OFFSET_U32VEC2(SPN_BUFFER_TYPE(ttrk_keyvals),
ttrks_curr,
push.devaddr_ttrk_keyvals,
ttrks_curr_offset);
//
// Load ttrk keyvals
//
u32vec2 curr[SPN_DEVICE_TTRKS_SEGMENT_ROWS];
[[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
{
const uint32_t idx = ttrks_idx_row0 + ii * SPN_SUBGROUP_SIZE;
if (idx < ttrks_count)
{
curr[ii] = ttrks_curr.extent[ii * SPN_SUBGROUP_SIZE];
}
else
{
curr[ii] = u32vec2(0, SPN_TTRK_HI_MASK_COHORT); //
}
}
//
// Get prev keyval
//
u32vec2 prev[SPN_DEVICE_TTRKS_SEGMENT_ROWS];
[[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
{
prev[ii] = subgroupShuffleUp(curr[ii], 1);
}
//
// Fix lane 0 prev[] for rows [1,ROWS-1]
//
const bool is_lane0 = (gl_SubgroupInvocationID == 0);
[[unroll]] for (uint32_t ii = 1, jj = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++, jj++)
{
const u32vec2 last = subgroupBroadcast(curr[jj], SPN_SUBGROUP_SIZE - 1);
if (is_lane0)
{
prev[ii] = last;
}
}
//
// Fix lane 0 prev[] for row 0
//
if (is_lane0)
{
if (ttrks_idx_row0 > 0)
{
//
// If this is the first key in any block other the first
// then broadcast load the last key in the previous block.
//
// NOTE: This keyval may have already had its NEW_X/NEW_Y
// bits updated by another subgroup.
//
u32vec2 ttrks_prev_offset;
umulExtended(ttrks_idx_row0 - 1,
8, // sizeof(ttrk)
ttrks_prev_offset.y, // msb
ttrks_prev_offset.x); // lsb
SPN_BUFREF_DEFINE_AT_OFFSET_U32VEC2(SPN_BUFFER_TYPE(ttrk_keyvals),
ttrks_prev,
push.devaddr_ttrk_keyvals,
ttrks_prev_offset);
prev[0] = ttrks_prev.extent[0];
}
else
{
//
// This is the first block and first lane so we want to
// force recording of a new y in order to clear the prefix
// accumulator.
//
prev[0].x = curr[0].x;
prev[0].y = curr[0].y ^ SPN_TTRK_HI_MASK_Y;
}
}
//
// Update curr ttrks and get counts
//
uint32_t ttrk_count[SPN_DEVICE_TTRKS_SEGMENT_ROWS];
[[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
{
ttrk_count[ii] = spn_ttrk_update_and_compare(curr[ii], prev[ii]);
}
//
// Store updated ttrk keyvals back to extent
//
[[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
{
const uint32_t idx = ttrks_idx_row0 + ii * SPN_SUBGROUP_SIZE;
if (idx < ttrks_count)
{
ttrks_curr.extent[ii * SPN_SUBGROUP_SIZE] = curr[ii];
}
}
//
// Exclusive add all TTPK and BLOCK counts
//
uint32_t ttrk_count_exc[SPN_DEVICE_TTRKS_SEGMENT_ROWS];
uint32_t prev_last = 0;
[[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
{
ttrk_count_exc[ii] = prev_last + subgroupInclusiveAdd(ttrk_count[ii]);
prev_last = subgroupBroadcast(ttrk_count_exc[ii], SPN_SUBGROUP_SIZE - 1);
ttrk_count_exc[ii] = ttrk_count_exc[ii] - ttrk_count[ii];
}
//////////////////////////////////////////////////////////////////////
//
// At this point every row has:
//
// * curr ttrk
// * prev ttrk
// * exclusive prefix-sum of block and ttpk counts
//
// An invalid cohort id identifies an invalid keyval.
//
//////////////////////////////////////////////////////////////////////
//
// Atomically accumulate meta data for the raster cohort:
//
// * meta.rk_off
// * meta.blocks
// * meta.ttpks
// * meta.ttrks
//
// Note that the explicit "+" signs in the atomicAdd() difference
// operations are for clarity.
//
[[unroll]] for (uint32_t ii = 0; ii < SPN_DEVICE_TTRKS_SEGMENT_ROWS; ii++)
{
const uint32_t idx = ttrks_idx_row0 + ii * SPN_SUBGROUP_SIZE;
if (SPN_TTRK_IS_NEW_COHORT(curr[ii], prev[ii]))
{
const uint32_t curr_cohort_id = SPN_TTRK_GET_COHORT(curr[ii]);
const uint32_t prev_cohort_id = SPN_TTRK_GET_COHORT(prev[ii]);
// meta.rk_off
ttrks.meta.rk_off[curr_cohort_id] = idx;
// meta.ttrks
atomicAdd(ttrks.meta.ttrks[curr_cohort_id], -idx);
atomicAdd(ttrks.meta.ttrks[prev_cohort_id], +idx);
// meta.blocks
const uint32_t block_count = SPN_TTRK_COUNT_GET_BLOCK_COUNT(ttrk_count_exc[ii]);
atomicAdd(ttrks.meta.blocks[curr_cohort_id], -block_count);
atomicAdd(ttrks.meta.blocks[prev_cohort_id], +block_count);
// meta.ttpks
const uint32_t ttpk_count = SPN_TTRK_COUNT_GET_TTPK_COUNT(ttrk_count_exc[ii]);
atomicAdd(ttrks.meta.ttpks[curr_cohort_id], -ttpk_count);
atomicAdd(ttrks.meta.ttpks[prev_cohort_id], +ttpk_count);
}
}
//
// Finalize by accumulating the contributions of the very last
// keyval in the block.
//
// Note that `prev_last` contains the exclusive prefix-sum of the
// row *following* the last row in the block.
//
if (gl_SubgroupInvocationID == SPN_SUBGROUP_SIZE - 1)
{
const uint32_t curr_cohort_id = SPN_TTRK_GET_COHORT(curr[SPN_TTRK_ROW_LAST]);
//
// If the number of keyvals is a multiple of SPN_SUBGROUP_TTRKS
// then the final keyval has to contribute its position to the
// ttrk count.
//
if ((ttrks_idx_row0 + SPN_TTRK_ROW_LAST * SPN_SUBGROUP_SIZE + 1) == ttrks_count)
{
atomicAdd(ttrks.meta.ttrks[curr_cohort_id], +ttrks_count);
}
// meta.blocks
const uint32_t block_count = SPN_TTRK_COUNT_GET_BLOCK_COUNT(prev_last);
atomicAdd(ttrks.meta.blocks[curr_cohort_id], +block_count);
// meta.ttpks
const uint32_t ttpk_count = SPN_TTRK_COUNT_GET_TTPK_COUNT(prev_last);
atomicAdd(ttrks.meta.ttpks[curr_cohort_id], +ttpk_count);
}
}
//
//
//