blob: 3abe4ea1cd8bf61043ba6e027dd4bed09d5be7f8 [file] [log] [blame]
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#version 460
//
// Initialize the `rs_indirect_info` struct
//
// clang-format off
#extension GL_GOOGLE_include_directive : require
#extension GL_EXT_control_flow_attributes : require
// clang-format on
//
// Load arch/keyval configuration
//
#include "config.h"
//
// Buffer reference macros and push constants
//
#include "bufref.h"
#include "push.h"
//
// Subgroup uniform support
//
#if defined(RS_SCATTER_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
#extension GL_EXT_subgroupuniform_qualifier : required
#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
#else
#define RS_SUBGROUP_UNIFORM
#endif
//
// Declare the push constants
//
RS_STRUCT_PUSH_INIT();
layout(push_constant) uniform block_push
{
rs_push_init push;
};
//
// The "init" shader configures the fill info structure.
//
RS_STRUCT_INDIRECT_INFO();
//
// Local macros
//
// clang-format off
#define RS_FILL_WORKGROUP_SIZE (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
#define RS_SCATTER_WORKGROUP_SIZE (1 << RS_SCATTER_WORKGROUP_SIZE_LOG2)
#define RS_HISTOGRAM_WORKGROUP_SIZE (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)
#define RS_FILL_BLOCK_DWORDS (RS_FILL_BLOCK_ROWS * RS_FILL_WORKGROUP_SIZE)
#define RS_SCATTER_BLOCK_KEYVALS (RS_SCATTER_BLOCK_ROWS * RS_SCATTER_WORKGROUP_SIZE)
#define RS_HISTOGRAM_BLOCK_KEYVALS (RS_HISTOGRAM_BLOCK_ROWS * RS_HISTOGRAM_WORKGROUP_SIZE)
// clang-format on
//
// This workgroup only has one invocation!
//
layout(local_size_x = 1) in;
//
//
//
layout(buffer_reference, std430) buffer buffer_rs_count
{
uint32_t count;
};
layout(buffer_reference, std430) buffer buffer_rs_indirect_info
{
rs_indirect_info info;
};
//
// Helper macros
//
// RU = Round Up
// RD = Round Down
//
#define RS_COUNT_RU_BLOCKS(count_, block_size_) ((count_ + (block_size_)-1) / (block_size_))
#define RS_COUNT_RD_BLOCKS(count_, block_size_) ((count_) / (block_size_))
//
//
//
void
main()
{
//
// Load the keyval count
//
readonly RS_BUFREF_DEFINE(buffer_rs_count, rs_count, push.devaddr_count);
RS_SUBGROUP_UNIFORM const uint32_t count = rs_count.count;
//
// Define the init struct bufref
//
writeonly RS_BUFREF_DEFINE(buffer_rs_indirect_info, rs_indirect_info, push.devaddr_info);
//
// Size and set scatter dispatch
//
const uint32_t scatter_blocks_ru = RS_COUNT_RU_BLOCKS(count, RS_SCATTER_BLOCK_KEYVALS);
const uint32_t scatter_count_ru = scatter_blocks_ru * RS_SCATTER_BLOCK_KEYVALS;
rs_indirect_info.info.dispatch.scatter = u32vec4(scatter_blocks_ru, 1, 1, 0);
//
// Size and set histogram dispatch
//
const uint32_t histo_blocks_ru = RS_COUNT_RU_BLOCKS(scatter_count_ru, RS_HISTOGRAM_BLOCK_KEYVALS);
const uint32_t histo_count_ru = histo_blocks_ru * RS_HISTOGRAM_BLOCK_KEYVALS;
rs_indirect_info.info.dispatch.histogram = u32vec4(histo_blocks_ru, 1, 1, 0);
//
// Size and set pad fill and dispatch
//
const uint32_t count_dwords = count * RS_KEYVAL_DWORDS;
const uint32_t pad_blocks_rd = RS_COUNT_RD_BLOCKS(count_dwords, RS_FILL_BLOCK_DWORDS);
const uint32_t pad_count_rd = pad_blocks_rd * RS_FILL_BLOCK_DWORDS;
const uint32_t histo_count_ru_dwords = histo_count_ru * RS_KEYVAL_DWORDS;
const uint32_t pad_dwords = histo_count_ru_dwords - pad_count_rd;
const uint32_t pad_blocks_ru = RS_COUNT_RU_BLOCKS(pad_dwords, RS_FILL_BLOCK_DWORDS);
rs_indirect_info_fill pad;
pad.block_offset = pad_blocks_rd;
pad.dword_offset_min = count_dwords;
pad.dword_offset_max_minus_min = histo_count_ru_dwords - count_dwords;
rs_indirect_info.info.pad = pad;
rs_indirect_info.info.dispatch.pad = u32vec4(pad_blocks_ru, 1, 1, 0);
//
// Size and set zero fill and dispatch
//
// NOTE(allanmac): We could zero the histogram passes on the host
// since the number of passes is known ahead of time but since the
// 256-dword partitions directly follow the 256-dword histograms we
// can dispatch just one FILL.
//
// The "internal" memory map looks like this:
//
// +---------------------------------+ <-- 0
// | histograms[keyval_size] |
// +---------------------------------+ <-- keyval_size * histo_dwords
// | partitions[scatter_blocks_ru-1] |
// +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords
// | workgroup_ids[keyval_size] |
// +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords + keyval_size
//
// NOTE(allanmac): The `.block_offset` and `.dword_offset_min`
// parameters are zeroes because the host can offset the buffer
// device address since the number of passes is known by the host.
// If we ever wanted to supported an indirect number of "key" bits
// in the sort, then this would need to change.
//
// NOTE(allanmac): The `.workgroup_ids[]` are only used if
// nonsequential dispatch isn't supported by the device.
//
rs_indirect_info_fill zero;
// clang-format off
zero.block_offset = 0;
zero.dword_offset_min = 0;
zero.dword_offset_max_minus_min = (push.passes + scatter_blocks_ru - 1) * RS_RADIX_SIZE;
// clang-format on
#ifdef RS_SCATTER_NONSEQUENTIAL_DISPATCH
zero.dword_offset_max_minus_min += (RS_KEYVAL_DWORDS * 4); // one pass per byte
#endif
const uint32_t zero_blocks_ru = RS_COUNT_RU_BLOCKS(zero.dword_offset_max_minus_min, //
RS_FILL_BLOCK_DWORDS);
rs_indirect_info.info.zero = zero;
rs_indirect_info.info.dispatch.zero = u32vec4(zero_blocks_ru, 1, 1, 0);
}
//
//
//