src/graphics/lib/compute/radix_sort/platforms/vk/shaders/init.comp - fuchsia - Git at Google

 // Copyright 2021 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #version 460

 //
 // Initialize the `rs_indirect_info` struct
 //

 // clang-format off
 #extension GL_GOOGLE_include_directive    : require
 #extension GL_EXT_control_flow_attributes : require
 // clang-format on

 //
 // Load arch/keyval configuration
 //
 #include "config.h"

 //
 // Buffer reference macros and push constants
 //
 #include "bufref.h"
 #include "push.h"

 //
 // Subgroup uniform support
 //
 #if defined(RS_SCATTER_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
 #extension GL_EXT_subgroupuniform_qualifier : required
 #define RS_SUBGROUP_UNIFORM subgroupuniformEXT
 #else
 #define RS_SUBGROUP_UNIFORM
 #endif

 //
 // Declare the push constants
 //
 RS_STRUCT_PUSH_INIT();

 layout(push_constant) uniform block_push
 {
   rs_push_init push;
 };

 //
 // The "init" shader configures the fill info structure.
 //
 RS_STRUCT_INDIRECT_INFO();

 //
 // Local macros
 //
 // clang-format off
 #define RS_FILL_WORKGROUP_SIZE        (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
 #define RS_SCATTER_WORKGROUP_SIZE     (1 << RS_SCATTER_WORKGROUP_SIZE_LOG2)
 #define RS_HISTOGRAM_WORKGROUP_SIZE   (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)

 #define RS_FILL_BLOCK_DWORDS          (RS_FILL_BLOCK_ROWS * RS_FILL_WORKGROUP_SIZE)
 #define RS_SCATTER_BLOCK_KEYVALS      (RS_SCATTER_BLOCK_ROWS * RS_SCATTER_WORKGROUP_SIZE)
 #define RS_HISTOGRAM_BLOCK_KEYVALS    (RS_HISTOGRAM_BLOCK_ROWS * RS_HISTOGRAM_WORKGROUP_SIZE)
 // clang-format on

 //
 // This workgroup only has one invocation!
 //
 layout(local_size_x = 1) in;

 //
 //
 //
 layout(buffer_reference, std430) buffer buffer_rs_count
 {
   uint32_t count;
 };

 layout(buffer_reference, std430) buffer buffer_rs_indirect_info
 {
   rs_indirect_info info;
 };

 //
 // Helper macros
 //
 // RU = Round Up
 // RD = Round Down
 //
 #define RS_COUNT_RU_BLOCKS(count_, block_size_) ((count_ + (block_size_)-1) / (block_size_))
 #define RS_COUNT_RD_BLOCKS(count_, block_size_) ((count_) / (block_size_))

 //
 //
 //
 void
 main()
 {
   //
   // Load the keyval count
   //
   readonly RS_BUFREF_DEFINE(buffer_rs_count, rs_count, push.devaddr_count);

   RS_SUBGROUP_UNIFORM const uint32_t count = rs_count.count;

   //
   // Define the init struct bufref
   //
   writeonly RS_BUFREF_DEFINE(buffer_rs_indirect_info, rs_indirect_info, push.devaddr_info);

   //
   // Size and set scatter dispatch
   //
   const uint32_t scatter_blocks_ru = RS_COUNT_RU_BLOCKS(count, RS_SCATTER_BLOCK_KEYVALS);
   const uint32_t scatter_count_ru  = scatter_blocks_ru * RS_SCATTER_BLOCK_KEYVALS;

   rs_indirect_info.info.dispatch.scatter = u32vec4(scatter_blocks_ru, 1, 1, 0);

   //
   // Size and set histogram dispatch
   //
   const uint32_t histo_blocks_ru = RS_COUNT_RU_BLOCKS(scatter_count_ru, RS_HISTOGRAM_BLOCK_KEYVALS);
   const uint32_t histo_count_ru  = histo_blocks_ru * RS_HISTOGRAM_BLOCK_KEYVALS;

   rs_indirect_info.info.dispatch.histogram = u32vec4(histo_blocks_ru, 1, 1, 0);

   //
   // Size and set pad fill and dispatch
   //
   const uint32_t count_dwords          = count * RS_KEYVAL_DWORDS;
   const uint32_t pad_blocks_rd         = RS_COUNT_RD_BLOCKS(count_dwords, RS_FILL_BLOCK_DWORDS);
   const uint32_t pad_count_rd          = pad_blocks_rd * RS_FILL_BLOCK_DWORDS;
   const uint32_t histo_count_ru_dwords = histo_count_ru * RS_KEYVAL_DWORDS;
   const uint32_t pad_dwords            = histo_count_ru_dwords - pad_count_rd;
   const uint32_t pad_blocks_ru         = RS_COUNT_RU_BLOCKS(pad_dwords, RS_FILL_BLOCK_DWORDS);

   rs_indirect_info_fill pad;

   pad.block_offset               = pad_blocks_rd;
   pad.dword_offset_min           = count_dwords;
   pad.dword_offset_max_minus_min = histo_count_ru_dwords - count_dwords;

   rs_indirect_info.info.pad          = pad;
   rs_indirect_info.info.dispatch.pad = u32vec4(pad_blocks_ru, 1, 1, 0);

   //
   // Size and set zero fill and dispatch
   //
   // NOTE(allanmac): We could zero the histogram passes on the host
   // since the number of passes is known ahead of time but since the
   // 256-dword partitions directly follow the 256-dword histograms we
   // can dispatch just one FILL.
   //
   // The "internal" memory map looks like this:
   //
   //   +---------------------------------+ <-- 0
   //   | histograms[keyval_size]         |
   //   +---------------------------------+ <-- keyval_size                           * histo_dwords
   //   | partitions[scatter_blocks_ru-1] |
   //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords
   //   | workgroup_ids[keyval_size]      |
   //   +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords + keyval_size
   //
   // NOTE(allanmac): The `.block_offset` and `.dword_offset_min`
   // parameters are zeroes because the host can offset the buffer
   // device address since the number of passes is known by the host.
   // If we ever wanted to supported an indirect number of "key" bits
   // in the sort, then this would need to change.
   //
   // NOTE(allanmac): The `.workgroup_ids[]` are only used if
   // nonsequential dispatch isn't supported by the device.
   //
   rs_indirect_info_fill zero;

   // clang-format off
   zero.block_offset               = 0;
   zero.dword_offset_min           = 0;
   zero.dword_offset_max_minus_min = (push.passes + scatter_blocks_ru - 1) * RS_RADIX_SIZE;
   // clang-format on

 #ifdef RS_SCATTER_NONSEQUENTIAL_DISPATCH
   zero.dword_offset_max_minus_min += (RS_KEYVAL_DWORDS * 4);  // one pass per byte
 #endif

   const uint32_t zero_blocks_ru = RS_COUNT_RU_BLOCKS(zero.dword_offset_max_minus_min,  //
                                                      RS_FILL_BLOCK_DWORDS);

   rs_indirect_info.info.zero          = zero;
   rs_indirect_info.info.dispatch.zero = u32vec4(zero_blocks_ru, 1, 1, 0);
 }

 //
 //
 //
	// Copyright 2021 The Fuchsia Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#version 460

	//
	// Initialize the `rs_indirect_info` struct
	//

	// clang-format off
	#extension GL_GOOGLE_include_directive : require
	#extension GL_EXT_control_flow_attributes : require
	// clang-format on

	//
	// Load arch/keyval configuration
	//
	#include "config.h"

	//
	// Buffer reference macros and push constants
	//
	#include "bufref.h"
	#include "push.h"

	//
	// Subgroup uniform support
	//
	#if defined(RS_SCATTER_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
	#extension GL_EXT_subgroupuniform_qualifier : required
	#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
	#else
	#define RS_SUBGROUP_UNIFORM
	#endif

	//
	// Declare the push constants
	//
	RS_STRUCT_PUSH_INIT();

	layout(push_constant) uniform block_push
	{
	rs_push_init push;
	};

	//
	// The "init" shader configures the fill info structure.
	//
	RS_STRUCT_INDIRECT_INFO();

	//
	// Local macros
	//
	// clang-format off
	#define RS_FILL_WORKGROUP_SIZE (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
	#define RS_SCATTER_WORKGROUP_SIZE (1 << RS_SCATTER_WORKGROUP_SIZE_LOG2)
	#define RS_HISTOGRAM_WORKGROUP_SIZE (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)

	#define RS_FILL_BLOCK_DWORDS (RS_FILL_BLOCK_ROWS * RS_FILL_WORKGROUP_SIZE)
	#define RS_SCATTER_BLOCK_KEYVALS (RS_SCATTER_BLOCK_ROWS * RS_SCATTER_WORKGROUP_SIZE)
	#define RS_HISTOGRAM_BLOCK_KEYVALS (RS_HISTOGRAM_BLOCK_ROWS * RS_HISTOGRAM_WORKGROUP_SIZE)
	// clang-format on

	//
	// This workgroup only has one invocation!
	//
	layout(local_size_x = 1) in;

	//
	//
	//
	layout(buffer_reference, std430) buffer buffer_rs_count
	{
	uint32_t count;
	};

	layout(buffer_reference, std430) buffer buffer_rs_indirect_info
	{
	rs_indirect_info info;
	};

	//
	// Helper macros
	//
	// RU = Round Up
	// RD = Round Down
	//
	#define RS_COUNT_RU_BLOCKS(count_, block_size_) ((count_ + (block_size_)-1) / (block_size_))
	#define RS_COUNT_RD_BLOCKS(count_, block_size_) ((count_) / (block_size_))

	//
	//
	//
	void
	main()
	{
	//
	// Load the keyval count
	//
	readonly RS_BUFREF_DEFINE(buffer_rs_count, rs_count, push.devaddr_count);

	RS_SUBGROUP_UNIFORM const uint32_t count = rs_count.count;

	//
	// Define the init struct bufref
	//
	writeonly RS_BUFREF_DEFINE(buffer_rs_indirect_info, rs_indirect_info, push.devaddr_info);

	//
	// Size and set scatter dispatch
	//
	const uint32_t scatter_blocks_ru = RS_COUNT_RU_BLOCKS(count, RS_SCATTER_BLOCK_KEYVALS);
	const uint32_t scatter_count_ru = scatter_blocks_ru * RS_SCATTER_BLOCK_KEYVALS;

	rs_indirect_info.info.dispatch.scatter = u32vec4(scatter_blocks_ru, 1, 1, 0);

	//
	// Size and set histogram dispatch
	//
	const uint32_t histo_blocks_ru = RS_COUNT_RU_BLOCKS(scatter_count_ru, RS_HISTOGRAM_BLOCK_KEYVALS);
	const uint32_t histo_count_ru = histo_blocks_ru * RS_HISTOGRAM_BLOCK_KEYVALS;

	rs_indirect_info.info.dispatch.histogram = u32vec4(histo_blocks_ru, 1, 1, 0);

	//
	// Size and set pad fill and dispatch
	//
	const uint32_t count_dwords = count * RS_KEYVAL_DWORDS;
	const uint32_t pad_blocks_rd = RS_COUNT_RD_BLOCKS(count_dwords, RS_FILL_BLOCK_DWORDS);
	const uint32_t pad_count_rd = pad_blocks_rd * RS_FILL_BLOCK_DWORDS;
	const uint32_t histo_count_ru_dwords = histo_count_ru * RS_KEYVAL_DWORDS;
	const uint32_t pad_dwords = histo_count_ru_dwords - pad_count_rd;
	const uint32_t pad_blocks_ru = RS_COUNT_RU_BLOCKS(pad_dwords, RS_FILL_BLOCK_DWORDS);

	rs_indirect_info_fill pad;

	pad.block_offset = pad_blocks_rd;
	pad.dword_offset_min = count_dwords;
	pad.dword_offset_max_minus_min = histo_count_ru_dwords - count_dwords;

	rs_indirect_info.info.pad = pad;
	rs_indirect_info.info.dispatch.pad = u32vec4(pad_blocks_ru, 1, 1, 0);

	//
	// Size and set zero fill and dispatch
	//
	// NOTE(allanmac): We could zero the histogram passes on the host
	// since the number of passes is known ahead of time but since the
	// 256-dword partitions directly follow the 256-dword histograms we
	// can dispatch just one FILL.
	//
	// The "internal" memory map looks like this:
	//
	// +---------------------------------+ <-- 0
	// \| histograms[keyval_size] \|
	// +---------------------------------+ <-- keyval_size * histo_dwords
	// \| partitions[scatter_blocks_ru-1] \|
	// +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords
	// \| workgroup_ids[keyval_size] \|
	// +---------------------------------+ <-- (keyval_size + scatter_blocks_ru - 1) * histo_dwords + keyval_size
	//
	// NOTE(allanmac): The `.block_offset` and `.dword_offset_min`
	// parameters are zeroes because the host can offset the buffer
	// device address since the number of passes is known by the host.
	// If we ever wanted to supported an indirect number of "key" bits
	// in the sort, then this would need to change.
	//
	// NOTE(allanmac): The `.workgroup_ids[]` are only used if
	// nonsequential dispatch isn't supported by the device.
	//
	rs_indirect_info_fill zero;

	// clang-format off
	zero.block_offset = 0;
	zero.dword_offset_min = 0;
	zero.dword_offset_max_minus_min = (push.passes + scatter_blocks_ru - 1) * RS_RADIX_SIZE;
	// clang-format on

	#ifdef RS_SCATTER_NONSEQUENTIAL_DISPATCH
	zero.dword_offset_max_minus_min += (RS_KEYVAL_DWORDS * 4); // one pass per byte
	#endif

	const uint32_t zero_blocks_ru = RS_COUNT_RU_BLOCKS(zero.dword_offset_max_minus_min, //
	RS_FILL_BLOCK_DWORDS);

	rs_indirect_info.info.zero = zero;
	rs_indirect_info.info.dispatch.zero = u32vec4(zero_blocks_ru, 1, 1, 0);
	}

	//
	//
	//