blob: 28836ee30fd85f9138fef8aab42d4521ef7f56fd [file]
/*
* Copyright © 2025 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#version 460
#extension GL_GOOGLE_include_directive : require
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#extension GL_EXT_scalar_block_layout : require
#extension GL_EXT_buffer_reference : require
#extension GL_EXT_buffer_reference2 : require
#extension GL_KHR_memory_scope_semantics : require
#extension GL_KHR_shader_subgroup_basic : require
#extension GL_KHR_shader_subgroup_shuffle : require
#extension GL_KHR_shader_subgroup_ballot : require
#extension GL_KHR_shader_subgroup_clustered : require
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
#include "build_interface.h"
#include "invocation_cluster.h"
#include "update.h"
layout(push_constant) uniform CONSTS
{
update_gfx12_args args;
};
uint32_t
read_bits(VOID_REF data, uint32_t start, uint32_t count)
{
uint32_t shift = start % 32;
uint32_t lower = DEREF(INDEX(uint32_t, data, start / 32)) >> shift;
uint32_t upper = 0;
if (shift != 0 && shift + count > 32)
upper = DEREF(INDEX(uint32_t, data, start / 32 + 1)) << (32 - shift);
uint32_t total = lower | upper;
return count != 32 ? total & ((1u << count) - 1u) : total;
}
void
main()
{
uint32_t bvh_offset = DEREF(args.src).bvh_offset;
VOID_REF src_bvh = OFFSET(args.src, bvh_offset);
VOID_REF dst_bvh = OFFSET(args.dst, bvh_offset);
uint32_t leaf_node_size = SIZEOF(radv_gfx12_primitive_node);
uint32_t first_leaf_offset = id_to_offset(RADV_BVH_ROOT_NODE) + SIZEOF(radv_gfx12_box_node);
uint32_t internal_nodes_offset = first_leaf_offset + args.leaf_node_count * leaf_node_size;
uint32_t node_count = DEREF(args.src).update_dispatch_size[0] / 8;
uint32_t node_index = node_count - 1 - gl_GlobalInvocationID.x / 8;
bool is_root_node = node_index == 0;
/* Each invocation cluster updates one internal node. */
radv_invocation_cluster cluster;
radv_invocation_cluster_init(cluster, 8);
uint32_t node_offset = internal_nodes_offset + SIZEOF(radv_gfx12_box_node) * (node_index - 1);
if (is_root_node)
node_offset = id_to_offset(RADV_BVH_ROOT_NODE);
radv_gfx12_box_node src_node = DEREF(REF(radv_gfx12_box_node) OFFSET(src_bvh, node_offset));
REF(radv_gfx12_box_node) dst_node = REF(radv_gfx12_box_node) OFFSET(dst_bvh, node_offset);
uint32_t valid_child_count_minus_one = src_node.child_count_exponents >> 28;
radv_gfx12_box_child child = src_node.children[cluster.invocation_index];
uint32_t child_type = (child.dword2 >> 24) & 0xf;
bool is_leaf_or_invalid = child_type == radv_bvh_node_triangle;
bool is_valid = cluster.invocation_index <= valid_child_count_minus_one && valid_child_count_minus_one != 0xf;
bool is_leaf = is_leaf_or_invalid && is_valid;
uint32_t child_offset;
if (is_leaf_or_invalid) {
child_offset = id_to_offset(src_node.primitive_base_id);
uint32_t child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1));
child_offset += leaf_node_size * child_index;
} else {
child_offset = id_to_offset(src_node.internal_base_id);
uint32_t child_index = bitCount(radv_ballot(cluster, true) & ((1u << cluster.invocation_index) - 1));
child_offset += SIZEOF(radv_gfx12_box_node) * child_index;
}
uint32_t child_index = (child_offset - internal_nodes_offset) / SIZEOF(radv_gfx12_box_node);
bool is_ready = is_leaf_or_invalid;
while (true) {
if (!is_ready)
is_ready = DEREF(INDEX(uint32_t, args.internal_ready_count, child_index)) != 0;
if (radv_ballot(cluster, is_ready) != 0xff)
continue;
vk_aabb bounds;
bounds.min = vec3(INFINITY);
bounds.max = vec3(-INFINITY);
if (is_leaf) {
VOID_REF src_leaf_node = OFFSET(src_bvh, child_offset);
uint32_t indices_midpoint = read_bits(src_leaf_node, 42, 10);
uint32_t geometry_index = read_bits(src_leaf_node, indices_midpoint - 28, 28);
uint32_t primitive_index = read_bits(src_leaf_node, indices_midpoint, 28);
vk_bvh_geometry_data geom_data = DEREF(INDEX(vk_bvh_geometry_data, args.geom_data, geometry_index));
VOID_REF dst_leaf_node = OFFSET(dst_bvh, child_offset);
if (geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
radv_build_triangle(bounds, dst_leaf_node, geom_data, primitive_index, true);
} else {
VOID_REF src_ptr = OFFSET(geom_data.data, primitive_index * geom_data.stride);
radv_build_aabb(bounds, src_ptr, dst_leaf_node, geometry_index, primitive_index, true);
}
} else if (is_valid) {
bounds = DEREF(INDEX(vk_aabb, args.bounds, child_index));
}
vk_aabb total_bounds;
total_bounds.min.x = subgroupClusteredMin(bounds.min.x, 8);
total_bounds.min.y = subgroupClusteredMin(bounds.min.y, 8);
total_bounds.min.z = subgroupClusteredMin(bounds.min.z, 8);
total_bounds.max.x = subgroupClusteredMax(bounds.max.x, 8);
total_bounds.max.y = subgroupClusteredMax(bounds.max.y, 8);
total_bounds.max.z = subgroupClusteredMax(bounds.max.z, 8);
if (!is_root_node) {
DEREF(INDEX(vk_aabb, args.bounds, node_index - 1)) = total_bounds;
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
DEREF(INDEX(uint32_t, args.internal_ready_count, node_index - 1)) = 1;
}
vec3 origin = total_bounds.min;
vec3 extent = total_bounds.max - total_bounds.min;
extent = uintBitsToFloat((floatBitsToUint(extent) + uvec3(0x7fffff)) & 0x7f800000);
uvec3 extent_exponents = floatBitsToUint(extent) >> 23;
if (cluster.invocation_index == 0) {
if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE)) {
DEREF(dst_node).primitive_base_id = src_node.primitive_base_id;
DEREF(dst_node).internal_base_id = src_node.internal_base_id;
}
DEREF(dst_node).origin = origin;
DEREF(dst_node).child_count_exponents = extent_exponents.x | (extent_exponents.y << 8) |
(extent_exponents.z << 16) | (valid_child_count_minus_one << 28);
if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE))
DEREF(dst_node).obb_matrix_index = 0x7f;
}
if (is_valid) {
radv_gfx12_box_child box_child;
box_child.dword0 = (child.dword0 & 0xFF000000) |
min(uint32_t(floor((bounds.min.x - origin.x) / extent.x * float(0x1000))), 0xfff) |
(min(uint32_t(floor((bounds.min.y - origin.y) / extent.y * float(0x1000))), 0xfff) << 12);
box_child.dword1 =
(child.dword1 & 0xFF000000) |
min(uint32_t(floor((bounds.min.z - origin.z) / extent.z * float(0x1000))), 0xfff) |
(min(uint32_t(ceil((bounds.max.x - origin.x) / extent.x * float(0x1000))) - 1, 0xfff) << 12);
box_child.dword2 =
(child.dword2 & 0xFF000000) |
min(uint32_t(ceil((bounds.max.y - origin.y) / extent.y * float(0x1000))) - 1, 0xfff) |
(min(uint32_t(ceil((bounds.max.z - origin.z) / extent.z * float(0x1000))) - 1, 0xfff) << 12);
DEREF(dst_node).children[cluster.invocation_index] = box_child;
}
if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE) && !is_valid) {
radv_gfx12_box_child null_child;
null_child.dword0 = 0xffffffff;
null_child.dword1 = 0xfff;
null_child.dword2 = 0;
DEREF(dst_node).children[cluster.invocation_index] = null_child;
}
if (is_root_node)
DEREF(args.dst).aabb = total_bounds;
/* Make changes to internal_ready_count available to the other invocations. */
memoryBarrier(gl_ScopeDevice, gl_StorageSemanticsBuffer,
gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
break;
}
}