src/vulkan/runtime/bvh/ploc_internal.comp - third_party/mesa - Git at Google

 /*
  * Copyright © 2022 Bas Nieuwenhuizen
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #version 460

 #extension GL_GOOGLE_include_directive : require

 #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
 #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
 #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
 #extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
 #extension GL_EXT_scalar_block_layout : require
 #extension GL_EXT_buffer_reference : require
 #extension GL_EXT_buffer_reference2 : require
 #extension GL_KHR_memory_scope_semantics : require
 #extension GL_KHR_shader_subgroup_vote : require
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_KHR_shader_subgroup_ballot : require

 layout(local_size_x = 1024, local_size_y = 1, local_size_z = 1) in;

 #define USE_GLOBAL_SYNC
 #include "vk_build_interface.h"

 TYPE(ploc_prefix_scan_partition, 4);

 layout(push_constant) uniform CONSTS
 {
    ploc_args args;
 };

 shared uint32_t exclusive_prefix_sum;
 shared uint32_t aggregate_sums[PLOC_SUBGROUPS_PER_WORKGROUP];
 shared uint32_t aggregate_sums2[PLOC_SUBGROUPS_PER_WORKGROUP];

 /*
  * Global prefix scan over all workgroups to find out the index of the collapsed node to write.
  * See https://research.nvidia.com/sites/default/files/publications/nvr-2016-002.pdf
  * One partition = one workgroup in this case.
  */
 uint32_t
 prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t task_index)
 {
    if (gl_LocalInvocationIndex == 0) {
       exclusive_prefix_sum = 0;
       if (task_index >= gl_WorkGroupSize.x) {
          REF(ploc_prefix_scan_partition) current_partition =
             REF(ploc_prefix_scan_partition)(INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x));

          REF(ploc_prefix_scan_partition) previous_partition = current_partition - 1;

          while (true) {
             /* See if this previous workgroup already set their inclusive sum */
             if (atomicLoad(DEREF(previous_partition).inclusive_sum, gl_ScopeDevice,
                            gl_StorageSemanticsBuffer,
                            gl_SemanticsAcquire | gl_SemanticsMakeVisible) != 0xFFFFFFFF) {
                atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).inclusive_sum);
                break;
             } else {
                atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).aggregate);
                previous_partition -= 1;
             }
          }
          /* Set the inclusive sum for the next workgroups */
          atomicStore(DEREF(current_partition).inclusive_sum,
                      DEREF(current_partition).aggregate + exclusive_prefix_sum, gl_ScopeDevice,
                      gl_StorageSemanticsBuffer, gl_SemanticsRelease | gl_SemanticsMakeAvailable);
       }
    }

    if (subgroupElect())
       aggregate_sums[gl_SubgroupID] = subgroupBallotBitCount(ballot);
    barrier();

    if (PLOC_SUBGROUPS_PER_WORKGROUP <= SUBGROUP_SIZE) {
       if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) {
          aggregate_sums[gl_LocalInvocationID.x] =
             exclusive_prefix_sum + subgroupExclusiveAdd(aggregate_sums[gl_LocalInvocationID.x]);
       }
    } else {
       /* If the length of aggregate_sums[] is larger than SUBGROUP_SIZE,
        * the prefix scan can't be done simply by subgroupExclusiveAdd.
        */
       if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
          aggregate_sums2[gl_LocalInvocationID.x] = aggregate_sums[gl_LocalInvocationID.x];
       barrier();

       /* Hillis Steele inclusive scan on aggregate_sums2 */
       for (uint32_t stride = 1; stride < PLOC_SUBGROUPS_PER_WORKGROUP; stride *= 2) {
          uint32_t value = 0;
          if (gl_LocalInvocationID.x >= stride && gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
             value = aggregate_sums2[gl_LocalInvocationID.x - stride];
          barrier();
          if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
             aggregate_sums2[gl_LocalInvocationID.x] += value;
          barrier();
       }

       /* Adapt to exclusive and add the prefix_sum from previous workgroups */
       if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) {
          if (gl_LocalInvocationID.x == 0)
             aggregate_sums[gl_LocalInvocationID.x] = exclusive_prefix_sum;
          else
             aggregate_sums[gl_LocalInvocationID.x] = exclusive_prefix_sum + aggregate_sums2[gl_LocalInvocationID.x - 1];
       }
    }
    barrier();

    return aggregate_sums[gl_SubgroupID] + subgroupBallotExclusiveBitCount(ballot);
 }

 /* Relative cost of increasing the BVH depth. Deep BVHs will require more backtracking. */
 #define BVH_LEVEL_COST 0.2

 uint32_t
 push_node(uint32_t children[2], vk_aabb bounds[2])
 {
    uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1);
    uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(vk_ir_box_node);
    uint32_t dst_id = pack_ir_node_id(dst_offset, vk_ir_node_internal);
    REF(vk_ir_box_node) dst_node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset));

    vk_aabb total_bounds;
    total_bounds.min = vec3(INFINITY);
    total_bounds.max = vec3(-INFINITY);

    uint32_t dst_flags = VK_BVH_BOX_FLAG_ONLY_OPAQUE | VK_BVH_BOX_FLAG_NO_OPAQUE;

    for (uint i = 0; i < 2; ++i) {
       total_bounds.min = min(total_bounds.min, bounds[i].min);
       total_bounds.max = max(total_bounds.max, bounds[i].max);

       DEREF(dst_node).children[i] = children[i];

       if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
          dst_flags &= fetch_child_flags(args.bvh, children[i]);
    }

    DEREF(dst_node).base.aabb = total_bounds;
    DEREF(dst_node).bvh_offset = VK_UNKNOWN_BVH_OFFSET;

    if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
       DEREF(dst_node).flags = dst_flags;

    return dst_id;
 }

 #define PLOC_NEIGHBOURHOOD 16
 #define PLOC_OFFSET_MASK   ((1 << 5) - 1)

 uint32_t
 encode_neighbour_offset(float sah, uint32_t i, uint32_t j)
 {
    int32_t offset = int32_t(j) - int32_t(i);
    uint32_t encoded_offset = offset + PLOC_NEIGHBOURHOOD - (offset > 0 ? 1 : 0);
    return (floatBitsToUint(sah) & (~PLOC_OFFSET_MASK)) | encoded_offset;
 }

 int32_t
 decode_neighbour_offset(uint32_t encoded_offset)
 {
    int32_t offset = int32_t(encoded_offset & PLOC_OFFSET_MASK) - PLOC_NEIGHBOURHOOD;
    return offset + (offset >= 0 ? 1 : 0);
 }

 #define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD

 shared vk_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
 shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS];

 uint32_t
 load_id(VOID_REF ids, uint32_t iter, uint32_t index)
 {
    if (iter == 0)
       return DEREF(REF(key_id_pair)(INDEX(key_id_pair, ids, index))).id;
    else
       return DEREF(REF(uint32_t)(INDEX(uint32_t, ids, index)));
 }

 void
 load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
             uint32_t neighbourhood_overlap, uint32_t search_bound)
 {
    for (uint32_t i = task_index - 2 * neighbourhood_overlap; i < search_bound;
         i += gl_WorkGroupSize.x) {
       uint32_t id = load_id(ids, iter, i);
       if (id == VK_BVH_INVALID_NODE)
          continue;

       VOID_REF addr = OFFSET(args.bvh, ir_id_to_offset(id));
       REF(vk_ir_node) node = REF(vk_ir_node)(addr);

       shared_bounds[i - lds_base] = DEREF(node).aabb;
    }
 }

 float
 combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j)
 {
    vk_aabb combined_bounds;
    combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min);
    combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max);
    return aabb_surface_area(combined_bounds);
 }

 shared uint32_t shared_aggregate_sum;

 void
 main(void)
 {
    VOID_REF src_ids = args.ids_0;
    VOID_REF dst_ids = args.ids_1;

    /* We try to use LBVH for BVHs where we know there will be less than 5 leaves,
     * but sometimes all leaves might be inactive */
    if (DEREF(args.header).active_leaf_count <= 2) {
       if (gl_GlobalInvocationID.x == 0) {
          uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1);
          uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(vk_ir_box_node);
          REF(vk_ir_box_node) dst_node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset));

          vk_aabb total_bounds;
          total_bounds.min = vec3(INFINITY);
          total_bounds.max = vec3(-INFINITY);

          uint32_t i = 0;
          for (; i < DEREF(args.header).active_leaf_count; i++) {
             uint32_t child_id = DEREF(INDEX(key_id_pair, src_ids, i)).id;

             if (child_id != VK_BVH_INVALID_NODE) {
                VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(child_id));
                REF(vk_ir_node) child = REF(vk_ir_node)(node);
                vk_aabb bounds = DEREF(child).aabb;

                total_bounds.min = min(total_bounds.min, bounds.min);
                total_bounds.max = max(total_bounds.max, bounds.max);
             }

             DEREF(dst_node).children[i] = child_id;
          }
          for (; i < 2; i++)
             DEREF(dst_node).children[i] = VK_BVH_INVALID_NODE;

          DEREF(dst_node).base.aabb = total_bounds;
          DEREF(dst_node).bvh_offset = VK_UNKNOWN_BVH_OFFSET;
       }
       return;
    }

    /* Only initialize sync_data once per workgroup. For intra-workgroup synchronization,
     * fetch_task contains a workgroup-scoped control+memory barrier.
     */
    if (gl_LocalInvocationIndex == 0) {
       atomicCompSwap(DEREF(args.header).sync_data.task_counts[0], 0xFFFFFFFF,
                      DEREF(args.header).active_leaf_count);
       atomicCompSwap(DEREF(args.header).sync_data.current_phase_end_counter, 0xFFFFFFFF,
                      DIV_ROUND_UP(DEREF(args.header).active_leaf_count, gl_WorkGroupSize.x));
    }

    REF(ploc_prefix_scan_partition)
    partitions = REF(ploc_prefix_scan_partition)(args.prefix_scan_partitions);

    uint32_t task_index = fetch_task(args.header, false);

    for (uint iter = 0;; ++iter) {
       if (task_index == TASK_INDEX_INVALID)
          break;

       /* Find preferred partners and merge them */
       PHASE (args.header) {
          uint32_t current_task_count = task_count(args.header);
          uint32_t base_index = task_index - gl_LocalInvocationID.x;
          uint32_t neighbourhood_overlap = min(PLOC_NEIGHBOURHOOD, base_index);
          uint32_t double_neighbourhood_overlap = min(2 * PLOC_NEIGHBOURHOOD, base_index);
          /* Upper bound to where valid nearest node indices are written. */
          uint32_t write_bound =
             min(current_task_count, base_index + gl_WorkGroupSize.x + PLOC_NEIGHBOURHOOD);
          /* Upper bound to where valid nearest node indices are searched. */
          uint32_t search_bound =
             min(current_task_count, base_index + gl_WorkGroupSize.x + 2 * PLOC_NEIGHBOURHOOD);
          uint32_t lds_base = base_index - double_neighbourhood_overlap;

          load_bounds(src_ids, iter, task_index, lds_base, neighbourhood_overlap, search_bound);

          for (uint32_t i = gl_LocalInvocationID.x; i < NUM_PLOC_LDS_ITEMS; i += gl_WorkGroupSize.x)
             nearest_neighbour_indices[i] = 0xFFFFFFFF;
          barrier();

          for (uint32_t i = task_index - double_neighbourhood_overlap; i < write_bound;
               i += gl_WorkGroupSize.x) {
             uint32_t right_bound = min(search_bound - 1 - i, PLOC_NEIGHBOURHOOD);

             uint32_t fallback_pair = i == 0 ? (i + 1) : (i - 1);
             uint32_t min_offset = encode_neighbour_offset(INFINITY, i, fallback_pair);

             for (uint32_t j = max(i + 1, base_index - neighbourhood_overlap); j <= i + right_bound;
                  ++j) {

                float sah = combined_node_cost(lds_base, i, j);
                uint32_t i_encoded_offset = encode_neighbour_offset(sah, i, j);
                uint32_t j_encoded_offset = encode_neighbour_offset(sah, j, i);
                min_offset = min(min_offset, i_encoded_offset);
                atomicMin(nearest_neighbour_indices[j - lds_base], j_encoded_offset);
             }
             if (i >= base_index - neighbourhood_overlap)
                atomicMin(nearest_neighbour_indices[i - lds_base], min_offset);
          }

          if (gl_LocalInvocationID.x == 0)
             shared_aggregate_sum = 0;
          barrier();

          for (uint32_t i = task_index - neighbourhood_overlap; i < write_bound;
               i += gl_WorkGroupSize.x) {
             uint32_t left_bound = min(i, PLOC_NEIGHBOURHOOD);
             uint32_t right_bound = min(search_bound - 1 - i, PLOC_NEIGHBOURHOOD);
             /*
              * Workaround for a worst-case scenario in PLOC: If the combined area of
              * all nodes (in the neighbourhood) is the same, then the chosen nearest
              * neighbour is the first neighbour. However, this means that no nodes
              * except the first two will find each other as nearest neighbour. Therefore,
              * only one node is contained in each BVH level. By first testing if the immediate
              * neighbour on one side is the nearest, all immediate neighbours will be merged
              * on every step.
              */
             uint32_t preferred_pair;
             if ((i & 1) != 0)
                preferred_pair = i - min(left_bound, 1);
             else
                preferred_pair = i + min(right_bound, 1);

             if (preferred_pair != i) {
                uint32_t encoded_min_sah =
                   nearest_neighbour_indices[i - lds_base] & (~PLOC_OFFSET_MASK);
                float sah = combined_node_cost(lds_base, i, preferred_pair);
                uint32_t encoded_sah = floatBitsToUint(sah) & (~PLOC_OFFSET_MASK);
                uint32_t encoded_offset = encode_neighbour_offset(sah, i, preferred_pair);
                if (encoded_sah <= encoded_min_sah) {
                   nearest_neighbour_indices[i - lds_base] = encoded_offset;
                }
             }
          }
          barrier();

          bool has_valid_node = true;

          if (task_index < current_task_count) {
             uint32_t base_index = task_index - gl_LocalInvocationID.x;

             uint32_t neighbour_index =
                task_index +
                decode_neighbour_offset(nearest_neighbour_indices[task_index - lds_base]);
             uint32_t other_neighbour_index =
                neighbour_index +
                decode_neighbour_offset(nearest_neighbour_indices[neighbour_index - lds_base]);
             uint32_t id = load_id(src_ids, iter, task_index);
             if (other_neighbour_index == task_index) {
                if (task_index < neighbour_index) {
                   uint32_t neighbour_id = load_id(src_ids, iter, neighbour_index);
                   uint32_t children[2] = {id, neighbour_id};
                   vk_aabb bounds[2] = {shared_bounds[task_index - lds_base], shared_bounds[neighbour_index - lds_base]};

                   DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = push_node(children, bounds);
                   DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, neighbour_index))) =
                      VK_BVH_INVALID_NODE;
                } else {
                   /* We still store in the other case so we don't destroy the node id needed to
                    * create the internal node */
                   has_valid_node = false;
                }
             } else {
                DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = id;
             }

             /* Compact - prepare prefix scan */
             uvec4 ballot = subgroupBallot(has_valid_node);

             uint32_t aggregate_sum = subgroupBallotBitCount(ballot);
             if (subgroupElect())
                atomicAdd(shared_aggregate_sum, aggregate_sum);
          }

          barrier();
          /*
           * The paper proposes initializing all partitions to an invalid state
           * and only computing aggregates afterwards. We skip that step and
           * initialize the partitions to a valid state. This also simplifies
           * the look-back, as there will never be any blocking due to invalid
           * partitions.
           */
          if (gl_LocalInvocationIndex == 0) {
             REF(ploc_prefix_scan_partition)
             current_partition = REF(ploc_prefix_scan_partition)(
                INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x));
             DEREF(current_partition).aggregate = shared_aggregate_sum;
             if (task_index < gl_WorkGroupSize.x) {
                DEREF(current_partition).inclusive_sum = shared_aggregate_sum;
             } else {
                DEREF(current_partition).inclusive_sum = 0xFFFFFFFF;
             }
          }

          if (task_index == 0)
             set_next_task_count(args.header, task_count(args.header));
       }

       /* Compact - prefix scan and update */
       PHASE (args.header) {
          uint32_t current_task_count = task_count(args.header);

          uint32_t id = task_index < current_task_count
                           ? DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index)))
                           : VK_BVH_INVALID_NODE;
          uvec4 ballot = subgroupBallot(id != VK_BVH_INVALID_NODE);

          uint32_t new_offset = prefix_scan(ballot, partitions, task_index);
          if (task_index >= current_task_count)
             continue;

          if (id != VK_BVH_INVALID_NODE) {
             DEREF(REF(uint32_t)(INDEX(uint32_t, src_ids, new_offset))) = id;
             ++new_offset;
          }

          if (task_index == current_task_count - 1) {
             set_next_task_count(args.header, new_offset);
             if (new_offset == 1)
                DEREF(args.header).sync_data.next_phase_exit_flag = 1;
          }
       }
    }
 }
	/*
	* Copyright © 2022 Bas Nieuwenhuizen
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*/

	#version 460

	#extension GL_GOOGLE_include_directive : require

	#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
	#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
	#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
	#extension GL_EXT_shader_explicit_arithmetic_types_int64 : require
	#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
	#extension GL_EXT_scalar_block_layout : require
	#extension GL_EXT_buffer_reference : require
	#extension GL_EXT_buffer_reference2 : require
	#extension GL_KHR_memory_scope_semantics : require
	#extension GL_KHR_shader_subgroup_vote : require
	#extension GL_KHR_shader_subgroup_arithmetic : require
	#extension GL_KHR_shader_subgroup_ballot : require

	layout(local_size_x = 1024, local_size_y = 1, local_size_z = 1) in;

	#define USE_GLOBAL_SYNC
	#include "vk_build_interface.h"

	TYPE(ploc_prefix_scan_partition, 4);

	layout(push_constant) uniform CONSTS
	{
	ploc_args args;
	};

	shared uint32_t exclusive_prefix_sum;
	shared uint32_t aggregate_sums[PLOC_SUBGROUPS_PER_WORKGROUP];
	shared uint32_t aggregate_sums2[PLOC_SUBGROUPS_PER_WORKGROUP];

	/*
	* Global prefix scan over all workgroups to find out the index of the collapsed node to write.
	* See https://research.nvidia.com/sites/default/files/publications/nvr-2016-002.pdf
	* One partition = one workgroup in this case.
	*/
	uint32_t
	prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t task_index)
	{
	if (gl_LocalInvocationIndex == 0) {
	exclusive_prefix_sum = 0;
	if (task_index >= gl_WorkGroupSize.x) {
	REF(ploc_prefix_scan_partition) current_partition =
	REF(ploc_prefix_scan_partition)(INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x));

	REF(ploc_prefix_scan_partition) previous_partition = current_partition - 1;

	while (true) {
	/* See if this previous workgroup already set their inclusive sum */
	if (atomicLoad(DEREF(previous_partition).inclusive_sum, gl_ScopeDevice,
	gl_StorageSemanticsBuffer,
	gl_SemanticsAcquire \| gl_SemanticsMakeVisible) != 0xFFFFFFFF) {
	atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).inclusive_sum);
	break;
	} else {
	atomicAdd(exclusive_prefix_sum, DEREF(previous_partition).aggregate);
	previous_partition -= 1;
	}
	}
	/* Set the inclusive sum for the next workgroups */
	atomicStore(DEREF(current_partition).inclusive_sum,
	DEREF(current_partition).aggregate + exclusive_prefix_sum, gl_ScopeDevice,
	gl_StorageSemanticsBuffer, gl_SemanticsRelease \| gl_SemanticsMakeAvailable);
	}
	}

	if (subgroupElect())
	aggregate_sums[gl_SubgroupID] = subgroupBallotBitCount(ballot);
	barrier();

	if (PLOC_SUBGROUPS_PER_WORKGROUP <= SUBGROUP_SIZE) {
	if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) {
	aggregate_sums[gl_LocalInvocationID.x] =
	exclusive_prefix_sum + subgroupExclusiveAdd(aggregate_sums[gl_LocalInvocationID.x]);
	}
	} else {
	/* If the length of aggregate_sums[] is larger than SUBGROUP_SIZE,
	* the prefix scan can't be done simply by subgroupExclusiveAdd.
	*/
	if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
	aggregate_sums2[gl_LocalInvocationID.x] = aggregate_sums[gl_LocalInvocationID.x];
	barrier();

	/* Hillis Steele inclusive scan on aggregate_sums2 */
	for (uint32_t stride = 1; stride < PLOC_SUBGROUPS_PER_WORKGROUP; stride *= 2) {
	uint32_t value = 0;
	if (gl_LocalInvocationID.x >= stride && gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
	value = aggregate_sums2[gl_LocalInvocationID.x - stride];
	barrier();
	if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP)
	aggregate_sums2[gl_LocalInvocationID.x] += value;
	barrier();
	}

	/* Adapt to exclusive and add the prefix_sum from previous workgroups */
	if (gl_LocalInvocationID.x < PLOC_SUBGROUPS_PER_WORKGROUP) {
	if (gl_LocalInvocationID.x == 0)
	aggregate_sums[gl_LocalInvocationID.x] = exclusive_prefix_sum;
	else
	aggregate_sums[gl_LocalInvocationID.x] = exclusive_prefix_sum + aggregate_sums2[gl_LocalInvocationID.x - 1];
	}
	}
	barrier();

	return aggregate_sums[gl_SubgroupID] + subgroupBallotExclusiveBitCount(ballot);
	}

	/* Relative cost of increasing the BVH depth. Deep BVHs will require more backtracking. */
	#define BVH_LEVEL_COST 0.2

	uint32_t
	push_node(uint32_t children[2], vk_aabb bounds[2])
	{
	uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1);
	uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(vk_ir_box_node);
	uint32_t dst_id = pack_ir_node_id(dst_offset, vk_ir_node_internal);
	REF(vk_ir_box_node) dst_node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset));

	vk_aabb total_bounds;
	total_bounds.min = vec3(INFINITY);
	total_bounds.max = vec3(-INFINITY);

	uint32_t dst_flags = VK_BVH_BOX_FLAG_ONLY_OPAQUE \| VK_BVH_BOX_FLAG_NO_OPAQUE;

	for (uint i = 0; i < 2; ++i) {
	total_bounds.min = min(total_bounds.min, bounds[i].min);
	total_bounds.max = max(total_bounds.max, bounds[i].max);

	DEREF(dst_node).children[i] = children[i];

	if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
	dst_flags &= fetch_child_flags(args.bvh, children[i]);
	}

	DEREF(dst_node).base.aabb = total_bounds;
	DEREF(dst_node).bvh_offset = VK_UNKNOWN_BVH_OFFSET;

	if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
	DEREF(dst_node).flags = dst_flags;

	return dst_id;
	}

	#define PLOC_NEIGHBOURHOOD 16
	#define PLOC_OFFSET_MASK ((1 << 5) - 1)

	uint32_t
	encode_neighbour_offset(float sah, uint32_t i, uint32_t j)
	{
	int32_t offset = int32_t(j) - int32_t(i);
	uint32_t encoded_offset = offset + PLOC_NEIGHBOURHOOD - (offset > 0 ? 1 : 0);
	return (floatBitsToUint(sah) & (~PLOC_OFFSET_MASK)) \| encoded_offset;
	}

	int32_t
	decode_neighbour_offset(uint32_t encoded_offset)
	{
	int32_t offset = int32_t(encoded_offset & PLOC_OFFSET_MASK) - PLOC_NEIGHBOURHOOD;
	return offset + (offset >= 0 ? 1 : 0);
	}

	#define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD

	shared vk_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
	shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS];

	uint32_t
	load_id(VOID_REF ids, uint32_t iter, uint32_t index)
	{
	if (iter == 0)
	return DEREF(REF(key_id_pair)(INDEX(key_id_pair, ids, index))).id;
	else
	return DEREF(REF(uint32_t)(INDEX(uint32_t, ids, index)));
	}

	void
	load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
	uint32_t neighbourhood_overlap, uint32_t search_bound)
	{
	for (uint32_t i = task_index - 2 * neighbourhood_overlap; i < search_bound;
	i += gl_WorkGroupSize.x) {
	uint32_t id = load_id(ids, iter, i);
	if (id == VK_BVH_INVALID_NODE)
	continue;

	VOID_REF addr = OFFSET(args.bvh, ir_id_to_offset(id));
	REF(vk_ir_node) node = REF(vk_ir_node)(addr);

	shared_bounds[i - lds_base] = DEREF(node).aabb;
	}
	}

	float
	combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j)
	{
	vk_aabb combined_bounds;
	combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min);
	combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max);
	return aabb_surface_area(combined_bounds);
	}

	shared uint32_t shared_aggregate_sum;

	void
	main(void)
	{
	VOID_REF src_ids = args.ids_0;
	VOID_REF dst_ids = args.ids_1;

	/* We try to use LBVH for BVHs where we know there will be less than 5 leaves,
	* but sometimes all leaves might be inactive */
	if (DEREF(args.header).active_leaf_count <= 2) {
	if (gl_GlobalInvocationID.x == 0) {
	uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1);
	uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(vk_ir_box_node);
	REF(vk_ir_box_node) dst_node = REF(vk_ir_box_node)(OFFSET(args.bvh, dst_offset));

	vk_aabb total_bounds;
	total_bounds.min = vec3(INFINITY);
	total_bounds.max = vec3(-INFINITY);

	uint32_t i = 0;
	for (; i < DEREF(args.header).active_leaf_count; i++) {
	uint32_t child_id = DEREF(INDEX(key_id_pair, src_ids, i)).id;

	if (child_id != VK_BVH_INVALID_NODE) {
	VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(child_id));
	REF(vk_ir_node) child = REF(vk_ir_node)(node);
	vk_aabb bounds = DEREF(child).aabb;

	total_bounds.min = min(total_bounds.min, bounds.min);
	total_bounds.max = max(total_bounds.max, bounds.max);
	}

	DEREF(dst_node).children[i] = child_id;
	}
	for (; i < 2; i++)
	DEREF(dst_node).children[i] = VK_BVH_INVALID_NODE;

	DEREF(dst_node).base.aabb = total_bounds;
	DEREF(dst_node).bvh_offset = VK_UNKNOWN_BVH_OFFSET;
	}
	return;
	}

	/* Only initialize sync_data once per workgroup. For intra-workgroup synchronization,
	* fetch_task contains a workgroup-scoped control+memory barrier.
	*/
	if (gl_LocalInvocationIndex == 0) {
	atomicCompSwap(DEREF(args.header).sync_data.task_counts[0], 0xFFFFFFFF,
	DEREF(args.header).active_leaf_count);
	atomicCompSwap(DEREF(args.header).sync_data.current_phase_end_counter, 0xFFFFFFFF,
	DIV_ROUND_UP(DEREF(args.header).active_leaf_count, gl_WorkGroupSize.x));
	}

	REF(ploc_prefix_scan_partition)
	partitions = REF(ploc_prefix_scan_partition)(args.prefix_scan_partitions);

	uint32_t task_index = fetch_task(args.header, false);

	for (uint iter = 0;; ++iter) {
	if (task_index == TASK_INDEX_INVALID)
	break;

	/* Find preferred partners and merge them */
	PHASE (args.header) {
	uint32_t current_task_count = task_count(args.header);
	uint32_t base_index = task_index - gl_LocalInvocationID.x;
	uint32_t neighbourhood_overlap = min(PLOC_NEIGHBOURHOOD, base_index);
	uint32_t double_neighbourhood_overlap = min(2 * PLOC_NEIGHBOURHOOD, base_index);
	/* Upper bound to where valid nearest node indices are written. */
	uint32_t write_bound =
	min(current_task_count, base_index + gl_WorkGroupSize.x + PLOC_NEIGHBOURHOOD);
	/* Upper bound to where valid nearest node indices are searched. */
	uint32_t search_bound =
	min(current_task_count, base_index + gl_WorkGroupSize.x + 2 * PLOC_NEIGHBOURHOOD);
	uint32_t lds_base = base_index - double_neighbourhood_overlap;

	load_bounds(src_ids, iter, task_index, lds_base, neighbourhood_overlap, search_bound);

	for (uint32_t i = gl_LocalInvocationID.x; i < NUM_PLOC_LDS_ITEMS; i += gl_WorkGroupSize.x)
	nearest_neighbour_indices[i] = 0xFFFFFFFF;
	barrier();

	for (uint32_t i = task_index - double_neighbourhood_overlap; i < write_bound;
	i += gl_WorkGroupSize.x) {
	uint32_t right_bound = min(search_bound - 1 - i, PLOC_NEIGHBOURHOOD);

	uint32_t fallback_pair = i == 0 ? (i + 1) : (i - 1);
	uint32_t min_offset = encode_neighbour_offset(INFINITY, i, fallback_pair);

	for (uint32_t j = max(i + 1, base_index - neighbourhood_overlap); j <= i + right_bound;
	++j) {

	float sah = combined_node_cost(lds_base, i, j);
	uint32_t i_encoded_offset = encode_neighbour_offset(sah, i, j);
	uint32_t j_encoded_offset = encode_neighbour_offset(sah, j, i);
	min_offset = min(min_offset, i_encoded_offset);
	atomicMin(nearest_neighbour_indices[j - lds_base], j_encoded_offset);
	}
	if (i >= base_index - neighbourhood_overlap)
	atomicMin(nearest_neighbour_indices[i - lds_base], min_offset);
	}

	if (gl_LocalInvocationID.x == 0)
	shared_aggregate_sum = 0;
	barrier();

	for (uint32_t i = task_index - neighbourhood_overlap; i < write_bound;
	i += gl_WorkGroupSize.x) {
	uint32_t left_bound = min(i, PLOC_NEIGHBOURHOOD);
	uint32_t right_bound = min(search_bound - 1 - i, PLOC_NEIGHBOURHOOD);
	/*
	* Workaround for a worst-case scenario in PLOC: If the combined area of
	* all nodes (in the neighbourhood) is the same, then the chosen nearest
	* neighbour is the first neighbour. However, this means that no nodes
	* except the first two will find each other as nearest neighbour. Therefore,
	* only one node is contained in each BVH level. By first testing if the immediate
	* neighbour on one side is the nearest, all immediate neighbours will be merged
	* on every step.
	*/
	uint32_t preferred_pair;
	if ((i & 1) != 0)
	preferred_pair = i - min(left_bound, 1);
	else
	preferred_pair = i + min(right_bound, 1);

	if (preferred_pair != i) {
	uint32_t encoded_min_sah =
	nearest_neighbour_indices[i - lds_base] & (~PLOC_OFFSET_MASK);
	float sah = combined_node_cost(lds_base, i, preferred_pair);
	uint32_t encoded_sah = floatBitsToUint(sah) & (~PLOC_OFFSET_MASK);
	uint32_t encoded_offset = encode_neighbour_offset(sah, i, preferred_pair);
	if (encoded_sah <= encoded_min_sah) {
	nearest_neighbour_indices[i - lds_base] = encoded_offset;
	}
	}
	}
	barrier();

	bool has_valid_node = true;

	if (task_index < current_task_count) {
	uint32_t base_index = task_index - gl_LocalInvocationID.x;

	uint32_t neighbour_index =
	task_index +
	decode_neighbour_offset(nearest_neighbour_indices[task_index - lds_base]);
	uint32_t other_neighbour_index =
	neighbour_index +
	decode_neighbour_offset(nearest_neighbour_indices[neighbour_index - lds_base]);
	uint32_t id = load_id(src_ids, iter, task_index);
	if (other_neighbour_index == task_index) {
	if (task_index < neighbour_index) {
	uint32_t neighbour_id = load_id(src_ids, iter, neighbour_index);
	uint32_t children[2] = {id, neighbour_id};
	vk_aabb bounds[2] = {shared_bounds[task_index - lds_base], shared_bounds[neighbour_index - lds_base]};

	DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = push_node(children, bounds);
	DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, neighbour_index))) =
	VK_BVH_INVALID_NODE;
	} else {
	/* We still store in the other case so we don't destroy the node id needed to
	* create the internal node */
	has_valid_node = false;
	}
	} else {
	DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = id;
	}

	/* Compact - prepare prefix scan */
	uvec4 ballot = subgroupBallot(has_valid_node);

	uint32_t aggregate_sum = subgroupBallotBitCount(ballot);
	if (subgroupElect())
	atomicAdd(shared_aggregate_sum, aggregate_sum);
	}

	barrier();
	/*
	* The paper proposes initializing all partitions to an invalid state
	* and only computing aggregates afterwards. We skip that step and
	* initialize the partitions to a valid state. This also simplifies
	* the look-back, as there will never be any blocking due to invalid
	* partitions.
	*/
	if (gl_LocalInvocationIndex == 0) {
	REF(ploc_prefix_scan_partition)
	current_partition = REF(ploc_prefix_scan_partition)(
	INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x));
	DEREF(current_partition).aggregate = shared_aggregate_sum;
	if (task_index < gl_WorkGroupSize.x) {
	DEREF(current_partition).inclusive_sum = shared_aggregate_sum;
	} else {
	DEREF(current_partition).inclusive_sum = 0xFFFFFFFF;
	}
	}

	if (task_index == 0)
	set_next_task_count(args.header, task_count(args.header));
	}

	/* Compact - prefix scan and update */
	PHASE (args.header) {
	uint32_t current_task_count = task_count(args.header);

	uint32_t id = task_index < current_task_count
	? DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index)))
	: VK_BVH_INVALID_NODE;
	uvec4 ballot = subgroupBallot(id != VK_BVH_INVALID_NODE);

	uint32_t new_offset = prefix_scan(ballot, partitions, task_index);
	if (task_index >= current_task_count)
	continue;

	if (id != VK_BVH_INVALID_NODE) {
	DEREF(REF(uint32_t)(INDEX(uint32_t, src_ids, new_offset))) = id;
	++new_offset;
	}

	if (task_index == current_task_count - 1) {
	set_next_task_count(args.header, new_offset);
	if (new_offset == 1)
	DEREF(args.header).sync_data.next_phase_exit_flag = 1;
	}
	}
	}
	}