blob: e0df801134bc562faf323850355ee40cc529c387 [file] [log] [blame]
/*
* Copyright © 2024 Advanced Micro Devices, Inc.
*
* SPDX-License-Identifier: MIT
*/
/* For each output slot, gather which input components are used to compute it.
* Component-wise ALU instructions must be scalar.
*/
#include "nir_builder.h"
#include "util/hash_table.h"
#include "util/u_dynarray.h"
#include "util/u_memory.h"
static void
accum_deps(BITSET_WORD *dst, BITSET_WORD *src, unsigned num_bitset_words)
{
__bitset_or(dst, dst, src, num_bitset_words);
}
typedef struct {
BITSET_WORD **instr_deps;
BITSET_WORD *tmp;
unsigned num_bitset_words;
} foreach_src_data;
static bool
accum_src_deps(nir_src *src, void *opaque)
{
foreach_src_data *data = (foreach_src_data *)opaque;
nir_instr *src_instr = src->ssa->parent_instr;
if (src_instr->type == nir_instr_type_load_const ||
src_instr->type == nir_instr_type_undef)
return true;
nir_instr *dst_instr = nir_src_parent_instr(src);
accum_deps(data->instr_deps[dst_instr->index],
data->instr_deps[src_instr->index], data->num_bitset_words);
return true;
}
typedef struct {
nir_block *start_block; /* the first block of the loop */
nir_block *exit_block; /* the first block after the loop */
bool header_phi_changed;
} loop_entry;
static loop_entry *
get_current_loop(struct util_dynarray *loop_stack)
{
assert(util_dynarray_num_elements(loop_stack, loop_entry));
return util_dynarray_last_ptr(loop_stack, loop_entry);
}
/* For each output slot, gather which instructions are used to compute it.
* The result is that each output slot will have the list of all instructions
* that must execute to compute that output.
*
* If there are memory operations that affect other memory operations, those
* dependencies are not gathered.
*
* Required:
* - The shader must be in LCSSA form.
*
* Recommended:
* - IO intrinsics and component-wise ALU instructions should be scalar, and
* vecN opcodes should have their components copy-propagated. If not,
* the results will have false dependencies.
*
* Algorithm:
* - For each instruction, compute a bitset of instruction indices whose
* results are needed to compute the result of the instruction. The final
* bitset is the instruction index OR'd with bitsets of all its sources and
* also all if-conditions used to enter the block, recursively.
* - Since every instruction inherits instruction bitsets from its sources,
* every instruction contains the list of all instructions that must execute
* before the instruction can execute.
* - At the end, output stores contain the list of instructions that must
* execute to compute their results. This may be any subset of instructions
* from the shader, including all instructions.
*
* Control flow notes:
* - There is a stack of "if" conditions for entered ifs.
* - The dependencies of instructions are the union of dependencies of all
* their sources and all if conditions on the if-condition stack.
* - For each continue, all loop-header phis receive the dependencies of all
* if-conditions on the if-condition stack at the continue.
* - For each break, all loop-exit phis receive the dependencies of all
* if-conditions on the if-condition stack at the break.
* - If there is any change to loop-header phis while iterating over a loop,
* we iterate over the loop again after the current iteration is finished.
*/
void
nir_gather_output_dependencies(nir_shader *nir, nir_output_deps *deps)
{
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
nir_metadata_require(impl, nir_metadata_instr_index);
unsigned num_instr = nir_impl_last_block(impl)->end_ip;
/* Allocate bitsets of instruction->instruction dependencies. */
unsigned num_bitset_words = BITSET_WORDS(num_instr);
BITSET_WORD **instr_deps = rzalloc_array(NULL, BITSET_WORD*, num_instr);
void *mem_ctx = instr_deps;
for (unsigned i = 0; i < num_instr; i++)
instr_deps[i] = rzalloc_array(mem_ctx, BITSET_WORD, num_bitset_words);
/* Allocate bitsets of instruction->output dependencies. */
BITSET_WORD **out_deps = rzalloc_array(mem_ctx, BITSET_WORD*,
NUM_TOTAL_VARYING_SLOTS);
/* Allocate stacks. */
struct util_dynarray loop_stack, if_cond_stack;
util_dynarray_init(&loop_stack, mem_ctx);
util_dynarray_init(&if_cond_stack, mem_ctx);
/* Gather dependencies of every instruction.
* Dependencies of each instruction are OR'd dependencies of its sources and
* control flow conditions.
*/
nir_foreach_block(block, impl) {
nir_cf_node *parent_cf = block->cf_node.parent;
bool is_loop_first_block = parent_cf->type == nir_cf_node_loop &&
block == nir_cf_node_cf_tree_first(parent_cf);
if (is_loop_first_block) {
loop_entry loop = {
.start_block = block,
.exit_block = nir_cf_node_cf_tree_next(parent_cf),
};
util_dynarray_append(&loop_stack, loop_entry, loop);
}
if (parent_cf->type == nir_cf_node_if &&
block == nir_if_first_then_block(nir_cf_node_as_if(parent_cf))) {
util_dynarray_append(&if_cond_stack, nir_def *,
nir_cf_node_as_if(parent_cf)->condition.ssa);
}
loop_again:
nir_foreach_instr(instr, block) {
/* Add self as a dependency. */
BITSET_WORD *this_instr_deps = instr_deps[instr->index];
BITSET_SET(this_instr_deps, instr->index);
/* Add sources as dependencies. */
nir_foreach_src(instr, accum_src_deps,
&(foreach_src_data){instr_deps, NULL, num_bitset_words});
/* Add parent if-conditions as dependencies.
*
* Note that phis with sources inside conditional blocks don't need
* this because the phi sources already contain if-conditions.
*/
util_dynarray_foreach(&if_cond_stack, nir_def *, cond) {
accum_deps(this_instr_deps,
instr_deps[(*cond)->parent_instr->index],
num_bitset_words);
}
/* Gather the current instruction. */
switch (instr->type) {
case nir_instr_type_jump:
switch (nir_instr_as_jump(instr)->type) {
case nir_jump_continue:
case nir_jump_break: {
loop_entry *loop = get_current_loop(&loop_stack);
/* Iterate over all loop-header phis (for continue) or all
* loop-exit phis (for break).
*
* Assumption: Only the loop-start block can have loop-header
* phis.
*/
bool is_continue =
nir_instr_as_jump(instr)->type == nir_jump_continue;
nir_block *iter_block =
is_continue ? loop->start_block : loop->exit_block;
assert(iter_block);
nir_foreach_phi(phi, iter_block) {
/* We need to track whether any header phi of the current
* loop has changed because we need to walk such loops
* again. Use the bitset bitcount to determine whether
* any instruction has been added to header phis as
* a dependency.
*/
unsigned old_count = 0;
if (is_continue) {
old_count = __bitset_count(instr_deps[phi->instr.index],
num_bitset_words);
}
/* Add dependencies of all if-conditions affecting the
* jump statement to phis at the loop header / exit.
*/
util_dynarray_foreach(&if_cond_stack, nir_def *, cond) {
accum_deps(instr_deps[phi->instr.index],
instr_deps[(*cond)->parent_instr->index],
num_bitset_words);
}
if (is_continue &&
old_count != __bitset_count(instr_deps[phi->instr.index],
num_bitset_words))
loop->header_phi_changed = true;
}
break;
}
default:
unreachable("unexpected jump type");
}
break;
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
switch (intr->intrinsic) {
case nir_intrinsic_store_output:
case nir_intrinsic_store_per_vertex_output:
case nir_intrinsic_store_per_primitive_output:
case nir_intrinsic_store_per_view_output: {
/* The write mask must be contiguous starting from x. */
ASSERTED unsigned writemask = nir_intrinsic_write_mask(intr);
assert(writemask == BITFIELD_MASK(util_bitcount(writemask)));
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
assert(sem.num_slots >= 1);
for (unsigned i = 0; i < sem.num_slots; i++) {
unsigned slot = sem.location + i;
if (!out_deps[slot]) {
out_deps[slot] = rzalloc_array(mem_ctx, BITSET_WORD,
num_bitset_words);
}
accum_deps(out_deps[slot], this_instr_deps, num_bitset_words);
}
break;
}
default:
break;
}
break;
}
default:
break;
}
}
if (parent_cf->type == nir_cf_node_if &&
block == nir_if_last_else_block(nir_cf_node_as_if(parent_cf))) {
/* Add the current if stack to the phis after the if node because
* this can happen:
*
* a = load_const true
* b = load_const false
* if (cond) {
* } else {
* }
* c = phi a, b
*
* c depends on cond, but doesn't use any defs from then/else blocks.
*/
nir_foreach_phi(phi, nir_cf_node_cf_tree_next(parent_cf)) {
util_dynarray_foreach(&if_cond_stack, nir_def *, cond) {
accum_deps(instr_deps[phi->instr.index],
instr_deps[(*cond)->parent_instr->index],
num_bitset_words);
}
}
assert(util_dynarray_num_elements(&if_cond_stack, nir_def *));
(void)util_dynarray_pop_ptr(&if_cond_stack, nir_def *);
}
if (parent_cf->type == nir_cf_node_loop &&
block == nir_cf_node_cf_tree_last(parent_cf)) {
assert(util_dynarray_num_elements(&loop_stack, loop_entry));
loop_entry *loop = get_current_loop(&loop_stack);
/* Check if any loop header phis would be changed by iterating over
* the loop again.
*/
nir_foreach_phi(phi, loop->start_block) {
unsigned old_count = __bitset_count(instr_deps[phi->instr.index],
num_bitset_words);
nir_foreach_src(&phi->instr, accum_src_deps,
&(foreach_src_data){instr_deps, NULL, num_bitset_words});
if (old_count != __bitset_count(instr_deps[phi->instr.index],
num_bitset_words)) {
loop->header_phi_changed = true;
break;
}
}
if (loop->header_phi_changed) {
loop->header_phi_changed = false;
/* Iterate over the loop again: */
is_loop_first_block = true;
block = loop->start_block;
assert(block);
goto loop_again;
}
(void)util_dynarray_pop_ptr(&loop_stack, loop_entry);
}
}
/* Gather instructions that affect each output from bitsets. */
memset(deps, 0, sizeof(*deps));
for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
if (!out_deps[i])
continue;
unsigned total = __bitset_count(out_deps[i], num_bitset_words);
unsigned added = 0;
deps->output[i].num_instr = total;
deps->output[i].instr_list = malloc(total * sizeof(nir_instr*));
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (BITSET_TEST(out_deps[i], instr->index)) {
assert(added < total);
deps->output[i].instr_list[added++] = instr;
}
}
}
assert(added == total);
}
ralloc_free(mem_ctx);
}
void
nir_free_output_dependencies(nir_output_deps *deps)
{
for (unsigned i = 0; i < ARRAY_SIZE(deps->output); i++) {
assert(!!deps->output[i].instr_list == !!deps->output[i].num_instr);
if (deps->output[i].instr_list)
free(deps->output[i].instr_list);
}
}
static unsigned
get_slot_index(nir_intrinsic_instr *intr, unsigned slot_offset)
{
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
return (sem.location + slot_offset) * 8 + nir_intrinsic_component(intr) * 2 +
sem.high_16bits;
}
/* For each output slot, gather which inputs are used to compute it.
* The shader must be in LCSSA form.
*
* If there are memory operations that affect other memory operations, those
* dependencies are not gathered.
*/
void
nir_gather_input_to_output_dependencies(nir_shader *nir,
nir_input_to_output_deps *out_deps)
{
nir_output_deps deps;
nir_gather_output_dependencies(nir, &deps);
memset(out_deps, 0, sizeof(*out_deps));
for (unsigned out = 0; out < ARRAY_SIZE(deps.output); out++) {
unsigned num_instr = deps.output[out].num_instr;
if (!num_instr)
continue;
out_deps->output[out].defined = true;
for (unsigned i = 0; i < num_instr; i++) {
nir_instr *instr = deps.output[out].instr_list[i];
if (instr->type == nir_instr_type_tex) {
if (!nir_tex_instr_is_query(nir_instr_as_tex(instr)))
out_deps->output[out].uses_image_reads = true;
}
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
switch (intr->intrinsic) {
case nir_intrinsic_load_input:
case nir_intrinsic_load_input_vertex:
case nir_intrinsic_load_per_vertex_input:
case nir_intrinsic_load_per_primitive_input:
case nir_intrinsic_load_interpolated_input: {
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
assert(intr->def.num_components == 1);
assert(sem.num_slots >= 1);
for (unsigned index = 0; index < sem.num_slots; index++) {
unsigned slot = get_slot_index(intr, index);
BITSET_SET(out_deps->output[out].inputs, slot);
}
break;
}
default: {
const char *name = nir_intrinsic_infos[intr->intrinsic].name;
if (strstr(name, "load_ssbo") || strstr(name, "ssbo_atomic"))
out_deps->output[out].uses_ssbo_reads = true;
if (strstr(name, "image") &&
(strstr(name, "load") || strstr(name, "atomic")))
out_deps->output[out].uses_image_reads = true;
break;
}
}
}
}
nir_free_output_dependencies(&deps);
}
void
nir_print_input_to_output_deps(nir_input_to_output_deps *deps,
nir_shader *nir, FILE *f)
{
for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
if (!deps->output[i].defined)
continue;
fprintf(f, "%s(->%s): %s =",
_mesa_shader_stage_to_abbrev(nir->info.stage),
nir->info.next_stage != MESA_SHADER_NONE ?
_mesa_shader_stage_to_abbrev(nir->info.next_stage) :
"NONE",
gl_varying_slot_name_for_stage(i, nir->info.stage));
unsigned in;
BITSET_FOREACH_SET(in, deps->output[i].inputs, NUM_TOTAL_VARYING_SLOTS * 8) {
fprintf(f, " %u.%c%s", in / 8, "xyzw"[(in % 8) / 2], in % 2 ? ".hi" : "");
}
fprintf(f, "%s%s",
deps->output[i].uses_ssbo_reads ? " (ssbo read)" : "",
deps->output[i].uses_image_reads ? " (image read)" : "");
fprintf(f, "\n");
}
}
/* Gather 3 disjoint sets:
* - the set of input components only used to compute outputs for the clipper
* (those that are only used to compute the position and clip outputs)
* - the set of input components only used to compute all other outputs
* - the set of input components that are used to compute BOTH outputs for
* the clipper and all other outputs
*
* If there are memory operations that affect other memory operations, those
* dependencies are not gathered.
*
* The shader must be in LCSSA form.
*
* Patch outputs are not gathered because shaders feeding the clipper don't
* have patch outputs.
*/
void
nir_gather_output_clipper_var_groups(nir_shader *nir,
nir_output_clipper_var_groups *groups)
{
nir_input_to_output_deps *deps = calloc(1, sizeof(*deps));
nir_gather_input_to_output_dependencies(nir, deps);
uint32_t clipper_outputs = VARYING_BIT_POS |
VARYING_BIT_CLIP_VERTEX |
VARYING_BIT_CLIP_DIST0 |
VARYING_BIT_CLIP_DIST1 |
VARYING_BIT_CULL_DIST0 |
VARYING_BIT_CULL_DIST1;
/* OR-reduce the per-output sets. */
memset(groups, 0, sizeof(*groups));
u_foreach_bit(i, clipper_outputs) {
if (deps->output[i].defined) {
BITSET_OR(groups->pos_only, groups->pos_only,
deps->output[i].inputs);
}
}
for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
if (deps->output[i].defined &&
(i >= 32 || !(clipper_outputs & BITFIELD_BIT(i)))) {
BITSET_OR(groups->var_only, groups->var_only,
deps->output[i].inputs);
}
}
/* Compute the intersection of the above and make them disjoint. */
BITSET_AND(groups->both, groups->pos_only, groups->var_only);
BITSET_ANDNOT(groups->pos_only, groups->pos_only, groups->both);
BITSET_ANDNOT(groups->var_only, groups->var_only, groups->both);
free(deps);
}