| /* |
| * Copyright © 2024 Advanced Micro Devices, Inc. |
| * |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| /* For each output slot, gather which input components are used to compute it. |
| * Component-wise ALU instructions must be scalar. |
| */ |
| |
| #include "nir_builder.h" |
| #include "util/hash_table.h" |
| #include "util/u_dynarray.h" |
| #include "util/u_memory.h" |
| |
| static void |
| accum_deps(BITSET_WORD *dst, BITSET_WORD *src, unsigned num_bitset_words) |
| { |
| __bitset_or(dst, dst, src, num_bitset_words); |
| } |
| |
| typedef struct { |
| BITSET_WORD **instr_deps; |
| BITSET_WORD *tmp; |
| unsigned num_bitset_words; |
| } foreach_src_data; |
| |
| static bool |
| accum_src_deps(nir_src *src, void *opaque) |
| { |
| foreach_src_data *data = (foreach_src_data *)opaque; |
| nir_instr *src_instr = src->ssa->parent_instr; |
| |
| if (src_instr->type == nir_instr_type_load_const || |
| src_instr->type == nir_instr_type_undef) |
| return true; |
| |
| nir_instr *dst_instr = nir_src_parent_instr(src); |
| accum_deps(data->instr_deps[dst_instr->index], |
| data->instr_deps[src_instr->index], data->num_bitset_words); |
| return true; |
| } |
| |
| typedef struct { |
| nir_block *start_block; /* the first block of the loop */ |
| nir_block *exit_block; /* the first block after the loop */ |
| bool header_phi_changed; |
| } loop_entry; |
| |
| static loop_entry * |
| get_current_loop(struct util_dynarray *loop_stack) |
| { |
| assert(util_dynarray_num_elements(loop_stack, loop_entry)); |
| return util_dynarray_last_ptr(loop_stack, loop_entry); |
| } |
| |
| /* For each output slot, gather which instructions are used to compute it. |
| * The result is that each output slot will have the list of all instructions |
| * that must execute to compute that output. |
| * |
| * If there are memory operations that affect other memory operations, those |
| * dependencies are not gathered. |
| * |
| * Required: |
| * - The shader must be in LCSSA form. |
| * |
| * Recommended: |
| * - IO intrinsics and component-wise ALU instructions should be scalar, and |
| * vecN opcodes should have their components copy-propagated. If not, |
| * the results will have false dependencies. |
| * |
| * Algorithm: |
| * - For each instruction, compute a bitset of instruction indices whose |
| * results are needed to compute the result of the instruction. The final |
| * bitset is the instruction index OR'd with bitsets of all its sources and |
| * also all if-conditions used to enter the block, recursively. |
| * - Since every instruction inherits instruction bitsets from its sources, |
| * every instruction contains the list of all instructions that must execute |
| * before the instruction can execute. |
| * - At the end, output stores contain the list of instructions that must |
| * execute to compute their results. This may be any subset of instructions |
| * from the shader, including all instructions. |
| * |
| * Control flow notes: |
| * - There is a stack of "if" conditions for entered ifs. |
| * - The dependencies of instructions are the union of dependencies of all |
| * their sources and all if conditions on the if-condition stack. |
| * - For each continue, all loop-header phis receive the dependencies of all |
| * if-conditions on the if-condition stack at the continue. |
| * - For each break, all loop-exit phis receive the dependencies of all |
| * if-conditions on the if-condition stack at the break. |
| * - If there is any change to loop-header phis while iterating over a loop, |
| * we iterate over the loop again after the current iteration is finished. |
| */ |
| void |
| nir_gather_output_dependencies(nir_shader *nir, nir_output_deps *deps) |
| { |
| nir_function_impl *impl = nir_shader_get_entrypoint(nir); |
| nir_metadata_require(impl, nir_metadata_instr_index); |
| unsigned num_instr = nir_impl_last_block(impl)->end_ip; |
| |
| /* Allocate bitsets of instruction->instruction dependencies. */ |
| unsigned num_bitset_words = BITSET_WORDS(num_instr); |
| BITSET_WORD **instr_deps = rzalloc_array(NULL, BITSET_WORD*, num_instr); |
| void *mem_ctx = instr_deps; |
| for (unsigned i = 0; i < num_instr; i++) |
| instr_deps[i] = rzalloc_array(mem_ctx, BITSET_WORD, num_bitset_words); |
| |
| /* Allocate bitsets of instruction->output dependencies. */ |
| BITSET_WORD **out_deps = rzalloc_array(mem_ctx, BITSET_WORD*, |
| NUM_TOTAL_VARYING_SLOTS); |
| |
| /* Allocate stacks. */ |
| struct util_dynarray loop_stack, if_cond_stack; |
| util_dynarray_init(&loop_stack, mem_ctx); |
| util_dynarray_init(&if_cond_stack, mem_ctx); |
| |
| /* Gather dependencies of every instruction. |
| * Dependencies of each instruction are OR'd dependencies of its sources and |
| * control flow conditions. |
| */ |
| nir_foreach_block(block, impl) { |
| nir_cf_node *parent_cf = block->cf_node.parent; |
| bool is_loop_first_block = parent_cf->type == nir_cf_node_loop && |
| block == nir_cf_node_cf_tree_first(parent_cf); |
| if (is_loop_first_block) { |
| loop_entry loop = { |
| .start_block = block, |
| .exit_block = nir_cf_node_cf_tree_next(parent_cf), |
| }; |
| util_dynarray_append(&loop_stack, loop_entry, loop); |
| } |
| |
| if (parent_cf->type == nir_cf_node_if && |
| block == nir_if_first_then_block(nir_cf_node_as_if(parent_cf))) { |
| util_dynarray_append(&if_cond_stack, nir_def *, |
| nir_cf_node_as_if(parent_cf)->condition.ssa); |
| } |
| |
| loop_again: |
| nir_foreach_instr(instr, block) { |
| /* Add self as a dependency. */ |
| BITSET_WORD *this_instr_deps = instr_deps[instr->index]; |
| BITSET_SET(this_instr_deps, instr->index); |
| |
| /* Add sources as dependencies. */ |
| nir_foreach_src(instr, accum_src_deps, |
| &(foreach_src_data){instr_deps, NULL, num_bitset_words}); |
| |
| /* Add parent if-conditions as dependencies. |
| * |
| * Note that phis with sources inside conditional blocks don't need |
| * this because the phi sources already contain if-conditions. |
| */ |
| util_dynarray_foreach(&if_cond_stack, nir_def *, cond) { |
| accum_deps(this_instr_deps, |
| instr_deps[(*cond)->parent_instr->index], |
| num_bitset_words); |
| } |
| |
| /* Gather the current instruction. */ |
| switch (instr->type) { |
| case nir_instr_type_jump: |
| switch (nir_instr_as_jump(instr)->type) { |
| case nir_jump_continue: |
| case nir_jump_break: { |
| loop_entry *loop = get_current_loop(&loop_stack); |
| /* Iterate over all loop-header phis (for continue) or all |
| * loop-exit phis (for break). |
| * |
| * Assumption: Only the loop-start block can have loop-header |
| * phis. |
| */ |
| bool is_continue = |
| nir_instr_as_jump(instr)->type == nir_jump_continue; |
| nir_block *iter_block = |
| is_continue ? loop->start_block : loop->exit_block; |
| assert(iter_block); |
| |
| nir_foreach_phi(phi, iter_block) { |
| /* We need to track whether any header phi of the current |
| * loop has changed because we need to walk such loops |
| * again. Use the bitset bitcount to determine whether |
| * any instruction has been added to header phis as |
| * a dependency. |
| */ |
| unsigned old_count = 0; |
| if (is_continue) { |
| old_count = __bitset_count(instr_deps[phi->instr.index], |
| num_bitset_words); |
| } |
| |
| /* Add dependencies of all if-conditions affecting the |
| * jump statement to phis at the loop header / exit. |
| */ |
| util_dynarray_foreach(&if_cond_stack, nir_def *, cond) { |
| accum_deps(instr_deps[phi->instr.index], |
| instr_deps[(*cond)->parent_instr->index], |
| num_bitset_words); |
| } |
| |
| if (is_continue && |
| old_count != __bitset_count(instr_deps[phi->instr.index], |
| num_bitset_words)) |
| loop->header_phi_changed = true; |
| } |
| break; |
| } |
| default: |
| unreachable("unexpected jump type"); |
| } |
| break; |
| |
| case nir_instr_type_intrinsic: { |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| |
| switch (intr->intrinsic) { |
| case nir_intrinsic_store_output: |
| case nir_intrinsic_store_per_vertex_output: |
| case nir_intrinsic_store_per_primitive_output: |
| case nir_intrinsic_store_per_view_output: { |
| /* The write mask must be contiguous starting from x. */ |
| ASSERTED unsigned writemask = nir_intrinsic_write_mask(intr); |
| assert(writemask == BITFIELD_MASK(util_bitcount(writemask))); |
| |
| nir_io_semantics sem = nir_intrinsic_io_semantics(intr); |
| assert(sem.num_slots >= 1); |
| |
| for (unsigned i = 0; i < sem.num_slots; i++) { |
| unsigned slot = sem.location + i; |
| if (!out_deps[slot]) { |
| out_deps[slot] = rzalloc_array(mem_ctx, BITSET_WORD, |
| num_bitset_words); |
| } |
| accum_deps(out_deps[slot], this_instr_deps, num_bitset_words); |
| } |
| break; |
| } |
| |
| default: |
| break; |
| } |
| break; |
| } |
| |
| default: |
| break; |
| } |
| } |
| |
| if (parent_cf->type == nir_cf_node_if && |
| block == nir_if_last_else_block(nir_cf_node_as_if(parent_cf))) { |
| /* Add the current if stack to the phis after the if node because |
| * this can happen: |
| * |
| * a = load_const true |
| * b = load_const false |
| * if (cond) { |
| * } else { |
| * } |
| * c = phi a, b |
| * |
| * c depends on cond, but doesn't use any defs from then/else blocks. |
| */ |
| nir_foreach_phi(phi, nir_cf_node_cf_tree_next(parent_cf)) { |
| util_dynarray_foreach(&if_cond_stack, nir_def *, cond) { |
| accum_deps(instr_deps[phi->instr.index], |
| instr_deps[(*cond)->parent_instr->index], |
| num_bitset_words); |
| } |
| } |
| |
| assert(util_dynarray_num_elements(&if_cond_stack, nir_def *)); |
| (void)util_dynarray_pop_ptr(&if_cond_stack, nir_def *); |
| } |
| |
| if (parent_cf->type == nir_cf_node_loop && |
| block == nir_cf_node_cf_tree_last(parent_cf)) { |
| assert(util_dynarray_num_elements(&loop_stack, loop_entry)); |
| loop_entry *loop = get_current_loop(&loop_stack); |
| |
| /* Check if any loop header phis would be changed by iterating over |
| * the loop again. |
| */ |
| nir_foreach_phi(phi, loop->start_block) { |
| unsigned old_count = __bitset_count(instr_deps[phi->instr.index], |
| num_bitset_words); |
| nir_foreach_src(&phi->instr, accum_src_deps, |
| &(foreach_src_data){instr_deps, NULL, num_bitset_words}); |
| if (old_count != __bitset_count(instr_deps[phi->instr.index], |
| num_bitset_words)) { |
| loop->header_phi_changed = true; |
| break; |
| } |
| } |
| |
| if (loop->header_phi_changed) { |
| loop->header_phi_changed = false; |
| /* Iterate over the loop again: */ |
| is_loop_first_block = true; |
| block = loop->start_block; |
| assert(block); |
| goto loop_again; |
| } |
| |
| (void)util_dynarray_pop_ptr(&loop_stack, loop_entry); |
| } |
| } |
| |
| /* Gather instructions that affect each output from bitsets. */ |
| memset(deps, 0, sizeof(*deps)); |
| |
| for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) { |
| if (!out_deps[i]) |
| continue; |
| |
| unsigned total = __bitset_count(out_deps[i], num_bitset_words); |
| unsigned added = 0; |
| deps->output[i].num_instr = total; |
| deps->output[i].instr_list = malloc(total * sizeof(nir_instr*)); |
| |
| nir_foreach_block(block, impl) { |
| nir_foreach_instr(instr, block) { |
| if (BITSET_TEST(out_deps[i], instr->index)) { |
| assert(added < total); |
| deps->output[i].instr_list[added++] = instr; |
| } |
| } |
| } |
| assert(added == total); |
| } |
| |
| ralloc_free(mem_ctx); |
| } |
| |
| void |
| nir_free_output_dependencies(nir_output_deps *deps) |
| { |
| for (unsigned i = 0; i < ARRAY_SIZE(deps->output); i++) { |
| assert(!!deps->output[i].instr_list == !!deps->output[i].num_instr); |
| if (deps->output[i].instr_list) |
| free(deps->output[i].instr_list); |
| } |
| } |
| |
| static unsigned |
| get_slot_index(nir_intrinsic_instr *intr, unsigned slot_offset) |
| { |
| nir_io_semantics sem = nir_intrinsic_io_semantics(intr); |
| return (sem.location + slot_offset) * 8 + nir_intrinsic_component(intr) * 2 + |
| sem.high_16bits; |
| } |
| |
| /* For each output slot, gather which inputs are used to compute it. |
| * The shader must be in LCSSA form. |
| * |
| * If there are memory operations that affect other memory operations, those |
| * dependencies are not gathered. |
| */ |
| void |
| nir_gather_input_to_output_dependencies(nir_shader *nir, |
| nir_input_to_output_deps *out_deps) |
| { |
| nir_output_deps deps; |
| nir_gather_output_dependencies(nir, &deps); |
| |
| memset(out_deps, 0, sizeof(*out_deps)); |
| |
| for (unsigned out = 0; out < ARRAY_SIZE(deps.output); out++) { |
| unsigned num_instr = deps.output[out].num_instr; |
| |
| if (!num_instr) |
| continue; |
| |
| out_deps->output[out].defined = true; |
| |
| for (unsigned i = 0; i < num_instr; i++) { |
| nir_instr *instr = deps.output[out].instr_list[i]; |
| if (instr->type == nir_instr_type_tex) { |
| if (!nir_tex_instr_is_query(nir_instr_as_tex(instr))) |
| out_deps->output[out].uses_image_reads = true; |
| } |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| switch (intr->intrinsic) { |
| case nir_intrinsic_load_input: |
| case nir_intrinsic_load_input_vertex: |
| case nir_intrinsic_load_per_vertex_input: |
| case nir_intrinsic_load_per_primitive_input: |
| case nir_intrinsic_load_interpolated_input: { |
| nir_io_semantics sem = nir_intrinsic_io_semantics(intr); |
| assert(intr->def.num_components == 1); |
| assert(sem.num_slots >= 1); |
| |
| for (unsigned index = 0; index < sem.num_slots; index++) { |
| unsigned slot = get_slot_index(intr, index); |
| BITSET_SET(out_deps->output[out].inputs, slot); |
| } |
| break; |
| } |
| default: { |
| const char *name = nir_intrinsic_infos[intr->intrinsic].name; |
| |
| if (strstr(name, "load_ssbo") || strstr(name, "ssbo_atomic")) |
| out_deps->output[out].uses_ssbo_reads = true; |
| |
| if (strstr(name, "image") && |
| (strstr(name, "load") || strstr(name, "atomic"))) |
| out_deps->output[out].uses_image_reads = true; |
| break; |
| } |
| } |
| } |
| } |
| |
| nir_free_output_dependencies(&deps); |
| } |
| |
| void |
| nir_print_input_to_output_deps(nir_input_to_output_deps *deps, |
| nir_shader *nir, FILE *f) |
| { |
| for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) { |
| if (!deps->output[i].defined) |
| continue; |
| |
| fprintf(f, "%s(->%s): %s =", |
| _mesa_shader_stage_to_abbrev(nir->info.stage), |
| nir->info.next_stage != MESA_SHADER_NONE ? |
| _mesa_shader_stage_to_abbrev(nir->info.next_stage) : |
| "NONE", |
| gl_varying_slot_name_for_stage(i, nir->info.stage)); |
| |
| unsigned in; |
| BITSET_FOREACH_SET(in, deps->output[i].inputs, NUM_TOTAL_VARYING_SLOTS * 8) { |
| fprintf(f, " %u.%c%s", in / 8, "xyzw"[(in % 8) / 2], in % 2 ? ".hi" : ""); |
| } |
| fprintf(f, "%s%s", |
| deps->output[i].uses_ssbo_reads ? " (ssbo read)" : "", |
| deps->output[i].uses_image_reads ? " (image read)" : ""); |
| fprintf(f, "\n"); |
| } |
| } |
| |
| /* Gather 3 disjoint sets: |
| * - the set of input components only used to compute outputs for the clipper |
| * (those that are only used to compute the position and clip outputs) |
| * - the set of input components only used to compute all other outputs |
| * - the set of input components that are used to compute BOTH outputs for |
| * the clipper and all other outputs |
| * |
| * If there are memory operations that affect other memory operations, those |
| * dependencies are not gathered. |
| * |
| * The shader must be in LCSSA form. |
| * |
| * Patch outputs are not gathered because shaders feeding the clipper don't |
| * have patch outputs. |
| */ |
| void |
| nir_gather_output_clipper_var_groups(nir_shader *nir, |
| nir_output_clipper_var_groups *groups) |
| { |
| nir_input_to_output_deps *deps = calloc(1, sizeof(*deps)); |
| nir_gather_input_to_output_dependencies(nir, deps); |
| |
| uint32_t clipper_outputs = VARYING_BIT_POS | |
| VARYING_BIT_CLIP_VERTEX | |
| VARYING_BIT_CLIP_DIST0 | |
| VARYING_BIT_CLIP_DIST1 | |
| VARYING_BIT_CULL_DIST0 | |
| VARYING_BIT_CULL_DIST1; |
| |
| /* OR-reduce the per-output sets. */ |
| memset(groups, 0, sizeof(*groups)); |
| |
| u_foreach_bit(i, clipper_outputs) { |
| if (deps->output[i].defined) { |
| BITSET_OR(groups->pos_only, groups->pos_only, |
| deps->output[i].inputs); |
| } |
| } |
| |
| for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) { |
| if (deps->output[i].defined && |
| (i >= 32 || !(clipper_outputs & BITFIELD_BIT(i)))) { |
| BITSET_OR(groups->var_only, groups->var_only, |
| deps->output[i].inputs); |
| } |
| } |
| |
| /* Compute the intersection of the above and make them disjoint. */ |
| BITSET_AND(groups->both, groups->pos_only, groups->var_only); |
| BITSET_ANDNOT(groups->pos_only, groups->pos_only, groups->both); |
| BITSET_ANDNOT(groups->var_only, groups->var_only, groups->both); |
| free(deps); |
| } |