| /* |
| * Copyright (C) 2022 Collabora Ltd. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| * |
| * Authors (Collabora): |
| * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> |
| */ |
| |
| /* Bottom-up local scheduler to reduce register pressure */ |
| |
| #include "compiler.h" |
| #include "util/dag.h" |
| |
| struct sched_ctx { |
| /* Dependency graph */ |
| struct dag *dag; |
| |
| /* Live set */ |
| BITSET_WORD *live; |
| }; |
| |
| struct sched_node { |
| struct dag_node dag; |
| |
| /* Instruction this node represents */ |
| bi_instr *instr; |
| }; |
| |
| static void |
| add_dep(struct sched_node *a, struct sched_node *b) |
| { |
| if (a && b) |
| dag_add_edge(&a->dag, &b->dag, 0); |
| } |
| |
| static struct dag * |
| create_dag(bi_context *ctx, bi_block *block, void *memctx) |
| { |
| struct dag *dag = dag_create(ctx); |
| |
| struct sched_node **last_write = |
| calloc(ctx->ssa_alloc, sizeof(struct sched_node *)); |
| struct sched_node *coverage = NULL; |
| struct sched_node *preload = NULL; |
| |
| /* Last memory load, to serialize stores against */ |
| struct sched_node *memory_load = NULL; |
| |
| /* Last memory store, to serialize loads and stores against */ |
| struct sched_node *memory_store = NULL; |
| |
| bi_foreach_instr_in_block(block, I) { |
| /* Leave branches at the end */ |
| if (I->op == BI_OPCODE_JUMP || bi_opcode_props[I->op].branch) |
| break; |
| |
| assert(I->branch_target == NULL); |
| |
| struct sched_node *node = rzalloc(memctx, struct sched_node); |
| node->instr = I; |
| dag_init_node(dag, &node->dag); |
| |
| /* Reads depend on writes, no other hazards in SSA */ |
| bi_foreach_ssa_src(I, s) |
| add_dep(node, last_write[I->src[s].value]); |
| |
| bi_foreach_dest(I, d) |
| last_write[I->dest[d].value] = node; |
| |
| switch (bi_opcode_props[I->op].message) { |
| case BIFROST_MESSAGE_LOAD: |
| /* Regular memory loads needs to be serialized against |
| * other memory access. However, UBO memory is read-only |
| * so it can be moved around freely. |
| */ |
| if (I->seg != BI_SEG_UBO) { |
| add_dep(node, memory_store); |
| memory_load = node; |
| } |
| |
| break; |
| |
| case BIFROST_MESSAGE_ATTRIBUTE: |
| /* Regular attribute loads can be reordered, but |
| * writeable attributes can't be. Our one use of |
| * writeable attributes are images. |
| */ |
| if ((I->op == BI_OPCODE_LD_TEX) || |
| (I->op == BI_OPCODE_LD_TEX_IMM) || |
| (I->op == BI_OPCODE_LD_ATTR_TEX)) { |
| add_dep(node, memory_store); |
| memory_load = node; |
| } |
| |
| break; |
| |
| case BIFROST_MESSAGE_STORE: |
| assert(I->seg != BI_SEG_UBO); |
| add_dep(node, memory_load); |
| add_dep(node, memory_store); |
| memory_store = node; |
| break; |
| |
| case BIFROST_MESSAGE_ATOMIC: |
| case BIFROST_MESSAGE_BARRIER: |
| add_dep(node, memory_load); |
| add_dep(node, memory_store); |
| memory_load = node; |
| memory_store = node; |
| break; |
| |
| case BIFROST_MESSAGE_BLEND: |
| case BIFROST_MESSAGE_Z_STENCIL: |
| case BIFROST_MESSAGE_TILE: |
| add_dep(node, coverage); |
| coverage = node; |
| break; |
| |
| case BIFROST_MESSAGE_ATEST: |
| /* ATEST signals the end of shader side effects */ |
| add_dep(node, memory_store); |
| memory_store = node; |
| |
| /* ATEST also updates coverage */ |
| add_dep(node, coverage); |
| coverage = node; |
| break; |
| default: |
| break; |
| } |
| |
| add_dep(node, preload); |
| |
| if (I->op == BI_OPCODE_DISCARD_F32) { |
| /* Serialize against ATEST */ |
| add_dep(node, coverage); |
| coverage = node; |
| |
| /* Also serialize against memory and barriers */ |
| add_dep(node, memory_load); |
| add_dep(node, memory_store); |
| memory_load = node; |
| memory_store = node; |
| } else if ((I->op == BI_OPCODE_PHI) || |
| (I->op == BI_OPCODE_MOV_I32 && |
| I->src[0].type == BI_INDEX_REGISTER)) { |
| preload = node; |
| } |
| } |
| |
| free(last_write); |
| |
| return dag; |
| } |
| |
| /* |
| * Calculate the change in register pressure from scheduling a given |
| * instruction. Equivalently, calculate the difference in the number of live |
| * registers before and after the instruction, given the live set after the |
| * instruction. This calculation follows immediately from the dataflow |
| * definition of liveness: |
| * |
| * live_in = (live_out - KILL) + GEN |
| */ |
| static signed |
| calculate_pressure_delta(bi_instr *I, BITSET_WORD *live) |
| { |
| signed delta = 0; |
| |
| /* Destinations must be unique */ |
| bi_foreach_dest(I, d) { |
| if (BITSET_TEST(live, I->dest[d].value)) |
| delta -= bi_count_write_registers(I, d); |
| } |
| |
| bi_foreach_ssa_src(I, src) { |
| /* Filter duplicates */ |
| bool dupe = false; |
| |
| for (unsigned i = 0; i < src; ++i) { |
| if (bi_is_equiv(I->src[i], I->src[src])) { |
| dupe = true; |
| break; |
| } |
| } |
| |
| if (!dupe && !BITSET_TEST(live, I->src[src].value)) |
| delta += bi_count_read_registers(I, src); |
| } |
| |
| return delta; |
| } |
| |
| /* |
| * Choose the next instruction, bottom-up. For now we use a simple greedy |
| * heuristic: choose the instruction that has the best effect on liveness. |
| */ |
| static struct sched_node * |
| choose_instr(struct sched_ctx *s) |
| { |
| int32_t min_delta = INT32_MAX; |
| struct sched_node *best = NULL; |
| |
| list_for_each_entry(struct sched_node, n, &s->dag->heads, dag.link) { |
| int32_t delta = calculate_pressure_delta(n->instr, s->live); |
| |
| if (delta < min_delta) { |
| best = n; |
| min_delta = delta; |
| } |
| } |
| |
| return best; |
| } |
| |
| static void |
| pressure_schedule_block(bi_context *ctx, bi_block *block, struct sched_ctx *s) |
| { |
| /* off by a constant, that's ok */ |
| signed pressure = 0; |
| signed orig_max_pressure = 0; |
| unsigned nr_ins = 0; |
| |
| memcpy(s->live, block->ssa_live_out, BITSET_WORDS(ctx->ssa_alloc) * sizeof(BITSET_WORD)); |
| |
| bi_foreach_instr_in_block_rev(block, I) { |
| pressure += calculate_pressure_delta(I, s->live); |
| orig_max_pressure = MAX2(pressure, orig_max_pressure); |
| bi_liveness_ins_update_ssa(s->live, I); |
| nr_ins++; |
| } |
| |
| memcpy(s->live, block->ssa_live_out, BITSET_WORDS(ctx->ssa_alloc) * sizeof(BITSET_WORD)); |
| |
| /* off by a constant, that's ok */ |
| signed max_pressure = 0; |
| pressure = 0; |
| |
| struct sched_node **schedule = calloc(nr_ins, sizeof(struct sched_node *)); |
| nr_ins = 0; |
| |
| while (!list_is_empty(&s->dag->heads)) { |
| struct sched_node *node = choose_instr(s); |
| pressure += calculate_pressure_delta(node->instr, s->live); |
| max_pressure = MAX2(pressure, max_pressure); |
| dag_prune_head(s->dag, &node->dag); |
| |
| schedule[nr_ins++] = node; |
| bi_liveness_ins_update_ssa(s->live, node->instr); |
| } |
| |
| /* Bail if it looks like it's worse */ |
| if (max_pressure >= orig_max_pressure) { |
| free(schedule); |
| return; |
| } |
| |
| /* Apply the schedule */ |
| for (unsigned i = 0; i < nr_ins; ++i) { |
| bi_remove_instruction(schedule[i]->instr); |
| list_add(&schedule[i]->instr->link, &block->instructions); |
| } |
| |
| free(schedule); |
| } |
| |
| void |
| bi_pressure_schedule(bi_context *ctx) |
| { |
| bi_compute_liveness_ssa(ctx); |
| void *memctx = ralloc_context(ctx); |
| BITSET_WORD *live = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(ctx->ssa_alloc)); |
| |
| bi_foreach_block(ctx, block) { |
| struct sched_ctx sctx = { |
| .dag = create_dag(ctx, block, memctx), |
| .live = live |
| }; |
| |
| pressure_schedule_block(ctx, block, &sctx); |
| } |
| |
| ralloc_free(memctx); |
| } |