| /* |
| * Copyright (C) 2020 Collabora Ltd. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| * |
| * Authors (Collabora): |
| * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> |
| */ |
| |
| #include "compiler.h" |
| #include "nodearray.h" |
| #include "bi_builder.h" |
| #include "util/u_memory.h" |
| |
| struct lcra_state { |
| unsigned node_count; |
| uint64_t *affinity; |
| |
| /* Linear constraints imposed. For each node there there is a |
| * 'nodearray' structure, which changes between a sparse and dense |
| * array depending on the number of elements. |
| * |
| * Each element is itself a bit field denoting whether (c_j - c_i) bias |
| * is present or not, including negative biases. |
| * |
| * We support up to 8 components so the bias is in range |
| * [-7, 7] encoded by a 16-bit field |
| */ |
| nodearray *linear; |
| |
| /* Before solving, forced registers; after solving, solutions. */ |
| unsigned *solutions; |
| |
| /** Node which caused register allocation to fail */ |
| unsigned spill_node; |
| }; |
| |
| /* This module is an implementation of "Linearly Constrained |
| * Register Allocation". The paper is available in PDF form |
| * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX |
| * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md) |
| */ |
| |
| static struct lcra_state * |
| lcra_alloc_equations(unsigned node_count) |
| { |
| struct lcra_state *l = calloc(1, sizeof(*l)); |
| |
| l->node_count = node_count; |
| |
| l->linear = calloc(sizeof(l->linear[0]), node_count); |
| l->solutions = calloc(sizeof(l->solutions[0]), node_count); |
| l->affinity = calloc(sizeof(l->affinity[0]), node_count); |
| |
| memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count); |
| |
| return l; |
| } |
| |
| static void |
| lcra_free(struct lcra_state *l) |
| { |
| for (unsigned i = 0; i < l->node_count; ++i) |
| nodearray_reset(&l->linear[i]); |
| |
| free(l->linear); |
| free(l->affinity); |
| free(l->solutions); |
| free(l); |
| } |
| |
| static void |
| lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j) |
| { |
| if (i == j) |
| return; |
| |
| nodearray_value constraint_fw = 0; |
| nodearray_value constraint_bw = 0; |
| |
| /* The constraint bits are reversed from lcra.c so that register |
| * allocation can be done in parallel for every possible solution, |
| * with lower-order bits representing smaller registers. */ |
| |
| for (unsigned D = 0; D < 8; ++D) { |
| if (cmask_i & (cmask_j << D)) { |
| constraint_fw |= (1 << (7 + D)); |
| constraint_bw |= (1 << (7 - D)); |
| } |
| |
| if (cmask_i & (cmask_j >> D)) { |
| constraint_bw |= (1 << (7 + D)); |
| constraint_fw |= (1 << (7 - D)); |
| } |
| } |
| |
| /* Use dense arrays after adding 256 elements */ |
| nodearray_orr(&l->linear[j], i, constraint_fw, 256, l->node_count); |
| nodearray_orr(&l->linear[i], j, constraint_bw, 256, l->node_count); |
| } |
| |
| static bool |
| lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i) |
| { |
| signed constant = solutions[i]; |
| |
| if (nodearray_is_sparse(&l->linear[i])) { |
| nodearray_sparse_foreach(&l->linear[i], elem) { |
| unsigned j = nodearray_sparse_key(elem); |
| nodearray_value constraint = nodearray_sparse_value(elem); |
| |
| if (solutions[j] == ~0) continue; |
| |
| signed lhs = constant - solutions[j]; |
| |
| if (lhs < -7 || lhs > 7) |
| continue; |
| |
| if (constraint & (1 << (lhs + 7))) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| nodearray_value *row = l->linear[i].dense; |
| |
| for (unsigned j = 0; j < l->node_count; ++j) { |
| if (solutions[j] == ~0) continue; |
| |
| signed lhs = constant - solutions[j]; |
| |
| if (lhs < -7 || lhs > 7) |
| continue; |
| |
| if (row[j] & (1 << (lhs + 7))) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static bool |
| lcra_solve(struct lcra_state *l) |
| { |
| for (unsigned step = 0; step < l->node_count; ++step) { |
| if (l->solutions[step] != ~0) continue; |
| if (l->affinity[step] == 0) continue; |
| |
| bool succ = false; |
| |
| u_foreach_bit64(r, l->affinity[step]) { |
| l->solutions[step] = r; |
| |
| if (lcra_test_linear(l, l->solutions, step)) { |
| succ = true; |
| break; |
| } |
| } |
| |
| /* Out of registers - prepare to spill */ |
| if (!succ) { |
| l->spill_node = step; |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| /* Register spilling is implemented with a cost-benefit system. Costs are set |
| * by the user. Benefits are calculated from the constraints. */ |
| |
| static unsigned |
| lcra_count_constraints(struct lcra_state *l, unsigned i) |
| { |
| unsigned count = 0; |
| nodearray *constraints = &l->linear[i]; |
| |
| if (nodearray_is_sparse(constraints)) { |
| nodearray_sparse_foreach(constraints, elem) |
| count += util_bitcount(nodearray_sparse_value(elem)); |
| } else { |
| nodearray_dense_foreach_64(constraints, elem) |
| count += util_bitcount64(*elem); |
| } |
| |
| return count; |
| } |
| |
| /* Liveness analysis is a backwards-may dataflow analysis pass. Within a block, |
| * we compute live_out from live_in. The intrablock pass is linear-time. It |
| * returns whether progress was made. */ |
| |
| static void |
| bi_liveness_ins_update_ra(uint8_t *live, bi_instr *ins) |
| { |
| /* live_in[s] = GEN[s] + (live_out[s] - KILL[s]) */ |
| |
| bi_foreach_dest(ins, d) { |
| live[ins->dest[d].value] &= ~bi_writemask(ins, d); |
| } |
| |
| bi_foreach_ssa_src(ins, src) { |
| unsigned count = bi_count_read_registers(ins, src); |
| unsigned rmask = BITFIELD_MASK(count); |
| |
| live[ins->src[src].value] |= (rmask << ins->src[src].offset); |
| } |
| } |
| |
| static bool |
| liveness_block_update(bi_block *blk, unsigned temp_count) |
| { |
| bool progress = false; |
| |
| /* live_out[s] = sum { p in succ[s] } ( live_in[p] ) */ |
| bi_foreach_successor(blk, succ) { |
| for (unsigned i = 0; i < temp_count; ++i) |
| blk->live_out[i] |= succ->live_in[i]; |
| } |
| |
| uint8_t *live = ralloc_array(blk, uint8_t, temp_count); |
| memcpy(live, blk->live_out, temp_count); |
| |
| bi_foreach_instr_in_block_rev(blk, ins) |
| bi_liveness_ins_update_ra(live, ins); |
| |
| /* To figure out progress, diff live_in */ |
| |
| for (unsigned i = 0; (i < temp_count) && !progress; ++i) |
| progress |= (blk->live_in[i] != live[i]); |
| |
| ralloc_free(blk->live_in); |
| blk->live_in = live; |
| |
| return progress; |
| } |
| |
| /* Globally, liveness analysis uses a fixed-point algorithm based on a |
| * worklist. We initialize a work list with the exit block. We iterate the work |
| * list to compute live_in from live_out for each block on the work list, |
| * adding the predecessors of the block to the work list if we made progress. |
| */ |
| |
| static void |
| bi_compute_liveness_ra(bi_context *ctx) |
| { |
| u_worklist worklist; |
| bi_worklist_init(ctx, &worklist); |
| |
| bi_foreach_block(ctx, block) { |
| if (block->live_in) |
| ralloc_free(block->live_in); |
| |
| if (block->live_out) |
| ralloc_free(block->live_out); |
| |
| block->live_in = rzalloc_array(block, uint8_t, ctx->ssa_alloc); |
| block->live_out = rzalloc_array(block, uint8_t, ctx->ssa_alloc); |
| |
| bi_worklist_push_tail(&worklist, block); |
| } |
| |
| while (!u_worklist_is_empty(&worklist)) { |
| /* Pop off in reverse order since liveness is backwards */ |
| bi_block *blk = bi_worklist_pop_tail(&worklist); |
| |
| /* Update liveness information. If we made progress, we need to |
| * reprocess the predecessors |
| */ |
| if (liveness_block_update(blk, ctx->ssa_alloc)) { |
| bi_foreach_predecessor(blk, pred) |
| bi_worklist_push_head(&worklist, *pred); |
| } |
| } |
| |
| u_worklist_fini(&worklist); |
| } |
| |
| /* Construct an affinity mask such that the vector with `count` elements does |
| * not intersect any of the registers in the bitset `clobber`. In other words, |
| * an allocated register r needs to satisfy for each i < count: a + i != b. |
| * Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the |
| * entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where |
| * that union is the desired clobber set. That may be written equivalently as |
| * the union over i < n of (B - i), where subtraction is defined elementwise |
| * and corresponds to a shift of the entire bitset. |
| * |
| * EVEN_BITS_MASK is an affinity mask for aligned register pairs. Interpreted |
| * as a bit set, it is { x : 0 <= x < 64 if x is even } |
| */ |
| |
| #define EVEN_BITS_MASK (0x5555555555555555ull) |
| |
| static uint64_t |
| bi_make_affinity(uint64_t clobber, unsigned count, bool split_file) |
| { |
| uint64_t clobbered = 0; |
| |
| for (unsigned i = 0; i < count; ++i) |
| clobbered |= (clobber >> i); |
| |
| /* Don't allocate past the end of the register file */ |
| if (count > 1) { |
| unsigned excess = count - 1; |
| uint64_t mask = BITFIELD_MASK(excess); |
| clobbered |= mask << (64 - excess); |
| |
| if (split_file) |
| clobbered |= mask << (16 - excess); |
| } |
| |
| /* Don't allocate the middle if we split out the middle */ |
| if (split_file) |
| clobbered |= BITFIELD64_MASK(32) << 16; |
| |
| /* We can use a register iff it's not clobberred */ |
| return ~clobbered; |
| } |
| |
| static void |
| bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file, bool aligned_sr) |
| { |
| bi_foreach_instr_in_block_rev(block, ins) { |
| /* Mark all registers live after the instruction as |
| * interfering with the destination */ |
| |
| bi_foreach_dest(ins, d) { |
| unsigned node = ins->dest[d].value; |
| |
| /* Don't allocate to anything that's read later as a |
| * preloaded register. The affinity is the intersection |
| * of affinity masks for each write. Since writes have |
| * offsets, but the affinity is for the whole node, we |
| * need to offset the affinity opposite the write |
| * offset, so we shift right. */ |
| unsigned count = bi_count_write_registers(ins, d); |
| unsigned offset = ins->dest[d].offset; |
| uint64_t affinity = bi_make_affinity(preload_live, count, split_file) >> offset; |
| /* Valhall needs >= 64-bit staging writes to be pair-aligned */ |
| if (aligned_sr && (count >= 2 || offset)) |
| affinity &= EVEN_BITS_MASK; |
| |
| l->affinity[node] &= affinity; |
| |
| for (unsigned i = 0; i < node_count; ++i) { |
| uint8_t r = live[i]; |
| |
| /* Nodes only interfere if they occupy |
| * /different values/ at the same time |
| * (Boissinot). In particular, sources of |
| * moves do not interfere with their |
| * destinations. This enables a limited form of |
| * coalescing. |
| */ |
| if (ins->op == BI_OPCODE_MOV_I32 && |
| bi_is_ssa(ins->src[0]) && |
| i == ins->src[0].value) { |
| |
| r &= ~BITFIELD_BIT(ins->src[0].offset); |
| } |
| |
| if (r) { |
| lcra_add_node_interference(l, node, |
| bi_writemask(ins, d), i, r); |
| } |
| } |
| |
| unsigned node_first = ins->dest[0].value; |
| if (d == 1) { |
| lcra_add_node_interference(l, node, bi_writemask(ins, 1), |
| node_first, bi_writemask(ins, 0)); |
| } |
| } |
| |
| /* Valhall needs >= 64-bit reads to be pair-aligned */ |
| if (aligned_sr) { |
| bi_foreach_ssa_src(ins, s) { |
| if (bi_count_read_registers(ins, s) >= 2) |
| l->affinity[ins->src[s].value] &= EVEN_BITS_MASK; |
| } |
| } |
| |
| if (!is_blend && ins->op == BI_OPCODE_BLEND) { |
| /* Blend shaders might clobber r0-r15, r48. */ |
| uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48); |
| |
| for (unsigned i = 0; i < node_count; ++i) { |
| if (live[i]) |
| l->affinity[i] &= ~clobber; |
| } |
| } |
| |
| /* Update live_in */ |
| preload_live = bi_postra_liveness_ins(preload_live, ins); |
| bi_liveness_ins_update_ra(live, ins); |
| } |
| |
| block->reg_live_in = preload_live; |
| } |
| |
| static void |
| bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs) |
| { |
| bi_compute_liveness_ra(ctx); |
| bi_postra_liveness(ctx); |
| |
| bi_foreach_block_rev(ctx, blk) { |
| uint8_t *live = mem_dup(blk->live_out, ctx->ssa_alloc); |
| |
| bi_mark_interference(blk, l, live, blk->reg_live_out, |
| ctx->ssa_alloc, ctx->inputs->is_blend, |
| !full_regs, ctx->arch >= 9); |
| |
| free(live); |
| } |
| } |
| |
| static struct lcra_state * |
| bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs) |
| { |
| struct lcra_state *l = lcra_alloc_equations(ctx->ssa_alloc); |
| |
| /* Blend shaders are restricted to R0-R15. Other shaders at full |
| * occupancy also can access R48-R63. At half occupancy they can access |
| * the whole file. */ |
| |
| uint64_t default_affinity = |
| ctx->inputs->is_blend ? BITFIELD64_MASK(16) : |
| full_regs ? BITFIELD64_MASK(64) : |
| (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48)); |
| |
| /* To test spilling, mimic a small register file */ |
| if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend) |
| default_affinity &= BITFIELD64_MASK(48) << 8; |
| |
| bi_foreach_instr_global(ctx, ins) { |
| bi_foreach_dest(ins, d) |
| l->affinity[ins->dest[d].value] = default_affinity; |
| |
| /* Blend shaders expect the src colour to be in r0-r3 */ |
| if (ins->op == BI_OPCODE_BLEND && |
| !ctx->inputs->is_blend) { |
| assert(bi_is_ssa(ins->src[0])); |
| l->solutions[ins->src[0].value] = 0; |
| |
| /* Dual source blend input in r4-r7 */ |
| if (bi_is_ssa(ins->src[4])) |
| l->solutions[ins->src[4].value] = 4; |
| |
| /* Writes to R48 */ |
| if (!bi_is_null(ins->dest[0])) |
| l->solutions[ins->dest[0].value] = 48; |
| } |
| |
| /* Coverage mask writes stay in R60 */ |
| if ((ins->op == BI_OPCODE_ATEST || |
| ins->op == BI_OPCODE_ZS_EMIT) && |
| !bi_is_null(ins->dest[0])) { |
| l->solutions[ins->dest[0].value] = 60; |
| } |
| |
| /* Experimentally, it seems coverage masks inputs to ATEST must |
| * be in R60. Otherwise coverage mask writes do not work with |
| * early-ZS with pixel-frequency-shading (this combination of |
| * settings is legal if depth/stencil writes are disabled). |
| */ |
| if (ins->op == BI_OPCODE_ATEST) { |
| assert(bi_is_ssa(ins->src[0])); |
| l->solutions[ins->src[0].value] = 60; |
| } |
| } |
| |
| bi_compute_interference(ctx, l, full_regs); |
| |
| /* Coalesce register moves if we're allowed. We need to be careful due |
| * to the restricted affinity induced by the blend shader ABI. |
| */ |
| bi_foreach_instr_global(ctx, I) { |
| if (I->op != BI_OPCODE_MOV_I32) continue; |
| if (I->src[0].type != BI_INDEX_REGISTER) continue; |
| |
| unsigned reg = I->src[0].value; |
| unsigned node = I->dest[0].value; |
| |
| if (l->solutions[node] != ~0) continue; |
| |
| uint64_t affinity = l->affinity[node]; |
| |
| if (ctx->inputs->is_blend) { |
| /* We're allowed to coalesce the moves to these */ |
| affinity |= BITFIELD64_BIT(48); |
| affinity |= BITFIELD64_BIT(60); |
| } |
| |
| /* Try to coalesce */ |
| if (affinity & BITFIELD64_BIT(reg)) { |
| l->solutions[node] = reg; |
| |
| if (!lcra_test_linear(l, l->solutions, node)) |
| l->solutions[node] = ~0; |
| } |
| } |
| |
| *success = lcra_solve(l); |
| |
| return l; |
| } |
| |
| static bi_index |
| bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index) |
| { |
| /* Offsets can only be applied when we register allocated an index, or |
| * alternatively for FAU's encoding */ |
| |
| ASSERTED bool is_offset = (index.offset > 0) && |
| (index.type != BI_INDEX_FAU); |
| |
| /* Did we run RA for this index at all */ |
| if (!bi_is_ssa(index)) { |
| assert(!is_offset); |
| return index; |
| } |
| |
| /* LCRA didn't bother solving this index (how lazy!) */ |
| signed solution = l->solutions[index.value]; |
| if (solution < 0) { |
| assert(!is_offset); |
| return index; |
| } |
| |
| /* todo: do we want to compose with the subword swizzle? */ |
| bi_index new_index = bi_register(solution + index.offset); |
| new_index.swizzle = index.swizzle; |
| new_index.abs = index.abs; |
| new_index.neg = index.neg; |
| return new_index; |
| } |
| |
| /* Dual texture instructions write to two sets of staging registers, modeled as |
| * two destinations in the IR. The first set is communicated with the usual |
| * staging register mechanism. The second set is encoded in the texture |
| * operation descriptor. This is quite unusual, and requires the following late |
| * fixup. |
| */ |
| static void |
| bi_fixup_dual_tex_register(bi_instr *I) |
| { |
| assert(I->dest[1].type == BI_INDEX_REGISTER); |
| assert(I->src[3].type == BI_INDEX_CONSTANT); |
| |
| struct bifrost_dual_texture_operation desc = { |
| .secondary_register = I->dest[1].value |
| }; |
| |
| I->src[3].value |= bi_dual_tex_as_u32(desc); |
| } |
| |
| static void |
| bi_install_registers(bi_context *ctx, struct lcra_state *l) |
| { |
| bi_foreach_instr_global(ctx, ins) { |
| bi_foreach_dest(ins, d) |
| ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]); |
| |
| bi_foreach_src(ins, s) |
| ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]); |
| |
| if (ins->op == BI_OPCODE_TEXC_DUAL) |
| bi_fixup_dual_tex_register(ins); |
| } |
| } |
| |
| static void |
| bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new) |
| { |
| bi_foreach_src(ins, i) { |
| if (bi_is_equiv(ins->src[i], old)) { |
| ins->src[i].type = new.type; |
| ins->src[i].value = new.value; |
| } |
| } |
| } |
| |
| /* If register allocation fails, find the best spill node */ |
| |
| static signed |
| bi_choose_spill_node(bi_context *ctx, struct lcra_state *l) |
| { |
| /* Pick a node satisfying bi_spill_register's preconditions */ |
| BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count)); |
| |
| bi_foreach_instr_global(ctx, ins) { |
| bi_foreach_dest(ins, d) { |
| /* Don't allow spilling coverage mask writes because the |
| * register preload logic assumes it will stay in R60. |
| * This could be optimized. |
| */ |
| if (ins->no_spill || |
| ins->op == BI_OPCODE_ATEST || |
| ins->op == BI_OPCODE_ZS_EMIT || |
| (ins->op == BI_OPCODE_MOV_I32 && |
| ins->src[0].type == BI_INDEX_REGISTER && |
| ins->src[0].value == 60)) { |
| BITSET_SET(no_spill, ins->dest[d].value); |
| } |
| } |
| } |
| |
| unsigned best_benefit = 0.0; |
| signed best_node = -1; |
| |
| if (nodearray_is_sparse(&l->linear[l->spill_node])) { |
| nodearray_sparse_foreach(&l->linear[l->spill_node], elem) { |
| unsigned i = nodearray_sparse_key(elem); |
| unsigned constraint = nodearray_sparse_value(elem); |
| |
| /* Only spill nodes that interfere with the node failing |
| * register allocation. It's pointless to spill anything else */ |
| if (!constraint) continue; |
| |
| if (BITSET_TEST(no_spill, i)) continue; |
| |
| unsigned benefit = lcra_count_constraints(l, i); |
| |
| if (benefit > best_benefit) { |
| best_benefit = benefit; |
| best_node = i; |
| } |
| } |
| } else { |
| nodearray_value *row = l->linear[l->spill_node].dense; |
| |
| for (unsigned i = 0; i < l->node_count; ++i) { |
| /* Only spill nodes that interfere with the node failing |
| * register allocation. It's pointless to spill anything else */ |
| if (!row[i]) continue; |
| |
| if (BITSET_TEST(no_spill, i)) continue; |
| |
| unsigned benefit = lcra_count_constraints(l, i); |
| |
| if (benefit > best_benefit) { |
| best_benefit = benefit; |
| best_node = i; |
| } |
| } |
| } |
| |
| free(no_spill); |
| return best_node; |
| } |
| |
| static unsigned |
| bi_count_read_index(bi_instr *I, bi_index index) |
| { |
| unsigned max = 0; |
| |
| bi_foreach_src(I, s) { |
| if (bi_is_equiv(I->src[s], index)) { |
| unsigned count = bi_count_read_registers(I, s); |
| max = MAX2(max, count + I->src[s].offset); |
| } |
| } |
| |
| return max; |
| } |
| |
| /* |
| * Wrappers to emit loads/stores to thread-local storage in an appropriate way |
| * for the target, so the spill/fill code becomes architecture-independent. |
| */ |
| |
| static bi_index |
| bi_tls_ptr(bool hi) |
| { |
| return bi_fau(BIR_FAU_TLS_PTR, hi); |
| } |
| |
| static bi_instr * |
| bi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset) |
| { |
| if (b->shader->arch >= 9) { |
| return bi_load_to(b, bits, src, bi_tls_ptr(false), |
| bi_tls_ptr(true), BI_SEG_TL, offset); |
| } else { |
| return bi_load_to(b, bits, src, bi_imm_u32(offset), bi_zero(), |
| BI_SEG_TL, 0); |
| } |
| } |
| |
| static void |
| bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset) |
| { |
| if (b->shader->arch >= 9) { |
| bi_store(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true), BI_SEG_TL, offset); |
| } else { |
| bi_store(b, bits, src, bi_imm_u32(offset), bi_zero(), BI_SEG_TL, 0); |
| } |
| } |
| |
| /* Once we've chosen a spill node, spill it and returns bytes spilled */ |
| |
| static unsigned |
| bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset) |
| { |
| bi_builder b = { .shader = ctx }; |
| unsigned channels = 0; |
| |
| /* Spill after every store, fill before every load */ |
| bi_foreach_instr_global_safe(ctx, I) { |
| bi_foreach_dest(I, d) { |
| if (!bi_is_equiv(I->dest[d], index)) continue; |
| |
| unsigned extra = I->dest[d].offset; |
| bi_index tmp = bi_temp(ctx); |
| |
| I->dest[d] = bi_replace_index(I->dest[d], tmp); |
| I->no_spill = true; |
| |
| unsigned count = bi_count_write_registers(I, d); |
| unsigned bits = count * 32; |
| |
| b.cursor = bi_after_instr(I); |
| bi_store_tl(&b, bits, tmp, offset + 4 * extra); |
| |
| ctx->spills++; |
| channels = MAX2(channels, extra + count); |
| } |
| |
| if (bi_has_arg(I, index)) { |
| b.cursor = bi_before_instr(I); |
| bi_index tmp = bi_temp(ctx); |
| |
| unsigned bits = bi_count_read_index(I, index) * 32; |
| bi_rewrite_index_src_single(I, index, tmp); |
| |
| bi_instr *ld = bi_load_tl(&b, bits, tmp, offset); |
| ld->no_spill = true; |
| ctx->fills++; |
| } |
| } |
| |
| return (channels * 4); |
| } |
| |
| /* |
| * For transition, lower collects and splits before RA, rather than after RA. |
| * LCRA knows how to deal with offsets (broken SSA), but not how to coalesce |
| * these vector moves. |
| */ |
| static void |
| bi_lower_vector(bi_context *ctx, unsigned first_reg) |
| { |
| bi_index *remap = calloc(ctx->ssa_alloc, sizeof(bi_index)); |
| |
| bi_foreach_instr_global_safe(ctx, I) { |
| bi_builder b = bi_init_builder(ctx, bi_after_instr(I)); |
| |
| if (I->op == BI_OPCODE_SPLIT_I32) { |
| bi_index src = I->src[0]; |
| assert(src.offset == 0); |
| |
| bi_foreach_dest(I, i) { |
| src.offset = i; |
| bi_mov_i32_to(&b, I->dest[i], src); |
| |
| if (I->dest[i].value < first_reg) |
| remap[I->dest[i].value] = src; |
| } |
| |
| bi_remove_instruction(I); |
| } else if (I->op == BI_OPCODE_COLLECT_I32) { |
| bi_index dest = I->dest[0]; |
| assert(dest.offset == 0); |
| assert(((dest.value < first_reg) || I->nr_srcs == 1) && "nir_lower_phis_to_scalar"); |
| |
| bi_foreach_src(I, i) { |
| if (bi_is_null(I->src[i])) |
| continue; |
| |
| dest.offset = i; |
| bi_mov_i32_to(&b, dest, I->src[i]); |
| } |
| |
| bi_remove_instruction(I); |
| } |
| } |
| |
| bi_foreach_instr_global(ctx, I) { |
| bi_foreach_ssa_src(I, s) { |
| if (I->src[s].value < first_reg && !bi_is_null(remap[I->src[s].value])) |
| bi_replace_src(I, s, remap[I->src[s].value]); |
| } |
| } |
| |
| free(remap); |
| |
| /* After generating a pile of moves, clean up */ |
| bi_compute_liveness_ra(ctx); |
| |
| bi_foreach_block_rev(ctx, block) { |
| uint8_t *live = rzalloc_array(block, uint8_t, ctx->ssa_alloc); |
| |
| bi_foreach_successor(block, succ) { |
| for (unsigned i = 0; i < ctx->ssa_alloc; ++i) |
| live[i] |= succ->live_in[i]; |
| } |
| |
| bi_foreach_instr_in_block_safe_rev(block, ins) { |
| bool all_null = true; |
| |
| bi_foreach_dest(ins, d) { |
| if (live[ins->dest[d].value] & bi_writemask(ins, d)) |
| all_null = false; |
| } |
| |
| if (all_null && !bi_side_effects(ins)) |
| bi_remove_instruction(ins); |
| else |
| bi_liveness_ins_update_ra(live, ins); |
| } |
| |
| ralloc_free(block->live_in); |
| block->live_in = live; |
| } |
| } |
| |
| /* |
| * Check if the instruction requires a "tied" operand. Such instructions MUST |
| * allocate their source and destination to the same register. This is a |
| * constraint on RA, and may require extra moves. |
| * |
| * In particular, this is the case for Bifrost instructions that both read and |
| * write with the staging register mechanism. |
| */ |
| static bool |
| bi_is_tied(const bi_instr *I) |
| { |
| return (I->op == BI_OPCODE_TEXC || |
| I->op == BI_OPCODE_TEXC_DUAL || |
| I->op == BI_OPCODE_ATOM_RETURN_I32 || |
| I->op == BI_OPCODE_AXCHG_I32 || |
| I->op == BI_OPCODE_ACMPXCHG_I32) && |
| !bi_is_null(I->src[0]); |
| } |
| |
| /* |
| * For transition, coalesce tied operands together, as LCRA knows how to handle |
| * non-SSA operands but doesn't know about tied operands. |
| * |
| * This breaks the SSA form of the program, but that doesn't matter for LCRA. |
| */ |
| static void |
| bi_coalesce_tied(bi_context *ctx) |
| { |
| bi_foreach_instr_global(ctx, I) { |
| if (!bi_is_tied(I)) continue; |
| |
| bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); |
| unsigned n = bi_count_read_registers(I, 0); |
| |
| for (unsigned i = 0; i < n; ++i) { |
| bi_index dst = I->dest[0], src = I->src[0]; |
| |
| assert(dst.offset == 0 && src.offset == 0); |
| dst.offset = src.offset = i; |
| |
| bi_mov_i32_to(&b, dst, src); |
| } |
| |
| bi_replace_src(I, 0, I->dest[0]); |
| } |
| } |
| |
| static unsigned |
| find_or_allocate_temp(unsigned *map, unsigned value, unsigned *alloc) |
| { |
| if (!map[value]) |
| map[value] = ++(*alloc); |
| |
| assert(map[value]); |
| return map[value] - 1; |
| } |
| |
| /* Reassigns numbering to get rid of gaps in the indices and to prioritize |
| * smaller register classes */ |
| |
| static void |
| squeeze_index(bi_context *ctx) |
| { |
| unsigned *map = rzalloc_array(ctx, unsigned, ctx->ssa_alloc); |
| ctx->ssa_alloc = 0; |
| |
| bi_foreach_instr_global(ctx, I) { |
| bi_foreach_dest(I, d) |
| I->dest[d].value = find_or_allocate_temp(map, I->dest[d].value, &ctx->ssa_alloc); |
| |
| bi_foreach_ssa_src(I, s) |
| I->src[s].value = find_or_allocate_temp(map, I->src[s].value, &ctx->ssa_alloc); |
| } |
| |
| ralloc_free(map); |
| } |
| |
| /* |
| * Brainless out-of-SSA pass. The eventual goal is to go out-of-SSA after RA and |
| * coalesce implicitly with biased colouring in a tree scan allocator. For now, |
| * this should be good enough for LCRA. |
| */ |
| static unsigned |
| bi_out_of_ssa(bi_context *ctx) |
| { |
| bi_index zero = bi_fau(BIR_FAU_IMMEDIATE | 0, false); |
| unsigned first_reg = ctx->ssa_alloc; |
| |
| /* Trivially lower phis */ |
| bi_foreach_block(ctx, block) { |
| bi_foreach_instr_in_block_safe(block, I) { |
| if (I->op != BI_OPCODE_PHI) |
| break; |
| |
| /* Assign a register for the phi */ |
| bi_index reg = bi_temp(ctx); |
| assert(reg.value >= first_reg); |
| |
| /* Lower to a move in each predecessor. The destinations |
| * cannot interfere so these can be sequentialized |
| * in arbitrary order. |
| */ |
| bi_foreach_predecessor(block, pred) { |
| bi_builder b = bi_init_builder(ctx, bi_after_block_logical(*pred)); |
| unsigned i = bi_predecessor_index(block, *pred); |
| |
| assert(!I->src[i].abs); |
| assert(!I->src[i].neg); |
| assert(I->src[i].swizzle == BI_SWIZZLE_H01); |
| |
| /* MOV of immediate needs lowering on Valhall */ |
| if (ctx->arch >= 9 && I->src[i].type == BI_INDEX_CONSTANT) |
| bi_iadd_imm_i32_to(&b, reg, zero, I->src[i].value); |
| else |
| bi_mov_i32_to(&b, reg, I->src[i]); |
| } |
| |
| /* Replace the phi with a move */ |
| bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); |
| bi_mov_i32_to(&b, I->dest[0], reg); |
| bi_remove_instruction(I); |
| |
| /* Propagate that move within the block. The destination |
| * is SSA and the source is not written in this block, |
| * so this is legal. The move itself will be DCE'd if |
| * possible in the next pass. |
| */ |
| bi_foreach_instr_in_block_rev(block, prop) { |
| if (prop->op == BI_OPCODE_PHI) |
| break; |
| |
| bi_foreach_src(prop, s) { |
| if (bi_is_equiv(prop->src[s], I->dest[0])) { |
| bi_replace_src(prop, s, reg); |
| } |
| } |
| } |
| } |
| } |
| |
| /* Try to locally propagate the moves we created. We need to be extra |
| * careful because we're not in SSA at this point, as such this |
| * algorithm is quadratic. This will go away when we go out of SSA after |
| * RA. |
| */ |
| BITSET_WORD *used = calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc)); |
| BITSET_WORD *multiple_uses = calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc)); |
| |
| bi_foreach_instr_global(ctx, I) { |
| bi_foreach_ssa_src(I, s) { |
| if (BITSET_TEST(used, I->src[s].value)) |
| BITSET_SET(multiple_uses, I->src[s].value); |
| else |
| BITSET_SET(used, I->src[s].value); |
| } |
| } |
| |
| bi_foreach_block(ctx, block) { |
| bi_foreach_instr_in_block_safe_rev(block, mov) { |
| /* Match "reg = ssa" */ |
| if (mov->op != BI_OPCODE_MOV_I32) continue; |
| if (mov->dest[0].type != BI_INDEX_NORMAL) continue; |
| if (mov->dest[0].value < first_reg) continue; |
| if (!bi_is_ssa(mov->src[0])) continue; |
| if (mov->src[0].value >= first_reg) continue; |
| if (BITSET_TEST(multiple_uses, mov->src[0].value)) continue; |
| |
| bool found = false; |
| |
| /* Look locally for the write of the SSA */ |
| bi_foreach_instr_in_block_rev(block, I) { |
| bool bail = false; |
| |
| bi_foreach_src(I, s) { |
| /* Bail: write-after-read */ |
| if (bi_is_equiv(I->src[s], mov->dest[0])) |
| bail = true; |
| } |
| |
| if (bail) |
| break; |
| |
| bi_foreach_dest(I, d) { |
| /* Bail: write-after-write */ |
| if (bi_is_equiv(I->dest[d], mov->dest[0])) |
| break; |
| |
| if (!bi_is_equiv(I->dest[d], mov->src[0])) |
| continue; |
| |
| /* We found it, replace */ |
| I->dest[d] = bi_replace_index(I->dest[d], mov->dest[0]); |
| found = true; |
| break; |
| } |
| |
| if (found) |
| break; |
| } |
| |
| if (found) |
| bi_remove_instruction(mov); |
| } |
| } |
| |
| free(used); |
| free(multiple_uses); |
| return first_reg; |
| } |
| |
| void |
| bi_register_allocate(bi_context *ctx) |
| { |
| struct lcra_state *l = NULL; |
| bool success = false; |
| |
| unsigned iter_count = 1000; /* max iterations */ |
| |
| /* Number of bytes of memory we've spilled into */ |
| unsigned spill_count = ctx->info.tls_size; |
| |
| if (ctx->arch >= 9) |
| va_lower_split_64bit(ctx); |
| |
| /* Lower tied operands. SSA is broken from here on. */ |
| unsigned first_reg = bi_out_of_ssa(ctx); |
| bi_lower_vector(ctx, first_reg); |
| bi_coalesce_tied(ctx); |
| squeeze_index(ctx); |
| |
| /* Try with reduced register pressure to improve thread count */ |
| if (ctx->arch >= 7) { |
| l = bi_allocate_registers(ctx, &success, false); |
| |
| if (success) { |
| ctx->info.work_reg_count = 32; |
| } else { |
| lcra_free(l); |
| l = NULL; |
| } |
| } |
| |
| /* Otherwise, use the register file and spill until we succeed */ |
| while (!success && ((iter_count--) > 0)) { |
| l = bi_allocate_registers(ctx, &success, true); |
| |
| if (success) { |
| ctx->info.work_reg_count = 64; |
| } else { |
| signed spill_node = bi_choose_spill_node(ctx, l); |
| lcra_free(l); |
| l = NULL; |
| |
| if (spill_node == -1) |
| unreachable("Failed to choose spill node\n"); |
| |
| if (ctx->inputs->is_blend) |
| unreachable("Blend shaders may not spill"); |
| |
| /* By default, we use packed TLS addressing on Valhall. |
| * We cannot cross 16 byte boundaries with packed TLS |
| * addressing. Align to ensure this doesn't happen. This |
| * could be optimized a bit. |
| */ |
| if (ctx->arch >= 9) |
| spill_count = ALIGN_POT(spill_count, 16); |
| |
| spill_count += bi_spill_register(ctx, |
| bi_get_index(spill_node), spill_count); |
| |
| /* In case the spill affected an instruction with tied |
| * operands, we need to fix up. |
| */ |
| bi_coalesce_tied(ctx); |
| } |
| } |
| |
| assert(success); |
| assert(l != NULL); |
| |
| ctx->info.tls_size = spill_count; |
| bi_install_registers(ctx, l); |
| |
| lcra_free(l); |
| } |