blob: 1d4da15f26e47e837d2fa9558305c59a987a1bb9 [file] [log] [blame]
/*
* Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "agx_compiler.h"
#include "agx_builder.h"
/* SSA-based register allocator */
struct ra_ctx {
agx_context *shader;
agx_block *block;
uint8_t *ssa_to_reg;
uint8_t *ncomps;
BITSET_WORD *visited;
BITSET_WORD *used_regs;
/* Maximum number of registers that RA is allowed to use */
unsigned bound;
};
/** Returns number of registers written by an instruction */
unsigned
agx_write_registers(agx_instr *I, unsigned d)
{
unsigned size = agx_size_align_16(I->dest[d].size);
switch (I->op) {
case AGX_OPCODE_ITER:
assert(1 <= I->channels && I->channels <= 4);
return I->channels * size;
case AGX_OPCODE_DEVICE_LOAD:
case AGX_OPCODE_TEXTURE_LOAD:
case AGX_OPCODE_TEXTURE_SAMPLE:
case AGX_OPCODE_LD_TILE:
return util_bitcount(I->mask) * size;
case AGX_OPCODE_LDCF:
return 6;
case AGX_OPCODE_COLLECT:
return I->nr_srcs * size;
default:
return size;
}
}
static inline enum agx_size
agx_split_width(const agx_instr *I)
{
enum agx_size width = ~0;
agx_foreach_dest(I, d) {
if (agx_is_null(I->dest[d]))
continue;
else if (width != ~0)
assert(width == I->dest[d].size);
else
width = I->dest[d].size;
}
assert(width != ~0 && "should have been DCE'd");
return width;
}
static unsigned
agx_assign_regs(BITSET_WORD *used_regs, unsigned count, unsigned align, unsigned max)
{
for (unsigned reg = 0; reg < max; reg += align) {
bool conflict = false;
for (unsigned j = 0; j < count; ++j)
conflict |= BITSET_TEST(used_regs, reg + j);
if (!conflict) {
for (unsigned j = 0; j < count; ++j)
BITSET_SET(used_regs, reg + j);
return reg;
}
}
/* Couldn't find a free register, dump the state of the register file */
fprintf(stderr, "Failed to find register of size %u aligned %u max %u.\n",
count, align, max);
fprintf(stderr, "Register file:\n");
for (unsigned i = 0; i < BITSET_WORDS(max); ++i)
fprintf(stderr, " %08X\n", used_regs[i]);
unreachable("Could not find a free register");
}
/*
* Loop over live-in values at the start of the block and mark their registers
* as in-use. We process blocks in dominance order, so this handles everything
* but loop headers.
*
* For loop headers, this handles the forward edges but not the back edge.
* However, that's okay: we don't want to reserve the registers that are
* defined within the loop, because then we'd get a contradiction. Instead we
* leave them available and then they become fixed points of a sort.
*/
static void
reserve_live_in(struct ra_ctx *rctx)
{
int i;
BITSET_FOREACH_SET(i, rctx->block->live_in, rctx->shader->alloc) {
/* Skip values defined in loops when processing the loop header */
if (!BITSET_TEST(rctx->visited, i))
continue;
for (unsigned j = 0; j < rctx->ncomps[i]; ++j)
BITSET_SET(rctx->used_regs, rctx->ssa_to_reg[i] + j);
}
}
/** Assign registers to SSA values in a block. */
static void
agx_ra_assign_local(struct ra_ctx *rctx)
{
BITSET_DECLARE(used_regs, AGX_NUM_REGS) = { 0 };
agx_block *block = rctx->block;
uint8_t *ssa_to_reg = rctx->ssa_to_reg;
uint8_t *ncomps = rctx->ncomps;
rctx->used_regs = used_regs;
reserve_live_in(rctx);
/* Force the nesting counter r0l live throughout shaders using control flow.
* This could be optimized (sync with agx_calc_register_demand).
*/
if (rctx->shader->any_cf)
BITSET_SET(used_regs, 0);
agx_foreach_instr_in_block(block, I) {
/* Optimization: if a split contains the last use of a vector, the split
* can be removed by assigning the destinations overlapping the source.
*/
if (I->op == AGX_OPCODE_SPLIT && I->src[0].kill) {
unsigned reg = ssa_to_reg[I->src[0].value];
unsigned length = ncomps[I->src[0].value];
unsigned width = agx_size_align_16(agx_split_width(I));
unsigned count = length / width;
agx_foreach_dest(I, d) {
/* Skip excess components */
if (d >= count) {
assert(agx_is_null(I->dest[d]));
continue;
}
/* The source of the split is killed. If a destination of the split
* is null, that channel is killed. Free it.
*/
if (agx_is_null(I->dest[d])) {
for (unsigned i = 0; i < width; ++i)
BITSET_CLEAR(used_regs, reg + (width * d) + i);
continue;
}
/* Otherwise, transfer the liveness */
unsigned offset = d * width;
assert(I->dest[d].type == AGX_INDEX_NORMAL);
assert(offset < length);
ssa_to_reg[I->dest[d].value] = reg + offset;
BITSET_SET(rctx->visited, I->dest[d].value);
}
continue;
} else if (I->op == AGX_OPCODE_PRELOAD) {
/* We must coalesce all preload moves */
assert(I->dest[0].type == AGX_INDEX_NORMAL);
assert(I->dest[0].size == I->src[0].size);
assert(I->src[0].type == AGX_INDEX_REGISTER);
unsigned base = I->src[0].value;
for (unsigned i = 0; i < agx_size_align_16(I->src[0].size); ++i) {
assert(!BITSET_TEST(used_regs, base + i));
BITSET_SET(used_regs, base + i);
}
ssa_to_reg[I->dest[0].value] = base;
BITSET_SET(rctx->visited, I->dest[0].value);
continue;
}
/* First, free killed sources */
agx_foreach_src(I, s) {
if (I->src[s].type == AGX_INDEX_NORMAL && I->src[s].kill) {
unsigned reg = ssa_to_reg[I->src[s].value];
unsigned count = ncomps[I->src[s].value];
for (unsigned i = 0; i < count; ++i)
BITSET_CLEAR(used_regs, reg + i);
}
}
/* Next, assign destinations one at a time. This is always legal
* because of the SSA form.
*/
agx_foreach_dest(I, d) {
if (I->dest[d].type == AGX_INDEX_NORMAL) {
unsigned count = agx_write_registers(I, d);
unsigned align = agx_size_align_16(I->dest[d].size);
unsigned reg = agx_assign_regs(used_regs, count, align, rctx->bound);
ssa_to_reg[I->dest[d].value] = reg;
BITSET_SET(rctx->visited, I->dest[d].value);
}
}
}
STATIC_ASSERT(sizeof(block->regs_out) == sizeof(used_regs));
memcpy(block->regs_out, used_regs, sizeof(used_regs));
}
/*
* Resolve an agx_index of type NORMAL or REGISTER to a physical register, once
* registers have been allocated for all SSA values.
*/
static unsigned
agx_index_to_reg(uint8_t *ssa_to_reg, agx_index idx)
{
if (idx.type == AGX_INDEX_NORMAL) {
return ssa_to_reg[idx.value];
} else {
assert(idx.type == AGX_INDEX_REGISTER);
return idx.value;
}
}
/*
* Lower phis to parallel copies at the logical end of a given block. If a block
* needs parallel copies inserted, a successor of the block has a phi node. To
* have a (nontrivial) phi node, a block must have multiple predecessors. So the
* edge from the block to the successor (with phi) is not the only edge entering
* the successor. Because the control flow graph has no critical edges, this
* edge must therefore be the only edge leaving the block, so the block must
* have only a single successor.
*/
static void
agx_insert_parallel_copies(agx_context *ctx, agx_block *block)
{
bool any_succ = false;
unsigned nr_phi = 0;
/* Phi nodes logically happen on the control flow edge, so parallel copies
* are added at the end of the predecessor */
agx_builder b = agx_init_builder(ctx, agx_after_block_logical(block));
agx_foreach_successor(block, succ) {
assert(nr_phi == 0 && "control flow graph has a critical edge");
agx_foreach_phi_in_block(succ, phi) {
assert(!any_succ && "control flow graph has a critical edge");
nr_phi++;
}
any_succ = true;
/* Nothing to do if there are no phi nodes */
if (nr_phi == 0)
continue;
unsigned pred_index = agx_predecessor_index(succ, block);
/* Create a parallel copy lowering all the phi nodes */
struct agx_copy *copies = calloc(sizeof(*copies), nr_phi);
unsigned i = 0;
agx_foreach_phi_in_block(succ, phi) {
agx_index dest = phi->dest[0];
agx_index src = phi->src[pred_index];
assert(dest.type == AGX_INDEX_REGISTER);
assert(src.type == AGX_INDEX_REGISTER);
assert(dest.size == src.size);
copies[i++] = (struct agx_copy) {
.dest = dest.value,
.src = src.value,
.size = src.size
};
}
agx_emit_parallel_copies(&b, copies, nr_phi);
free(copies);
}
}
void
agx_ra(agx_context *ctx)
{
unsigned *alloc = calloc(ctx->alloc, sizeof(unsigned));
agx_compute_liveness(ctx);
uint8_t *ssa_to_reg = calloc(ctx->alloc, sizeof(uint8_t));
uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t));
BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD));
agx_foreach_instr_global(ctx, I) {
agx_foreach_dest(I, d) {
if (I->dest[d].type != AGX_INDEX_NORMAL) continue;
unsigned v = I->dest[d].value;
assert(ncomps[v] == 0 && "broken SSA");
ncomps[v] = agx_write_registers(I, d);
}
}
/* Assign registers in dominance-order. This coincides with source-order due
* to a NIR invariant, so we do not need special handling for this.
*/
agx_foreach_block(ctx, block) {
agx_ra_assign_local(&(struct ra_ctx) {
.shader = ctx,
.block = block,
.ssa_to_reg = ssa_to_reg,
.ncomps = ncomps,
.visited = visited,
.bound = AGX_NUM_REGS
});
}
for (unsigned i = 0; i < ctx->alloc; ++i) {
if (ncomps[i])
ctx->max_reg = MAX2(ctx->max_reg, ssa_to_reg[i] + ncomps[i] - 1);
}
agx_foreach_instr_global(ctx, ins) {
agx_foreach_src(ins, s) {
if (ins->src[s].type == AGX_INDEX_NORMAL) {
unsigned v = ssa_to_reg[ins->src[s].value];
ins->src[s] = agx_replace_index(ins->src[s], agx_register(v, ins->src[s].size));
}
}
agx_foreach_dest(ins, d) {
if (ins->dest[d].type == AGX_INDEX_NORMAL) {
unsigned v = ssa_to_reg[ins->dest[d].value];
ins->dest[d] = agx_replace_index(ins->dest[d], agx_register(v, ins->dest[d].size));
}
}
}
agx_foreach_instr_global_safe(ctx, ins) {
/* Lower away RA pseudo-instructions */
agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
if (ins->op == AGX_OPCODE_COLLECT) {
unsigned base = agx_index_to_reg(ssa_to_reg, ins->dest[0]);
unsigned width = agx_size_align_16(ins->dest[0].size);
struct agx_copy *copies = alloca(sizeof(copies[0]) * ins->nr_srcs);
unsigned n = 0;
/* Move the sources */
agx_foreach_src(ins, i) {
if (agx_is_null(ins->src[i])) continue;
assert(ins->src[i].size == ins->dest[0].size);
bool is_uniform = ins->src[i].type == AGX_INDEX_UNIFORM;
copies[n++] = (struct agx_copy) {
.dest = base + (i * width),
.is_uniform = is_uniform,
.src = is_uniform ? ins->src[i].value : agx_index_to_reg(ssa_to_reg, ins->src[i]),
.size = ins->src[i].size
};
}
agx_emit_parallel_copies(&b, copies, n);
agx_remove_instruction(ins);
continue;
} else if (ins->op == AGX_OPCODE_SPLIT) {
unsigned base = agx_index_to_reg(ssa_to_reg, ins->src[0]);
unsigned width = agx_size_align_16(agx_split_width(ins));
struct agx_copy copies[4];
assert(ins->nr_dests <= ARRAY_SIZE(copies));
unsigned n = 0;
/* Move the sources */
agx_foreach_dest(ins, i) {
if (agx_is_null(ins->dest[i])) continue;
copies[n++] = (struct agx_copy) {
.dest = agx_index_to_reg(ssa_to_reg, ins->dest[i]),
.src = base + (i * width),
.size = ins->dest[i].size
};
}
/* Lower away */
agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
agx_emit_parallel_copies(&b, copies, n);
agx_remove_instruction(ins);
continue;
}
}
/* Insert parallel copies lowering phi nodes */
agx_foreach_block(ctx, block) {
agx_insert_parallel_copies(ctx, block);
}
agx_foreach_instr_global_safe(ctx, I) {
switch (I->op) {
/* Pseudoinstructions for RA must be removed now */
case AGX_OPCODE_PHI:
case AGX_OPCODE_LOGICAL_END:
case AGX_OPCODE_PRELOAD:
agx_remove_instruction(I);
break;
/* Coalesced moves can be removed */
case AGX_OPCODE_MOV:
if (I->src[0].type == AGX_INDEX_REGISTER &&
I->dest[0].size == I->src[0].size &&
I->src[0].value == I->dest[0].value) {
assert(I->dest[0].type == AGX_INDEX_REGISTER);
agx_remove_instruction(I);
}
break;
/* Writes to the nesting counter lowered to the real register */
case AGX_OPCODE_NEST: {
agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
agx_mov_to(&b, agx_register(0, AGX_SIZE_16), I->src[0]);
agx_remove_instruction(I);
break;
}
default:
break;
}
}
free(ssa_to_reg);
free(ncomps);
free(visited);
free(alloc);
}