blob: 2a472e8edbc402007e0a2e0e5e846c3f12a1e596 [file]
/*
* Copyright 2022 Alyssa Rosenzweig
* Copyright 2021 Valve Corporation
* SPDX-License-Identifier: MIT
*/
#include "agx_builder.h"
#include "agx_compiler.h"
UNUSED static void
print_copy(const struct agx_copy *cp)
{
printf("%sr%u = ", cp->dest_mem ? "m" : "", cp->dest);
agx_print_index(cp->src, false, stdout);
printf("\n");
}
UNUSED static void
print_copies(const struct agx_copy *copies, unsigned nr)
{
printf("[\n");
for (unsigned i = 0; i < nr; ++i) {
printf(" ");
print_copy(&copies[i]);
}
printf("]\n");
}
/*
* Emits code for
*
* for (int i = 0; i < n; ++i)
* registers[dests[i]] = registers[srcs[i]];
*
* ...with all copies happening in parallel.
*
* That is, emit machine instructions equivalent to a parallel copy. This is
* used to lower not only parallel copies but also collects and splits, which
* also have parallel copy semantics.
*
* We only handles register-register copies, not general agx_index sources. This
* suffices for its internal use for register allocation.
*/
static void
do_copy(agx_builder *b, const struct agx_copy *copy)
{
agx_index dst = copy->dest_mem
? agx_memory_register(copy->dest, copy->src.size)
: agx_register(copy->dest, copy->src.size);
if (copy->dest_mem && copy->src.memory) {
/* Memory-memory copies need to be lowered to memory-register and
* register-memory, using a reserved scratch register.
*/
agx_index scratch_reg = agx_register(2, copy->src.size);
agx_mov_to(b, scratch_reg, copy->src);
agx_mov_to(b, dst, scratch_reg);
} else if (copy->src.type == AGX_INDEX_IMMEDIATE) {
agx_mov_imm_to(b, dst, copy->src.value);
} else {
agx_mov_to(b, dst, copy->src);
}
}
static void
do_swap(agx_builder *b, const struct agx_copy *copy)
{
assert(copy->src.type == AGX_INDEX_REGISTER && "only GPRs are swapped");
if (copy->dest == copy->src.value)
return;
agx_index x = copy->dest_mem
? agx_memory_register(copy->dest, copy->src.size)
: agx_register(copy->dest, copy->src.size);
agx_index y = copy->src;
assert(x.memory == y.memory);
/* Memory-memory swaps lowered here, GPR swaps lowered later */
if (x.memory) {
agx_index temp1 = agx_register(4, copy->src.size);
agx_index temp2 = agx_register(6, copy->src.size);
agx_mov_to(b, temp1, x);
agx_mov_to(b, temp2, y);
agx_mov_to(b, y, temp1);
agx_mov_to(b, x, temp2);
} else {
agx_swap(b, x, y);
}
}
struct copy_ctx {
/* Number of copies being processed */
unsigned entry_count;
/* For each physreg, the number of pending copy entries that use it as a
* source. Once this drops to zero, then the physreg is unblocked and can
* be moved to.
*/
unsigned physreg_use_count[AGX_NUM_MODELED_REGS];
/* For each physreg, the pending copy_entry that uses it as a dest. */
struct agx_copy *physreg_dest[AGX_NUM_MODELED_REGS];
struct agx_copy entries[AGX_NUM_MODELED_REGS];
};
static bool
entry_blocked(struct agx_copy *entry, struct copy_ctx *ctx)
{
for (unsigned i = 0; i < agx_size_align_16(entry->src.size); i++) {
if (ctx->physreg_use_count[entry->dest + i] != 0)
return true;
}
return false;
}
static bool
is_real(struct agx_copy *entry)
{
return entry->src.type == AGX_INDEX_REGISTER &&
entry->dest_mem == entry->src.memory;
}
/* TODO: Generalize to other bit sizes */
static void
split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry)
{
assert(!entry->done);
assert(is_real(entry));
assert(agx_size_align_16(entry->src.size) == 2);
struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++];
new_entry->dest = entry->dest + 1;
new_entry->dest_mem = entry->dest_mem;
new_entry->src = entry->src;
new_entry->src.value += 1;
new_entry->done = false;
entry->src.size = AGX_SIZE_16;
new_entry->src.size = AGX_SIZE_16;
ctx->physreg_dest[entry->dest + 1] = new_entry;
}
static void
agx_emit_parallel_copies_for_class(agx_builder *b, struct agx_copy *copies,
unsigned num_copies, bool cls)
{
/* First, lower away 64-bit copies to smaller chunks, since we don't have
* 64-bit ALU so we always want to split.
*/
struct agx_copy *copies2 = calloc(sizeof(copies[0]), num_copies * 2);
unsigned num_copies2 = 0;
for (unsigned i = 0; i < num_copies; ++i) {
struct agx_copy copy = copies[i];
/* Filter by class */
if (copy.dest_mem != cls)
continue;
assert(copy.dest < AGX_NUM_MODELED_REGS);
if (copy.src.size == AGX_SIZE_64) {
copy.src.size = AGX_SIZE_32;
copies2[num_copies2++] = copy;
if (copy.src.type == AGX_INDEX_IMMEDIATE) {
static_assert(sizeof(copy.src.value) * 8 == 32, "known size");
copy.src.value = 0;
} else {
assert(copy.src.type == AGX_INDEX_REGISTER ||
copy.src.type == AGX_INDEX_UNIFORM);
copy.src.value += 2;
}
copy.dest += 2;
copies2[num_copies2++] = copy;
} else {
copies2[num_copies2++] = copy;
}
}
copies = copies2;
num_copies = num_copies2;
/* Set up the bookkeeping */
struct copy_ctx _ctx = {.entry_count = num_copies};
struct copy_ctx *ctx = &_ctx;
memset(ctx->physreg_dest, 0, sizeof(ctx->physreg_dest));
memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
for (unsigned i = 0; i < ctx->entry_count; i++) {
struct agx_copy *entry = &copies[i];
ctx->entries[i] = *entry;
for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
if (is_real(entry))
ctx->physreg_use_count[entry->src.value + j]++;
/* Copies should not have overlapping destinations. */
assert(!ctx->physreg_dest[entry->dest + j]);
ctx->physreg_dest[entry->dest + j] = &ctx->entries[i];
}
}
/* Try to vectorize aligned 16-bit copies to use 32-bit operations instead */
for (unsigned i = 0; i < ctx->entry_count; i++) {
struct agx_copy *entry = &ctx->entries[i];
if (entry->src.size != AGX_SIZE_16)
continue;
if ((entry->dest & 1) || (entry->src.value & 1))
continue;
if (entry->src.type != AGX_INDEX_UNIFORM &&
entry->src.type != AGX_INDEX_REGISTER)
continue;
unsigned next_dest = entry->dest + 1;
assert(next_dest < ARRAY_SIZE(ctx->physreg_dest) && "aligned reg");
struct agx_copy *next_copy = ctx->physreg_dest[next_dest];
if (!next_copy)
continue;
assert(next_copy->dest == next_dest && "data structure invariant");
assert(next_copy->src.size == AGX_SIZE_16 && "unaligned copy");
if (next_copy->src.type != entry->src.type)
continue;
if (next_copy->src.value != (entry->src.value + 1))
continue;
/* Vectorize the copies */
ctx->physreg_dest[next_dest] = entry;
entry->src.size = AGX_SIZE_32;
next_copy->done = true;
}
bool progress = true;
while (progress) {
progress = false;
/* Step 1: resolve paths in the transfer graph. This means finding
* copies whose destination aren't blocked by something else and then
* emitting them, continuing this process until every copy is blocked
* and there are only cycles left.
*
* TODO: We should note that src is also available in dest to unblock
* cycles that src is involved in.
*/
for (unsigned i = 0; i < ctx->entry_count; i++) {
struct agx_copy *entry = &ctx->entries[i];
if (!entry->done && !entry_blocked(entry, ctx)) {
entry->done = true;
progress = true;
do_copy(b, entry);
for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
if (is_real(entry))
ctx->physreg_use_count[entry->src.value + j]--;
ctx->physreg_dest[entry->dest + j] = NULL;
}
}
}
if (progress)
continue;
/* Step 2: Find partially blocked copies and split them. In the
* mergedregs case, we can 32-bit copies which are only blocked on one
* 16-bit half, and splitting them helps get things moving.
*
* We can skip splitting copies if the source isn't a register,
* however, because it does not unblock anything and therefore doesn't
* contribute to making forward progress with step 1. These copies
* should still be resolved eventually in step 1 because they can't be
* part of a cycle.
*/
for (unsigned i = 0; i < ctx->entry_count; i++) {
struct agx_copy *entry = &ctx->entries[i];
if (entry->done || (agx_size_align_16(entry->src.size) != 2))
continue;
if (((ctx->physreg_use_count[entry->dest] == 0 ||
ctx->physreg_use_count[entry->dest + 1] == 0)) &&
is_real(entry)) {
split_32bit_copy(ctx, entry);
progress = true;
}
}
}
/* Step 3: resolve cycles through swapping.
*
* At this point, the transfer graph should consist of only cycles.
* The reason is that, given any physreg n_1 that's the source of a
* remaining entry, it has a destination n_2, which (because every
* copy is blocked) is the source of some other copy whose destination
* is n_3, and so we can follow the chain until we get a cycle. If we
* reached some other node than n_1:
*
* n_1 -> n_2 -> ... -> n_i
* ^ |
* |-------------|
*
* then n_2 would be the destination of 2 copies, which is illegal
* (checked above in an assert). So n_1 must be part of a cycle:
*
* n_1 -> n_2 -> ... -> n_i
* ^ |
* |---------------------|
*
* and this must be only cycle n_1 is involved in, because any other
* path starting from n_1 would also have to end in n_1, resulting in
* a node somewhere along the way being the destination of 2 copies
* when the 2 paths merge.
*
* The way we resolve the cycle is through picking a copy (n_1, n_2)
* and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
* out of the cycle:
*
* n_1 -> ... -> n_i
* ^ |
* |--------------|
*
* and we can keep repeating this until the cycle is empty.
*/
for (unsigned i = 0; i < ctx->entry_count; i++) {
struct agx_copy *entry = &ctx->entries[i];
if (entry->done)
continue;
assert(is_real(entry));
/* catch trivial copies */
if (entry->dest == entry->src.value) {
entry->done = true;
continue;
}
do_swap(b, entry);
/* Split any blocking copies whose sources are only partially
* contained within our destination.
*/
if (agx_size_align_16(entry->src.size) == 1) {
for (unsigned j = 0; j < ctx->entry_count; j++) {
struct agx_copy *blocking = &ctx->entries[j];
if (blocking->done)
continue;
if (blocking->src.value <= entry->dest &&
blocking->src.value + 1 >= entry->dest &&
agx_size_align_16(blocking->src.size) == 2) {
split_32bit_copy(ctx, blocking);
}
}
}
/* Update sources of blocking copies.
*
* Note: at this point, every blocking copy's source should be
* contained within our destination.
*/
for (unsigned j = 0; j < ctx->entry_count; j++) {
struct agx_copy *blocking = &ctx->entries[j];
if (blocking->src.value >= entry->dest &&
blocking->src.value <
entry->dest + agx_size_align_16(entry->src.size)) {
blocking->src.value =
entry->src.value + (blocking->src.value - entry->dest);
}
}
entry->done = true;
}
free(copies2);
}
void
agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
unsigned num_copies)
{
/* Emit copies fo reach register class separately because we don't have
* register class awareness in the parallel copy lowering data structure.
*/
agx_emit_parallel_copies_for_class(b, copies, num_copies, false);
agx_emit_parallel_copies_for_class(b, copies, num_copies, true);
}