blob: f9629f8f957816642e3f89124e7d227d0f6e4a78 [file] [log] [blame]
/*
* Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "util/register_allocate.h"
#include "compiler_defines.h"
#include "bifrost_sched.h"
#include "bifrost_compile.h"
#include "bifrost_print.h"
#define BI_DEBUG
const unsigned max_primary_reg = 64; /* Overestimate because of special regs */
const unsigned max_vec2_reg = 32;
const unsigned max_vec3_reg = 16; // XXX: Do we need to align vec3 to vec4 boundary?
const unsigned max_vec4_reg = 16;
const unsigned max_registers = 128; /* Sum of classes */
const unsigned primary_base = 0;
const unsigned vec2_base = 64;
const unsigned vec3_base = 96; /* above base + max_class_reg */
const unsigned vec4_base = 112;
const unsigned vec4_end = 128;
static unsigned
find_or_allocate_temp(compiler_context *ctx, unsigned hash)
{
if (hash >= SSA_FIXED_MINIMUM)
return hash;
unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(ctx->hash_to_temp, hash + 1);
if (temp)
return temp - 1;
/* If no temp is find, allocate one */
temp = ctx->num_temps++;
ctx->max_hash = MAX2(ctx->max_hash, hash);
_mesa_hash_table_u64_insert(ctx->hash_to_temp, hash + 1, (void *) ((uintptr_t) temp + 1));
return temp;
}
static bool
is_live_in_instr(bifrost_instruction *instr, unsigned temp)
{
if (instr->ssa_args.src0 == temp) return true;
if (instr->ssa_args.src1 == temp) return true;
if (instr->ssa_args.src2 == temp) return true;
if (instr->ssa_args.src3 == temp) return true;
return false;
}
static bool
is_live_after_instr(compiler_context *ctx, bifrost_block *blk, bifrost_instruction *instr, unsigned temp)
{
// Scan forward in the block from this location to see if we are still live.
mir_foreach_instr_in_block_from(blk, ins, mir_next_instr(instr)) {
if (is_live_in_instr(ins, temp))
return true;
}
// XXX: Walk all successor blocks and ensure the value isn't used there
return false;
}
static uint32_t
ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
{
for (int i = primary_base; i < vec4_end; ++i) {
if (BITSET_TEST(regs, i)) {
return i;
}
}
assert(0);
return 0;
}
static uint32_t
ra_get_phys_reg(compiler_context *ctx, struct ra_graph *g, unsigned temp, unsigned max_reg)
{
if (temp == SSA_INVALID_VALUE ||
temp >= SSA_FIXED_UREG_MINIMUM ||
temp == SSA_FIXED_CONST_0)
return temp;
if (temp >= SSA_FIXED_MINIMUM)
return SSA_REG_FROM_FIXED(temp);
assert(temp < max_reg);
uint32_t r = ra_get_node_reg(g, temp);
if (r >= vec4_base)
return (r - vec4_base) * 4;
else if (r >= vec3_base)
return (r - vec3_base) * 4;
else if (r >= vec2_base)
return (r - vec2_base) * 2;
return r;
}
static void
allocate_registers(compiler_context *ctx)
{
struct ra_regs *regs = ra_alloc_reg_set(NULL, max_registers, true);
int primary_class = ra_alloc_reg_class(regs);
int vec2_class = ra_alloc_reg_class(regs);
int vec3_class = ra_alloc_reg_class(regs);
int vec4_class = ra_alloc_reg_class(regs);
// Allocate our register classes and conflicts
{
unsigned reg = 0;
unsigned primary_base = 0;
// Add all of our primary scalar registers
for (unsigned i = 0; i < max_primary_reg; ++i) {
ra_class_add_reg(regs, primary_class, reg);
reg++;
}
// Add all of our vec2 class registers
// These alias with the scalar registers
for (unsigned i = 0; i < max_vec2_reg; ++i) {
ra_class_add_reg(regs, vec2_class, reg);
// Tell RA that this conflicts with primary class registers
// Make sure to tell the RA utility all conflict slots
ra_add_reg_conflict(regs, reg, primary_base + i*2 + 0);
ra_add_reg_conflict(regs, reg, primary_base + i*2 + 1);
reg++;
}
// Add all of our vec3 class registers
// These alias with the scalar registers
for (unsigned i = 0; i < max_vec3_reg; ++i) {
ra_class_add_reg(regs, vec3_class, reg);
// Tell RA that this conflicts with primary class registers
// Make sure to tell the RA utility all conflict slots
// These are aligned to vec4 even though they only conflict with a vec3 wide slot
ra_add_reg_conflict(regs, reg, primary_base + i*4 + 0);
ra_add_reg_conflict(regs, reg, primary_base + i*4 + 1);
ra_add_reg_conflict(regs, reg, primary_base + i*4 + 2);
// State that this class conflicts with the vec2 class
ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 0);
ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 1);
reg++;
}
// Add all of our vec4 class registers
// These alias with the scalar registers
for (unsigned i = 0; i < max_vec4_reg; ++i) {
ra_class_add_reg(regs, vec4_class, reg);
// Tell RA that this conflicts with primary class registers
// Make sure to tell the RA utility all conflict slots
// These are aligned to vec4 even though they only conflict with a vec3 wide slot
ra_add_reg_conflict(regs, reg, primary_base + i*4 + 0);
ra_add_reg_conflict(regs, reg, primary_base + i*4 + 1);
ra_add_reg_conflict(regs, reg, primary_base + i*4 + 2);
ra_add_reg_conflict(regs, reg, primary_base + i*4 + 3);
// State that this class conflicts with the vec2 class
ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 0);
ra_add_reg_conflict(regs, reg, vec2_base + i*2 + 1);
// State that this class conflicts with the vec3 class
// They conflict on the exact same location due to alignments
ra_add_reg_conflict(regs, reg, vec3_base + i);
reg++;
}
}
ra_set_finalize(regs, NULL);
mir_foreach_block(ctx, block) {
mir_foreach_instr_in_block(block, instr) {
instr->ssa_args.src0 = find_or_allocate_temp(ctx, instr->ssa_args.src0);
instr->ssa_args.src1 = find_or_allocate_temp(ctx, instr->ssa_args.src1);
instr->ssa_args.src2 = find_or_allocate_temp(ctx, instr->ssa_args.src2);
instr->ssa_args.src3 = find_or_allocate_temp(ctx, instr->ssa_args.src3);
instr->ssa_args.dest = find_or_allocate_temp(ctx, instr->ssa_args.dest);
}
}
uint32_t nodes = ctx->num_temps;
struct ra_graph *g = ra_alloc_interference_graph(regs, nodes);
mir_foreach_block(ctx, block) {
mir_foreach_instr_in_block(block, instr) {
if (instr->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
if (instr->dest_components == 4)
ra_set_node_class(g, instr->ssa_args.dest, vec4_class);
else if (instr->dest_components == 3)
ra_set_node_class(g, instr->ssa_args.dest, vec3_class);
else if (instr->dest_components == 2)
ra_set_node_class(g, instr->ssa_args.dest, vec2_class);
else
ra_set_node_class(g, instr->ssa_args.dest, primary_class);
}
}
uint32_t *live_start = malloc(nodes * sizeof(uint32_t));
uint32_t *live_end = malloc(nodes * sizeof(uint32_t));
memset(live_start, 0xFF, nodes * sizeof(uint32_t));
memset(live_end, 0xFF, nodes * sizeof(uint32_t));
uint32_t location = 0;
mir_foreach_block(ctx, block) {
mir_foreach_instr_in_block(block, instr) {
if (instr->ssa_args.dest < SSA_FIXED_MINIMUM) {
// If the destination isn't yet live before this point
// then this is the point it becomes live since we wrote to it
if (live_start[instr->ssa_args.dest] == ~0U) {
live_start[instr->ssa_args.dest] = location;
}
}
uint32_t sources[4] = {
instr->ssa_args.src0,
instr->ssa_args.src1,
instr->ssa_args.src2,
instr->ssa_args.src3,
};
for (unsigned i = 0; i < 4; ++i) {
if (sources[i] >= SSA_FIXED_MINIMUM)
continue;
// If the source is no longer live after this instruction then we can end its liveness
if (!is_live_after_instr(ctx, block, instr, sources[i])) {
live_end[sources[i]] = location;
}
}
++location;
}
}
// Spin through the nodes quick and ensure they are all killed by the end of the program
for (unsigned i = 0; i < nodes; ++i) {
if (live_end[i] == ~0U)
live_end[i] = location;
}
for (int i = 0; i < nodes; ++i) {
for (int j = i + 1; j < nodes; ++j) {
if (!(live_start[i] >= live_end[j] || live_start[j] >= live_end[i])) {
ra_add_node_interference(g, i, j);
}
}
}
ra_set_select_reg_callback(g, ra_select_callback, NULL);
if (!ra_allocate(g)) {
assert(0);
}
free(live_start);
free(live_end);
mir_foreach_block(ctx, block) {
mir_foreach_instr_in_block(block, instr) {
instr->args.src0 = ra_get_phys_reg(ctx, g, instr->ssa_args.src0, nodes);
instr->args.src1 = ra_get_phys_reg(ctx, g, instr->ssa_args.src1, nodes);
instr->args.src2 = ra_get_phys_reg(ctx, g, instr->ssa_args.src2, nodes);
instr->args.src3 = ra_get_phys_reg(ctx, g, instr->ssa_args.src3, nodes);
instr->args.dest = ra_get_phys_reg(ctx, g, instr->ssa_args.dest, nodes);
}
}
}
static void
bundle_block(compiler_context *ctx, bifrost_block *block)
{
}
static void
remove_create_vectors(compiler_context *ctx, bifrost_block *block)
{
mir_foreach_instr_in_block_safe(block, instr) {
if (instr->op != op_create_vector) continue;
uint32_t vector_ssa_sources[4] = {
instr->ssa_args.src0,
instr->ssa_args.src1,
instr->ssa_args.src2,
instr->ssa_args.src3,
};
mir_foreach_instr_in_block_from_rev(block, next_instr, instr) {
// Walk our block backwards and find the creators of this vector creation instruction
for (unsigned i = 0; i < instr->dest_components; ++i) {
// If this instruction is ther one that writes this register then forward it to the real register
if (vector_ssa_sources[i] == next_instr->ssa_args.dest) {
next_instr->ssa_args.dest = vector_ssa_sources[i];
// Source instruction destination is a vector register of size dest_components
// So dest + i gets the components of it
next_instr->args.dest = instr->args.dest + i;
}
}
}
// Remove the instruction now that we have copied over all the sources
mir_remove_instr(instr);
}
}
static void
remove_extract_elements(compiler_context *ctx, bifrost_block *block)
{
mir_foreach_instr_in_block_safe(block, instr) {
if (instr->op != op_extract_element) continue;
mir_foreach_instr_in_block_from(block, next_instr, instr) {
// Walk our block forward to replace uses of this register with a real register
// src0 = vector
// src1 = index in to vector
uint32_t vector_ssa_sources[4] = {
next_instr->ssa_args.src0,
next_instr->ssa_args.src1,
next_instr->ssa_args.src2,
next_instr->ssa_args.src3,
};
uint32_t *vector_sources[4] = {
&next_instr->args.src0,
&next_instr->args.src1,
&next_instr->args.src2,
&next_instr->args.src3,
};
for (unsigned i = 0; i < 4; ++i) {
if (vector_ssa_sources[i] == instr->ssa_args.dest) {
// This source uses this vector extraction
// Replace its usage with the real register
// src0 is a vector register and src1 is the constant element of the vector
*vector_sources[i] = instr->args.src0 + instr->literal_args[0];
}
}
}
// Remove the instruction now that we have copied over all the sources
mir_remove_instr(instr);
}
}
void schedule_program(compiler_context *ctx)
{
// XXX: we should move instructions together before RA that can feed in to each other and be scheduled in the same clause
allocate_registers(ctx);
mir_foreach_block(ctx, block) {
remove_create_vectors(ctx, block);
remove_extract_elements(ctx, block);
}
mir_foreach_block(ctx, block) {
#ifdef BI_DEBUG
print_mir_block(block, true);
#endif
bundle_block(ctx, block);
}
}