blob: d26d4815132988ccef336279af5815b1990669d3 [file] [log] [blame]
/*
* Copyright © 2024 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#include "aco_builder.h"
#include "aco_ir.h"
namespace aco {
namespace {
struct branch_ctx {
Program* program;
std::vector<bool> blocks_incoming_exec_used;
branch_ctx(Program* program_)
: program(program_), blocks_incoming_exec_used(program_->blocks.size(), true)
{}
};
void
remove_linear_successor(branch_ctx& ctx, Block& block, uint32_t succ_index)
{
Block& succ = ctx.program->blocks[succ_index];
ASSERTED auto it = std::remove(succ.linear_preds.begin(), succ.linear_preds.end(), block.index);
assert(std::next(it) == succ.linear_preds.end());
succ.linear_preds.pop_back();
it = std::remove(block.linear_succs.begin(), block.linear_succs.end(), succ_index);
assert(std::next(it) == block.linear_succs.end());
block.linear_succs.pop_back();
if (succ.linear_preds.empty()) {
/* This block became unreachable - Recursively remove successors. */
succ.instructions.clear();
for (unsigned i : succ.linear_succs)
remove_linear_successor(ctx, succ, i);
}
}
void
try_remove_simple_block(branch_ctx& ctx, Block& block)
{
if (!block.instructions.empty() && block.instructions.front()->opcode != aco_opcode::s_branch)
return;
/* Don't remove the preheader as it might be needed as convergence point
* in order to insert code (e.g. for loop alignment, wait states, etc.).
*/
if (block.kind & block_kind_loop_preheader)
return;
unsigned succ_idx = block.linear_succs[0];
Block& succ = ctx.program->blocks[succ_idx];
Block::edge_vec new_preds;
for (unsigned pred_idx : block.linear_preds) {
Block& pred = ctx.program->blocks[pred_idx];
assert(pred.index < block.index);
assert(!pred.instructions.empty() && pred.instructions.back()->isBranch());
Instruction* branch = pred.instructions.back().get();
if (branch->opcode == aco_opcode::p_branch) {
/* The predecessor unconditionally jumps to this block. Redirect to successor. */
pred.linear_succs[0] = succ_idx;
succ.linear_preds.push_back(pred_idx);
} else if (pred.linear_succs[0] == succ_idx || pred.linear_succs[1] == succ_idx) {
/* The predecessor's alternative target is this block's successor. */
pred.linear_succs[0] = succ_idx;
pred.linear_succs[1] = pred.linear_succs.back(); /* In case of discard */
pred.linear_succs.pop_back();
branch->opcode = aco_opcode::p_branch;
branch->branch().never_taken = false;
branch->branch().rarely_taken = false;
} else if (pred.linear_succs[1] == block.index) {
/* The predecessor jumps to this block. Redirect to successor. */
pred.linear_succs[1] = succ_idx;
succ.linear_preds.push_back(pred_idx);
} else {
/* This block is the fall-through target of the predecessor. */
assert(pred_idx == block.index - 1);
if (block.instructions.empty()) {
/* If this block is empty, just fall-through to the successor. */
pred.linear_succs[0] = succ_idx;
succ.linear_preds.push_back(pred_idx);
continue;
}
/* Otherwise, check if there is a fall-through path for the jump target. */
bool can_fallthrough = block.index < pred.linear_succs[1];
for (unsigned j = block.index + 1; can_fallthrough && j < pred.linear_succs[1]; j++) {
if (!ctx.program->blocks[j].instructions.empty())
can_fallthrough = false;
}
if (!can_fallthrough) {
new_preds.push_back(pred_idx);
continue;
}
pred.linear_succs[0] = pred.linear_succs[1];
pred.linear_succs[1] = succ_idx;
succ.linear_preds.push_back(pred_idx);
/* Invert the condition. This branch now falls through to its original target.
* However, we don't update the fall-through target since this instruction
* gets lowered in the next step, anyway.
*/
if (branch->opcode == aco_opcode::p_cbranch_nz)
branch->opcode = aco_opcode::p_cbranch_z;
else
branch->opcode = aco_opcode::p_cbranch_nz;
branch->branch().never_taken = false;
branch->branch().rarely_taken = false;
}
/* Update the branch target. */
branch->branch().target[0] = succ_idx;
}
/* If this block is part of the logical CFG, also connect pre- and successors. */
if (!block.logical_succs.empty()) {
assert(block.logical_succs.size() == 1);
unsigned logical_succ_idx = block.logical_succs[0];
Block& logical_succ = ctx.program->blocks[logical_succ_idx];
ASSERTED auto it = std::remove(logical_succ.logical_preds.begin(),
logical_succ.logical_preds.end(), block.index);
assert(std::next(it) == logical_succ.logical_preds.end());
logical_succ.logical_preds.pop_back();
for (unsigned pred_idx : block.logical_preds) {
Block& pred = ctx.program->blocks[pred_idx];
std::replace(pred.logical_succs.begin(), pred.logical_succs.end(), block.index,
logical_succ_idx);
if (pred.logical_succs.size() == 2 && pred.logical_succs[0] == pred.logical_succs[1])
pred.logical_succs.pop_back(); /* This should have been optimized in NIR! */
else
logical_succ.logical_preds.push_back(pred_idx);
}
block.logical_succs.clear();
block.logical_preds.clear();
}
block.linear_preds = new_preds;
if (block.linear_preds.empty()) {
remove_linear_successor(ctx, block, succ_idx);
block.instructions.clear();
}
}
bool
instr_uses_reg(aco_ptr<Instruction>& instr, PhysReg reg, uint32_t size)
{
auto intersects = [=](auto src) -> bool
{ return src.physReg() + src.size() > reg && reg + size > src.physReg(); };
return std::any_of(instr->definitions.begin(), instr->definitions.end(),
[=](Definition def) { return intersects(def); }) ||
std::any_of(instr->operands.begin(), instr->operands.end(),
[=](Operand op) { return intersects(op); });
}
void
try_merge_break_with_continue(branch_ctx& ctx, Block& block)
{
/* Look for this:
* BB1:
* ...
* p_branch_z exec BB3, BB2
* BB2:
* ...
* s[0:1], scc = s_andn2 s[0:1], exec
* s_cbranch_scc0 BB4
* BB3:
* exec = s_mov_b64 s[0:1]
* s_branch BB1
* BB4:
* ...
*
* And turn it into this:
* BB1:
* ...
* p_branch_z exec BB3, BB2
* BB2:
* ...
* BB3:
* s[0:1], scc, exec = s_andn2_wrexec s[0:1], exec
* s_cbranch_scc1 BB1, BB4
* BB4:
* ...
*/
if (block.linear_succs.size() != 2 || block.instructions.size() < 2)
return;
Instruction* branch = block.instructions.back().get();
if (branch->opcode != aco_opcode::s_cbranch_scc0)
return;
Block& merge = ctx.program->blocks[block.linear_succs[0]];
Block& loopexit = ctx.program->blocks[block.linear_succs[1]];
/* Just a jump to the loop header. */
if (merge.linear_succs.size() != 1)
return;
for (unsigned merge_pred : merge.linear_preds) {
if (merge_pred == block.index)
continue;
Block& pred = ctx.program->blocks[merge_pred];
Instruction* pred_branch = pred.instructions.back().get();
/* The branch needs to be exec zero only, otherwise we corrupt exec. */
if (pred_branch->opcode != aco_opcode::p_cbranch_z ||
pred_branch->operands[0].physReg() != exec)
return;
}
/* merge block: copy to exec, branch */
if (merge.instructions.size() != 2 || merge.instructions.back()->opcode != aco_opcode::s_branch)
return;
Builder bld(ctx.program);
Instruction* execwrite = merge.instructions[0].get();
if (execwrite->opcode != bld.w64or32(Builder::s_mov) || !execwrite->writes_exec())
return;
/* break block: find s_andn2 */
PhysReg exec_temp = execwrite->operands[0].physReg();
Instruction* execsrc = nullptr;
for (auto rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
aco_ptr<Instruction>& instr = *rit;
if (instr->opcode == bld.w64or32(Builder::s_andn2) &&
instr->definitions[0].physReg() == exec_temp &&
instr->operands[0].physReg() == exec_temp && instr->operands[1].physReg() == exec) {
execsrc = instr.release();
block.instructions.erase(std::next(rit).base());
break;
}
/* There might be copies for phis after the execsrc instructions,
* but these must not read / write the same register.
*/
if (instr->writes_exec() || instr_uses_reg(instr, exec_temp, bld.lm.size()) ||
instr_uses_reg(instr, scc, s1))
break;
}
if (execsrc == nullptr)
return;
/* Use conditional branch in merge block. */
merge.instructions.back()->opcode = aco_opcode::s_cbranch_scc1;
block.linear_succs.pop_back();
block.linear_succs[0] = merge.index;
merge.linear_succs.push_back(loopexit.index);
std::swap(merge.linear_succs[0], merge.linear_succs[1]);
std::replace(loopexit.linear_preds.begin(), loopexit.linear_preds.end(), block.index,
merge.index);
/* Check if we can use the loopexit as the fallthrough block.
* Otherwise, we'll need an extra branch instruction.
*/
for (unsigned i = merge.index + 1; i < loopexit.index; i++) {
if (!ctx.program->blocks[i].instructions.empty()) {
branch->opcode = aco_opcode::s_branch;
merge.instructions.emplace_back(std::move(block.instructions.back()));
break;
}
}
block.instructions.pop_back();
if (ctx.program->gfx_level >= GFX9) {
/* Combine s_andn2 and copy to exec to s_andn2_wrexec. */
Instruction* wr_exec =
bld.sop1(Builder::s_andn2_wrexec, execsrc->definitions[0], execsrc->definitions[1],
Definition(exec, bld.lm), execsrc->operands[0], execsrc->operands[1]);
merge.instructions[0].reset(wr_exec);
} else {
/* Move s_andn2 to the merge block. */
merge.instructions.emplace(merge.instructions.begin(), execsrc);
}
ctx.blocks_incoming_exec_used[merge.index] = true;
}
void
eliminate_useless_exec_writes_in_block(branch_ctx& ctx, Block& block)
{
bool exec_write_used = false;
if (block.kind & block_kind_end_with_regs) {
/* Last block of a program with succeed shader part should respect final exec write. */
exec_write_used = true;
} else if (!block.linear_succs.empty()) {
/* Check if the successor needs the outgoing exec mask from the current block. */
exec_write_used = ctx.blocks_incoming_exec_used[block.linear_succs[0]];
}
/* Go through all instructions and eliminate useless exec writes. */
for (int i = block.instructions.size() - 1; i >= 0; --i) {
aco_ptr<Instruction>& instr = block.instructions[i];
/* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
if (instr->opcode == aco_opcode::s_cbranch_scc0 ||
instr->opcode == aco_opcode::s_cbranch_scc1) {
exec_write_used |= ctx.blocks_incoming_exec_used[instr->salu().imm];
}
/* See if the current instruction needs or writes exec. */
bool needs_exec = needs_exec_mask(instr.get());
bool writes_exec =
instr->writes_exec() && instr->definitions[0].regClass() == ctx.program->lane_mask;
/* See if we found an unused exec write. */
if (writes_exec && !exec_write_used) {
/* Don't eliminate an instruction that writes registers other than exec and scc.
* It is possible that this is eg. an s_and_saveexec and the saved value is
* used by a later branch.
*/
bool writes_other = std::any_of(instr->definitions.begin(), instr->definitions.end(),
[](const Definition& def) -> bool
{ return def.physReg() != exec && def.physReg() != scc; });
if (!writes_other) {
instr.reset();
continue;
}
}
/* For a newly encountered exec write, clear the used flag. */
if (writes_exec)
exec_write_used = false;
/* If the current instruction needs exec, mark it as used. */
exec_write_used |= needs_exec;
}
/* Remember if the current block needs an incoming exec mask from its predecessors. */
ctx.blocks_incoming_exec_used[block.index] = exec_write_used;
/* Cleanup: remove deleted instructions from the vector. */
auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr);
block.instructions.resize(new_end - block.instructions.begin());
}
/**
* Check if the branch instruction can be removed:
* This is beneficial when executing the next block with an empty exec mask
* is faster than the branch instruction itself.
*
* Override this judgement when:
* - The application prefers to remove control flow
* - The compiler stack knows that it's a divergent branch never taken
*/
bool
can_remove_branch(branch_ctx& ctx, Block& block, Pseudo_branch_instruction* branch)
{
const uint32_t target = branch->target[0];
const bool uniform_branch =
!((branch->opcode == aco_opcode::p_cbranch_z || branch->opcode == aco_opcode::p_cbranch_nz) &&
branch->operands[0].physReg() == exec);
if (branch->never_taken) {
assert(!uniform_branch || std::all_of(std::next(ctx.program->blocks.begin(), block.index + 1),
std::next(ctx.program->blocks.begin(), target),
[](Block& b) { return b.instructions.empty(); }));
return true;
}
/* Cannot remove back-edges. */
if (block.index >= target)
return false;
const bool prefer_remove = branch->rarely_taken;
unsigned num_scalar = 0;
unsigned num_vector = 0;
/* Check the instructions between branch and target */
for (unsigned i = block.index + 1; i < target; i++) {
/* Uniform conditional branches must not be ignored if they
* are about to jump over actual instructions */
if (uniform_branch && !ctx.program->blocks[i].instructions.empty())
return false;
for (aco_ptr<Instruction>& instr : ctx.program->blocks[i].instructions) {
if (instr->isSOPP()) {
/* Discard early exits and loop breaks and continues should work fine with
* an empty exec mask.
*/
if (instr->opcode == aco_opcode::s_cbranch_scc0 ||
instr->opcode == aco_opcode::s_cbranch_scc1 ||
instr->opcode == aco_opcode::s_cbranch_execz ||
instr->opcode == aco_opcode::s_cbranch_execnz) {
bool is_break_continue =
ctx.program->blocks[i].kind & (block_kind_break | block_kind_continue);
bool discard_early_exit =
ctx.program->blocks[instr->salu().imm].kind & block_kind_discard_early_exit;
if (is_break_continue || discard_early_exit)
continue;
}
return false;
} else if (instr->isSALU()) {
num_scalar++;
} else if (instr->isVALU() || instr->isVINTRP()) {
if (instr->opcode == aco_opcode::v_writelane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
/* writelane ignores exec, writing inactive lanes results in UB. */
return false;
}
num_vector++;
/* VALU which writes SGPRs are always executed on GFX10+ */
if (ctx.program->gfx_level >= GFX10) {
for (Definition& def : instr->definitions) {
if (def.regClass().type() == RegType::sgpr)
num_scalar++;
}
}
} else if (instr->isEXP() || instr->isSMEM() || instr->isBarrier()) {
/* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs),
* SMEM might be an invalid access, and barriers are probably expensive. */
return false;
} else if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isLDSDIR()) {
// TODO: GFX6-9 can use vskip
if (!prefer_remove)
return false;
} else if (instr->opcode != aco_opcode::p_debug_info) {
assert(false && "Pseudo instructions should be lowered by this point.");
return false;
}
if (!prefer_remove) {
/* Under these conditions, we shouldn't remove the branch.
* Don't care about the estimated cycles when the shader prefers flattening.
*/
unsigned est_cycles;
if (ctx.program->gfx_level >= GFX10)
est_cycles = num_scalar * 2 + num_vector;
else
est_cycles = num_scalar * 4 + num_vector * 4;
if (est_cycles > 16)
return false;
}
}
}
return true;
}
void
lower_branch_instruction(branch_ctx& ctx, Block& block)
{
if (block.instructions.empty() || !block.instructions.back()->isBranch())
return;
aco_ptr<Instruction> branch = std::move(block.instructions.back());
const uint32_t target = branch->branch().target[0];
block.instructions.pop_back();
if (can_remove_branch(ctx, block, &branch->branch())) {
if (branch->opcode != aco_opcode::p_branch)
remove_linear_successor(ctx, block, target);
return;
}
/* emit branch instruction */
Builder bld(ctx.program, &block.instructions);
switch (branch->opcode) {
case aco_opcode::p_branch:
assert(block.linear_succs[0] == target);
bld.sopp(aco_opcode::s_branch, target);
break;
case aco_opcode::p_cbranch_nz:
assert(block.linear_succs[1] == target);
if (branch->operands[0].physReg() == exec)
bld.sopp(aco_opcode::s_cbranch_execnz, target);
else if (branch->operands[0].physReg() == vcc)
bld.sopp(aco_opcode::s_cbranch_vccnz, target);
else {
assert(branch->operands[0].physReg() == scc);
bld.sopp(aco_opcode::s_cbranch_scc1, target);
}
break;
case aco_opcode::p_cbranch_z:
assert(block.linear_succs[1] == target);
if (branch->operands[0].physReg() == exec)
bld.sopp(aco_opcode::s_cbranch_execz, target);
else if (branch->operands[0].physReg() == vcc)
bld.sopp(aco_opcode::s_cbranch_vccz, target);
else {
assert(branch->operands[0].physReg() == scc);
bld.sopp(aco_opcode::s_cbranch_scc0, target);
}
break;
default: unreachable("Unknown Pseudo branch instruction!");
}
}
void
try_stitch_linear_block(branch_ctx& ctx, Block& block)
{
/* Don't stitch blocks that are part of the logical CFG. */
if (block.linear_preds.empty() || block.linear_succs.empty() || !block.logical_preds.empty())
return;
/* Try to stitch this block with the predecessor:
* This block must have exactly one predecessor and
* the predecessor must have exactly one successor.
*/
Block& pred = ctx.program->blocks[block.linear_preds[0]];
if (block.linear_preds.size() == 1 && pred.linear_succs.size() == 1 &&
(pred.instructions.empty() || !pred.instructions.back()->isSOPP())) {
/* Insert the instructions at the end of the predecessor and fixup edges. */
pred.instructions.insert(pred.instructions.end(),
std::move_iterator(block.instructions.begin()),
std::move_iterator(block.instructions.end()));
for (unsigned succ_idx : block.linear_succs) {
Block& s = ctx.program->blocks[succ_idx];
std::replace(s.linear_preds.begin(), s.linear_preds.end(), block.index, pred.index);
}
pred.linear_succs = std::move(block.linear_succs);
block.instructions.clear();
block.linear_preds.clear();
block.linear_succs.clear();
return;
}
/* Try to stitch this block with the successor:
* This block must have exactly one successor and
* the successor must have exactly one predecessor.
*/
Block& succ = ctx.program->blocks[block.linear_succs[0]];
if (block.linear_succs.size() == 1 && succ.linear_preds.size() == 1 &&
(block.instructions.empty() || !block.instructions.back()->isSOPP())) {
/* Insert the instructions at the beginning of the successor. */
succ.instructions.insert(succ.instructions.begin(),
std::move_iterator(block.instructions.begin()),
std::move_iterator(block.instructions.end()));
for (unsigned pred_idx : block.linear_preds) {
Block& p = ctx.program->blocks[pred_idx];
if (!p.instructions.empty() &&
instr_info.classes[(int)p.instructions.back()->opcode] == instr_class::branch &&
p.instructions.back()->salu().imm == block.index) {
p.instructions.back()->salu().imm = succ.index;
}
std::replace(p.linear_succs.begin(), p.linear_succs.end(), block.index, succ.index);
}
succ.linear_preds = std::move(block.linear_preds);
block.instructions.clear();
block.linear_preds.clear();
block.linear_succs.clear();
}
}
} /* end namespace */
void
lower_branches(Program* program)
{
branch_ctx ctx(program);
for (int i = program->blocks.size() - 1; i >= 0; i--) {
Block& block = program->blocks[i];
lower_branch_instruction(ctx, block);
eliminate_useless_exec_writes_in_block(ctx, block);
if (block.kind & block_kind_break)
try_merge_break_with_continue(ctx, block);
if (block.linear_succs.size() == 1 && block.logical_succs.size() <= 1)
try_remove_simple_block(ctx, block);
}
for (Block& block : program->blocks) {
try_stitch_linear_block(ctx, block);
}
}
} // namespace aco