| /* |
| * Copyright © 2024 Valve Corporation |
| * |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| #include "aco_builder.h" |
| #include "aco_ir.h" |
| |
| namespace aco { |
| namespace { |
| |
| struct branch_ctx { |
| Program* program; |
| std::vector<bool> blocks_incoming_exec_used; |
| |
| branch_ctx(Program* program_) |
| : program(program_), blocks_incoming_exec_used(program_->blocks.size(), true) |
| {} |
| }; |
| |
| void |
| remove_linear_successor(branch_ctx& ctx, Block& block, uint32_t succ_index) |
| { |
| Block& succ = ctx.program->blocks[succ_index]; |
| ASSERTED auto it = std::remove(succ.linear_preds.begin(), succ.linear_preds.end(), block.index); |
| assert(std::next(it) == succ.linear_preds.end()); |
| succ.linear_preds.pop_back(); |
| it = std::remove(block.linear_succs.begin(), block.linear_succs.end(), succ_index); |
| assert(std::next(it) == block.linear_succs.end()); |
| block.linear_succs.pop_back(); |
| |
| if (succ.linear_preds.empty()) { |
| /* This block became unreachable - Recursively remove successors. */ |
| succ.instructions.clear(); |
| for (unsigned i : succ.linear_succs) |
| remove_linear_successor(ctx, succ, i); |
| } |
| } |
| |
| void |
| try_remove_simple_block(branch_ctx& ctx, Block& block) |
| { |
| if (!block.instructions.empty() && block.instructions.front()->opcode != aco_opcode::s_branch) |
| return; |
| |
| /* Don't remove the preheader as it might be needed as convergence point |
| * in order to insert code (e.g. for loop alignment, wait states, etc.). |
| */ |
| if (block.kind & block_kind_loop_preheader) |
| return; |
| |
| unsigned succ_idx = block.linear_succs[0]; |
| Block& succ = ctx.program->blocks[succ_idx]; |
| Block::edge_vec new_preds; |
| for (unsigned pred_idx : block.linear_preds) { |
| Block& pred = ctx.program->blocks[pred_idx]; |
| assert(pred.index < block.index); |
| assert(!pred.instructions.empty() && pred.instructions.back()->isBranch()); |
| Instruction* branch = pred.instructions.back().get(); |
| if (branch->opcode == aco_opcode::p_branch) { |
| /* The predecessor unconditionally jumps to this block. Redirect to successor. */ |
| pred.linear_succs[0] = succ_idx; |
| succ.linear_preds.push_back(pred_idx); |
| } else if (pred.linear_succs[0] == succ_idx || pred.linear_succs[1] == succ_idx) { |
| /* The predecessor's alternative target is this block's successor. */ |
| pred.linear_succs[0] = succ_idx; |
| pred.linear_succs[1] = pred.linear_succs.back(); /* In case of discard */ |
| pred.linear_succs.pop_back(); |
| branch->opcode = aco_opcode::p_branch; |
| branch->branch().never_taken = false; |
| branch->branch().rarely_taken = false; |
| } else if (pred.linear_succs[1] == block.index) { |
| /* The predecessor jumps to this block. Redirect to successor. */ |
| pred.linear_succs[1] = succ_idx; |
| succ.linear_preds.push_back(pred_idx); |
| } else { |
| /* This block is the fall-through target of the predecessor. */ |
| assert(pred_idx == block.index - 1); |
| if (block.instructions.empty()) { |
| /* If this block is empty, just fall-through to the successor. */ |
| pred.linear_succs[0] = succ_idx; |
| succ.linear_preds.push_back(pred_idx); |
| continue; |
| } |
| |
| /* Otherwise, check if there is a fall-through path for the jump target. */ |
| bool can_fallthrough = block.index < pred.linear_succs[1]; |
| for (unsigned j = block.index + 1; can_fallthrough && j < pred.linear_succs[1]; j++) { |
| if (!ctx.program->blocks[j].instructions.empty()) |
| can_fallthrough = false; |
| } |
| if (!can_fallthrough) { |
| new_preds.push_back(pred_idx); |
| continue; |
| } |
| |
| pred.linear_succs[0] = pred.linear_succs[1]; |
| pred.linear_succs[1] = succ_idx; |
| succ.linear_preds.push_back(pred_idx); |
| |
| /* Invert the condition. This branch now falls through to its original target. |
| * However, we don't update the fall-through target since this instruction |
| * gets lowered in the next step, anyway. |
| */ |
| if (branch->opcode == aco_opcode::p_cbranch_nz) |
| branch->opcode = aco_opcode::p_cbranch_z; |
| else |
| branch->opcode = aco_opcode::p_cbranch_nz; |
| branch->branch().never_taken = false; |
| branch->branch().rarely_taken = false; |
| } |
| |
| /* Update the branch target. */ |
| branch->branch().target[0] = succ_idx; |
| } |
| |
| /* If this block is part of the logical CFG, also connect pre- and successors. */ |
| if (!block.logical_succs.empty()) { |
| assert(block.logical_succs.size() == 1); |
| unsigned logical_succ_idx = block.logical_succs[0]; |
| Block& logical_succ = ctx.program->blocks[logical_succ_idx]; |
| ASSERTED auto it = std::remove(logical_succ.logical_preds.begin(), |
| logical_succ.logical_preds.end(), block.index); |
| assert(std::next(it) == logical_succ.logical_preds.end()); |
| logical_succ.logical_preds.pop_back(); |
| for (unsigned pred_idx : block.logical_preds) { |
| Block& pred = ctx.program->blocks[pred_idx]; |
| std::replace(pred.logical_succs.begin(), pred.logical_succs.end(), block.index, |
| logical_succ_idx); |
| |
| if (pred.logical_succs.size() == 2 && pred.logical_succs[0] == pred.logical_succs[1]) |
| pred.logical_succs.pop_back(); /* This should have been optimized in NIR! */ |
| else |
| logical_succ.logical_preds.push_back(pred_idx); |
| } |
| |
| block.logical_succs.clear(); |
| block.logical_preds.clear(); |
| } |
| |
| block.linear_preds = new_preds; |
| if (block.linear_preds.empty()) { |
| remove_linear_successor(ctx, block, succ_idx); |
| block.instructions.clear(); |
| } |
| } |
| |
| bool |
| instr_uses_reg(aco_ptr<Instruction>& instr, PhysReg reg, uint32_t size) |
| { |
| auto intersects = [=](auto src) -> bool |
| { return src.physReg() + src.size() > reg && reg + size > src.physReg(); }; |
| |
| return std::any_of(instr->definitions.begin(), instr->definitions.end(), |
| [=](Definition def) { return intersects(def); }) || |
| std::any_of(instr->operands.begin(), instr->operands.end(), |
| [=](Operand op) { return intersects(op); }); |
| } |
| |
| void |
| try_merge_break_with_continue(branch_ctx& ctx, Block& block) |
| { |
| /* Look for this: |
| * BB1: |
| * ... |
| * p_branch_z exec BB3, BB2 |
| * BB2: |
| * ... |
| * s[0:1], scc = s_andn2 s[0:1], exec |
| * s_cbranch_scc0 BB4 |
| * BB3: |
| * exec = s_mov_b64 s[0:1] |
| * s_branch BB1 |
| * BB4: |
| * ... |
| * |
| * And turn it into this: |
| * BB1: |
| * ... |
| * p_branch_z exec BB3, BB2 |
| * BB2: |
| * ... |
| * BB3: |
| * s[0:1], scc, exec = s_andn2_wrexec s[0:1], exec |
| * s_cbranch_scc1 BB1, BB4 |
| * BB4: |
| * ... |
| */ |
| if (block.linear_succs.size() != 2 || block.instructions.size() < 2) |
| return; |
| |
| Instruction* branch = block.instructions.back().get(); |
| if (branch->opcode != aco_opcode::s_cbranch_scc0) |
| return; |
| |
| Block& merge = ctx.program->blocks[block.linear_succs[0]]; |
| Block& loopexit = ctx.program->blocks[block.linear_succs[1]]; |
| |
| /* Just a jump to the loop header. */ |
| if (merge.linear_succs.size() != 1) |
| return; |
| |
| for (unsigned merge_pred : merge.linear_preds) { |
| if (merge_pred == block.index) |
| continue; |
| |
| Block& pred = ctx.program->blocks[merge_pred]; |
| Instruction* pred_branch = pred.instructions.back().get(); |
| /* The branch needs to be exec zero only, otherwise we corrupt exec. */ |
| if (pred_branch->opcode != aco_opcode::p_cbranch_z || |
| pred_branch->operands[0].physReg() != exec) |
| return; |
| } |
| |
| /* merge block: copy to exec, branch */ |
| if (merge.instructions.size() != 2 || merge.instructions.back()->opcode != aco_opcode::s_branch) |
| return; |
| |
| Builder bld(ctx.program); |
| Instruction* execwrite = merge.instructions[0].get(); |
| if (execwrite->opcode != bld.w64or32(Builder::s_mov) || !execwrite->writes_exec()) |
| return; |
| |
| /* break block: find s_andn2 */ |
| PhysReg exec_temp = execwrite->operands[0].physReg(); |
| Instruction* execsrc = nullptr; |
| for (auto rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) { |
| aco_ptr<Instruction>& instr = *rit; |
| if (instr->opcode == bld.w64or32(Builder::s_andn2) && |
| instr->definitions[0].physReg() == exec_temp && |
| instr->operands[0].physReg() == exec_temp && instr->operands[1].physReg() == exec) { |
| execsrc = instr.release(); |
| block.instructions.erase(std::next(rit).base()); |
| break; |
| } |
| |
| /* There might be copies for phis after the execsrc instructions, |
| * but these must not read / write the same register. |
| */ |
| if (instr->writes_exec() || instr_uses_reg(instr, exec_temp, bld.lm.size()) || |
| instr_uses_reg(instr, scc, s1)) |
| break; |
| } |
| |
| if (execsrc == nullptr) |
| return; |
| |
| /* Use conditional branch in merge block. */ |
| merge.instructions.back()->opcode = aco_opcode::s_cbranch_scc1; |
| block.linear_succs.pop_back(); |
| block.linear_succs[0] = merge.index; |
| merge.linear_succs.push_back(loopexit.index); |
| std::swap(merge.linear_succs[0], merge.linear_succs[1]); |
| std::replace(loopexit.linear_preds.begin(), loopexit.linear_preds.end(), block.index, |
| merge.index); |
| |
| /* Check if we can use the loopexit as the fallthrough block. |
| * Otherwise, we'll need an extra branch instruction. |
| */ |
| for (unsigned i = merge.index + 1; i < loopexit.index; i++) { |
| if (!ctx.program->blocks[i].instructions.empty()) { |
| branch->opcode = aco_opcode::s_branch; |
| merge.instructions.emplace_back(std::move(block.instructions.back())); |
| break; |
| } |
| } |
| block.instructions.pop_back(); |
| |
| if (ctx.program->gfx_level >= GFX9) { |
| /* Combine s_andn2 and copy to exec to s_andn2_wrexec. */ |
| Instruction* wr_exec = |
| bld.sop1(Builder::s_andn2_wrexec, execsrc->definitions[0], execsrc->definitions[1], |
| Definition(exec, bld.lm), execsrc->operands[0], execsrc->operands[1]); |
| merge.instructions[0].reset(wr_exec); |
| } else { |
| /* Move s_andn2 to the merge block. */ |
| merge.instructions.emplace(merge.instructions.begin(), execsrc); |
| } |
| ctx.blocks_incoming_exec_used[merge.index] = true; |
| } |
| |
| void |
| eliminate_useless_exec_writes_in_block(branch_ctx& ctx, Block& block) |
| { |
| bool exec_write_used = false; |
| if (block.kind & block_kind_end_with_regs) { |
| /* Last block of a program with succeed shader part should respect final exec write. */ |
| exec_write_used = true; |
| } else if (!block.linear_succs.empty()) { |
| /* Check if the successor needs the outgoing exec mask from the current block. */ |
| exec_write_used = ctx.blocks_incoming_exec_used[block.linear_succs[0]]; |
| } |
| |
| /* Go through all instructions and eliminate useless exec writes. */ |
| for (int i = block.instructions.size() - 1; i >= 0; --i) { |
| aco_ptr<Instruction>& instr = block.instructions[i]; |
| |
| /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */ |
| if (instr->opcode == aco_opcode::s_cbranch_scc0 || |
| instr->opcode == aco_opcode::s_cbranch_scc1) { |
| exec_write_used |= ctx.blocks_incoming_exec_used[instr->salu().imm]; |
| } |
| |
| /* See if the current instruction needs or writes exec. */ |
| bool needs_exec = needs_exec_mask(instr.get()); |
| bool writes_exec = |
| instr->writes_exec() && instr->definitions[0].regClass() == ctx.program->lane_mask; |
| |
| /* See if we found an unused exec write. */ |
| if (writes_exec && !exec_write_used) { |
| /* Don't eliminate an instruction that writes registers other than exec and scc. |
| * It is possible that this is eg. an s_and_saveexec and the saved value is |
| * used by a later branch. |
| */ |
| bool writes_other = std::any_of(instr->definitions.begin(), instr->definitions.end(), |
| [](const Definition& def) -> bool |
| { return def.physReg() != exec && def.physReg() != scc; }); |
| if (!writes_other) { |
| instr.reset(); |
| continue; |
| } |
| } |
| |
| /* For a newly encountered exec write, clear the used flag. */ |
| if (writes_exec) |
| exec_write_used = false; |
| |
| /* If the current instruction needs exec, mark it as used. */ |
| exec_write_used |= needs_exec; |
| } |
| |
| /* Remember if the current block needs an incoming exec mask from its predecessors. */ |
| ctx.blocks_incoming_exec_used[block.index] = exec_write_used; |
| |
| /* Cleanup: remove deleted instructions from the vector. */ |
| auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr); |
| block.instructions.resize(new_end - block.instructions.begin()); |
| } |
| |
| /** |
| * Check if the branch instruction can be removed: |
| * This is beneficial when executing the next block with an empty exec mask |
| * is faster than the branch instruction itself. |
| * |
| * Override this judgement when: |
| * - The application prefers to remove control flow |
| * - The compiler stack knows that it's a divergent branch never taken |
| */ |
| bool |
| can_remove_branch(branch_ctx& ctx, Block& block, Pseudo_branch_instruction* branch) |
| { |
| const uint32_t target = branch->target[0]; |
| const bool uniform_branch = |
| !((branch->opcode == aco_opcode::p_cbranch_z || branch->opcode == aco_opcode::p_cbranch_nz) && |
| branch->operands[0].physReg() == exec); |
| |
| if (branch->never_taken) { |
| assert(!uniform_branch || std::all_of(std::next(ctx.program->blocks.begin(), block.index + 1), |
| std::next(ctx.program->blocks.begin(), target), |
| [](Block& b) { return b.instructions.empty(); })); |
| return true; |
| } |
| |
| /* Cannot remove back-edges. */ |
| if (block.index >= target) |
| return false; |
| |
| const bool prefer_remove = branch->rarely_taken; |
| unsigned num_scalar = 0; |
| unsigned num_vector = 0; |
| |
| /* Check the instructions between branch and target */ |
| for (unsigned i = block.index + 1; i < target; i++) { |
| /* Uniform conditional branches must not be ignored if they |
| * are about to jump over actual instructions */ |
| if (uniform_branch && !ctx.program->blocks[i].instructions.empty()) |
| return false; |
| |
| for (aco_ptr<Instruction>& instr : ctx.program->blocks[i].instructions) { |
| if (instr->isSOPP()) { |
| /* Discard early exits and loop breaks and continues should work fine with |
| * an empty exec mask. |
| */ |
| if (instr->opcode == aco_opcode::s_cbranch_scc0 || |
| instr->opcode == aco_opcode::s_cbranch_scc1 || |
| instr->opcode == aco_opcode::s_cbranch_execz || |
| instr->opcode == aco_opcode::s_cbranch_execnz) { |
| bool is_break_continue = |
| ctx.program->blocks[i].kind & (block_kind_break | block_kind_continue); |
| bool discard_early_exit = |
| ctx.program->blocks[instr->salu().imm].kind & block_kind_discard_early_exit; |
| if (is_break_continue || discard_early_exit) |
| continue; |
| } |
| return false; |
| } else if (instr->isSALU()) { |
| num_scalar++; |
| } else if (instr->isVALU() || instr->isVINTRP()) { |
| if (instr->opcode == aco_opcode::v_writelane_b32 || |
| instr->opcode == aco_opcode::v_writelane_b32_e64) { |
| /* writelane ignores exec, writing inactive lanes results in UB. */ |
| return false; |
| } |
| num_vector++; |
| /* VALU which writes SGPRs are always executed on GFX10+ */ |
| if (ctx.program->gfx_level >= GFX10) { |
| for (Definition& def : instr->definitions) { |
| if (def.regClass().type() == RegType::sgpr) |
| num_scalar++; |
| } |
| } |
| } else if (instr->isEXP() || instr->isSMEM() || instr->isBarrier()) { |
| /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs), |
| * SMEM might be an invalid access, and barriers are probably expensive. */ |
| return false; |
| } else if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isLDSDIR()) { |
| // TODO: GFX6-9 can use vskip |
| if (!prefer_remove) |
| return false; |
| } else if (instr->opcode != aco_opcode::p_debug_info) { |
| assert(false && "Pseudo instructions should be lowered by this point."); |
| return false; |
| } |
| |
| if (!prefer_remove) { |
| /* Under these conditions, we shouldn't remove the branch. |
| * Don't care about the estimated cycles when the shader prefers flattening. |
| */ |
| unsigned est_cycles; |
| if (ctx.program->gfx_level >= GFX10) |
| est_cycles = num_scalar * 2 + num_vector; |
| else |
| est_cycles = num_scalar * 4 + num_vector * 4; |
| |
| if (est_cycles > 16) |
| return false; |
| } |
| } |
| } |
| |
| return true; |
| } |
| |
| void |
| lower_branch_instruction(branch_ctx& ctx, Block& block) |
| { |
| if (block.instructions.empty() || !block.instructions.back()->isBranch()) |
| return; |
| |
| aco_ptr<Instruction> branch = std::move(block.instructions.back()); |
| const uint32_t target = branch->branch().target[0]; |
| block.instructions.pop_back(); |
| |
| if (can_remove_branch(ctx, block, &branch->branch())) { |
| if (branch->opcode != aco_opcode::p_branch) |
| remove_linear_successor(ctx, block, target); |
| return; |
| } |
| |
| /* emit branch instruction */ |
| Builder bld(ctx.program, &block.instructions); |
| switch (branch->opcode) { |
| case aco_opcode::p_branch: |
| assert(block.linear_succs[0] == target); |
| bld.sopp(aco_opcode::s_branch, target); |
| break; |
| case aco_opcode::p_cbranch_nz: |
| assert(block.linear_succs[1] == target); |
| if (branch->operands[0].physReg() == exec) |
| bld.sopp(aco_opcode::s_cbranch_execnz, target); |
| else if (branch->operands[0].physReg() == vcc) |
| bld.sopp(aco_opcode::s_cbranch_vccnz, target); |
| else { |
| assert(branch->operands[0].physReg() == scc); |
| bld.sopp(aco_opcode::s_cbranch_scc1, target); |
| } |
| break; |
| case aco_opcode::p_cbranch_z: |
| assert(block.linear_succs[1] == target); |
| if (branch->operands[0].physReg() == exec) |
| bld.sopp(aco_opcode::s_cbranch_execz, target); |
| else if (branch->operands[0].physReg() == vcc) |
| bld.sopp(aco_opcode::s_cbranch_vccz, target); |
| else { |
| assert(branch->operands[0].physReg() == scc); |
| bld.sopp(aco_opcode::s_cbranch_scc0, target); |
| } |
| break; |
| default: unreachable("Unknown Pseudo branch instruction!"); |
| } |
| } |
| |
| void |
| try_stitch_linear_block(branch_ctx& ctx, Block& block) |
| { |
| /* Don't stitch blocks that are part of the logical CFG. */ |
| if (block.linear_preds.empty() || block.linear_succs.empty() || !block.logical_preds.empty()) |
| return; |
| |
| /* Try to stitch this block with the predecessor: |
| * This block must have exactly one predecessor and |
| * the predecessor must have exactly one successor. |
| */ |
| Block& pred = ctx.program->blocks[block.linear_preds[0]]; |
| if (block.linear_preds.size() == 1 && pred.linear_succs.size() == 1 && |
| (pred.instructions.empty() || !pred.instructions.back()->isSOPP())) { |
| /* Insert the instructions at the end of the predecessor and fixup edges. */ |
| pred.instructions.insert(pred.instructions.end(), |
| std::move_iterator(block.instructions.begin()), |
| std::move_iterator(block.instructions.end())); |
| for (unsigned succ_idx : block.linear_succs) { |
| Block& s = ctx.program->blocks[succ_idx]; |
| std::replace(s.linear_preds.begin(), s.linear_preds.end(), block.index, pred.index); |
| } |
| pred.linear_succs = std::move(block.linear_succs); |
| |
| block.instructions.clear(); |
| block.linear_preds.clear(); |
| block.linear_succs.clear(); |
| return; |
| } |
| |
| /* Try to stitch this block with the successor: |
| * This block must have exactly one successor and |
| * the successor must have exactly one predecessor. |
| */ |
| Block& succ = ctx.program->blocks[block.linear_succs[0]]; |
| if (block.linear_succs.size() == 1 && succ.linear_preds.size() == 1 && |
| (block.instructions.empty() || !block.instructions.back()->isSOPP())) { |
| /* Insert the instructions at the beginning of the successor. */ |
| succ.instructions.insert(succ.instructions.begin(), |
| std::move_iterator(block.instructions.begin()), |
| std::move_iterator(block.instructions.end())); |
| for (unsigned pred_idx : block.linear_preds) { |
| Block& p = ctx.program->blocks[pred_idx]; |
| if (!p.instructions.empty() && |
| instr_info.classes[(int)p.instructions.back()->opcode] == instr_class::branch && |
| p.instructions.back()->salu().imm == block.index) { |
| p.instructions.back()->salu().imm = succ.index; |
| } |
| std::replace(p.linear_succs.begin(), p.linear_succs.end(), block.index, succ.index); |
| } |
| succ.linear_preds = std::move(block.linear_preds); |
| |
| block.instructions.clear(); |
| block.linear_preds.clear(); |
| block.linear_succs.clear(); |
| } |
| } |
| |
| } /* end namespace */ |
| |
| void |
| lower_branches(Program* program) |
| { |
| branch_ctx ctx(program); |
| |
| for (int i = program->blocks.size() - 1; i >= 0; i--) { |
| Block& block = program->blocks[i]; |
| lower_branch_instruction(ctx, block); |
| eliminate_useless_exec_writes_in_block(ctx, block); |
| |
| if (block.kind & block_kind_break) |
| try_merge_break_with_continue(ctx, block); |
| |
| if (block.linear_succs.size() == 1 && block.logical_succs.size() <= 1) |
| try_remove_simple_block(ctx, block); |
| } |
| |
| for (Block& block : program->blocks) { |
| try_stitch_linear_block(ctx, block); |
| } |
| } |
| |
| } // namespace aco |