src/amd/compiler/aco_lower_branches.cpp - third_party/mesa - Git at Google

 /*
  * Copyright © 2024 Valve Corporation
  *
  * SPDX-License-Identifier: MIT
  */

 #include "aco_builder.h"
 #include "aco_ir.h"

 namespace aco {
 namespace {

 struct branch_ctx {
    Program* program;
    std::vector<bool> blocks_incoming_exec_used;

    branch_ctx(Program* program_)
        : program(program_), blocks_incoming_exec_used(program_->blocks.size(), true)
    {}
 };

 void
 remove_linear_successor(branch_ctx& ctx, Block& block, uint32_t succ_index)
 {
    Block& succ = ctx.program->blocks[succ_index];
    ASSERTED auto it = std::remove(succ.linear_preds.begin(), succ.linear_preds.end(), block.index);
    assert(std::next(it) == succ.linear_preds.end());
    succ.linear_preds.pop_back();
    it = std::remove(block.linear_succs.begin(), block.linear_succs.end(), succ_index);
    assert(std::next(it) == block.linear_succs.end());
    block.linear_succs.pop_back();

    if (succ.linear_preds.empty()) {
       /* This block became unreachable - Recursively remove successors. */
       succ.instructions.clear();
       for (unsigned i : succ.linear_succs)
          remove_linear_successor(ctx, succ, i);
    }
 }

 void
 try_remove_simple_block(branch_ctx& ctx, Block& block)
 {
    if (!block.instructions.empty() && block.instructions.front()->opcode != aco_opcode::s_branch)
       return;

    /* Don't remove the preheader as it might be needed as convergence point
     * in order to insert code (e.g. for loop alignment, wait states, etc.).
     */
    if (block.kind & block_kind_loop_preheader)
       return;

    unsigned succ_idx = block.linear_succs[0];
    Block& succ = ctx.program->blocks[succ_idx];
    Block::edge_vec new_preds;
    for (unsigned pred_idx : block.linear_preds) {
       Block& pred = ctx.program->blocks[pred_idx];
       assert(pred.index < block.index);
       assert(!pred.instructions.empty() && pred.instructions.back()->isBranch());
       Instruction* branch = pred.instructions.back().get();
       if (branch->opcode == aco_opcode::p_branch) {
          /* The predecessor unconditionally jumps to this block. Redirect to successor. */
          pred.linear_succs[0] = succ_idx;
          succ.linear_preds.push_back(pred_idx);
       } else if (pred.linear_succs[0] == succ_idx || pred.linear_succs[1] == succ_idx) {
          /* The predecessor's alternative target is this block's successor. */
          pred.linear_succs[0] = succ_idx;
          pred.linear_succs[1] = pred.linear_succs.back(); /* In case of discard */
          pred.linear_succs.pop_back();
          branch->opcode = aco_opcode::p_branch;
          branch->branch().never_taken = false;
          branch->branch().rarely_taken = false;
       } else if (pred.linear_succs[1] == block.index) {
          /* The predecessor jumps to this block. Redirect to successor. */
          pred.linear_succs[1] = succ_idx;
          succ.linear_preds.push_back(pred_idx);
       } else {
          /* This block is the fall-through target of the predecessor. */
          assert(pred_idx == block.index - 1);
          if (block.instructions.empty()) {
             /* If this block is empty, just fall-through to the successor. */
             pred.linear_succs[0] = succ_idx;
             succ.linear_preds.push_back(pred_idx);
             continue;
          }

          /* Otherwise, check if there is a fall-through path for the jump target. */
          bool can_fallthrough = block.index < pred.linear_succs[1];
          for (unsigned j = block.index + 1; can_fallthrough && j < pred.linear_succs[1]; j++) {
             if (!ctx.program->blocks[j].instructions.empty())
                can_fallthrough = false;
          }
          if (!can_fallthrough) {
             new_preds.push_back(pred_idx);
             continue;
          }

          pred.linear_succs[0] = pred.linear_succs[1];
          pred.linear_succs[1] = succ_idx;
          succ.linear_preds.push_back(pred_idx);

          /* Invert the condition. This branch now falls through to its original target.
           * However, we don't update the fall-through target since this instruction
           * gets lowered in the next step, anyway.
           */
          if (branch->opcode == aco_opcode::p_cbranch_nz)
             branch->opcode = aco_opcode::p_cbranch_z;
          else
             branch->opcode = aco_opcode::p_cbranch_nz;
          branch->branch().never_taken = false;
          branch->branch().rarely_taken = false;
       }

       /* Update the branch target. */
       branch->branch().target[0] = succ_idx;
    }

    /* If this block is part of the logical CFG, also connect pre- and successors. */
    if (!block.logical_succs.empty()) {
       assert(block.logical_succs.size() == 1);
       unsigned logical_succ_idx = block.logical_succs[0];
       Block& logical_succ = ctx.program->blocks[logical_succ_idx];
       ASSERTED auto it = std::remove(logical_succ.logical_preds.begin(),
                                      logical_succ.logical_preds.end(), block.index);
       assert(std::next(it) == logical_succ.logical_preds.end());
       logical_succ.logical_preds.pop_back();
       for (unsigned pred_idx : block.logical_preds) {
          Block& pred = ctx.program->blocks[pred_idx];
          std::replace(pred.logical_succs.begin(), pred.logical_succs.end(), block.index,
                       logical_succ_idx);

          if (pred.logical_succs.size() == 2 && pred.logical_succs[0] == pred.logical_succs[1])
             pred.logical_succs.pop_back(); /* This should have been optimized in NIR! */
          else
             logical_succ.logical_preds.push_back(pred_idx);
       }

       block.logical_succs.clear();
       block.logical_preds.clear();
    }

    block.linear_preds = new_preds;
    if (block.linear_preds.empty()) {
       remove_linear_successor(ctx, block, succ_idx);
       block.instructions.clear();
    }
 }

 bool
 instr_uses_reg(aco_ptr<Instruction>& instr, PhysReg reg, uint32_t size)
 {
    auto intersects = [=](auto src) -> bool
    { return src.physReg() + src.size() > reg && reg + size > src.physReg(); };

    return std::any_of(instr->definitions.begin(), instr->definitions.end(),
                       [=](Definition def) { return intersects(def); }) ||
           std::any_of(instr->operands.begin(), instr->operands.end(),
                       [=](Operand op) { return intersects(op); });
 }

 void
 try_merge_break_with_continue(branch_ctx& ctx, Block& block)
 {
    /* Look for this:
     * BB1:
     *    ...
     *    p_branch_z exec BB3, BB2
     * BB2:
     *    ...
     *    s[0:1], scc = s_andn2 s[0:1], exec
     *    s_cbranch_scc0 BB4
     * BB3:
     *    exec = s_mov_b64 s[0:1]
     *    s_branch BB1
     * BB4:
     *    ...
     *
     * And turn it into this:
     * BB1:
     *    ...
     *    p_branch_z exec BB3, BB2
     * BB2:
     *    ...
     * BB3:
     *    s[0:1], scc, exec = s_andn2_wrexec s[0:1], exec
     *    s_cbranch_scc1 BB1, BB4
     * BB4:
     *    ...
     */
    if (block.linear_succs.size() != 2 || block.instructions.size() < 2)
       return;

    Instruction* branch = block.instructions.back().get();
    if (branch->opcode != aco_opcode::s_cbranch_scc0)
       return;

    Block& merge = ctx.program->blocks[block.linear_succs[0]];
    Block& loopexit = ctx.program->blocks[block.linear_succs[1]];

    /* Just a jump to the loop header. */
    if (merge.linear_succs.size() != 1)
       return;

    for (unsigned merge_pred : merge.linear_preds) {
       if (merge_pred == block.index)
          continue;

       Block& pred = ctx.program->blocks[merge_pred];
       Instruction* pred_branch = pred.instructions.back().get();
       /* The branch needs to be exec zero only, otherwise we corrupt exec. */
       if (pred_branch->opcode != aco_opcode::p_cbranch_z ||
           pred_branch->operands[0].physReg() != exec)
          return;
    }

    /* merge block: copy to exec, branch */
    if (merge.instructions.size() != 2 || merge.instructions.back()->opcode != aco_opcode::s_branch)
       return;

    Builder bld(ctx.program);
    Instruction* execwrite = merge.instructions[0].get();
    if (execwrite->opcode != bld.w64or32(Builder::s_mov) || !execwrite->writes_exec())
       return;

    /* break block: find s_andn2 */
    PhysReg exec_temp = execwrite->operands[0].physReg();
    Instruction* execsrc = nullptr;
    for (auto rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
       aco_ptr<Instruction>& instr = *rit;
       if (instr->opcode == bld.w64or32(Builder::s_andn2) &&
           instr->definitions[0].physReg() == exec_temp &&
           instr->operands[0].physReg() == exec_temp && instr->operands[1].physReg() == exec) {
          execsrc = instr.release();
          block.instructions.erase(std::next(rit).base());
          break;
       }

       /* There might be copies for phis after the execsrc instructions,
        * but these must not read / write the same register.
        */
       if (instr->writes_exec() || instr_uses_reg(instr, exec_temp, bld.lm.size()) ||
           instr_uses_reg(instr, scc, s1))
          break;
    }

    if (execsrc == nullptr)
       return;

    /* Use conditional branch in merge block. */
    merge.instructions.back()->opcode = aco_opcode::s_cbranch_scc1;
    block.linear_succs.pop_back();
    block.linear_succs[0] = merge.index;
    merge.linear_succs.push_back(loopexit.index);
    std::swap(merge.linear_succs[0], merge.linear_succs[1]);
    std::replace(loopexit.linear_preds.begin(), loopexit.linear_preds.end(), block.index,
                 merge.index);

    /* Check if we can use the loopexit as the fallthrough block.
     * Otherwise, we'll need an extra branch instruction.
     */
    for (unsigned i = merge.index + 1; i < loopexit.index; i++) {
       if (!ctx.program->blocks[i].instructions.empty()) {
          branch->opcode = aco_opcode::s_branch;
          merge.instructions.emplace_back(std::move(block.instructions.back()));
          break;
       }
    }
    block.instructions.pop_back();

    if (ctx.program->gfx_level >= GFX9) {
       /* Combine s_andn2 and copy to exec to s_andn2_wrexec. */
       Instruction* wr_exec =
          bld.sop1(Builder::s_andn2_wrexec, execsrc->definitions[0], execsrc->definitions[1],
                   Definition(exec, bld.lm), execsrc->operands[0], execsrc->operands[1]);
       merge.instructions[0].reset(wr_exec);
    } else {
       /* Move s_andn2 to the merge block. */
       merge.instructions.emplace(merge.instructions.begin(), execsrc);
    }
    ctx.blocks_incoming_exec_used[merge.index] = true;
 }

 void
 eliminate_useless_exec_writes_in_block(branch_ctx& ctx, Block& block)
 {
    bool exec_write_used = false;
    if (block.kind & block_kind_end_with_regs) {
       /* Last block of a program with succeed shader part should respect final exec write. */
       exec_write_used = true;
    } else if (!block.linear_succs.empty()) {
       /* Check if the successor needs the outgoing exec mask from the current block. */
       exec_write_used = ctx.blocks_incoming_exec_used[block.linear_succs[0]];
    }

    /* Go through all instructions and eliminate useless exec writes. */
    for (int i = block.instructions.size() - 1; i >= 0; --i) {
       aco_ptr<Instruction>& instr = block.instructions[i];

       /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
       if (instr->opcode == aco_opcode::s_cbranch_scc0 ||
           instr->opcode == aco_opcode::s_cbranch_scc1) {
          exec_write_used |= ctx.blocks_incoming_exec_used[instr->salu().imm];
       }

       /* See if the current instruction needs or writes exec. */
       bool needs_exec = needs_exec_mask(instr.get());
       bool writes_exec =
          instr->writes_exec() && instr->definitions[0].regClass() == ctx.program->lane_mask;

       /* See if we found an unused exec write. */
       if (writes_exec && !exec_write_used) {
          /* Don't eliminate an instruction that writes registers other than exec and scc.
           * It is possible that this is eg. an s_and_saveexec and the saved value is
           * used by a later branch.
           */
          bool writes_other = std::any_of(instr->definitions.begin(), instr->definitions.end(),
                                          [](const Definition& def) -> bool
                                          { return def.physReg() != exec && def.physReg() != scc; });
          if (!writes_other) {
             instr.reset();
             continue;
          }
       }

       /* For a newly encountered exec write, clear the used flag. */
       if (writes_exec)
          exec_write_used = false;

       /* If the current instruction needs exec, mark it as used. */
       exec_write_used |= needs_exec;
    }

    /* Remember if the current block needs an incoming exec mask from its predecessors. */
    ctx.blocks_incoming_exec_used[block.index] = exec_write_used;

    /* Cleanup: remove deleted instructions from the vector. */
    auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr);
    block.instructions.resize(new_end - block.instructions.begin());
 }

 /**
  *  Check if the branch instruction can be removed:
  *  This is beneficial when executing the next block with an empty exec mask
  *  is faster than the branch instruction itself.
  *
  *  Override this judgement when:
  *  - The application prefers to remove control flow
  *  - The compiler stack knows that it's a divergent branch never taken
  */
 bool
 can_remove_branch(branch_ctx& ctx, Block& block, Pseudo_branch_instruction* branch)
 {
    const uint32_t target = branch->target[0];
    const bool uniform_branch =
       !((branch->opcode == aco_opcode::p_cbranch_z || branch->opcode == aco_opcode::p_cbranch_nz) &&
         branch->operands[0].physReg() == exec);

    if (branch->never_taken) {
       assert(!uniform_branch || std::all_of(std::next(ctx.program->blocks.begin(), block.index + 1),
                                             std::next(ctx.program->blocks.begin(), target),
                                             [](Block& b) { return b.instructions.empty(); }));
       return true;
    }

    /* Cannot remove back-edges. */
    if (block.index >= target)
       return false;

    const bool prefer_remove = branch->rarely_taken;
    unsigned num_scalar = 0;
    unsigned num_vector = 0;

    /* Check the instructions between branch and target */
    for (unsigned i = block.index + 1; i < target; i++) {
       /* Uniform conditional branches must not be ignored if they
        * are about to jump over actual instructions */
       if (uniform_branch && !ctx.program->blocks[i].instructions.empty())
          return false;

       for (aco_ptr<Instruction>& instr : ctx.program->blocks[i].instructions) {
          if (instr->isSOPP()) {
             /* Discard early exits and loop breaks and continues should work fine with
              * an empty exec mask.
              */
             if (instr->opcode == aco_opcode::s_cbranch_scc0 ||
                 instr->opcode == aco_opcode::s_cbranch_scc1 ||
                 instr->opcode == aco_opcode::s_cbranch_execz ||
                 instr->opcode == aco_opcode::s_cbranch_execnz) {
                bool is_break_continue =
                   ctx.program->blocks[i].kind & (block_kind_break | block_kind_continue);
                bool discard_early_exit =
                   ctx.program->blocks[instr->salu().imm].kind & block_kind_discard_early_exit;
                if (is_break_continue || discard_early_exit)
                   continue;
             }
             return false;
          } else if (instr->isSALU()) {
             num_scalar++;
          } else if (instr->isVALU() || instr->isVINTRP()) {
             if (instr->opcode == aco_opcode::v_writelane_b32 ||
                 instr->opcode == aco_opcode::v_writelane_b32_e64) {
                /* writelane ignores exec, writing inactive lanes results in UB. */
                return false;
             }
             num_vector++;
             /* VALU which writes SGPRs are always executed on GFX10+ */
             if (ctx.program->gfx_level >= GFX10) {
                for (Definition& def : instr->definitions) {
                   if (def.regClass().type() == RegType::sgpr)
                      num_scalar++;
                }
             }
          } else if (instr->isEXP() || instr->isSMEM() || instr->isBarrier()) {
             /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs),
              * SMEM might be an invalid access, and barriers are probably expensive. */
             return false;
          } else if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isLDSDIR()) {
             // TODO: GFX6-9 can use vskip
             if (!prefer_remove)
                return false;
          } else if (instr->opcode != aco_opcode::p_debug_info) {
             assert(false && "Pseudo instructions should be lowered by this point.");
             return false;
          }

          if (!prefer_remove) {
             /* Under these conditions, we shouldn't remove the branch.
              * Don't care about the estimated cycles when the shader prefers flattening.
              */
             unsigned est_cycles;
             if (ctx.program->gfx_level >= GFX10)
                est_cycles = num_scalar * 2 + num_vector;
             else
                est_cycles = num_scalar * 4 + num_vector * 4;

             if (est_cycles > 16)
                return false;
          }
       }
    }

    return true;
 }

 void
 lower_branch_instruction(branch_ctx& ctx, Block& block)
 {
    if (block.instructions.empty() || !block.instructions.back()->isBranch())
       return;

    aco_ptr<Instruction> branch = std::move(block.instructions.back());
    const uint32_t target = branch->branch().target[0];
    block.instructions.pop_back();

    if (can_remove_branch(ctx, block, &branch->branch())) {
       if (branch->opcode != aco_opcode::p_branch)
          remove_linear_successor(ctx, block, target);
       return;
    }

    /* emit branch instruction */
    Builder bld(ctx.program, &block.instructions);
    switch (branch->opcode) {
    case aco_opcode::p_branch:
       assert(block.linear_succs[0] == target);
       bld.sopp(aco_opcode::s_branch, target);
       break;
    case aco_opcode::p_cbranch_nz:
       assert(block.linear_succs[1] == target);
       if (branch->operands[0].physReg() == exec)
          bld.sopp(aco_opcode::s_cbranch_execnz, target);
       else if (branch->operands[0].physReg() == vcc)
          bld.sopp(aco_opcode::s_cbranch_vccnz, target);
       else {
          assert(branch->operands[0].physReg() == scc);
          bld.sopp(aco_opcode::s_cbranch_scc1, target);
       }
       break;
    case aco_opcode::p_cbranch_z:
       assert(block.linear_succs[1] == target);
       if (branch->operands[0].physReg() == exec)
          bld.sopp(aco_opcode::s_cbranch_execz, target);
       else if (branch->operands[0].physReg() == vcc)
          bld.sopp(aco_opcode::s_cbranch_vccz, target);
       else {
          assert(branch->operands[0].physReg() == scc);
          bld.sopp(aco_opcode::s_cbranch_scc0, target);
       }
       break;
    default: unreachable("Unknown Pseudo branch instruction!");
    }
 }

 void
 try_stitch_linear_block(branch_ctx& ctx, Block& block)
 {
    /* Don't stitch blocks that are part of the logical CFG. */
    if (block.linear_preds.empty() || block.linear_succs.empty() || !block.logical_preds.empty())
       return;

    /* Try to stitch this block with the predecessor:
     * This block must have exactly one predecessor and
     * the predecessor must have exactly one successor.
     */
    Block& pred = ctx.program->blocks[block.linear_preds[0]];
    if (block.linear_preds.size() == 1 && pred.linear_succs.size() == 1 &&
        (pred.instructions.empty() || !pred.instructions.back()->isSOPP())) {
       /* Insert the instructions at the end of the predecessor and fixup edges. */
       pred.instructions.insert(pred.instructions.end(),
                                std::move_iterator(block.instructions.begin()),
                                std::move_iterator(block.instructions.end()));
       for (unsigned succ_idx : block.linear_succs) {
          Block& s = ctx.program->blocks[succ_idx];
          std::replace(s.linear_preds.begin(), s.linear_preds.end(), block.index, pred.index);
       }
       pred.linear_succs = std::move(block.linear_succs);

       block.instructions.clear();
       block.linear_preds.clear();
       block.linear_succs.clear();
       return;
    }

    /* Try to stitch this block with the successor:
     * This block must have exactly one successor and
     * the successor must have exactly one predecessor.
     */
    Block& succ = ctx.program->blocks[block.linear_succs[0]];
    if (block.linear_succs.size() == 1 && succ.linear_preds.size() == 1 &&
        (block.instructions.empty() || !block.instructions.back()->isSOPP())) {
       /* Insert the instructions at the beginning of the successor. */
       succ.instructions.insert(succ.instructions.begin(),
                                std::move_iterator(block.instructions.begin()),
                                std::move_iterator(block.instructions.end()));
       for (unsigned pred_idx : block.linear_preds) {
          Block& p = ctx.program->blocks[pred_idx];
          if (!p.instructions.empty() &&
              instr_info.classes[(int)p.instructions.back()->opcode] == instr_class::branch &&
              p.instructions.back()->salu().imm == block.index) {
             p.instructions.back()->salu().imm = succ.index;
          }
          std::replace(p.linear_succs.begin(), p.linear_succs.end(), block.index, succ.index);
       }
       succ.linear_preds = std::move(block.linear_preds);

       block.instructions.clear();
       block.linear_preds.clear();
       block.linear_succs.clear();
    }
 }

 } /* end namespace */

 void
 lower_branches(Program* program)
 {
    branch_ctx ctx(program);

    for (int i = program->blocks.size() - 1; i >= 0; i--) {
       Block& block = program->blocks[i];
       lower_branch_instruction(ctx, block);
       eliminate_useless_exec_writes_in_block(ctx, block);

       if (block.kind & block_kind_break)
          try_merge_break_with_continue(ctx, block);

       if (block.linear_succs.size() == 1 && block.logical_succs.size() <= 1)
          try_remove_simple_block(ctx, block);
    }

    for (Block& block : program->blocks) {
       try_stitch_linear_block(ctx, block);
    }
 }

 } // namespace aco
	/*
	* Copyright © 2024 Valve Corporation
	*
	* SPDX-License-Identifier: MIT
	*/

	#include "aco_builder.h"
	#include "aco_ir.h"

	namespace aco {
	namespace {

	struct branch_ctx {
	Program* program;
	std::vector<bool> blocks_incoming_exec_used;

	branch_ctx(Program* program_)
	: program(program_), blocks_incoming_exec_used(program_->blocks.size(), true)
	{}
	};

	void
	remove_linear_successor(branch_ctx& ctx, Block& block, uint32_t succ_index)
	{
	Block& succ = ctx.program->blocks[succ_index];
	ASSERTED auto it = std::remove(succ.linear_preds.begin(), succ.linear_preds.end(), block.index);
	assert(std::next(it) == succ.linear_preds.end());
	succ.linear_preds.pop_back();
	it = std::remove(block.linear_succs.begin(), block.linear_succs.end(), succ_index);
	assert(std::next(it) == block.linear_succs.end());
	block.linear_succs.pop_back();

	if (succ.linear_preds.empty()) {
	/* This block became unreachable - Recursively remove successors. */
	succ.instructions.clear();
	for (unsigned i : succ.linear_succs)
	remove_linear_successor(ctx, succ, i);
	}
	}

	void
	try_remove_simple_block(branch_ctx& ctx, Block& block)
	{
	if (!block.instructions.empty() && block.instructions.front()->opcode != aco_opcode::s_branch)
	return;

	/* Don't remove the preheader as it might be needed as convergence point
	* in order to insert code (e.g. for loop alignment, wait states, etc.).
	*/
	if (block.kind & block_kind_loop_preheader)
	return;

	unsigned succ_idx = block.linear_succs[0];
	Block& succ = ctx.program->blocks[succ_idx];
	Block::edge_vec new_preds;
	for (unsigned pred_idx : block.linear_preds) {
	Block& pred = ctx.program->blocks[pred_idx];
	assert(pred.index < block.index);
	assert(!pred.instructions.empty() && pred.instructions.back()->isBranch());
	Instruction* branch = pred.instructions.back().get();
	if (branch->opcode == aco_opcode::p_branch) {
	/* The predecessor unconditionally jumps to this block. Redirect to successor. */
	pred.linear_succs[0] = succ_idx;
	succ.linear_preds.push_back(pred_idx);
	} else if (pred.linear_succs[0] == succ_idx \|\| pred.linear_succs[1] == succ_idx) {
	/* The predecessor's alternative target is this block's successor. */
	pred.linear_succs[0] = succ_idx;
	pred.linear_succs[1] = pred.linear_succs.back(); /* In case of discard */
	pred.linear_succs.pop_back();
	branch->opcode = aco_opcode::p_branch;
	branch->branch().never_taken = false;
	branch->branch().rarely_taken = false;
	} else if (pred.linear_succs[1] == block.index) {
	/* The predecessor jumps to this block. Redirect to successor. */
	pred.linear_succs[1] = succ_idx;
	succ.linear_preds.push_back(pred_idx);
	} else {
	/* This block is the fall-through target of the predecessor. */
	assert(pred_idx == block.index - 1);
	if (block.instructions.empty()) {
	/* If this block is empty, just fall-through to the successor. */
	pred.linear_succs[0] = succ_idx;
	succ.linear_preds.push_back(pred_idx);
	continue;
	}

	/* Otherwise, check if there is a fall-through path for the jump target. */
	bool can_fallthrough = block.index < pred.linear_succs[1];
	for (unsigned j = block.index + 1; can_fallthrough && j < pred.linear_succs[1]; j++) {
	if (!ctx.program->blocks[j].instructions.empty())
	can_fallthrough = false;
	}
	if (!can_fallthrough) {
	new_preds.push_back(pred_idx);
	continue;
	}

	pred.linear_succs[0] = pred.linear_succs[1];
	pred.linear_succs[1] = succ_idx;
	succ.linear_preds.push_back(pred_idx);

	/* Invert the condition. This branch now falls through to its original target.
	* However, we don't update the fall-through target since this instruction
	* gets lowered in the next step, anyway.
	*/
	if (branch->opcode == aco_opcode::p_cbranch_nz)
	branch->opcode = aco_opcode::p_cbranch_z;
	else
	branch->opcode = aco_opcode::p_cbranch_nz;
	branch->branch().never_taken = false;
	branch->branch().rarely_taken = false;
	}

	/* Update the branch target. */
	branch->branch().target[0] = succ_idx;
	}

	/* If this block is part of the logical CFG, also connect pre- and successors. */
	if (!block.logical_succs.empty()) {
	assert(block.logical_succs.size() == 1);
	unsigned logical_succ_idx = block.logical_succs[0];
	Block& logical_succ = ctx.program->blocks[logical_succ_idx];
	ASSERTED auto it = std::remove(logical_succ.logical_preds.begin(),
	logical_succ.logical_preds.end(), block.index);
	assert(std::next(it) == logical_succ.logical_preds.end());
	logical_succ.logical_preds.pop_back();
	for (unsigned pred_idx : block.logical_preds) {
	Block& pred = ctx.program->blocks[pred_idx];
	std::replace(pred.logical_succs.begin(), pred.logical_succs.end(), block.index,
	logical_succ_idx);

	if (pred.logical_succs.size() == 2 && pred.logical_succs[0] == pred.logical_succs[1])
	pred.logical_succs.pop_back(); /* This should have been optimized in NIR! */
	else
	logical_succ.logical_preds.push_back(pred_idx);
	}

	block.logical_succs.clear();
	block.logical_preds.clear();
	}

	block.linear_preds = new_preds;
	if (block.linear_preds.empty()) {
	remove_linear_successor(ctx, block, succ_idx);
	block.instructions.clear();
	}
	}

	bool
	instr_uses_reg(aco_ptr<Instruction>& instr, PhysReg reg, uint32_t size)
	{
	auto intersects = [=](auto src) -> bool
	{ return src.physReg() + src.size() > reg && reg + size > src.physReg(); };

	return std::any_of(instr->definitions.begin(), instr->definitions.end(),
	[=](Definition def) { return intersects(def); }) \|\|
	std::any_of(instr->operands.begin(), instr->operands.end(),
	[=](Operand op) { return intersects(op); });
	}

	void
	try_merge_break_with_continue(branch_ctx& ctx, Block& block)
	{
	/* Look for this:
	* BB1:
	* ...
	* p_branch_z exec BB3, BB2
	* BB2:
	* ...
	* s[0:1], scc = s_andn2 s[0:1], exec
	* s_cbranch_scc0 BB4
	* BB3:
	* exec = s_mov_b64 s[0:1]
	* s_branch BB1
	* BB4:
	* ...
	*
	* And turn it into this:
	* BB1:
	* ...
	* p_branch_z exec BB3, BB2
	* BB2:
	* ...
	* BB3:
	* s[0:1], scc, exec = s_andn2_wrexec s[0:1], exec
	* s_cbranch_scc1 BB1, BB4
	* BB4:
	* ...
	*/
	if (block.linear_succs.size() != 2 \|\| block.instructions.size() < 2)
	return;

	Instruction* branch = block.instructions.back().get();
	if (branch->opcode != aco_opcode::s_cbranch_scc0)
	return;

	Block& merge = ctx.program->blocks[block.linear_succs[0]];
	Block& loopexit = ctx.program->blocks[block.linear_succs[1]];

	/* Just a jump to the loop header. */
	if (merge.linear_succs.size() != 1)
	return;

	for (unsigned merge_pred : merge.linear_preds) {
	if (merge_pred == block.index)
	continue;

	Block& pred = ctx.program->blocks[merge_pred];
	Instruction* pred_branch = pred.instructions.back().get();
	/* The branch needs to be exec zero only, otherwise we corrupt exec. */
	if (pred_branch->opcode != aco_opcode::p_cbranch_z \|\|
	pred_branch->operands[0].physReg() != exec)
	return;
	}

	/* merge block: copy to exec, branch */
	if (merge.instructions.size() != 2 \|\| merge.instructions.back()->opcode != aco_opcode::s_branch)
	return;

	Builder bld(ctx.program);
	Instruction* execwrite = merge.instructions[0].get();
	if (execwrite->opcode != bld.w64or32(Builder::s_mov) \|\| !execwrite->writes_exec())
	return;

	/* break block: find s_andn2 */
	PhysReg exec_temp = execwrite->operands[0].physReg();
	Instruction* execsrc = nullptr;
	for (auto rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
	aco_ptr<Instruction>& instr = *rit;
	if (instr->opcode == bld.w64or32(Builder::s_andn2) &&
	instr->definitions[0].physReg() == exec_temp &&
	instr->operands[0].physReg() == exec_temp && instr->operands[1].physReg() == exec) {
	execsrc = instr.release();
	block.instructions.erase(std::next(rit).base());
	break;
	}

	/* There might be copies for phis after the execsrc instructions,
	* but these must not read / write the same register.
	*/
	if (instr->writes_exec() \|\| instr_uses_reg(instr, exec_temp, bld.lm.size()) \|\|
	instr_uses_reg(instr, scc, s1))
	break;
	}

	if (execsrc == nullptr)
	return;

	/* Use conditional branch in merge block. */
	merge.instructions.back()->opcode = aco_opcode::s_cbranch_scc1;
	block.linear_succs.pop_back();
	block.linear_succs[0] = merge.index;
	merge.linear_succs.push_back(loopexit.index);
	std::swap(merge.linear_succs[0], merge.linear_succs[1]);
	std::replace(loopexit.linear_preds.begin(), loopexit.linear_preds.end(), block.index,
	merge.index);

	/* Check if we can use the loopexit as the fallthrough block.
	* Otherwise, we'll need an extra branch instruction.
	*/
	for (unsigned i = merge.index + 1; i < loopexit.index; i++) {
	if (!ctx.program->blocks[i].instructions.empty()) {
	branch->opcode = aco_opcode::s_branch;
	merge.instructions.emplace_back(std::move(block.instructions.back()));
	break;
	}
	}
	block.instructions.pop_back();

	if (ctx.program->gfx_level >= GFX9) {
	/* Combine s_andn2 and copy to exec to s_andn2_wrexec. */
	Instruction* wr_exec =
	bld.sop1(Builder::s_andn2_wrexec, execsrc->definitions[0], execsrc->definitions[1],
	Definition(exec, bld.lm), execsrc->operands[0], execsrc->operands[1]);
	merge.instructions[0].reset(wr_exec);
	} else {
	/* Move s_andn2 to the merge block. */
	merge.instructions.emplace(merge.instructions.begin(), execsrc);
	}
	ctx.blocks_incoming_exec_used[merge.index] = true;
	}

	void
	eliminate_useless_exec_writes_in_block(branch_ctx& ctx, Block& block)
	{
	bool exec_write_used = false;
	if (block.kind & block_kind_end_with_regs) {
	/* Last block of a program with succeed shader part should respect final exec write. */
	exec_write_used = true;
	} else if (!block.linear_succs.empty()) {
	/* Check if the successor needs the outgoing exec mask from the current block. */
	exec_write_used = ctx.blocks_incoming_exec_used[block.linear_succs[0]];
	}

	/* Go through all instructions and eliminate useless exec writes. */
	for (int i = block.instructions.size() - 1; i >= 0; --i) {
	aco_ptr<Instruction>& instr = block.instructions[i];

	/* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
	if (instr->opcode == aco_opcode::s_cbranch_scc0 \|\|
	instr->opcode == aco_opcode::s_cbranch_scc1) {
	exec_write_used \|= ctx.blocks_incoming_exec_used[instr->salu().imm];
	}

	/* See if the current instruction needs or writes exec. */
	bool needs_exec = needs_exec_mask(instr.get());
	bool writes_exec =
	instr->writes_exec() && instr->definitions[0].regClass() == ctx.program->lane_mask;

	/* See if we found an unused exec write. */
	if (writes_exec && !exec_write_used) {
	/* Don't eliminate an instruction that writes registers other than exec and scc.
	* It is possible that this is eg. an s_and_saveexec and the saved value is
	* used by a later branch.
	*/
	bool writes_other = std::any_of(instr->definitions.begin(), instr->definitions.end(),
	[](const Definition& def) -> bool
	{ return def.physReg() != exec && def.physReg() != scc; });
	if (!writes_other) {
	instr.reset();
	continue;
	}
	}

	/* For a newly encountered exec write, clear the used flag. */
	if (writes_exec)
	exec_write_used = false;

	/* If the current instruction needs exec, mark it as used. */
	exec_write_used \|= needs_exec;
	}

	/* Remember if the current block needs an incoming exec mask from its predecessors. */
	ctx.blocks_incoming_exec_used[block.index] = exec_write_used;

	/* Cleanup: remove deleted instructions from the vector. */
	auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr);
	block.instructions.resize(new_end - block.instructions.begin());
	}

	/**
	* Check if the branch instruction can be removed:
	* This is beneficial when executing the next block with an empty exec mask
	* is faster than the branch instruction itself.
	*
	* Override this judgement when:
	* - The application prefers to remove control flow
	* - The compiler stack knows that it's a divergent branch never taken
	*/
	bool
	can_remove_branch(branch_ctx& ctx, Block& block, Pseudo_branch_instruction* branch)
	{
	const uint32_t target = branch->target[0];
	const bool uniform_branch =
	!((branch->opcode == aco_opcode::p_cbranch_z \|\| branch->opcode == aco_opcode::p_cbranch_nz) &&
	branch->operands[0].physReg() == exec);

	if (branch->never_taken) {
	assert(!uniform_branch \|\| std::all_of(std::next(ctx.program->blocks.begin(), block.index + 1),
	std::next(ctx.program->blocks.begin(), target),
	[](Block& b) { return b.instructions.empty(); }));
	return true;
	}

	/* Cannot remove back-edges. */
	if (block.index >= target)
	return false;

	const bool prefer_remove = branch->rarely_taken;
	unsigned num_scalar = 0;
	unsigned num_vector = 0;

	/* Check the instructions between branch and target */
	for (unsigned i = block.index + 1; i < target; i++) {
	/* Uniform conditional branches must not be ignored if they
	* are about to jump over actual instructions */
	if (uniform_branch && !ctx.program->blocks[i].instructions.empty())
	return false;

	for (aco_ptr<Instruction>& instr : ctx.program->blocks[i].instructions) {
	if (instr->isSOPP()) {
	/* Discard early exits and loop breaks and continues should work fine with
	* an empty exec mask.
	*/
	if (instr->opcode == aco_opcode::s_cbranch_scc0 \|\|
	instr->opcode == aco_opcode::s_cbranch_scc1 \|\|
	instr->opcode == aco_opcode::s_cbranch_execz \|\|
	instr->opcode == aco_opcode::s_cbranch_execnz) {
	bool is_break_continue =
	ctx.program->blocks[i].kind & (block_kind_break \| block_kind_continue);
	bool discard_early_exit =
	ctx.program->blocks[instr->salu().imm].kind & block_kind_discard_early_exit;
	if (is_break_continue \|\| discard_early_exit)
	continue;
	}
	return false;
	} else if (instr->isSALU()) {
	num_scalar++;
	} else if (instr->isVALU() \|\| instr->isVINTRP()) {
	if (instr->opcode == aco_opcode::v_writelane_b32 \|\|
	instr->opcode == aco_opcode::v_writelane_b32_e64) {
	/* writelane ignores exec, writing inactive lanes results in UB. */
	return false;
	}
	num_vector++;
	/* VALU which writes SGPRs are always executed on GFX10+ */
	if (ctx.program->gfx_level >= GFX10) {
	for (Definition& def : instr->definitions) {
	if (def.regClass().type() == RegType::sgpr)
	num_scalar++;
	}
	}
	} else if (instr->isEXP() \|\| instr->isSMEM() \|\| instr->isBarrier()) {
	/* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs),
	* SMEM might be an invalid access, and barriers are probably expensive. */
	return false;
	} else if (instr->isVMEM() \|\| instr->isFlatLike() \|\| instr->isDS() \|\| instr->isLDSDIR()) {
	// TODO: GFX6-9 can use vskip
	if (!prefer_remove)
	return false;
	} else if (instr->opcode != aco_opcode::p_debug_info) {
	assert(false && "Pseudo instructions should be lowered by this point.");
	return false;
	}

	if (!prefer_remove) {
	/* Under these conditions, we shouldn't remove the branch.
	* Don't care about the estimated cycles when the shader prefers flattening.
	*/
	unsigned est_cycles;
	if (ctx.program->gfx_level >= GFX10)
	est_cycles = num_scalar * 2 + num_vector;
	else
	est_cycles = num_scalar * 4 + num_vector * 4;

	if (est_cycles > 16)
	return false;
	}
	}
	}

	return true;
	}

	void
	lower_branch_instruction(branch_ctx& ctx, Block& block)
	{
	if (block.instructions.empty() \|\| !block.instructions.back()->isBranch())
	return;

	aco_ptr<Instruction> branch = std::move(block.instructions.back());
	const uint32_t target = branch->branch().target[0];
	block.instructions.pop_back();

	if (can_remove_branch(ctx, block, &branch->branch())) {
	if (branch->opcode != aco_opcode::p_branch)
	remove_linear_successor(ctx, block, target);
	return;
	}

	/* emit branch instruction */
	Builder bld(ctx.program, &block.instructions);
	switch (branch->opcode) {
	case aco_opcode::p_branch:
	assert(block.linear_succs[0] == target);
	bld.sopp(aco_opcode::s_branch, target);
	break;
	case aco_opcode::p_cbranch_nz:
	assert(block.linear_succs[1] == target);
	if (branch->operands[0].physReg() == exec)
	bld.sopp(aco_opcode::s_cbranch_execnz, target);
	else if (branch->operands[0].physReg() == vcc)
	bld.sopp(aco_opcode::s_cbranch_vccnz, target);
	else {
	assert(branch->operands[0].physReg() == scc);
	bld.sopp(aco_opcode::s_cbranch_scc1, target);
	}
	break;
	case aco_opcode::p_cbranch_z:
	assert(block.linear_succs[1] == target);
	if (branch->operands[0].physReg() == exec)
	bld.sopp(aco_opcode::s_cbranch_execz, target);
	else if (branch->operands[0].physReg() == vcc)
	bld.sopp(aco_opcode::s_cbranch_vccz, target);
	else {
	assert(branch->operands[0].physReg() == scc);
	bld.sopp(aco_opcode::s_cbranch_scc0, target);
	}
	break;
	default: unreachable("Unknown Pseudo branch instruction!");
	}
	}

	void
	try_stitch_linear_block(branch_ctx& ctx, Block& block)
	{
	/* Don't stitch blocks that are part of the logical CFG. */
	if (block.linear_preds.empty() \|\| block.linear_succs.empty() \|\| !block.logical_preds.empty())
	return;

	/* Try to stitch this block with the predecessor:
	* This block must have exactly one predecessor and
	* the predecessor must have exactly one successor.
	*/
	Block& pred = ctx.program->blocks[block.linear_preds[0]];
	if (block.linear_preds.size() == 1 && pred.linear_succs.size() == 1 &&
	(pred.instructions.empty() \|\| !pred.instructions.back()->isSOPP())) {
	/* Insert the instructions at the end of the predecessor and fixup edges. */
	pred.instructions.insert(pred.instructions.end(),
	std::move_iterator(block.instructions.begin()),
	std::move_iterator(block.instructions.end()));
	for (unsigned succ_idx : block.linear_succs) {
	Block& s = ctx.program->blocks[succ_idx];
	std::replace(s.linear_preds.begin(), s.linear_preds.end(), block.index, pred.index);
	}
	pred.linear_succs = std::move(block.linear_succs);

	block.instructions.clear();
	block.linear_preds.clear();
	block.linear_succs.clear();
	return;
	}

	/* Try to stitch this block with the successor:
	* This block must have exactly one successor and
	* the successor must have exactly one predecessor.
	*/
	Block& succ = ctx.program->blocks[block.linear_succs[0]];
	if (block.linear_succs.size() == 1 && succ.linear_preds.size() == 1 &&
	(block.instructions.empty() \|\| !block.instructions.back()->isSOPP())) {
	/* Insert the instructions at the beginning of the successor. */
	succ.instructions.insert(succ.instructions.begin(),
	std::move_iterator(block.instructions.begin()),
	std::move_iterator(block.instructions.end()));
	for (unsigned pred_idx : block.linear_preds) {
	Block& p = ctx.program->blocks[pred_idx];
	if (!p.instructions.empty() &&
	instr_info.classes[(int)p.instructions.back()->opcode] == instr_class::branch &&
	p.instructions.back()->salu().imm == block.index) {
	p.instructions.back()->salu().imm = succ.index;
	}
	std::replace(p.linear_succs.begin(), p.linear_succs.end(), block.index, succ.index);
	}
	succ.linear_preds = std::move(block.linear_preds);

	block.instructions.clear();
	block.linear_preds.clear();
	block.linear_succs.clear();
	}
	}

	} /* end namespace */

	void
	lower_branches(Program* program)
	{
	branch_ctx ctx(program);

	for (int i = program->blocks.size() - 1; i >= 0; i--) {
	Block& block = program->blocks[i];
	lower_branch_instruction(ctx, block);
	eliminate_useless_exec_writes_in_block(ctx, block);

	if (block.kind & block_kind_break)
	try_merge_break_with_continue(ctx, block);

	if (block.linear_succs.size() == 1 && block.logical_succs.size() <= 1)
	try_remove_simple_block(ctx, block);
	}

	for (Block& block : program->blocks) {
	try_stitch_linear_block(ctx, block);
	}
	}

	} // namespace aco