blob: 7431e111ec862f71fa259f17eed60a0093da633d [file] [log] [blame] [edit]
//===-- SIPreEmitPeephole.cpp ------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass performs the peephole optimizations before code emission.
///
/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16,
/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be
/// co-issued. This helps with overlapping MFMA and certain vector instructions
/// in machine schedules and is expected to improve performance. Only those
/// packed instructions are unpacked that are overlapped by the MFMA latency.
/// Rest should remain untouched.
/// TODO: Add support for F16 packed instructions
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/BranchProbability.h"
using namespace llvm;
#define DEBUG_TYPE "si-pre-emit-peephole"
namespace {
class SIPreEmitPeephole {
private:
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
bool optimizeVccBranch(MachineInstr &MI) const;
bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
bool getBlockDestinations(MachineBasicBlock &SrcMBB,
MachineBasicBlock *&TrueMBB,
MachineBasicBlock *&FalseMBB,
SmallVectorImpl<MachineOperand> &Cond);
bool mustRetainExeczBranch(const MachineInstr &Branch,
const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
// Creates a list of packed instructions following an MFMA that are suitable
// for unpacking.
void collectUnpackingCandidates(MachineInstr &BeginMI,
SetVector<MachineInstr *> &InstrsToUnpack,
uint16_t NumMFMACycles);
// v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1]
// op_sel_hi:[0,0,0]
// ==>
// v_fma_f32 v0, v1, v3, v3
// v_fma_f32 v1, v0, v2, v2
// Here, we have overwritten v0 before we use it. This function checks if
// unpacking can lead to such a situation.
bool canUnpackingClobberRegister(const MachineInstr &MI);
// Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
// V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
// this transformation.
void performF32Unpacking(MachineInstr &I);
// Select corresponding unpacked instruction
uint16_t mapToUnpackedOpcode(MachineInstr &I);
// Creates the unpacked instruction to be inserted. Adds source modifiers to
// the unpacked instructions based on the source modifiers in the packed
// instruction.
MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode,
bool IsHiBits);
// Process operands/source modifiers from packed instructions and insert the
// appropriate source modifers and operands into the unpacked instructions.
void addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods,
bool IsHiBits, const MachineOperand &SrcMO);
public:
bool run(MachineFunction &MF);
};
class SIPreEmitPeepholeLegacy : public MachineFunctionPass {
public:
static char ID;
SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {
initializeSIPreEmitPeepholeLegacyPass(*PassRegistry::getPassRegistry());
}
bool runOnMachineFunction(MachineFunction &MF) override {
return SIPreEmitPeephole().run(MF);
}
};
} // End anonymous namespace.
INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE,
"SI peephole optimizations", false, false)
char SIPreEmitPeepholeLegacy::ID = 0;
char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID;
bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
// Match:
// sreg = -1 or 0
// vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
// S_CBRANCH_VCC[N]Z
// =>
// S_CBRANCH_EXEC[N]Z
// We end up with this pattern sometimes after basic block placement.
// It happens while combining a block which assigns -1 or 0 to a saved mask
// and another block which consumes that saved mask and then a branch.
//
// While searching this also performs the following substitution:
// vcc = V_CMP
// vcc = S_AND exec, vcc
// S_CBRANCH_VCC[N]Z
// =>
// vcc = V_CMP
// S_CBRANCH_VCC[N]Z
bool Changed = false;
MachineBasicBlock &MBB = *MI.getParent();
const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
const bool IsWave32 = ST.isWave32();
const unsigned CondReg = TRI->getVCC();
const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
E = MBB.rend();
bool ReadsCond = false;
unsigned Threshold = 5;
for (++A; A != E; ++A) {
if (!--Threshold)
return false;
if (A->modifiesRegister(ExecReg, TRI))
return false;
if (A->modifiesRegister(CondReg, TRI)) {
if (!A->definesRegister(CondReg, TRI) ||
(A->getOpcode() != And && A->getOpcode() != AndN2))
return false;
break;
}
ReadsCond |= A->readsRegister(CondReg, TRI);
}
if (A == E)
return false;
MachineOperand &Op1 = A->getOperand(1);
MachineOperand &Op2 = A->getOperand(2);
if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
TII->commuteInstruction(*A);
Changed = true;
}
if (Op1.getReg() != ExecReg)
return Changed;
if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
return Changed;
int64_t MaskValue = 0;
Register SReg;
if (Op2.isReg()) {
SReg = Op2.getReg();
auto M = std::next(A);
bool ReadsSreg = false;
bool ModifiesExec = false;
for (; M != E; ++M) {
if (M->definesRegister(SReg, TRI))
break;
if (M->modifiesRegister(SReg, TRI))
return Changed;
ReadsSreg |= M->readsRegister(SReg, TRI);
ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
}
if (M == E)
return Changed;
// If SReg is VCC and SReg definition is a VALU comparison.
// This means S_AND with EXEC is not required.
// Erase the S_AND and return.
// Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
TII->isVOPC(*M)) {
A->eraseFromParent();
return true;
}
if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
(M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
return Changed;
MaskValue = M->getOperand(1).getImm();
// First if sreg is only used in the AND instruction fold the immediate
// into the AND.
if (!ReadsSreg && Op2.isKill()) {
A->getOperand(2).ChangeToImmediate(MaskValue);
M->eraseFromParent();
}
} else if (Op2.isImm()) {
MaskValue = Op2.getImm();
} else {
llvm_unreachable("Op2 must be register or immediate");
}
// Invert mask for s_andn2
assert(MaskValue == 0 || MaskValue == -1);
if (A->getOpcode() == AndN2)
MaskValue = ~MaskValue;
if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
if (!MI.killsRegister(CondReg, TRI)) {
// Replace AND with MOV
if (MaskValue == 0) {
BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
.addImm(0);
} else {
BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
.addReg(ExecReg);
}
}
// Remove AND instruction
A->eraseFromParent();
}
bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
if (SReg == ExecReg) {
// EXEC is updated directly
if (IsVCCZ) {
MI.eraseFromParent();
return true;
}
MI.setDesc(TII->get(AMDGPU::S_BRANCH));
} else if (IsVCCZ && MaskValue == 0) {
// Will always branch
// Remove all successors shadowed by new unconditional branch
MachineBasicBlock *Parent = MI.getParent();
SmallVector<MachineInstr *, 4> ToRemove;
bool Found = false;
for (MachineInstr &Term : Parent->terminators()) {
if (Found) {
if (Term.isBranch())
ToRemove.push_back(&Term);
} else {
Found = Term.isIdenticalTo(MI);
}
}
assert(Found && "conditional branch is not terminator");
for (auto *BranchMI : ToRemove) {
MachineOperand &Dst = BranchMI->getOperand(0);
assert(Dst.isMBB() && "destination is not basic block");
Parent->removeSuccessor(Dst.getMBB());
BranchMI->eraseFromParent();
}
if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
Parent->removeSuccessor(Succ);
}
// Rewrite to unconditional branch
MI.setDesc(TII->get(AMDGPU::S_BRANCH));
} else if (!IsVCCZ && MaskValue == 0) {
// Will never branch
MachineOperand &Dst = MI.getOperand(0);
assert(Dst.isMBB() && "destination is not basic block");
MI.getParent()->removeSuccessor(Dst.getMBB());
MI.eraseFromParent();
return true;
} else if (MaskValue == -1) {
// Depends only on EXEC
MI.setDesc(
TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
}
MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, TRI, false /*Kill*/));
MI.addImplicitDefUseOperands(*MBB.getParent());
return true;
}
bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
MachineInstr &MI) const {
MachineBasicBlock &MBB = *MI.getParent();
const MachineFunction &MF = *MBB.getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
SmallVector<MachineInstr *, 4> ToRemove;
bool IdxOn = true;
if (!MI.isIdenticalTo(First))
return false;
// Scan back to find an identical S_SET_GPR_IDX_ON
for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
E = MI.getIterator();
I != E; ++I) {
if (I->isBundle())
continue;
switch (I->getOpcode()) {
case AMDGPU::S_SET_GPR_IDX_MODE:
return false;
case AMDGPU::S_SET_GPR_IDX_OFF:
IdxOn = false;
ToRemove.push_back(&*I);
break;
default:
if (I->modifiesRegister(AMDGPU::M0, TRI))
return false;
if (IdxReg && I->modifiesRegister(IdxReg, TRI))
return false;
if (llvm::any_of(I->operands(), [&MRI, this](const MachineOperand &MO) {
return MO.isReg() && TRI->isVectorRegister(MRI, MO.getReg());
})) {
// The only exception allowed here is another indirect vector move
// with the same mode.
if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
return false;
}
}
}
MI.eraseFromBundle();
for (MachineInstr *RI : ToRemove)
RI->eraseFromBundle();
return true;
}
bool SIPreEmitPeephole::getBlockDestinations(
MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
return false;
if (!FalseMBB)
FalseMBB = SrcMBB.getNextNode();
return true;
}
namespace {
class BranchWeightCostModel {
const SIInstrInfo &TII;
const TargetSchedModel &SchedModel;
BranchProbability BranchProb;
static constexpr uint64_t BranchNotTakenCost = 1;
uint64_t BranchTakenCost;
uint64_t ThenCyclesCost = 0;
public:
BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
const MachineBasicBlock &Succ)
: TII(TII), SchedModel(TII.getSchedModel()) {
const MachineBasicBlock &Head = *Branch.getParent();
const auto *FromIt = find(Head.successors(), &Succ);
assert(FromIt != Head.succ_end());
BranchProb = Head.getSuccProbability(FromIt);
if (BranchProb.isUnknown())
BranchProb = BranchProbability::getZero();
BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
}
bool isProfitable(const MachineInstr &MI) {
if (TII.isWaitcnt(MI.getOpcode()))
return false;
ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
// Consider `P = N/D` to be the probability of execz being false (skipping
// the then-block) The transformation is profitable if always executing the
// 'then' block is cheaper than executing sometimes 'then' and always
// executing s_cbranch_execz:
// * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
// * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
// * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
// BranchNotTakenCost
uint64_t Numerator = BranchProb.getNumerator();
uint64_t Denominator = BranchProb.getDenominator();
return (Denominator - Numerator) * ThenCyclesCost <=
((Denominator - Numerator) * BranchTakenCost +
Numerator * BranchNotTakenCost);
}
};
bool SIPreEmitPeephole::mustRetainExeczBranch(
const MachineInstr &Branch, const MachineBasicBlock &From,
const MachineBasicBlock &To) const {
assert(is_contained(Branch.getParent()->successors(), &From));
BranchWeightCostModel CostModel{*TII, Branch, From};
const MachineFunction *MF = From.getParent();
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
MBBI != End && MBBI != ToI; ++MBBI) {
const MachineBasicBlock &MBB = *MBBI;
for (const MachineInstr &MI : MBB) {
// When a uniform loop is inside non-uniform control flow, the branch
// leaving the loop might never be taken when EXEC = 0.
// Hence we should retain cbranch out of the loop lest it become infinite.
if (MI.isConditionalBranch())
return true;
if (MI.isUnconditionalBranch() &&
TII->getBranchDestBlock(MI) != MBB.getNextNode())
return true;
if (MI.isMetaInstruction())
continue;
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
return true;
if (!CostModel.isProfitable(MI))
return true;
}
}
return false;
}
} // namespace
// Returns true if the skip branch instruction is removed.
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
MachineBasicBlock &SrcMBB) {
if (!TII->getSchedModel().hasInstrSchedModel())
return false;
MachineBasicBlock *TrueMBB = nullptr;
MachineBasicBlock *FalseMBB = nullptr;
SmallVector<MachineOperand, 1> Cond;
if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
return false;
// Consider only the forward branches.
if (SrcMBB.getNumber() >= TrueMBB->getNumber())
return false;
// Consider only when it is legal and profitable
if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
return false;
LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
MI.eraseFromParent();
SrcMBB.removeSuccessor(TrueMBB);
return true;
}
bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
unsigned OpCode = MI.getOpcode();
Register DstReg = MI.getOperand(0).getReg();
// Only the first register in the register pair needs to be checked due to the
// unpacking order. Packed instructions are unpacked such that the lower 32
// bits (i.e., the first register in the pair) are written first. This can
// introduce dependencies if the first register is written in one instruction
// and then read as part of the higher 32 bits in the subsequent instruction.
// Such scenarios can arise due to specific combinations of op_sel and
// op_sel_hi modifiers.
Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
if (Src0MO && Src0MO->isReg()) {
Register SrcReg0 = Src0MO->getReg();
unsigned Src0Mods =
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
: TRI->getSubReg(SrcReg0, AMDGPU::sub0);
// Check if the register selected by op_sel_hi is the same as the first
// register in the destination register pair.
if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
return true;
}
const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (Src1MO && Src1MO->isReg()) {
Register SrcReg1 = Src1MO->getReg();
unsigned Src1Mods =
TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
: TRI->getSubReg(SrcReg1, AMDGPU::sub0);
if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
return true;
}
// Applicable for packed instructions with 3 source operands, such as
// V_PK_FMA.
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
const MachineOperand *Src2MO =
TII->getNamedOperand(MI, AMDGPU::OpName::src2);
if (Src2MO && Src2MO->isReg()) {
Register SrcReg2 = Src2MO->getReg();
unsigned Src2Mods =
TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
: TRI->getSubReg(SrcReg2, AMDGPU::sub0);
if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
return true;
}
}
return false;
}
uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
unsigned Opcode = I.getOpcode();
// Use 64 bit encoding to allow use of VOP3 instructions.
// VOP3 e64 instructions allow source modifiers
// e32 instructions don't allow source modifiers.
switch (Opcode) {
case AMDGPU::V_PK_ADD_F32:
return AMDGPU::V_ADD_F32_e64;
case AMDGPU::V_PK_MUL_F32:
return AMDGPU::V_MUL_F32_e64;
case AMDGPU::V_PK_FMA_F32:
return AMDGPU::V_FMA_F32_e64;
default:
return std::numeric_limits<uint16_t>::max();
}
llvm_unreachable("Fully covered switch");
}
void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
unsigned SrcMods, bool IsHiBits,
const MachineOperand &SrcMO) {
unsigned NewSrcMods = 0;
unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
// Packed instructions (VOP3P) do not support ABS. Hence, no checks are done
// for ABS modifiers.
// If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
// lane.
// NEG_HI shares the same bit position with ABS. But packed instructions do
// not support ABS. Therefore, NEG_HI must be translated to NEG source
// modifier for the higher 32 bits. Unpacked VOP3 instructions support
// ABS, but do not support NEG_HI. Therefore we need to explicitly add the
// NEG modifier if present in the packed instruction.
if (SrcMods & NegModifier)
NewSrcMods |= SISrcMods::NEG;
// Src modifiers. Only negative modifiers are added if needed. Unpacked
// operations do not have op_sel, therefore it must be handled explicitly as
// done below.
NewMI.addImm(NewSrcMods);
if (SrcMO.isImm()) {
NewMI.addImm(SrcMO.getImm());
return;
}
// If op_sel == 0, select register 0 of reg:sub0_sub1.
Register UnpackedSrcReg = (SrcMods & OpSelModifier)
? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
: TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
MachineOperand UnpackedSrcMO =
MachineOperand::CreateReg(UnpackedSrcReg, /*isDef=*/false);
if (SrcMO.isKill()) {
// For each unpacked instruction, mark its source registers as killed if the
// corresponding source register in the original packed instruction was
// marked as killed.
//
// Exception:
// If the op_sel and op_sel_hi modifiers require both unpacked instructions
// to use the same register (e.g., due to overlapping access to low/high
// bits of the same packed register), then only the *second* (latter)
// instruction should mark the register as killed. This is because the
// second instruction handles the higher bits and is effectively the last
// user of the full register pair.
bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
bool KillState = true;
if ((OpSel == OpSelHi) && !IsHiBits)
KillState = false;
UnpackedSrcMO.setIsKill(KillState);
}
NewMI.add(UnpackedSrcMO);
}
void SIPreEmitPeephole::collectUnpackingCandidates(
MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
uint16_t NumMFMACycles) {
auto *BB = BeginMI.getParent();
auto E = BB->end();
int TotalCyclesBetweenCandidates = 0;
auto SchedModel = TII->getSchedModel();
Register MFMADef = BeginMI.getOperand(0).getReg();
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
MachineInstr &Instr = *I;
uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
bool IsUnpackable =
!(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
if (Instr.isMetaInstruction())
continue;
if ((Instr.isTerminator()) ||
(TII->isNeverCoissue(Instr) && !IsUnpackable) ||
(SIInstrInfo::modifiesModeRegister(Instr) &&
Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
return;
const MCSchedClassDesc *InstrSchedClassDesc =
SchedModel.resolveSchedClass(&Instr);
uint16_t Latency =
SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
TotalCyclesBetweenCandidates += Latency;
if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1)
return;
// Identify register dependencies between those used by the MFMA
// instruction and the following packed instructions. Also checks for
// transitive dependencies between the MFMA def and candidate instruction
// def and uses. Conservatively ensures that we do not incorrectly
// read/write registers.
for (const MachineOperand &InstrMO : Instr.operands()) {
if (!InstrMO.isReg() || !InstrMO.getReg().isValid())
continue;
if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
return;
}
if (!IsUnpackable)
continue;
if (canUnpackingClobberRegister(Instr))
return;
// If it's a packed instruction, adjust latency: remove the packed
// latency, add latency of two unpacked instructions (currently estimated
// as 2 cycles).
TotalCyclesBetweenCandidates -= Latency;
// TODO: improve latency handling based on instruction modeling.
TotalCyclesBetweenCandidates += 2;
// Subtract 1 to account for MFMA issue latency.
if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
InstrsToUnpack.insert(&Instr);
}
}
void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
MachineOperand DstOp = I.getOperand(0);
uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
"Unsupported Opcode");
MachineInstrBuilder Op0LOp1L =
createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
MachineOperand LoDstOp = Op0LOp1L->getOperand(0);
LoDstOp.setIsUndef(DstOp.isUndef());
MachineInstrBuilder Op0HOp1H =
createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true);
MachineOperand HiDstOp = Op0HOp1H->getOperand(0);
uint32_t IFlags = I.getFlags();
Op0LOp1L->setFlags(IFlags);
Op0HOp1H->setFlags(IFlags);
LoDstOp.setIsRenamable(DstOp.isRenamable());
HiDstOp.setIsRenamable(DstOp.isRenamable());
I.eraseFromParent();
}
MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
uint16_t UnpackedOpcode,
bool IsHiBits) {
MachineBasicBlock &MBB = *I.getParent();
const DebugLoc &DL = I.getDebugLoc();
const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
Register DstReg = I.getOperand(0).getReg();
unsigned OpCode = I.getOpcode();
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
: TRI->getSubReg(DstReg, AMDGPU::sub0);
int64_t ClampVal = TII->getNamedOperand(I, AMDGPU::OpName::clamp)->getImm();
unsigned Src0Mods =
TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm();
unsigned Src1Mods =
TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm();
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
NewMI.addDef(UnpackedDstReg); // vdst
addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
const MachineOperand *SrcMO2 =
TII->getNamedOperand(I, AMDGPU::OpName::src2);
unsigned Src2Mods =
TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
}
NewMI.addImm(ClampVal); // clamp
// Packed instructions do not support output modifiers. safe to assign them 0
// for this use case
NewMI.addImm(0); // omod
return NewMI;
}
PreservedAnalyses
llvm::SIPreEmitPeepholePass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
if (!SIPreEmitPeephole().run(MF))
return PreservedAnalyses::all();
return getMachineFunctionPassPreservedAnalyses();
}
bool SIPreEmitPeephole::run(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
bool Changed = false;
MF.RenumberBlocks();
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
// Check first terminator for branches to optimize
if (TermI != MBB.end()) {
MachineInstr &MI = *TermI;
switch (MI.getOpcode()) {
case AMDGPU::S_CBRANCH_VCCZ:
case AMDGPU::S_CBRANCH_VCCNZ:
Changed |= optimizeVccBranch(MI);
break;
case AMDGPU::S_CBRANCH_EXECZ:
Changed |= removeExeczBranch(MI, MBB);
break;
}
}
if (!ST.hasVGPRIndexMode())
continue;
MachineInstr *SetGPRMI = nullptr;
const unsigned Threshold = 20;
unsigned Count = 0;
// Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
// second is not needed. Do expensive checks in the optimizeSetGPR()
// and limit the distance to 20 instructions for compile time purposes.
// Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
// may be bundled with the instructions they modify.
for (auto &MI : make_early_inc_range(MBB.instrs())) {
if (Count == Threshold)
SetGPRMI = nullptr;
else
++Count;
if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
continue;
Count = 0;
if (!SetGPRMI) {
SetGPRMI = &MI;
continue;
}
if (optimizeSetGPR(*SetGPRMI, MI))
Changed = true;
else
SetGPRMI = &MI;
}
}
// TODO: Fold this into previous block, if possible. Evaluate and handle any
// side effects.
// Perform the extra MF scans only for supported archs
if (!ST.hasGFX940Insts())
return Changed;
for (MachineBasicBlock &MBB : MF) {
// Unpack packed instructions overlapped by MFMAs. This allows the
// compiler to co-issue unpacked instructions with MFMA
auto SchedModel = TII->getSchedModel();
SetVector<MachineInstr *> InstrsToUnpack;
for (auto &MI : make_early_inc_range(MBB.instrs())) {
if (!SIInstrInfo::isMFMA(MI))
continue;
const MCSchedClassDesc *SchedClassDesc =
SchedModel.resolveSchedClass(&MI);
uint16_t NumMFMACycles =
SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
}
for (MachineInstr *MI : InstrsToUnpack) {
performF32Unpacking(*MI);
}
}
return Changed;
}