| //===---- LoongArchMergeBaseOffset.cpp - Optimise address calculations ----===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // Merge the offset of address calculation into the offset field |
| // of instructions in a global address lowering sequence. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "LoongArch.h" |
| #include "LoongArchTargetMachine.h" |
| #include "llvm/CodeGen/MachineFunctionPass.h" |
| #include "llvm/CodeGen/Passes.h" |
| #include "llvm/MC/TargetRegistry.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Target/TargetOptions.h" |
| #include <optional> |
| |
| using namespace llvm; |
| |
| #define DEBUG_TYPE "loongarch-merge-base-offset" |
| #define LoongArch_MERGE_BASE_OFFSET_NAME "LoongArch Merge Base Offset" |
| |
| namespace { |
| |
| class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass { |
| const LoongArchSubtarget *ST = nullptr; |
| MachineRegisterInfo *MRI; |
| |
| public: |
| static char ID; |
| bool runOnMachineFunction(MachineFunction &Fn) override; |
| bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12, |
| MachineInstr *&Lo20, MachineInstr *&Hi12, |
| MachineInstr *&Last); |
| bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Add, |
| MachineInstr *&Lo12); |
| |
| bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12, |
| MachineInstr *&Lo20, MachineInstr *&Hi12, |
| MachineInstr *&Last); |
| void foldOffset(MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20, |
| MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail, |
| int64_t Offset); |
| bool foldLargeOffset(MachineInstr &Hi20, MachineInstr &Lo12, |
| MachineInstr *&Lo20, MachineInstr *&Hi12, |
| MachineInstr *&Last, MachineInstr &TailAdd, |
| Register GAReg); |
| |
| bool foldIntoMemoryOps(MachineInstr &Hi20, MachineInstr &Lo12, |
| MachineInstr *&Lo20, MachineInstr *&Hi12, |
| MachineInstr *&Last); |
| |
| LoongArchMergeBaseOffsetOpt() : MachineFunctionPass(ID) {} |
| |
| MachineFunctionProperties getRequiredProperties() const override { |
| return MachineFunctionProperties().set( |
| MachineFunctionProperties::Property::IsSSA); |
| } |
| |
| void getAnalysisUsage(AnalysisUsage &AU) const override { |
| AU.setPreservesCFG(); |
| MachineFunctionPass::getAnalysisUsage(AU); |
| } |
| |
| StringRef getPassName() const override { |
| return LoongArch_MERGE_BASE_OFFSET_NAME; |
| } |
| }; |
| } // end anonymous namespace |
| |
| char LoongArchMergeBaseOffsetOpt::ID = 0; |
| INITIALIZE_PASS(LoongArchMergeBaseOffsetOpt, DEBUG_TYPE, |
| LoongArch_MERGE_BASE_OFFSET_NAME, false, false) |
| |
| // Detect either of the patterns: |
| // |
| // 1. (small/medium): |
| // pcalau12i vreg1, %pc_hi20(s) |
| // addi.d vreg2, vreg1, %pc_lo12(s) |
| // |
| // 2. (large): |
| // pcalau12i vreg1, %pc_hi20(s) |
| // addi.d vreg2, $zero, %pc_lo12(s) |
| // lu32i.d vreg3, vreg2, %pc64_lo20(s) |
| // lu52i.d vreg4, vreg3, %pc64_hi12(s) |
| // add.d vreg5, vreg4, vreg1 |
| |
| // The pattern is only accepted if: |
| // 1) For small and medium pattern, the first instruction has only one use, |
| // which is the ADDI. |
| // 2) For large pattern, the first four instructions each have only one use, |
| // and the user of the fourth instruction is ADD. |
| // 3) The address operands have the appropriate type, reflecting the |
| // lowering of a global address or constant pool using the pattern. |
| // 4) The offset value in the Global Address or Constant Pool is 0. |
| bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20, |
| MachineInstr *&Lo12, |
| MachineInstr *&Lo20, |
| MachineInstr *&Hi12, |
| MachineInstr *&Last) { |
| if (Hi20.getOpcode() != LoongArch::PCALAU12I) |
| return false; |
| |
| const MachineOperand &Hi20Op1 = Hi20.getOperand(1); |
| if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_PCREL_HI) |
| return false; |
| |
| auto isGlobalOrCPIOrBlockAddress = [](const MachineOperand &Op) { |
| return Op.isGlobal() || Op.isCPI() || Op.isBlockAddress(); |
| }; |
| |
| if (!isGlobalOrCPIOrBlockAddress(Hi20Op1) || Hi20Op1.getOffset() != 0) |
| return false; |
| |
| Register HiDestReg = Hi20.getOperand(0).getReg(); |
| if (!MRI->hasOneUse(HiDestReg)) |
| return false; |
| |
| MachineInstr *UseInst = &*MRI->use_instr_begin(HiDestReg); |
| if (UseInst->getOpcode() != LoongArch::ADD_D) { |
| Lo12 = UseInst; |
| if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) || |
| (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W)) |
| return false; |
| } else { |
| assert(ST->is64Bit()); |
| Last = UseInst; |
| |
| Register LastOp1Reg = Last->getOperand(1).getReg(); |
| if (!LastOp1Reg.isVirtual()) |
| return false; |
| Hi12 = MRI->getVRegDef(LastOp1Reg); |
| const MachineOperand &Hi12Op2 = Hi12->getOperand(2); |
| if (Hi12Op2.getTargetFlags() != LoongArchII::MO_PCREL64_HI) |
| return false; |
| if (!isGlobalOrCPIOrBlockAddress(Hi12Op2) || Hi12Op2.getOffset() != 0) |
| return false; |
| if (!MRI->hasOneUse(Hi12->getOperand(0).getReg())) |
| return false; |
| |
| Lo20 = MRI->getVRegDef(Hi12->getOperand(1).getReg()); |
| const MachineOperand &Lo20Op2 = Lo20->getOperand(2); |
| if (Lo20Op2.getTargetFlags() != LoongArchII::MO_PCREL64_LO) |
| return false; |
| if (!isGlobalOrCPIOrBlockAddress(Lo20Op2) || Lo20Op2.getOffset() != 0) |
| return false; |
| if (!MRI->hasOneUse(Lo20->getOperand(0).getReg())) |
| return false; |
| |
| Lo12 = MRI->getVRegDef(Lo20->getOperand(1).getReg()); |
| if (!MRI->hasOneUse(Lo12->getOperand(0).getReg())) |
| return false; |
| } |
| |
| const MachineOperand &Lo12Op2 = Lo12->getOperand(2); |
| assert(Hi20.getOpcode() == LoongArch::PCALAU12I); |
| if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_PCREL_LO || |
| !(isGlobalOrCPIOrBlockAddress(Lo12Op2) || Lo12Op2.isMCSymbol()) || |
| Lo12Op2.getOffset() != 0) |
| return false; |
| |
| if (Hi20Op1.isGlobal()) { |
| LLVM_DEBUG(dbgs() << " Found lowered global address: " |
| << *Hi20Op1.getGlobal() << "\n"); |
| } else if (Hi20Op1.isBlockAddress()) { |
| LLVM_DEBUG(dbgs() << " Found lowered basic address: " |
| << *Hi20Op1.getBlockAddress() << "\n"); |
| } else if (Hi20Op1.isCPI()) { |
| LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex() |
| << "\n"); |
| } |
| |
| return true; |
| } |
| |
| // Detect the pattern: |
| // |
| // (small/medium): |
| // lu12i.w vreg1, %le_hi20_r(s) |
| // add.w/d vreg2, vreg1, r2, %le_add_r(s) |
| // addi.w/d vreg3, vreg2, %le_lo12_r(s) |
| |
| // The pattern is only accepted if: |
| // 1) The first instruction has only one use, which is the PseudoAddTPRel. |
| // The second instruction has only one use, which is the ADDI. The |
| // second instruction's last operand is the tp register. |
| // 2) The address operands have the appropriate type, reflecting the |
| // lowering of a thread_local global address using the pattern. |
| // 3) The offset value in the ThreadLocal Global Address is 0. |
| bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20, |
| MachineInstr *&Add, |
| MachineInstr *&Lo12) { |
| if (Hi20.getOpcode() != LoongArch::LU12I_W) |
| return false; |
| |
| auto isGlobalOrCPI = [](const MachineOperand &Op) { |
| return Op.isGlobal() || Op.isCPI(); |
| }; |
| |
| const MachineOperand &Hi20Op1 = Hi20.getOperand(1); |
| if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_LE_HI_R || |
| !isGlobalOrCPI(Hi20Op1) || Hi20Op1.getOffset() != 0) |
| return false; |
| |
| Register HiDestReg = Hi20.getOperand(0).getReg(); |
| if (!MRI->hasOneUse(HiDestReg)) |
| return false; |
| |
| Add = &*MRI->use_instr_begin(HiDestReg); |
| if ((ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_D) || |
| (!ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_W)) |
| return false; |
| |
| if (Add->getOperand(2).getReg() != LoongArch::R2) |
| return false; |
| |
| const MachineOperand &AddOp3 = Add->getOperand(3); |
| if (LoongArchII::getDirectFlags(AddOp3) != LoongArchII::MO_LE_ADD_R || |
| !(isGlobalOrCPI(AddOp3) || AddOp3.isMCSymbol()) || |
| AddOp3.getOffset() != 0) |
| return false; |
| |
| Register AddDestReg = Add->getOperand(0).getReg(); |
| if (!MRI->hasOneUse(AddDestReg)) |
| return false; |
| |
| Lo12 = &*MRI->use_instr_begin(AddDestReg); |
| if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) || |
| (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W)) |
| return false; |
| |
| const MachineOperand &Lo12Op2 = Lo12->getOperand(2); |
| if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_LE_LO_R || |
| !(isGlobalOrCPI(Lo12Op2) || Lo12Op2.isMCSymbol()) || |
| Lo12Op2.getOffset() != 0) |
| return false; |
| |
| if (Hi20Op1.isGlobal()) { |
| LLVM_DEBUG(dbgs() << " Found lowered global address: " |
| << *Hi20Op1.getGlobal() << "\n"); |
| } else if (Hi20Op1.isCPI()) { |
| LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex() |
| << "\n"); |
| } |
| |
| return true; |
| } |
| |
| // Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions. |
| // Delete the tail instruction and update all the uses to use the |
| // output from Last. |
| void LoongArchMergeBaseOffsetOpt::foldOffset( |
| MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20, |
| MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail, |
| int64_t Offset) { |
| // Put the offset back in Hi and the Lo |
| Hi20.getOperand(1).setOffset(Offset); |
| Lo12.getOperand(2).setOffset(Offset); |
| if (Lo20 && Hi12) { |
| Lo20->getOperand(2).setOffset(Offset); |
| Hi12->getOperand(2).setOffset(Offset); |
| } |
| |
| // For tls-le, offset of the second PseudoAddTPRel instr should also be |
| // updated. |
| MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg()); |
| if (Hi20.getOpcode() == LoongArch::LU12I_W) |
| Add->getOperand(3).setOffset(Offset); |
| |
| // Delete the tail instruction. |
| MachineInstr *Def = Last ? Last : &Lo12; |
| MRI->constrainRegClass(Def->getOperand(0).getReg(), |
| MRI->getRegClass(Tail.getOperand(0).getReg())); |
| MRI->replaceRegWith(Tail.getOperand(0).getReg(), Def->getOperand(0).getReg()); |
| Tail.eraseFromParent(); |
| |
| LLVM_DEBUG(dbgs() << " Merged offset " << Offset << " into base.\n" |
| << " " << Hi20;); |
| if (Hi20.getOpcode() == LoongArch::LU12I_W) { |
| LLVM_DEBUG(dbgs() << " " << *Add;); |
| } |
| LLVM_DEBUG(dbgs() << " " << Lo12;); |
| if (Lo20 && Hi12) { |
| LLVM_DEBUG(dbgs() << " " << *Lo20 << " " << *Hi12;); |
| } |
| } |
| |
| // Detect patterns for large offsets that are passed into an ADD instruction. |
| // If the pattern is found, updates the offset in Hi20, (Add), Lo12, |
| // (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that |
| // produced the offset. |
| // |
| // (The instructions marked with "!" are not necessarily present) |
| // |
| // Base address lowering is of the form: |
| // 1) pcala: |
| // Hi20: pcalau12i vreg1, %pc_hi20(s) |
| // +--- Lo12: addi.d vreg2, vreg1, %pc_lo12(s) |
| // | Lo20: lu32i.d vreg2, %pc64_lo20(s) ! |
| // +--- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) ! |
| // | |
| // | 2) tls-le: |
| // | Hi20: lu12i.w vreg1, %le_hi20_r(s) |
| // | Add: add.w/d vreg1, vreg1, r2, %le_add_r(s) |
| // +--- Lo12: addi.w/d vreg2, vreg1, %le_lo12_r(s) |
| // | |
| // | The large offset can be one of the forms: |
| // | |
| // +-> 1) Offset that has non zero bits in Hi20 and Lo12 bits: |
| // | OffsetHi20: lu12i.w vreg3, 4 |
| // | OffsetLo12: ori voff, vreg3, 188 ------------------+ |
| // | | |
| // +-> 2) Offset that has non zero bits in Hi20 bits only: | |
| // | OffsetHi20: lu12i.w voff, 128 ------------------+ |
| // | | |
| // +-> 3) Offset that has non zero bits in Lo20 bits: | |
| // | OffsetHi20: lu12i.w vreg3, 121 ! | |
| // | OffsetLo12: ori voff, vreg3, 122 ! | |
| // | OffsetLo20: lu32i.d voff, 123 ------------------+ |
| // +-> 4) Offset that has non zero bits in Hi12 bits: | |
| // OffsetHi20: lu12i.w vreg3, 121 ! | |
| // OffsetLo12: ori voff, vreg3, 122 ! | |
| // OffsetLo20: lu32i.d vreg3, 123 ! | |
| // OffsetHi12: lu52i.d voff, vrg3, 124 ------------------+ |
| // | |
| // TailAdd: add.d vreg4, vreg2, voff <------------------+ |
| // |
| bool LoongArchMergeBaseOffsetOpt::foldLargeOffset( |
| MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20, |
| MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &TailAdd, |
| Register GAReg) { |
| assert((TailAdd.getOpcode() == LoongArch::ADD_W || |
| TailAdd.getOpcode() == LoongArch::ADD_D) && |
| "Expected ADD instruction!"); |
| Register Rs = TailAdd.getOperand(1).getReg(); |
| Register Rt = TailAdd.getOperand(2).getReg(); |
| Register Reg = Rs == GAReg ? Rt : Rs; |
| SmallVector<MachineInstr *, 4> Instrs; |
| int64_t Offset = 0; |
| int64_t Mask = -1; |
| |
| // This can point to one of [ORI, LU12I.W, LU32I.D, LU52I.D]: |
| for (int i = 0; i < 4; i++) { |
| // Handle Reg is R0. |
| if (Reg == LoongArch::R0) |
| break; |
| |
| // Can't fold if the register has more than one use. |
| if (!Reg.isVirtual() || !MRI->hasOneUse(Reg)) |
| return false; |
| |
| MachineInstr *Curr = MRI->getVRegDef(Reg); |
| if (!Curr) |
| break; |
| |
| switch (Curr->getOpcode()) { |
| default: |
| // Can't fold if the instruction opcode is unexpected. |
| return false; |
| case LoongArch::ORI: { |
| MachineOperand ImmOp = Curr->getOperand(2); |
| if (ImmOp.getTargetFlags() != LoongArchII::MO_None) |
| return false; |
| Offset += ImmOp.getImm(); |
| Reg = Curr->getOperand(1).getReg(); |
| Instrs.push_back(Curr); |
| break; |
| } |
| case LoongArch::LU12I_W: { |
| MachineOperand ImmOp = Curr->getOperand(1); |
| if (ImmOp.getTargetFlags() != LoongArchII::MO_None) |
| return false; |
| Offset += SignExtend64<32>(ImmOp.getImm() << 12) & Mask; |
| Reg = LoongArch::R0; |
| Instrs.push_back(Curr); |
| break; |
| } |
| case LoongArch::LU32I_D: { |
| MachineOperand ImmOp = Curr->getOperand(2); |
| if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Lo20) |
| return false; |
| Offset += SignExtend64<52>(ImmOp.getImm() << 32) & Mask; |
| Mask ^= 0x000FFFFF00000000ULL; |
| Reg = Curr->getOperand(1).getReg(); |
| Instrs.push_back(Curr); |
| break; |
| } |
| case LoongArch::LU52I_D: { |
| MachineOperand ImmOp = Curr->getOperand(2); |
| if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Hi12) |
| return false; |
| Offset += ImmOp.getImm() << 52; |
| Mask ^= 0xFFF0000000000000ULL; |
| Reg = Curr->getOperand(1).getReg(); |
| Instrs.push_back(Curr); |
| break; |
| } |
| } |
| } |
| |
| // Can't fold if the offset is not extracted. |
| if (!Offset) |
| return false; |
| |
| foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset); |
| LLVM_DEBUG(dbgs() << " Offset Instrs:\n"); |
| for (auto I : Instrs) { |
| LLVM_DEBUG(dbgs() << " " << *I); |
| I->eraseFromParent(); |
| } |
| |
| return true; |
| } |
| |
| bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20, |
| MachineInstr &Lo12, |
| MachineInstr *&Lo20, |
| MachineInstr *&Hi12, |
| MachineInstr *&Last) { |
| Register DestReg = |
| Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg(); |
| |
| // Look for arithmetic instructions we can get an offset from. |
| // We might be able to remove the arithmetic instructions by folding the |
| // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or |
| // LU12I_W+PseudoAddTPRel+ADDI. |
| if (!MRI->hasOneUse(DestReg)) |
| return false; |
| |
| // DestReg has only one use. |
| MachineInstr &Tail = *MRI->use_instr_begin(DestReg); |
| switch (Tail.getOpcode()) { |
| default: |
| LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:" |
| << Tail); |
| break; |
| case LoongArch::ADDI_W: |
| if (ST->is64Bit()) |
| return false; |
| [[fallthrough]]; |
| case LoongArch::ADDI_D: |
| case LoongArch::ADDU16I_D: { |
| // Offset is simply an immediate operand. |
| int64_t Offset = Tail.getOperand(2).getImm(); |
| if (Tail.getOpcode() == LoongArch::ADDU16I_D) |
| Offset = SignExtend64<32>(Offset << 16); |
| |
| // We might have two ADDIs in a row. |
| Register TailDestReg = Tail.getOperand(0).getReg(); |
| if (MRI->hasOneUse(TailDestReg)) { |
| MachineInstr &TailTail = *MRI->use_instr_begin(TailDestReg); |
| if (ST->is64Bit() && TailTail.getOpcode() == LoongArch::ADDI_W) |
| return false; |
| if (TailTail.getOpcode() == LoongArch::ADDI_W || |
| TailTail.getOpcode() == LoongArch::ADDI_D) { |
| Offset += TailTail.getOperand(2).getImm(); |
| LLVM_DEBUG(dbgs() << " Offset Instrs: " << Tail << TailTail); |
| foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailTail, Offset); |
| Tail.eraseFromParent(); |
| return true; |
| } |
| } |
| |
| LLVM_DEBUG(dbgs() << " Offset Instr: " << Tail); |
| foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, Offset); |
| return true; |
| } |
| case LoongArch::ADD_W: |
| if (ST->is64Bit()) |
| return false; |
| [[fallthrough]]; |
| case LoongArch::ADD_D: |
| // The offset is too large to fit in the immediate field of ADDI. |
| return foldLargeOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, DestReg); |
| break; |
| } |
| |
| return false; |
| } |
| |
| // Memory access opcode mapping for transforms. |
| static unsigned getNewOpc(unsigned Op, bool isLarge) { |
| switch (Op) { |
| case LoongArch::LD_B: |
| return isLarge ? LoongArch::LDX_B : LoongArch::LD_B; |
| case LoongArch::LD_H: |
| return isLarge ? LoongArch::LDX_H : LoongArch::LD_H; |
| case LoongArch::LD_W: |
| case LoongArch::LDPTR_W: |
| return isLarge ? LoongArch::LDX_W : LoongArch::LD_W; |
| case LoongArch::LD_D: |
| case LoongArch::LDPTR_D: |
| return isLarge ? LoongArch::LDX_D : LoongArch::LD_D; |
| case LoongArch::LD_BU: |
| return isLarge ? LoongArch::LDX_BU : LoongArch::LD_BU; |
| case LoongArch::LD_HU: |
| return isLarge ? LoongArch::LDX_HU : LoongArch::LD_HU; |
| case LoongArch::LD_WU: |
| return isLarge ? LoongArch::LDX_WU : LoongArch::LD_WU; |
| case LoongArch::FLD_S: |
| return isLarge ? LoongArch::FLDX_S : LoongArch::FLD_S; |
| case LoongArch::FLD_D: |
| return isLarge ? LoongArch::FLDX_D : LoongArch::FLD_D; |
| case LoongArch::VLD: |
| return isLarge ? LoongArch::VLDX : LoongArch::VLD; |
| case LoongArch::XVLD: |
| return isLarge ? LoongArch::XVLDX : LoongArch::XVLD; |
| case LoongArch::VLDREPL_B: |
| return LoongArch::VLDREPL_B; |
| case LoongArch::XVLDREPL_B: |
| return LoongArch::XVLDREPL_B; |
| case LoongArch::ST_B: |
| return isLarge ? LoongArch::STX_B : LoongArch::ST_B; |
| case LoongArch::ST_H: |
| return isLarge ? LoongArch::STX_H : LoongArch::ST_H; |
| case LoongArch::ST_W: |
| case LoongArch::STPTR_W: |
| return isLarge ? LoongArch::STX_W : LoongArch::ST_W; |
| case LoongArch::ST_D: |
| case LoongArch::STPTR_D: |
| return isLarge ? LoongArch::STX_D : LoongArch::ST_D; |
| case LoongArch::FST_S: |
| return isLarge ? LoongArch::FSTX_S : LoongArch::FST_S; |
| case LoongArch::FST_D: |
| return isLarge ? LoongArch::FSTX_D : LoongArch::FST_D; |
| case LoongArch::VST: |
| return isLarge ? LoongArch::VSTX : LoongArch::VST; |
| case LoongArch::XVST: |
| return isLarge ? LoongArch::XVSTX : LoongArch::XVST; |
| default: |
| llvm_unreachable("Unexpected opcode for replacement"); |
| } |
| } |
| |
| bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20, |
| MachineInstr &Lo12, |
| MachineInstr *&Lo20, |
| MachineInstr *&Hi12, |
| MachineInstr *&Last) { |
| Register DestReg = |
| Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg(); |
| |
| // If all the uses are memory ops with the same offset, we can transform: |
| // |
| // 1. (small/medium): |
| // 1.1. pcala |
| // pcalau12i vreg1, %pc_hi20(s) |
| // addi.d vreg2, vreg1, %pc_lo12(s) |
| // ld.w vreg3, 8(vreg2) |
| // |
| // => |
| // |
| // pcalau12i vreg1, %pc_hi20(s+8) |
| // ld.w vreg3, vreg1, %pc_lo12(s+8)(vreg1) |
| // |
| // 1.2. tls-le |
| // lu12i.w vreg1, %le_hi20_r(s) |
| // add.w/d vreg2, vreg1, r2, %le_add_r(s) |
| // addi.w/d vreg3, vreg2, %le_lo12_r(s) |
| // ld.w vreg4, 8(vreg3) |
| // |
| // => |
| // |
| // lu12i.w vreg1, %le_hi20_r(s+8) |
| // add.w/d vreg2, vreg1, r2, %le_add_r(s+8) |
| // ld.w vreg4, vreg2, %le_lo12_r(s+8)(vreg2) |
| // |
| // 2. (large): |
| // pcalau12i vreg1, %pc_hi20(s) |
| // addi.d vreg2, $zero, %pc_lo12(s) |
| // lu32i.d vreg3, vreg2, %pc64_lo20(s) |
| // lu52i.d vreg4, vreg3, %pc64_hi12(s) |
| // add.d vreg5, vreg4, vreg1 |
| // ld.w vreg6, 8(vreg5) |
| // |
| // => |
| // |
| // pcalau12i vreg1, %pc_hi20(s+8) |
| // addi.d vreg2, $zero, %pc_lo12(s+8) |
| // lu32i.d vreg3, vreg2, %pc64_lo20(s+8) |
| // lu52i.d vreg4, vreg3, %pc64_hi12(s+8) |
| // ldx.w vreg6, vreg4, vreg1 |
| |
| std::optional<int64_t> CommonOffset; |
| DenseMap<const MachineInstr *, SmallVector<unsigned>> |
| InlineAsmMemoryOpIndexesMap; |
| for (const MachineInstr &UseMI : MRI->use_instructions(DestReg)) { |
| switch (UseMI.getOpcode()) { |
| default: |
| LLVM_DEBUG(dbgs() << "Not a load or store instruction: " << UseMI); |
| return false; |
| case LoongArch::VLDREPL_B: |
| case LoongArch::XVLDREPL_B: |
| // We can't do this for large pattern. |
| if (Last) |
| return false; |
| [[fallthrough]]; |
| case LoongArch::LD_B: |
| case LoongArch::LD_H: |
| case LoongArch::LD_W: |
| case LoongArch::LD_D: |
| case LoongArch::LD_BU: |
| case LoongArch::LD_HU: |
| case LoongArch::LD_WU: |
| case LoongArch::LDPTR_W: |
| case LoongArch::LDPTR_D: |
| case LoongArch::FLD_S: |
| case LoongArch::FLD_D: |
| case LoongArch::VLD: |
| case LoongArch::XVLD: |
| case LoongArch::ST_B: |
| case LoongArch::ST_H: |
| case LoongArch::ST_W: |
| case LoongArch::ST_D: |
| case LoongArch::STPTR_W: |
| case LoongArch::STPTR_D: |
| case LoongArch::FST_S: |
| case LoongArch::FST_D: |
| case LoongArch::VST: |
| case LoongArch::XVST: { |
| if (UseMI.getOperand(1).isFI()) |
| return false; |
| // Register defined by Lo should not be the value register. |
| if (DestReg == UseMI.getOperand(0).getReg()) |
| return false; |
| assert(DestReg == UseMI.getOperand(1).getReg() && |
| "Expected base address use"); |
| // All load/store instructions must use the same offset. |
| int64_t Offset = UseMI.getOperand(2).getImm(); |
| if (CommonOffset && Offset != CommonOffset) |
| return false; |
| CommonOffset = Offset; |
| break; |
| } |
| case LoongArch::INLINEASM: |
| case LoongArch::INLINEASM_BR: { |
| // We can't do this for large pattern. |
| if (Last) |
| return false; |
| SmallVector<unsigned> InlineAsmMemoryOpIndexes; |
| unsigned NumOps = 0; |
| for (unsigned I = InlineAsm::MIOp_FirstOperand; |
| I < UseMI.getNumOperands(); I += 1 + NumOps) { |
| const MachineOperand &FlagsMO = UseMI.getOperand(I); |
| // Should be an imm. |
| if (!FlagsMO.isImm()) |
| continue; |
| |
| const InlineAsm::Flag Flags(FlagsMO.getImm()); |
| NumOps = Flags.getNumOperandRegisters(); |
| |
| // Memory constraints have two operands. |
| if (NumOps != 2 || !Flags.isMemKind()) { |
| // If the register is used by something other than a memory contraint, |
| // we should not fold. |
| for (unsigned J = 0; J < NumOps; ++J) { |
| const MachineOperand &MO = UseMI.getOperand(I + 1 + J); |
| if (MO.isReg() && MO.getReg() == DestReg) |
| return false; |
| } |
| continue; |
| } |
| |
| // We can only do this for constraint m. |
| if (Flags.getMemoryConstraintID() != InlineAsm::ConstraintCode::m) |
| return false; |
| |
| const MachineOperand &AddrMO = UseMI.getOperand(I + 1); |
| if (!AddrMO.isReg() || AddrMO.getReg() != DestReg) |
| continue; |
| |
| const MachineOperand &OffsetMO = UseMI.getOperand(I + 2); |
| if (!OffsetMO.isImm()) |
| continue; |
| |
| // All inline asm memory operands must use the same offset. |
| int64_t Offset = OffsetMO.getImm(); |
| if (CommonOffset && Offset != CommonOffset) |
| return false; |
| CommonOffset = Offset; |
| InlineAsmMemoryOpIndexes.push_back(I + 1); |
| } |
| InlineAsmMemoryOpIndexesMap.insert( |
| std::make_pair(&UseMI, InlineAsmMemoryOpIndexes)); |
| break; |
| } |
| } |
| } |
| |
| // We found a common offset. |
| // Update the offsets in global address lowering. |
| // We may have already folded some arithmetic so we need to add to any |
| // existing offset. |
| int64_t NewOffset = Hi20.getOperand(1).getOffset() + *CommonOffset; |
| // LA32 ignores the upper 32 bits. |
| if (!ST->is64Bit()) |
| NewOffset = SignExtend64<32>(NewOffset); |
| // We can only fold simm32 offsets. |
| if (!isInt<32>(NewOffset)) |
| return false; |
| |
| // If optimized by this pass successfully, MO_RELAX bitmask target-flag should |
| // be removed from the pcala code sequence. Code sequence of tls-le can still |
| // be relaxed after being optimized. |
| // |
| // For example: |
| // pcalau12i $a0, %pc_hi20(symbol) |
| // addi.d $a0, $a0, %pc_lo12(symbol) |
| // ld.w $a0, $a0, 0 |
| // |
| // => |
| // |
| // pcalau12i $a0, %pc_hi20(symbol) |
| // ld.w $a0, $a0, %pc_lo12(symbol) |
| // |
| // Code sequence optimized before can be relax by linker. But after being |
| // optimized, it cannot be relaxed any more. So MO_RELAX flag should not be |
| // carried by them. |
| Hi20.getOperand(1).setOffset(NewOffset); |
| MachineOperand &ImmOp = Lo12.getOperand(2); |
| ImmOp.setOffset(NewOffset); |
| if (Lo20 && Hi12) { |
| Lo20->getOperand(2).setOffset(NewOffset); |
| Hi12->getOperand(2).setOffset(NewOffset); |
| } |
| if (Hi20.getOpcode() == LoongArch::PCALAU12I) { |
| Hi20.getOperand(1).setTargetFlags( |
| LoongArchII::getDirectFlags(Hi20.getOperand(1))); |
| ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp)); |
| } else if (Hi20.getOpcode() == LoongArch::LU12I_W) { |
| MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg()); |
| Add->getOperand(3).setOffset(NewOffset); |
| } |
| |
| // Update the immediate in the load/store instructions to add the offset. |
| const LoongArchInstrInfo &TII = *ST->getInstrInfo(); |
| for (MachineInstr &UseMI : |
| llvm::make_early_inc_range(MRI->use_instructions(DestReg))) { |
| if (UseMI.getOpcode() == LoongArch::INLINEASM || |
| UseMI.getOpcode() == LoongArch::INLINEASM_BR) { |
| auto &InlineAsmMemoryOpIndexes = InlineAsmMemoryOpIndexesMap[&UseMI]; |
| for (unsigned I : InlineAsmMemoryOpIndexes) { |
| MachineOperand &MO = UseMI.getOperand(I + 1); |
| switch (ImmOp.getType()) { |
| case MachineOperand::MO_GlobalAddress: |
| MO.ChangeToGA(ImmOp.getGlobal(), ImmOp.getOffset(), |
| LoongArchII::getDirectFlags(ImmOp)); |
| break; |
| case MachineOperand::MO_MCSymbol: |
| MO.ChangeToMCSymbol(ImmOp.getMCSymbol(), |
| LoongArchII::getDirectFlags(ImmOp)); |
| MO.setOffset(ImmOp.getOffset()); |
| break; |
| case MachineOperand::MO_BlockAddress: |
| MO.ChangeToBA(ImmOp.getBlockAddress(), ImmOp.getOffset(), |
| LoongArchII::getDirectFlags(ImmOp)); |
| break; |
| default: |
| report_fatal_error("unsupported machine operand type"); |
| break; |
| } |
| } |
| } else { |
| UseMI.setDesc(TII.get(getNewOpc(UseMI.getOpcode(), Last))); |
| if (Last) { |
| UseMI.removeOperand(2); |
| UseMI.removeOperand(1); |
| UseMI.addOperand(Last->getOperand(1)); |
| UseMI.addOperand(Last->getOperand(2)); |
| UseMI.getOperand(1).setIsKill(false); |
| UseMI.getOperand(2).setIsKill(false); |
| } else { |
| UseMI.removeOperand(2); |
| UseMI.addOperand(ImmOp); |
| } |
| } |
| } |
| |
| if (Last) { |
| Last->eraseFromParent(); |
| return true; |
| } |
| |
| if (Hi20.getOpcode() == LoongArch::PCALAU12I) { |
| MRI->replaceRegWith(Lo12.getOperand(0).getReg(), |
| Hi20.getOperand(0).getReg()); |
| } else if (Hi20.getOpcode() == LoongArch::LU12I_W) { |
| MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg()); |
| MRI->replaceRegWith(Lo12.getOperand(0).getReg(), |
| Add->getOperand(0).getReg()); |
| } |
| Lo12.eraseFromParent(); |
| return true; |
| } |
| |
| bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) { |
| if (skipFunction(Fn.getFunction())) |
| return false; |
| |
| ST = &Fn.getSubtarget<LoongArchSubtarget>(); |
| |
| bool MadeChange = false; |
| MRI = &Fn.getRegInfo(); |
| for (MachineBasicBlock &MBB : Fn) { |
| LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n"); |
| for (MachineInstr &Hi20 : MBB) { |
| MachineInstr *Lo12 = nullptr; |
| MachineInstr *Lo20 = nullptr; |
| MachineInstr *Hi12 = nullptr; |
| MachineInstr *Last = nullptr; |
| if (Hi20.getOpcode() == LoongArch::PCALAU12I) { |
| // Detect foldable pcala code sequence in small/medium/large code model. |
| if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last)) |
| continue; |
| } else if (Hi20.getOpcode() == LoongArch::LU12I_W) { |
| MachineInstr *Add = nullptr; |
| // Detect foldable tls-le code sequence in small/medium code model. |
| if (!detectFoldable(Hi20, Add, Lo12)) |
| continue; |
| } else { |
| continue; |
| } |
| // For tls-le, we do not pass the second PseudoAddTPRel instr in order to |
| // reuse the existing hooks and the last three paramaters should always be |
| // nullptr. |
| MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last); |
| MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last); |
| } |
| } |
| |
| return MadeChange; |
| } |
| |
| /// Returns an instance of the Merge Base Offset Optimization pass. |
| FunctionPass *llvm::createLoongArchMergeBaseOffsetOptPass() { |
| return new LoongArchMergeBaseOffsetOpt(); |
| } |