|  | //===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | // | 
|  | // This pass tries to combine multiple image_load intrinsics with dim=2dmsaa | 
|  | // or dim=2darraymsaa into a single image_msaa_load intrinsic if: | 
|  | // | 
|  | // - they refer to the same vaddr except for sample_id, | 
|  | // - they use a constant sample_id and they fall into the same group, | 
|  | // - they have the same dmask and the number of intrinsics and the number of | 
|  | //   vaddr/vdata dword transfers is reduced by the combine. | 
|  | // | 
|  | // Examples for the tradeoff (all are assuming 2DMsaa for vaddr): | 
|  | // | 
|  | // +----------+-----+-----+-------+---------+------------+---------+----------+ | 
|  | // | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? | | 
|  | // |  (dmask) |     |     |       | vdata   |            | vdata   |          | | 
|  | // +----------+-----+-----+-------+---------+------------+---------+----------+ | 
|  | // |        1 |   0 |   0 |     4 |  12 / 4 |          1 |   3 / 4 | yes      | | 
|  | // +----------+-----+-----+-------+---------+------------+---------+----------+ | 
|  | // |        1 |   0 |   0 |     2 |   6 / 2 |          1 |   3 / 4 | yes?     | | 
|  | // +----------+-----+-----+-------+---------+------------+---------+----------+ | 
|  | // |        2 |   0 |   0 |     4 |  12 / 8 |          2 |   6 / 8 | yes      | | 
|  | // +----------+-----+-----+-------+---------+------------+---------+----------+ | 
|  | // |        2 |   0 |   0 |     2 |   6 / 4 |          2 |   6 / 8 | no       | | 
|  | // +----------+-----+-----+-------+---------+------------+---------+----------+ | 
|  | // |        1 |   0 |   1 |     2 |   6 / 2 |          1 |   3 / 2 | yes      | | 
|  | // +----------+-----+-----+-------+---------+------------+---------+----------+ | 
|  | // | 
|  | // Some cases are of questionable benefit, like the one marked with "yes?" | 
|  | // above: fewer intrinsics and fewer vaddr and fewer total transfers between SP | 
|  | // and TX, but higher vdata. We start by erring on the side of converting these | 
|  | // to MSAA_LOAD. | 
|  | // | 
|  | // clang-format off | 
|  | // | 
|  | // This pass will combine intrinsics such as (not neccessarily consecutive): | 
|  | //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) | 
|  | //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0) | 
|  | //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0) | 
|  | //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0) | 
|  | // ==> | 
|  | //  call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) | 
|  | // | 
|  | // clang-format on | 
|  | // | 
|  | // Future improvements: | 
|  | // | 
|  | // - We may occasionally not want to do the combine if it increases the maximum | 
|  | //   register pressure. | 
|  | // | 
|  | // - Ensure clausing when multiple MSAA_LOAD are generated. | 
|  | // | 
|  | // Note: Even though the image_msaa_load intrinsic already exists on gfx10, this | 
|  | // combine only applies to gfx11, due to a limitation in gfx10: the gfx10 | 
|  | // IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and | 
|  | // we don't know the format at compile time. | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #include "AMDGPU.h" | 
|  | #include "AMDGPUInstrInfo.h" | 
|  | #include "AMDGPUTargetMachine.h" | 
|  | #include "llvm/IR/Function.h" | 
|  | #include "llvm/IR/IRBuilder.h" | 
|  | #include "llvm/IR/IntrinsicInst.h" | 
|  | #include "llvm/IR/IntrinsicsAMDGPU.h" | 
|  | #include "llvm/Pass.h" | 
|  | #include "llvm/Support/raw_ostream.h" | 
|  |  | 
|  | using namespace llvm; | 
|  |  | 
|  | #define DEBUG_TYPE "amdgpu-image-intrinsic-opt" | 
|  |  | 
|  | namespace { | 
|  | class AMDGPUImageIntrinsicOptimizer : public FunctionPass { | 
|  | const TargetMachine *TM; | 
|  |  | 
|  | public: | 
|  | static char ID; | 
|  |  | 
|  | AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr) | 
|  | : FunctionPass(ID), TM(TM) {} | 
|  |  | 
|  | bool runOnFunction(Function &F) override; | 
|  |  | 
|  | }; // End of class AMDGPUImageIntrinsicOptimizer | 
|  | } // End anonymous namespace | 
|  |  | 
|  | INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, | 
|  | "AMDGPU Image Intrinsic Optimizer", false, false) | 
|  |  | 
|  | char AMDGPUImageIntrinsicOptimizer::ID = 0; | 
|  |  | 
|  | void addInstToMergeableList( | 
|  | IntrinsicInst *II, | 
|  | SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts, | 
|  | const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) { | 
|  | for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) { | 
|  | // Check Dim. | 
|  | if (IIList.front()->getIntrinsicID() != II->getIntrinsicID()) | 
|  | continue; | 
|  |  | 
|  | // Check D16. | 
|  | if (IIList.front()->getType() != II->getType()) | 
|  | continue; | 
|  |  | 
|  | // Check all arguments (DMask, VAddr, RSrc etc). | 
|  | bool AllEqual = true; | 
|  | assert(IIList.front()->arg_size() == II->arg_size()); | 
|  | for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) { | 
|  | Value *ArgList = IIList.front()->getArgOperand(I); | 
|  | Value *Arg = II->getArgOperand(I); | 
|  | if (I == ImageDimIntr->VAddrEnd - 1) { | 
|  | // Check FragId group. | 
|  | auto *FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I)); | 
|  | auto *FragId = cast<ConstantInt>(II->getArgOperand(I)); | 
|  | AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4); | 
|  | } else { | 
|  | // Check all arguments except FragId. | 
|  | AllEqual = ArgList == Arg; | 
|  | } | 
|  | } | 
|  | if (!AllEqual) | 
|  | continue; | 
|  |  | 
|  | // Add to the list. | 
|  | IIList.emplace_back(II); | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Similar instruction not found, so add a new list. | 
|  | MergeableInsts.emplace_back(1, II); | 
|  | LLVM_DEBUG(dbgs() << "New: " << *II << "\n"); | 
|  | } | 
|  |  | 
|  | // Collect list of all instructions we know how to merge in a subset of the | 
|  | // block. It returns an iterator to the instruction after the last one analyzed. | 
|  | BasicBlock::iterator collectMergeableInsts( | 
|  | BasicBlock::iterator I, BasicBlock::iterator E, | 
|  | SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) { | 
|  | for (; I != E; ++I) { | 
|  | // Don't combine if there is a store in the middle or if there is a memory | 
|  | // barrier. | 
|  | if (I->mayHaveSideEffects()) { | 
|  | ++I; | 
|  | break; | 
|  | } | 
|  |  | 
|  | // Ignore non-intrinsics. | 
|  | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { | 
|  | Intrinsic::ID IntrinID = II->getIntrinsicID(); | 
|  |  | 
|  | // Ignore other intrinsics. | 
|  | if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa && | 
|  | IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa) | 
|  | continue; | 
|  |  | 
|  | // Check for constant FragId. | 
|  | const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID); | 
|  | const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; | 
|  | if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex))) | 
|  | continue; | 
|  |  | 
|  | LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n"); | 
|  | addInstToMergeableList(II, MergeableInsts, ImageDimIntr); | 
|  | } | 
|  | } | 
|  |  | 
|  | return I; | 
|  | } | 
|  |  | 
|  | bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) { | 
|  | bool Modified = false; | 
|  |  | 
|  | SmallVector<Instruction *, 4> InstrsToErase; | 
|  | for (const auto &IIList : MergeableInsts) { | 
|  | if (IIList.size() <= 1) | 
|  | continue; | 
|  |  | 
|  | // Assume the arguments are unchanged and later override them, if needed. | 
|  | SmallVector<Value *, 16> Args(IIList.front()->args()); | 
|  |  | 
|  | // Validate function argument and return types, extracting overloaded | 
|  | // types along the way. | 
|  | SmallVector<Type *, 6> OverloadTys; | 
|  | Function *F = IIList.front()->getCalledFunction(); | 
|  | if (!Intrinsic::getIntrinsicSignature(F, OverloadTys)) | 
|  | continue; | 
|  |  | 
|  | Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID(); | 
|  | const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = | 
|  | AMDGPU::getImageDimIntrinsicInfo(IntrinID); | 
|  |  | 
|  | Type *EltTy = IIList.front()->getType()->getScalarType(); | 
|  | Type *NewTy = FixedVectorType::get(EltTy, 4); | 
|  | OverloadTys[0] = NewTy; | 
|  | bool isD16 = EltTy->isHalfTy(); | 
|  |  | 
|  | ConstantInt *DMask = cast<ConstantInt>( | 
|  | IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex)); | 
|  | unsigned DMaskVal = DMask->getZExtValue() & 0xf; | 
|  | unsigned NumElts = popcount(DMaskVal); | 
|  |  | 
|  | // Number of instructions and the number of vaddr/vdata dword transfers | 
|  | // should be reduced. | 
|  | unsigned NumLoads = IIList.size(); | 
|  | unsigned NumMsaas = NumElts; | 
|  | unsigned NumVAddrLoads = 3 * NumLoads; | 
|  | unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads; | 
|  | unsigned NumVAddrMsaas = 3 * NumMsaas; | 
|  | unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas; | 
|  |  | 
|  | if (NumLoads < NumMsaas || | 
|  | (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas)) | 
|  | continue; | 
|  |  | 
|  | const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; | 
|  | auto *FragId = | 
|  | cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex)); | 
|  | const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4; | 
|  |  | 
|  | // Create the new instructions. | 
|  | IRBuilder<> B(IIList.front()); | 
|  |  | 
|  | // Create the new image_msaa_load intrinsic. | 
|  | SmallVector<Instruction *, 4> NewCalls; | 
|  | while (DMaskVal != 0) { | 
|  | unsigned NewMaskVal = 1 << countr_zero(DMaskVal); | 
|  |  | 
|  | Intrinsic::ID NewIntrinID; | 
|  | if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa) | 
|  | NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa; | 
|  | else | 
|  | NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa; | 
|  |  | 
|  | Args[ImageDimIntr->DMaskIndex] = | 
|  | ConstantInt::get(DMask->getType(), NewMaskVal); | 
|  | Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal); | 
|  | CallInst *NewCall = B.CreateIntrinsic(NewIntrinID, OverloadTys, Args); | 
|  | LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n"); | 
|  |  | 
|  | NewCalls.push_back(NewCall); | 
|  | DMaskVal -= NewMaskVal; | 
|  | } | 
|  |  | 
|  | // Create the new extractelement instructions. | 
|  | for (auto &II : IIList) { | 
|  | Value *VecOp = nullptr; | 
|  | auto *Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex)); | 
|  | B.SetCurrentDebugLocation(II->getDebugLoc()); | 
|  | if (NumElts == 1) { | 
|  | VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4)); | 
|  | LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); | 
|  | } else { | 
|  | VecOp = PoisonValue::get(II->getType()); | 
|  | for (unsigned I = 0; I < NumElts; ++I) { | 
|  | VecOp = B.CreateInsertElement( | 
|  | VecOp, | 
|  | B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I); | 
|  | LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); | 
|  | } | 
|  | } | 
|  |  | 
|  | // Replace the old instruction. | 
|  | II->replaceAllUsesWith(VecOp); | 
|  | VecOp->takeName(II); | 
|  | InstrsToErase.push_back(II); | 
|  | } | 
|  |  | 
|  | Modified = true; | 
|  | } | 
|  |  | 
|  | for (auto *I : InstrsToErase) | 
|  | I->eraseFromParent(); | 
|  |  | 
|  | return Modified; | 
|  | } | 
|  |  | 
|  | static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) { | 
|  | if (!TM) | 
|  | return false; | 
|  |  | 
|  | // This optimization only applies to GFX11 and beyond. | 
|  | const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); | 
|  | if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug()) | 
|  | return false; | 
|  |  | 
|  | Module *M = F.getParent(); | 
|  |  | 
|  | // Early test to determine if the intrinsics are used. | 
|  | if (llvm::none_of(*M, [](Function &F) { | 
|  | return !F.users().empty() && | 
|  | (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa || | 
|  | F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa); | 
|  | })) | 
|  | return false; | 
|  |  | 
|  | bool Modified = false; | 
|  | for (auto &BB : F) { | 
|  | BasicBlock::iterator SectionEnd; | 
|  | for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; | 
|  | I = SectionEnd) { | 
|  | SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts; | 
|  |  | 
|  | SectionEnd = collectMergeableInsts(I, E, MergeableInsts); | 
|  | Modified |= optimizeSection(MergeableInsts); | 
|  | } | 
|  | } | 
|  |  | 
|  | return Modified; | 
|  | } | 
|  |  | 
|  | bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) { | 
|  | if (skipFunction(F)) | 
|  | return false; | 
|  |  | 
|  | return imageIntrinsicOptimizerImpl(F, TM); | 
|  | } | 
|  |  | 
|  | FunctionPass * | 
|  | llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) { | 
|  | return new AMDGPUImageIntrinsicOptimizer(TM); | 
|  | } | 
|  |  | 
|  | PreservedAnalyses | 
|  | AMDGPUImageIntrinsicOptimizerPass::run(Function &F, | 
|  | FunctionAnalysisManager &AM) { | 
|  |  | 
|  | bool Changed = imageIntrinsicOptimizerImpl(F, &TM); | 
|  | return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); | 
|  | } |