| //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| /// \file |
| /// Implements the AMDGPU specific subclass of TargetSubtarget. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "AMDGPUSubtarget.h" |
| #include "AMDGPUCallLowering.h" |
| #include "AMDGPUInstructionSelector.h" |
| #include "AMDGPULegalizerInfo.h" |
| #include "AMDGPURegisterBankInfo.h" |
| #include "R600Subtarget.h" |
| #include "SIMachineFunctionInfo.h" |
| #include "Utils/AMDGPUBaseInfo.h" |
| #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" |
| #include "llvm/CodeGen/MachineScheduler.h" |
| #include "llvm/CodeGen/TargetFrameLowering.h" |
| #include "llvm/IR/DiagnosticInfo.h" |
| #include "llvm/IR/IntrinsicsAMDGPU.h" |
| #include "llvm/IR/IntrinsicsR600.h" |
| #include "llvm/IR/MDBuilder.h" |
| #include <algorithm> |
| |
| using namespace llvm; |
| |
| #define DEBUG_TYPE "amdgpu-subtarget" |
| |
| AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {} |
| |
| bool AMDGPUSubtarget::useRealTrue16Insts() const { |
| return hasTrue16BitInsts() && EnableRealTrue16Insts; |
| } |
| |
| // Returns the maximum per-workgroup LDS allocation size (in bytes) that still |
| // allows the given function to achieve an occupancy of NWaves waves per |
| // SIMD / EU, taking into account only the function's *maximum* workgroup size. |
| unsigned |
| AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, |
| const Function &F) const { |
| const unsigned WaveSize = getWavefrontSize(); |
| const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; |
| const unsigned WavesPerWorkgroup = |
| std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize); |
| |
| const unsigned WorkGroupsPerCU = |
| std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup); |
| |
| return getLocalMemorySize() / WorkGroupsPerCU; |
| } |
| |
| std::pair<unsigned, unsigned> |
| AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, |
| const Function &F) const { |
| // FIXME: We should take into account the LDS allocation granularity. |
| const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u); |
| |
| // Queried LDS size may be larger than available on a CU, in which case we |
| // consider the only achievable occupancy to be 1, in line with what we |
| // consider the occupancy to be when the number of requested registers in a |
| // particular bank is higher than the number of available ones in that bank. |
| if (!MaxWGsLDS) |
| return {1, 1}; |
| |
| const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU(); |
| |
| auto PropsFromWGSize = [=](unsigned WGSize) |
| -> std::tuple<const unsigned, const unsigned, unsigned> { |
| unsigned WavesPerWG = divideCeil(WGSize, WaveSize); |
| unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS); |
| return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU}; |
| }; |
| |
| // The maximum group size will generally yield the minimum number of |
| // workgroups, maximum number of waves, and minimum occupancy. The opposite is |
| // generally true for the minimum group size. LDS or barrier ressource |
| // limitations can flip those minimums/maximums. |
| const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F); |
| auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize); |
| auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize); |
| |
| // It is possible that we end up with flipped minimum and maximum number of |
| // waves per CU when the number of minimum/maximum concurrent groups on the CU |
| // is limited by LDS usage or barrier resources. |
| if (MinWavesPerCU >= MaxWavesPerCU) { |
| std::swap(MinWavesPerCU, MaxWavesPerCU); |
| } else { |
| const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU(); |
| |
| // Look for a potential smaller group size than the maximum which decreases |
| // the concurrent number of waves on the CU for the same number of |
| // concurrent workgroups on the CU. |
| unsigned MinWavesPerCUForWGSize = |
| divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU; |
| if (MinWavesPerCU > MinWavesPerCUForWGSize) { |
| unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize; |
| if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) { |
| // There may exist a smaller group size than the maximum that achieves |
| // the minimum number of waves per CU. This group size is the largest |
| // possible size that requires MaxWavesPerWG - E waves where E is |
| // maximized under the following constraints. |
| // 1. 0 <= E <= ExcessSlotsPerWG |
| // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize |
| MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG, |
| MaxWavesPerWG - MinWavesPerWG); |
| } |
| } |
| |
| // Look for a potential larger group size than the minimum which increases |
| // the concurrent number of waves on the CU for the same number of |
| // concurrent workgroups on the CU. |
| unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG; |
| if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) { |
| // There may exist a larger group size than the minimum that achieves the |
| // maximum number of waves per CU. This group size is the smallest |
| // possible size that requires MinWavesPerWG + L waves where L is |
| // maximized under the following constraints. |
| // 1. 0 <= L <= LeftoverSlotsPerWG |
| // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize |
| MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG, |
| ((MaxWGSize - 1) / WaveSize) + 1 - |
| MinWavesPerWG); |
| } |
| } |
| |
| // Return the minimum/maximum number of waves on any EU, assuming that all |
| // wavefronts are spread across all EUs as evenly as possible. |
| return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU), |
| std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)}; |
| } |
| |
| std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( |
| const MachineFunction &MF) const { |
| const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction()); |
| } |
| |
| std::pair<unsigned, unsigned> |
| AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { |
| switch (CC) { |
| case CallingConv::AMDGPU_VS: |
| case CallingConv::AMDGPU_LS: |
| case CallingConv::AMDGPU_HS: |
| case CallingConv::AMDGPU_ES: |
| case CallingConv::AMDGPU_GS: |
| case CallingConv::AMDGPU_PS: |
| return std::pair(1, getWavefrontSize()); |
| default: |
| return std::pair(1u, getMaxFlatWorkGroupSize()); |
| } |
| } |
| |
| std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( |
| const Function &F) const { |
| // Default minimum/maximum flat work group sizes. |
| std::pair<unsigned, unsigned> Default = |
| getDefaultFlatWorkGroupSize(F.getCallingConv()); |
| |
| // Requested minimum/maximum flat work group sizes. |
| std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( |
| F, "amdgpu-flat-work-group-size", Default); |
| |
| // Make sure requested minimum is less than requested maximum. |
| if (Requested.first > Requested.second) |
| return Default; |
| |
| // Make sure requested values do not violate subtarget's specifications. |
| if (Requested.first < getMinFlatWorkGroupSize()) |
| return Default; |
| if (Requested.second > getMaxFlatWorkGroupSize()) |
| return Default; |
| |
| return Requested; |
| } |
| |
| std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU( |
| std::pair<unsigned, unsigned> Requested, |
| std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { |
| // Default minimum/maximum number of waves per execution unit. |
| std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); |
| |
| // If minimum/maximum flat work group sizes were explicitly requested using |
| // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum |
| // number of waves per execution unit to values implied by requested |
| // minimum/maximum flat work group sizes. |
| unsigned MinImpliedByFlatWorkGroupSize = |
| getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); |
| Default.first = MinImpliedByFlatWorkGroupSize; |
| |
| // Make sure requested minimum is less than requested maximum. |
| if (Requested.second && Requested.first > Requested.second) |
| return Default; |
| |
| // Make sure requested values do not violate subtarget's specifications. |
| if (Requested.first < getMinWavesPerEU() || |
| Requested.second > getMaxWavesPerEU()) |
| return Default; |
| |
| // Make sure requested values are compatible with values implied by requested |
| // minimum/maximum flat work group sizes. |
| if (Requested.first < MinImpliedByFlatWorkGroupSize) |
| return Default; |
| |
| return Requested; |
| } |
| |
| std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( |
| const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { |
| // Default minimum/maximum number of waves per execution unit. |
| std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); |
| |
| // Requested minimum/maximum number of waves per execution unit. |
| std::pair<unsigned, unsigned> Requested = |
| AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true); |
| return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes); |
| } |
| |
| static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { |
| auto *Node = Kernel.getMetadata("reqd_work_group_size"); |
| if (Node && Node->getNumOperands() == 3) |
| return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); |
| return std::numeric_limits<unsigned>::max(); |
| } |
| |
| bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { |
| return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); |
| } |
| |
| unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, |
| unsigned Dimension) const { |
| unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); |
| if (ReqdSize != std::numeric_limits<unsigned>::max()) |
| return ReqdSize - 1; |
| return getFlatWorkGroupSizes(Kernel).second - 1; |
| } |
| |
| bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const { |
| for (int I = 0; I < 3; ++I) { |
| if (getMaxWorkitemID(Func, I) > 0) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { |
| Function *Kernel = I->getParent()->getParent(); |
| unsigned MinSize = 0; |
| unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; |
| bool IdQuery = false; |
| |
| // If reqd_work_group_size is present it narrows value down. |
| if (auto *CI = dyn_cast<CallInst>(I)) { |
| const Function *F = CI->getCalledFunction(); |
| if (F) { |
| unsigned Dim = UINT_MAX; |
| switch (F->getIntrinsicID()) { |
| case Intrinsic::amdgcn_workitem_id_x: |
| case Intrinsic::r600_read_tidig_x: |
| IdQuery = true; |
| [[fallthrough]]; |
| case Intrinsic::r600_read_local_size_x: |
| Dim = 0; |
| break; |
| case Intrinsic::amdgcn_workitem_id_y: |
| case Intrinsic::r600_read_tidig_y: |
| IdQuery = true; |
| [[fallthrough]]; |
| case Intrinsic::r600_read_local_size_y: |
| Dim = 1; |
| break; |
| case Intrinsic::amdgcn_workitem_id_z: |
| case Intrinsic::r600_read_tidig_z: |
| IdQuery = true; |
| [[fallthrough]]; |
| case Intrinsic::r600_read_local_size_z: |
| Dim = 2; |
| break; |
| default: |
| break; |
| } |
| |
| if (Dim <= 3) { |
| unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); |
| if (ReqdSize != std::numeric_limits<unsigned>::max()) |
| MinSize = MaxSize = ReqdSize; |
| } |
| } |
| } |
| |
| if (!MaxSize) |
| return false; |
| |
| // Range metadata is [Lo, Hi). For ID query we need to pass max size |
| // as Hi. For size query we need to pass Hi + 1. |
| if (IdQuery) |
| MinSize = 0; |
| else |
| ++MaxSize; |
| |
| APInt Lower{32, MinSize}; |
| APInt Upper{32, MaxSize}; |
| if (auto *CI = dyn_cast<CallBase>(I)) { |
| ConstantRange Range(Lower, Upper); |
| CI->addRangeRetAttr(Range); |
| } else { |
| MDBuilder MDB(I->getContext()); |
| MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper); |
| I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); |
| } |
| return true; |
| } |
| |
| unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { |
| assert(AMDGPU::isKernel(F.getCallingConv())); |
| |
| // We don't allocate the segment if we know the implicit arguments weren't |
| // used, even if the ABI implies we need them. |
| if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) |
| return 0; |
| |
| if (isMesaKernel(F)) |
| return 16; |
| |
| // Assume all implicit inputs are used by default |
| const Module *M = F.getParent(); |
| unsigned NBytes = |
| AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56; |
| return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes", |
| NBytes); |
| } |
| |
| uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, |
| Align &MaxAlign) const { |
| assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || |
| F.getCallingConv() == CallingConv::SPIR_KERNEL); |
| |
| const DataLayout &DL = F.getDataLayout(); |
| uint64_t ExplicitArgBytes = 0; |
| MaxAlign = Align(1); |
| |
| for (const Argument &Arg : F.args()) { |
| if (Arg.hasAttribute("amdgpu-hidden-argument")) |
| continue; |
| |
| const bool IsByRef = Arg.hasByRefAttr(); |
| Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); |
| Align Alignment = DL.getValueOrABITypeAlignment( |
| IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy); |
| uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); |
| ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; |
| MaxAlign = std::max(MaxAlign, Alignment); |
| } |
| |
| return ExplicitArgBytes; |
| } |
| |
| unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, |
| Align &MaxAlign) const { |
| if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL && |
| F.getCallingConv() != CallingConv::SPIR_KERNEL) |
| return 0; |
| |
| uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); |
| |
| unsigned ExplicitOffset = getExplicitKernelArgOffset(); |
| |
| uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; |
| unsigned ImplicitBytes = getImplicitArgNumBytes(F); |
| if (ImplicitBytes != 0) { |
| const Align Alignment = getAlignmentForImplicitArgPtr(); |
| TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; |
| MaxAlign = std::max(MaxAlign, Alignment); |
| } |
| |
| // Being able to dereference past the end is useful for emitting scalar loads. |
| return alignTo(TotalSize, 4); |
| } |
| |
| AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { |
| return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 |
| : AMDGPUDwarfFlavour::Wave64; |
| } |
| |
| const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { |
| if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) |
| return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); |
| return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>()); |
| } |
| |
| const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { |
| if (TM.getTargetTriple().getArch() == Triple::amdgcn) |
| return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); |
| return static_cast<const AMDGPUSubtarget &>( |
| TM.getSubtarget<R600Subtarget>(F)); |
| } |
| |
| // FIXME: This has no reason to be in subtarget |
| SmallVector<unsigned> |
| AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const { |
| return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3, |
| std::numeric_limits<uint32_t>::max()); |
| } |