llvm/lib/Target/AMDGPU/GCNSubtarget.h - third_party/llvm-project - Git at Google

 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //==-----------------------------------------------------------------------===//
 //
 /// \file
 /// AMD GCN specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//

 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H

 #include "AMDGPUCallLowering.h"
 #include "AMDGPURegisterBankInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "SIFrameLowering.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/Support/ErrorHandling.h"

 #define GET_SUBTARGETINFO_HEADER
 #include "AMDGPUGenSubtargetInfo.inc"

 namespace llvm {

 class GCNTargetMachine;

 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
                            public AMDGPUSubtarget {
 public:
   using AMDGPUSubtarget::getMaxWavesPerEU;

   // Following 2 enums are documented at:
   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
   enum class TrapHandlerAbi {
     NONE   = 0x00,
     AMDHSA = 0x01,
   };

   enum class TrapID {
     LLVMAMDHSATrap      = 0x02,
     LLVMAMDHSADebugTrap = 0x03,
   };

 private:
   /// SelectionDAGISel related APIs.
   std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;

   /// GlobalISel related APIs.
   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
   std::unique_ptr<InstructionSelector> InstSelector;
   std::unique_ptr<LegalizerInfo> Legalizer;
   std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;

 protected:
   // Basic subtarget description.
   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
   unsigned Gen = INVALID;
   InstrItineraryData InstrItins;
   int LDSBankCount = 0;
   unsigned MaxPrivateElementSize = 0;

   // Possibly statically set by tablegen, but may want to be overridden.
   bool FastDenormalF32 = false;
   bool HalfRate64Ops = false;
   bool FullRate64Ops = false;

   // Dynamically set bits that enable features.
   bool FlatForGlobal = false;
   bool AutoWaitcntBeforeBarrier = false;
   bool BackOffBarrier = false;
   bool UnalignedScratchAccess = false;
   bool UnalignedAccessMode = false;
   bool RelaxedBufferOOBMode = false;
   bool HasApertureRegs = false;
   bool SupportsXNACK = false;
   bool KernargPreload = false;

   // This should not be used directly. 'TargetID' tracks the dynamic settings
   // for XNACK.
   bool EnableXNACK = false;

   bool EnableTgSplit = false;
   bool EnableCuMode = false;
   bool TrapHandler = false;
   bool EnablePreciseMemory = false;

   // Used as options.
   bool EnableLoadStoreOpt = false;
   bool EnableUnsafeDSOffsetFolding = false;
   bool EnableSIScheduler = false;
   bool EnableDS128 = false;
   bool EnablePRTStrictNull = false;
   bool DumpCode = false;
   bool AssemblerPermissiveWavesize = false;

   // Subtarget statically properties set by tablegen
   bool FP64 = false;
   bool FMA = false;
   bool MIMG_R128 = false;
   bool CIInsts = false;
   bool GFX8Insts = false;
   bool GFX9Insts = false;
   bool GFX90AInsts = false;
   bool GFX940Insts = false;
   bool GFX950Insts = false;
   bool GFX10Insts = false;
   bool GFX11Insts = false;
   bool GFX12Insts = false;
   bool GFX1250Insts = false;
   bool GFX10_3Insts = false;
   bool GFX7GFX8GFX9Insts = false;
   bool SGPRInitBug = false;
   bool UserSGPRInit16Bug = false;
   bool NegativeScratchOffsetBug = false;
   bool NegativeUnalignedScratchOffsetBug = false;
   bool HasSMemRealTime = false;
   bool HasIntClamp = false;
   bool HasFmaMixInsts = false;
   bool HasFmaMixBF16Insts = false;
   bool HasMovrel = false;
   bool HasVGPRIndexMode = false;
   bool HasScalarDwordx3Loads = false;
   bool HasScalarStores = false;
   bool HasScalarAtomics = false;
   bool HasSDWAOmod = false;
   bool HasSDWAScalar = false;
   bool HasSDWASdst = false;
   bool HasSDWAMac = false;
   bool HasSDWAOutModsVOPC = false;
   bool HasDPP = false;
   bool HasDPP8 = false;
   bool HasDPALU_DPP = false;
   bool HasDPPSrc1SGPR = false;
   bool HasPackedFP32Ops = false;
   bool HasImageInsts = false;
   bool HasExtendedImageInsts = false;
   bool HasR128A16 = false;
   bool HasA16 = false;
   bool HasG16 = false;
   bool HasNSAEncoding = false;
   bool HasPartialNSAEncoding = false;
   bool GFX10_AEncoding = false;
   bool GFX10_BEncoding = false;
   bool HasDLInsts = false;
   bool HasFmacF64Inst = false;
   bool HasDot1Insts = false;
   bool HasDot2Insts = false;
   bool HasDot3Insts = false;
   bool HasDot4Insts = false;
   bool HasDot5Insts = false;
   bool HasDot6Insts = false;
   bool HasDot7Insts = false;
   bool HasDot8Insts = false;
   bool HasDot9Insts = false;
   bool HasDot10Insts = false;
   bool HasDot11Insts = false;
   bool HasDot12Insts = false;
   bool HasDot13Insts = false;
   bool HasMAIInsts = false;
   bool HasFP8Insts = false;
   bool HasFP8ConversionInsts = false;
   bool HasFP8E5M3Insts = false;
   bool HasCvtFP8Vop1Bug = false;
   bool HasPkFmacF16Inst = false;
   bool HasAtomicFMinFMaxF32GlobalInsts = false;
   bool HasAtomicFMinFMaxF64GlobalInsts = false;
   bool HasAtomicFMinFMaxF32FlatInsts = false;
   bool HasAtomicFMinFMaxF64FlatInsts = false;
   bool HasAtomicDsPkAdd16Insts = false;
   bool HasAtomicFlatPkAdd16Insts = false;
   bool HasAtomicFaddRtnInsts = false;
   bool HasAtomicFaddNoRtnInsts = false;
   bool HasMemoryAtomicFaddF32DenormalSupport = false;
   bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicCSubNoRtnInsts = false;
   bool HasAtomicGlobalPkAddBF16Inst = false;
   bool HasAtomicBufferPkAddBF16Inst = false;
   bool HasFlatAtomicFaddF32Inst = false;
   bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
   bool HasDefaultComponentZero = false;
   bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
   bool HasEmulatedSystemScopeAtomics = false;
   bool HasDefaultComponentBroadcast = false;
   bool HasXF32Insts = false;
   /// The maximum number of instructions that may be placed within an S_CLAUSE,
   /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
   /// indicates a lack of S_CLAUSE support.
   unsigned MaxHardClauseLength = 0;
   bool SupportsSRAMECC = false;
   bool DynamicVGPR = false;
   bool DynamicVGPRBlockSize32 = false;
   bool HasVMemToLDSLoad = false;
   bool RequiresAlignVGPR = false;

   // This should not be used directly. 'TargetID' tracks the dynamic settings
   // for SRAMECC.
   bool EnableSRAMECC = false;

   bool HasNoSdstCMPX = false;
   bool HasVscnt = false;
   bool HasWaitXcnt = false;
   bool HasGetWaveIdInst = false;
   bool HasSMemTimeInst = false;
   bool HasShaderCyclesRegister = false;
   bool HasShaderCyclesHiLoRegisters = false;
   bool HasVOP3Literal = false;
   bool HasNoDataDepHazard = false;
   bool FlatAddressSpace = false;
   bool FlatInstOffsets = false;
   bool FlatGlobalInsts = false;
   bool FlatScratchInsts = false;
   bool FlatGVSMode = false;
   bool ScalarFlatScratchInsts = false;
   bool HasArchitectedFlatScratch = false;
   bool EnableFlatScratch = false;
   bool HasArchitectedSGPRs = false;
   bool HasGDS = false;
   bool HasGWS = false;
   bool AddNoCarryInsts = false;
   bool HasUnpackedD16VMem = false;
   bool LDSMisalignedBug = false;
   bool HasMFMAInlineLiteralBug = false;
   bool UnalignedBufferAccess = false;
   bool UnalignedDSAccess = false;
   bool HasPackedTID = false;
   bool ScalarizeGlobal = false;
   bool HasSALUFloatInsts = false;
   bool HasPseudoScalarTrans = false;
   bool HasRestrictedSOffset = false;
   bool Has64BitLiterals = false;
   bool Has1024AddressableVGPRs = false;
   bool HasBitOp3Insts = false;
   bool HasTanhInsts = false;
   bool HasTensorCvtLutInsts = false;
   bool HasTransposeLoadF4F6Insts = false;
   bool HasPrngInst = false;
   bool HasBVHDualAndBVH8Insts = false;
   bool HasPermlane16Swap = false;
   bool HasPermlane32Swap = false;
   bool HasVcmpxPermlaneHazard = false;
   bool HasVMEMtoScalarWriteHazard = false;
   bool HasSMEMtoVectorWriteHazard = false;
   bool HasInstFwdPrefetchBug = false;
   bool HasVmemPrefInsts = false;
   bool HasSafeSmemPrefetch = false;
   bool HasSafeCUPrefetch = false;
   bool HasVcmpxExecWARHazard = false;
   bool HasLdsBranchVmemWARHazard = false;
   bool HasNSAtoVMEMBug = false;
   bool HasNSAClauseBug = false;
   bool HasOffset3fBug = false;
   bool HasFlatSegmentOffsetBug = false;
   bool HasImageStoreD16Bug = false;
   bool HasImageGather4D16Bug = false;
   bool HasMSAALoadDstSelBug = false;
   bool HasPrivEnabledTrap2NopBug = false;
   bool Has1_5xVGPRs = false;
   bool HasMADIntraFwdBug = false;
   bool HasVOPDInsts = false;
   bool HasVALUTransUseHazard = false;
   bool HasRequiredExportPriority = false;
   bool HasVmemWriteVgprInOrder = false;
   bool HasAshrPkInsts = false;
   bool HasIEEEMinimumMaximumInsts = false;
   bool HasMinimum3Maximum3F32 = false;
   bool HasMinimum3Maximum3F16 = false;
   bool HasMin3Max3PKF16 = false;
   bool HasMinimum3Maximum3PKF16 = false;
   bool HasLshlAddU64Inst = false;
   bool HasAddSubU64Insts = false;
   bool HasMadU32Inst = false;
   bool HasAddMinMaxInsts = false;
   bool HasPkAddMinMaxInsts = false;
   bool HasPointSampleAccel = false;
   bool HasLdsBarrierArriveAtomic = false;
   bool HasSetPrioIncWgInst = false;

   bool RequiresCOV6 = false;
   bool UseBlockVGPROpsForCSR = false;
   bool HasGloballyAddressableScratch = false;

   bool Has45BitNumRecordsBufferResource = false;

   bool HasClusters = false;
   bool RequiresWaitsBeforeSystemScopeStores = false;

   // Dummy feature to use for assembler in tablegen.
   bool FeatureDisable = false;

 private:
   SIInstrInfo InstrInfo;
   SITargetLowering TLInfo;
   SIFrameLowering FrameLowering;

 public:
   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
                const GCNTargetMachine &TM);
   ~GCNSubtarget() override;

   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
                                                    StringRef GPU, StringRef FS);

   /// Diagnose inconsistent subtarget features before attempting to codegen
   /// function \p F.
   void checkSubtargetFeatures(const Function &F) const;

   const SIInstrInfo *getInstrInfo() const override {
     return &InstrInfo;
   }

   const SIFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }

   const SITargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }

   const SIRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }

   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;

   const CallLowering *getCallLowering() const override {
     return CallLoweringInfo.get();
   }

   const InlineAsmLowering *getInlineAsmLowering() const override {
     return InlineAsmLoweringInfo.get();
   }

   InstructionSelector *getInstructionSelector() const override {
     return InstSelector.get();
   }

   const LegalizerInfo *getLegalizerInfo() const override {
     return Legalizer.get();
   }

   const AMDGPURegisterBankInfo *getRegBankInfo() const override {
     return RegBankInfo.get();
   }

   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
     return TargetID;
   }

   const InstrItineraryData *getInstrItineraryData() const override {
     return &InstrItins;
   }

   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);

   Generation getGeneration() const {
     return (Generation)Gen;
   }

   unsigned getMaxWaveScratchSize() const {
     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
     if (getGeneration() >= GFX12) {
       // 18-bit field in units of 64-dword.
       return (64 * 4) * ((1 << 18) - 1);
     }
     if (getGeneration() == GFX11) {
       // 15-bit field in units of 64-dword.
       return (64 * 4) * ((1 << 15) - 1);
     }
     // 13-bit field in units of 256-dword.
     return (256 * 4) * ((1 << 13) - 1);
   }

   /// Return the number of high bits known to be zero for a frame index.
   unsigned getKnownHighZeroBitsForFrameIndex() const {
     return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
   }

   int getLDSBankCount() const {
     return LDSBankCount;
   }

   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
   }

   unsigned getConstantBusLimit(unsigned Opcode) const;

   /// Returns if the result of this instruction with a 16-bit result returned in
   /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
   /// the original value.
   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;

   bool supportsWGP() const {
     if (GFX1250Insts)
       return false;
     return getGeneration() >= GFX10;
   }

   bool hasIntClamp() const {
     return HasIntClamp;
   }

   bool hasFP64() const {
     return FP64;
   }

   bool hasMIMG_R128() const {
     return MIMG_R128;
   }

   bool hasHWFP64() const {
     return FP64;
   }

   bool hasHalfRate64Ops() const {
     return HalfRate64Ops;
   }

   bool hasFullRate64Ops() const {
     return FullRate64Ops;
   }

   bool hasAddr64() const {
     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
   }

   bool hasFlat() const {
     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
   }

   // Return true if the target only has the reverse operand versions of VALU
   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
   bool hasOnlyRevVALUShifts() const {
     return getGeneration() >= VOLCANIC_ISLANDS;
   }

   bool hasFractBug() const {
     return getGeneration() == SOUTHERN_ISLANDS;
   }

   bool hasBFE() const {
     return true;
   }

   bool hasBFI() const {
     return true;
   }

   bool hasBFM() const {
     return hasBFE();
   }

   bool hasBCNT(unsigned Size) const {
     return true;
   }

   bool hasFFBL() const {
     return true;
   }

   bool hasFFBH() const {
     return true;
   }

   bool hasMed3_16() const {
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }

   bool hasMin3Max3_16() const {
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }

   bool hasFmaMixInsts() const {
     return HasFmaMixInsts;
   }

   bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; }

   bool hasCARRY() const {
     return true;
   }

   bool hasFMA() const {
     return FMA;
   }

   bool hasSwap() const {
     return GFX9Insts;
   }

   bool hasScalarPackInsts() const {
     return GFX9Insts;
   }

   bool hasScalarMulHiInsts() const {
     return GFX9Insts;
   }

   bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }

   TrapHandlerAbi getTrapHandlerAbi() const {
     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
   }

   bool supportsGetDoorbellID() const {
     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
     return getGeneration() >= GFX9;
   }

   /// True if the offset field of DS instructions works as expected. On SI, the
   /// offset uses a 16-bit adder and does not always wrap properly.
   bool hasUsableDSOffset() const {
     return getGeneration() >= SEA_ISLANDS;
   }

   bool unsafeDSOffsetFoldingEnabled() const {
     return EnableUnsafeDSOffsetFolding;
   }

   /// Condition output from div_scale is usable.
   bool hasUsableDivScaleConditionOutput() const {
     return getGeneration() != SOUTHERN_ISLANDS;
   }

   /// Extra wait hazard is needed in some cases before
   /// s_cbranch_vccnz/s_cbranch_vccz.
   bool hasReadVCCZBug() const {
     return getGeneration() <= SEA_ISLANDS;
   }

   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
   bool partialVCCWritesUpdateVCCZ() const {
     return getGeneration() >= GFX10;
   }

   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
   /// was written by a VALU instruction.
   bool hasSMRDReadVALUDefHazard() const {
     return getGeneration() == SOUTHERN_ISLANDS;
   }

   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
   /// SGPR was written by a VALU Instruction.
   bool hasVMEMReadSGPRVALUDefHazard() const {
     return getGeneration() >= VOLCANIC_ISLANDS;
   }

   bool hasRFEHazards() const {
     return getGeneration() >= VOLCANIC_ISLANDS;
   }

   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
   unsigned getSetRegWaitStates() const {
     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
   }

   bool dumpCode() const {
     return DumpCode;
   }

   /// Return the amount of LDS that can be used that will not restrict the
   /// occupancy lower than WaveCount.
   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
                                            const Function &) const;

   bool supportsMinMaxDenormModes() const {
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }

   /// \returns If target supports S_DENORM_MODE.
   bool hasDenormModeInst() const {
     return getGeneration() >= AMDGPUSubtarget::GFX10;
   }

   bool useFlatForGlobal() const {
     return FlatForGlobal;
   }

   /// \returns If target supports ds_read/write_b128 and user enables generation
   /// of ds_read/write_b128.
   bool useDS128() const {
     return CIInsts && EnableDS128;
   }

   /// \return If target supports ds_read/write_b96/128.
   bool hasDS96AndDS128() const {
     return CIInsts;
   }

   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
   bool haveRoundOpsF64() const {
     return CIInsts;
   }

   /// \returns If MUBUF instructions always perform range checking, even for
   /// buffer resources used for private memory access.
   bool privateMemoryResourceIsRangeChecked() const {
     return getGeneration() < AMDGPUSubtarget::GFX9;
   }

   /// \returns If target requires PRT Struct NULL support (zero result registers
   /// for sparse texture support).
   bool usePRTStrictNull() const {
     return EnablePRTStrictNull;
   }

   bool hasAutoWaitcntBeforeBarrier() const {
     return AutoWaitcntBeforeBarrier;
   }

   /// \returns true if the target supports backing off of s_barrier instructions
   /// when an exception is raised.
   bool supportsBackOffBarrier() const {
     return BackOffBarrier;
   }

   bool hasUnalignedBufferAccess() const {
     return UnalignedBufferAccess;
   }

   bool hasUnalignedBufferAccessEnabled() const {
     return UnalignedBufferAccess && UnalignedAccessMode;
   }

   bool hasUnalignedDSAccess() const {
     return UnalignedDSAccess;
   }

   bool hasUnalignedDSAccessEnabled() const {
     return UnalignedDSAccess && UnalignedAccessMode;
   }

   bool hasUnalignedScratchAccess() const {
     return UnalignedScratchAccess;
   }

   bool hasUnalignedScratchAccessEnabled() const {
     return UnalignedScratchAccess && UnalignedAccessMode;
   }

   bool hasUnalignedAccessMode() const {
     return UnalignedAccessMode;
   }

   bool hasRelaxedBufferOOBMode() const { return RelaxedBufferOOBMode; }

   bool hasApertureRegs() const {
     return HasApertureRegs;
   }

   bool isTrapHandlerEnabled() const {
     return TrapHandler;
   }

   bool isXNACKEnabled() const {
     return TargetID.isXnackOnOrAny();
   }

   bool isTgSplitEnabled() const {
     return EnableTgSplit;
   }

   bool isCuModeEnabled() const {
     return EnableCuMode;
   }

   bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }

   bool hasFlatAddressSpace() const {
     return FlatAddressSpace;
   }

   bool hasFlatScrRegister() const {
     return hasFlatAddressSpace();
   }

   bool hasFlatInstOffsets() const {
     return FlatInstOffsets;
   }

   bool hasFlatGlobalInsts() const {
     return FlatGlobalInsts;
   }

   bool hasFlatScratchInsts() const {
     return FlatScratchInsts;
   }

   // Check if target supports ST addressing mode with FLAT scratch instructions.
   // The ST addressing mode means no registers are used, either VGPR or SGPR,
   // but only immediate offset is swizzled and added to the FLAT scratch base.
   bool hasFlatScratchSTMode() const {
     return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
   }

   bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }

   bool hasScalarFlatScratchInsts() const {
     return ScalarFlatScratchInsts;
   }

   bool enableFlatScratch() const {
     return flatScratchIsArchitected() ||
            (EnableFlatScratch && hasFlatScratchInsts());
   }

   bool hasGlobalAddTidInsts() const {
     return GFX10_BEncoding;
   }

   bool hasAtomicCSub() const {
     return GFX10_BEncoding;
   }

   bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }

   bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }

   bool hasExportInsts() const {
     return !hasGFX940Insts() && !hasGFX1250Insts();
   }

   bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }

   // DS_ADD_F64/DS_ADD_RTN_F64
   bool hasLdsAtomicAddF64() const {
     return hasGFX90AInsts() || hasGFX1250Insts();
   }

   bool hasMultiDwordFlatScratchAddressing() const {
     return getGeneration() >= GFX9;
   }

   bool hasFlatSegmentOffsetBug() const {
     return HasFlatSegmentOffsetBug;
   }

   bool hasFlatLgkmVMemCountInOrder() const {
     return getGeneration() > GFX9;
   }

   bool hasD16LoadStore() const {
     return getGeneration() >= GFX9;
   }

   bool d16PreservesUnusedBits() const {
     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
   }

   bool hasD16Images() const {
     return getGeneration() >= VOLCANIC_ISLANDS;
   }

   /// Return if most LDS instructions have an m0 use that require m0 to be
   /// initialized.
   bool ldsRequiresM0Init() const {
     return getGeneration() < GFX9;
   }

   // True if the hardware rewinds and replays GWS operations if a wave is
   // preempted.
   //
   // If this is false, a GWS operation requires testing if a nack set the
   // MEM_VIOL bit, and repeating if so.
   bool hasGWSAutoReplay() const {
     return getGeneration() >= GFX9;
   }

   /// \returns if target has ds_gws_sema_release_all instruction.
   bool hasGWSSemaReleaseAll() const {
     return CIInsts;
   }

   /// \returns true if the target has integer add/sub instructions that do not
   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
   /// for saturation.
   bool hasAddNoCarry() const {
     return AddNoCarryInsts;
   }

   bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }

   bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }

   bool hasUnpackedD16VMem() const {
     return HasUnpackedD16VMem;
   }

   // Covers VS/PS/CS graphics shaders
   bool isMesaGfxShader(const Function &F) const {
     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
   }

   bool hasMad64_32() const {
     return getGeneration() >= SEA_ISLANDS;
   }

   bool hasSDWAOmod() const {
     return HasSDWAOmod;
   }

   bool hasSDWAScalar() const {
     return HasSDWAScalar;
   }

   bool hasSDWASdst() const {
     return HasSDWASdst;
   }

   bool hasSDWAMac() const {
     return HasSDWAMac;
   }

   bool hasSDWAOutModsVOPC() const {
     return HasSDWAOutModsVOPC;
   }

   bool hasDLInsts() const {
     return HasDLInsts;
   }

   bool hasFmacF64Inst() const { return HasFmacF64Inst; }

   bool hasDot1Insts() const {
     return HasDot1Insts;
   }

   bool hasDot2Insts() const {
     return HasDot2Insts;
   }

   bool hasDot3Insts() const {
     return HasDot3Insts;
   }

   bool hasDot4Insts() const {
     return HasDot4Insts;
   }

   bool hasDot5Insts() const {
     return HasDot5Insts;
   }

   bool hasDot6Insts() const {
     return HasDot6Insts;
   }

   bool hasDot7Insts() const {
     return HasDot7Insts;
   }

   bool hasDot8Insts() const {
     return HasDot8Insts;
   }

   bool hasDot9Insts() const {
     return HasDot9Insts;
   }

   bool hasDot10Insts() const {
     return HasDot10Insts;
   }

   bool hasDot11Insts() const {
     return HasDot11Insts;
   }

   bool hasDot12Insts() const {
     return HasDot12Insts;
   }

   bool hasDot13Insts() const {
     return HasDot13Insts;
   }

   bool hasMAIInsts() const {
     return HasMAIInsts;
   }

   bool hasFP8Insts() const {
     return HasFP8Insts;
   }

   bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }

   bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; }

   bool hasPkFmacF16Inst() const {
     return HasPkFmacF16Inst;
   }

   bool hasAtomicFMinFMaxF32GlobalInsts() const {
     return HasAtomicFMinFMaxF32GlobalInsts;
   }

   bool hasAtomicFMinFMaxF64GlobalInsts() const {
     return HasAtomicFMinFMaxF64GlobalInsts;
   }

   bool hasAtomicFMinFMaxF32FlatInsts() const {
     return HasAtomicFMinFMaxF32FlatInsts;
   }

   bool hasAtomicFMinFMaxF64FlatInsts() const {
     return HasAtomicFMinFMaxF64FlatInsts;
   }

   bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }

   bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }

   bool hasAtomicFaddInsts() const {
     return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
   }

   bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }

   bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }

   bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
     return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
   }

   bool hasAtomicBufferGlobalPkAddF16Insts() const {
     return HasAtomicBufferGlobalPkAddF16Insts;
   }

   bool hasAtomicGlobalPkAddBF16Inst() const {
     return HasAtomicGlobalPkAddBF16Inst;
   }

   bool hasAtomicBufferPkAddBF16Inst() const {
     return HasAtomicBufferPkAddBF16Inst;
   }

   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }

   /// \return true if the target has flat, global, and buffer atomic fadd for
   /// double.
   bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
     return HasFlatBufferGlobalAtomicFaddF64Inst;
   }

   /// \return true if the target's flat, global, and buffer atomic fadd for
   /// float supports denormal handling.
   bool hasMemoryAtomicFaddF32DenormalSupport() const {
     return HasMemoryAtomicFaddF32DenormalSupport;
   }

   /// \return true if atomic operations targeting fine-grained memory work
   /// correctly at device scope, in allocations in host or peer PCIe device
   /// memory.
   bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
     return HasAgentScopeFineGrainedRemoteMemoryAtomics;
   }

   /// \return true is HW emulates system scope atomics unsupported by the PCI-e
   /// via CAS loop.
   bool hasEmulatedSystemScopeAtomics() const {
     return HasEmulatedSystemScopeAtomics;
   }

   bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }

   bool hasDefaultComponentBroadcast() const {
     return HasDefaultComponentBroadcast;
   }

   bool hasNoSdstCMPX() const {
     return HasNoSdstCMPX;
   }

   bool hasVscnt() const {
     return HasVscnt;
   }

   bool hasGetWaveIdInst() const {
     return HasGetWaveIdInst;
   }

   bool hasSMemTimeInst() const {
     return HasSMemTimeInst;
   }

   bool hasShaderCyclesRegister() const {
     return HasShaderCyclesRegister;
   }

   bool hasShaderCyclesHiLoRegisters() const {
     return HasShaderCyclesHiLoRegisters;
   }

   bool hasVOP3Literal() const {
     return HasVOP3Literal;
   }

   bool hasNoDataDepHazard() const {
     return HasNoDataDepHazard;
   }

   bool vmemWriteNeedsExpWaitcnt() const {
     return getGeneration() < SEA_ISLANDS;
   }

   bool hasInstPrefetch() const {
     return getGeneration() == GFX10 || getGeneration() == GFX11;
   }

   bool hasPrefetch() const { return GFX12Insts; }

   bool hasVmemPrefInsts() const { return HasVmemPrefInsts; }

   bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }

   bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }

   // Has s_cmpk_* instructions.
   bool hasSCmpK() const { return getGeneration() < GFX12; }

   // Scratch is allocated in 256 dword per wave blocks for the entire
   // wavefront. When viewed from the perspective of an arbitrary workitem, this
   // is 4-byte aligned.
   //
   // Only 4-byte alignment is really needed to access anything. Transformations
   // on the pointer value itself may rely on the alignment / known low bits of
   // the pointer. Set this to something above the minimum to avoid needing
   // dynamic realignment in common cases.
   Align getStackAlignment() const { return Align(16); }

   bool enableMachineScheduler() const override {
     return true;
   }

   bool useAA() const override;

   bool enableSubRegLiveness() const override {
     return true;
   }

   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }

   // static wrappers
   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);

   // XXX - Why is this here if it isn't in the default pass set?
   bool enableEarlyIfConversion() const override {
     return true;
   }

   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            const SchedRegion &Region) const override;

   void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
                                  const SchedRegion &Region) const override;

   void mirFileLoaded(MachineFunction &MF) const override;

   unsigned getMaxNumUserSGPRs() const {
     return AMDGPU::getMaxNumUserSGPRs(*this);
   }

   bool hasSMemRealTime() const {
     return HasSMemRealTime;
   }

   bool hasMovrel() const {
     return HasMovrel;
   }

   bool hasVGPRIndexMode() const {
     return HasVGPRIndexMode;
   }

   bool useVGPRIndexMode() const;

   bool hasScalarCompareEq64() const {
     return getGeneration() >= VOLCANIC_ISLANDS;
   }

   bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }

   bool hasScalarStores() const {
     return HasScalarStores;
   }

   bool hasScalarAtomics() const {
     return HasScalarAtomics;
   }

   bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
   bool hasLDSFPAtomicAddF64() const { return GFX90AInsts || GFX1250Insts; }

   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }

   /// \returns true if the subtarget has the v_permlane64_b32 instruction.
   bool hasPermLane64() const { return getGeneration() >= GFX11; }

   bool hasDPP() const {
     return HasDPP;
   }

   bool hasDPPBroadcasts() const {
     return HasDPP && getGeneration() < GFX10;
   }

   bool hasDPPWavefrontShifts() const {
     return HasDPP && getGeneration() < GFX10;
   }

   bool hasDPP8() const {
     return HasDPP8;
   }

   bool hasDPALU_DPP() const {
     return HasDPALU_DPP;
   }

   bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }

   bool hasPackedFP32Ops() const {
     return HasPackedFP32Ops;
   }

   // Has V_PK_MOV_B32 opcode
   bool hasPkMovB32() const {
     return GFX90AInsts;
   }

   bool hasFmaakFmamkF32Insts() const {
     return getGeneration() >= GFX10 || hasGFX940Insts();
   }

   bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }

   bool hasImageInsts() const {
     return HasImageInsts;
   }

   bool hasExtendedImageInsts() const {
     return HasExtendedImageInsts;
   }

   bool hasR128A16() const {
     return HasR128A16;
   }

   bool hasA16() const { return HasA16; }

   bool hasG16() const { return HasG16; }

   bool hasOffset3fBug() const {
     return HasOffset3fBug;
   }

   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }

   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }

   bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }

   bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }

   bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }

   bool hasNSAEncoding() const { return HasNSAEncoding; }

   bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }

   bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }

   unsigned getNSAMaxSize(bool HasSampler = false) const {
     return AMDGPU::getNSAMaxSize(*this, HasSampler);
   }

   bool hasGFX10_AEncoding() const {
     return GFX10_AEncoding;
   }

   bool hasGFX10_BEncoding() const {
     return GFX10_BEncoding;
   }

   bool hasGFX10_3Insts() const {
     return GFX10_3Insts;
   }

   bool hasMadF16() const;

   bool hasMovB64() const { return GFX940Insts || GFX1250Insts; }

   bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }

   // Scalar and global loads support scale_offset bit.
   bool hasScaleOffset() const { return GFX1250Insts; }

   bool hasFlatGVSMode() const { return FlatGVSMode; }

   // FLAT GLOBAL VOffset is signed
   bool hasSignedGVSOffset() const { return GFX1250Insts; }

   bool enableSIScheduler() const {
     return EnableSIScheduler;
   }

   bool loadStoreOptEnabled() const {
     return EnableLoadStoreOpt;
   }

   bool hasSGPRInitBug() const {
     return SGPRInitBug;
   }

   bool hasUserSGPRInit16Bug() const {
     return UserSGPRInit16Bug && isWave32();
   }

   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }

   bool hasNegativeUnalignedScratchOffsetBug() const {
     return NegativeUnalignedScratchOffsetBug;
   }

   bool hasMFMAInlineLiteralBug() const {
     return HasMFMAInlineLiteralBug;
   }

   bool has12DWordStoreHazard() const {
     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
   }

   // \returns true if the subtarget supports DWORDX3 load/store instructions.
   bool hasDwordx3LoadStores() const {
     return CIInsts;
   }

   bool hasReadM0MovRelInterpHazard() const {
     return getGeneration() == AMDGPUSubtarget::GFX9;
   }

   bool hasReadM0SendMsgHazard() const {
     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
            getGeneration() <= AMDGPUSubtarget::GFX9;
   }

   bool hasReadM0LdsDmaHazard() const {
     return getGeneration() == AMDGPUSubtarget::GFX9;
   }

   bool hasReadM0LdsDirectHazard() const {
     return getGeneration() == AMDGPUSubtarget::GFX9;
   }

   bool hasVcmpxPermlaneHazard() const {
     return HasVcmpxPermlaneHazard;
   }

   bool hasVMEMtoScalarWriteHazard() const {
     return HasVMEMtoScalarWriteHazard;
   }

   bool hasSMEMtoVectorWriteHazard() const {
     return HasSMEMtoVectorWriteHazard;
   }

   bool hasLDSMisalignedBug() const {
     return LDSMisalignedBug && !EnableCuMode;
   }

   bool hasInstFwdPrefetchBug() const {
     return HasInstFwdPrefetchBug;
   }

   bool hasVcmpxExecWARHazard() const {
     return HasVcmpxExecWARHazard;
   }

   bool hasLdsBranchVmemWARHazard() const {
     return HasLdsBranchVmemWARHazard;
   }

   // Shift amount of a 64 bit shift cannot be a highest allocated register
   // if also at the end of the allocation block.
   bool hasShift64HighRegBug() const {
     return GFX90AInsts && !GFX940Insts;
   }

   // Has one cycle hazard on transcendental instruction feeding a
   // non transcendental VALU.
   bool hasTransForwardingHazard() const { return GFX940Insts; }

   // Has one cycle hazard on a VALU instruction partially writing dst with
   // a shift of result bits feeding another VALU instruction.
   bool hasDstSelForwardingHazard() const { return GFX940Insts; }

   // Cannot use op_sel with v_dot instructions.
   bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }

   // Does not have HW interlocs for VALU writing and then reading SGPRs.
   bool hasVDecCoExecHazard() const {
     return GFX940Insts;
   }

   bool hasNSAtoVMEMBug() const {
     return HasNSAtoVMEMBug;
   }

   bool hasNSAClauseBug() const { return HasNSAClauseBug; }

   bool hasHardClauses() const { return MaxHardClauseLength > 0; }

   bool hasGFX90AInsts() const { return GFX90AInsts; }

   bool hasFPAtomicToDenormModeHazard() const {
     return getGeneration() == GFX10;
   }

   bool hasVOP3DPP() const { return getGeneration() >= GFX11; }

   bool hasLdsDirect() const { return getGeneration() >= GFX11; }

   bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }

   bool hasVALUPartialForwardingHazard() const {
     return getGeneration() == GFX11;
   }

   bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }

   bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }

   bool requiresCodeObjectV6() const { return RequiresCOV6; }

   bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }

   bool hasGloballyAddressableScratch() const {
     return HasGloballyAddressableScratch;
   }

   bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }

   bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }

   bool setRegModeNeedsVNOPs() const {
     return GFX1250Insts && getGeneration() == GFX12;
   }

   /// Return if operations acting on VGPR tuples require even alignment.
   bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }

   /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
   bool hasSPackHL() const { return GFX11Insts; }

   /// Return true if the target's EXP instruction has the COMPR flag, which
   /// affects the meaning of the EN (enable) bits.
   bool hasCompressedExport() const { return !GFX11Insts; }

   /// Return true if the target's EXP instruction supports the NULL export
   /// target.
   bool hasNullExportTarget() const { return !GFX11Insts; }

   bool has1_5xVGPRs() const { return Has1_5xVGPRs; }

   bool hasVOPDInsts() const { return HasVOPDInsts; }

   bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }

   /// Return true if the target has the S_DELAY_ALU instruction.
   bool hasDelayAlu() const { return GFX11Insts; }

   bool hasPackedTID() const { return HasPackedTID; }

   // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies that
   // hasGFX90AInsts is also true.
   bool hasGFX940Insts() const { return GFX940Insts; }

   // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that
   // hasGFX940Insts and hasGFX90AInsts are also true.
   bool hasGFX950Insts() const { return GFX950Insts; }

   /// Returns true if the target supports
   /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
   /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
   bool hasLDSLoadB96_B128() const {
     return hasGFX950Insts();
   }

   bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; }

   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }

   bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }

   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }

   bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }

   bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }

   /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
   /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
   bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }

   /// \returns true if inline constants are not supported for F16 pseudo
   /// scalar transcendentals.
   bool hasNoF16PseudoScalarTransInlineConstants() const {
     return getGeneration() == GFX12;
   }

   /// \returns true if the target has instructions with xf32 format support.
   bool hasXF32Insts() const { return HasXF32Insts; }

   bool hasBitOp3Insts() const { return HasBitOp3Insts; }

   bool hasPermlane16Swap() const { return HasPermlane16Swap; }
   bool hasPermlane32Swap() const { return HasPermlane32Swap; }
   bool hasAshrPkInsts() const { return HasAshrPkInsts; }

   bool hasMinimum3Maximum3F32() const {
     return HasMinimum3Maximum3F32;
   }

   bool hasMinimum3Maximum3F16() const {
     return HasMinimum3Maximum3F16;
   }

   bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; }

   bool hasTanhInsts() const { return HasTanhInsts; }

   bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; }

   bool hasAddPC64Inst() const { return GFX1250Insts; }

   bool has1024AddressableVGPRs() const { return Has1024AddressableVGPRs; }

   bool hasMinimum3Maximum3PKF16() const {
     return HasMinimum3Maximum3PKF16;
   }

   bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }

   /// \returns true if the target has s_wait_xcnt insertion. Supported for
   /// GFX1250.
   bool hasWaitXCnt() const { return HasWaitXcnt; }

   // A single DWORD instructions can use a 64-bit literal.
   bool has64BitLiterals() const { return Has64BitLiterals; }

   bool hasPointSampleAccel() const { return HasPointSampleAccel; }

   bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; }

   /// \returns The maximum number of instructions that can be enclosed in an
   /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
   /// instruction.
   unsigned maxHardClauseLength() const { return MaxHardClauseLength; }

   bool hasPrngInst() const { return HasPrngInst; }

   bool hasBVHDualAndBVH8Insts() const { return HasBVHDualAndBVH8Insts; }

   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
   /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;

   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
   /// VGPRs
   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
                                     unsigned DynamicVGPRBlockSize) const;

   /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
   /// be achieved when the only function running on a CU is \p F, each workgroup
   /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
   /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
   /// range, so this returns a range as well.
   ///
   /// Note that occupancy can be affected by the scratch allocation as well, but
   /// we do not have enough information to compute it.
   std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
                                                  unsigned LDSSize = 0,
                                                  unsigned NumSGPRs = 0,
                                                  unsigned NumVGPRs = 0) const;

   /// \returns true if the flat_scratch register should be initialized with the
   /// pointer to the wave's scratch memory rather than a size and offset.
   bool flatScratchIsPointer() const {
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }

   /// \returns true if the flat_scratch register is initialized by the HW.
   /// In this case it is readonly.
   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }

   /// \returns true if the architected SGPRs are enabled.
   bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }

   /// \returns true if Global Data Share is supported.
   bool hasGDS() const { return HasGDS; }

   /// \returns true if Global Wave Sync is supported.
   bool hasGWS() const { return HasGWS; }

   /// \returns true if the machine has merged shaders in which s0-s7 are
   /// reserved by the hardware and user SGPRs start at s8
   bool hasMergedShaders() const {
     return getGeneration() >= GFX9;
   }

   // \returns true if the target supports the pre-NGG legacy geometry path.
   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }

   // \returns true if preloading kernel arguments is supported.
   bool hasKernargPreload() const { return KernargPreload; }

   // \returns true if the target has split barriers feature
   bool hasSplitBarriers() const { return getGeneration() >= GFX12; }

   // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
   bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; }

   // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
   // no-return form.
   bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }

   // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
   bool hasDX10ClampMode() const { return getGeneration() < GFX12; }

   // \returns true if the target has IEEE kernel descriptor mode bit
   bool hasIEEEMode() const { return getGeneration() < GFX12; }

   // \returns true if the target has IEEE fminimum/fmaximum instructions
   bool hasIEEEMinimumMaximumInsts() const { return HasIEEEMinimumMaximumInsts; }

   // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
   bool hasRrWGMode() const { return getGeneration() >= GFX12; }

   /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
   /// values.
   bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }

   bool hasGFX1250Insts() const { return GFX1250Insts; }

   bool hasVOPD3() const { return GFX1250Insts; }

   // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
   bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }

   // \returns true if the target has V_MAD_U32 instruction.
   bool hasMadU32Inst() const { return HasMadU32Inst; }

   // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
   bool hasVectorMulU64() const { return GFX1250Insts; }

   // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
   // instructions.
   bool hasMadU64U32NoCarry() const { return GFX1250Insts; }

   // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
   bool hasIntMinMax64() const { return GFX1250Insts; }

   // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
   bool hasAddMinMaxInsts() const { return HasAddMinMaxInsts; }

   // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
   bool hasPkAddMinMaxInsts() const { return HasPkAddMinMaxInsts; }

   // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
   bool hasPkMinMax3Insts() const { return GFX1250Insts; }

   // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
   bool hasSGetShaderCyclesInst() const { return GFX1250Insts; }

   // \returns true if target has S_SETPRIO_INC_WG instruction.
   bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }

   // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
   // of sign-extending. Note that GFX1250 has not only fixed the bug but also
   // extended VA to 57 bits.
   bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }

   // \returns true if the target needs to create a prolog for backward
   // compatibility when preloading kernel arguments.
   bool needsKernArgPreloadProlog() const {
     return hasKernargPreload() && !GFX1250Insts;
   }

   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
   }

   /// \returns SGPR encoding granularity supported by the subtarget.
   unsigned getSGPREncodingGranule() const {
     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
   }

   /// \returns Total number of SGPRs supported by the subtarget.
   unsigned getTotalNumSGPRs() const {
     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
   }

   /// \returns Addressable number of SGPRs supported by the subtarget.
   unsigned getAddressableNumSGPRs() const {
     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
   }

   /// \returns Minimum number of SGPRs that meets the given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
   }

   /// \returns Maximum number of SGPRs that meets the given number of waves per
   /// execution unit requirement supported by the subtarget.
   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
   }

   /// \returns Reserved number of SGPRs. This is common
   /// utility function called by MachineFunction and
   /// Function variants of getReservedNumSGPRs.
   unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
   /// \returns Reserved number of SGPRs for given machine function \p MF.
   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;

   /// \returns Reserved number of SGPRs for given function \p F.
   unsigned getReservedNumSGPRs(const Function &F) const;

   /// \returns Maximum number of preloaded SGPRs for the subtarget.
   unsigned getMaxNumPreloadedSGPRs() const;

   /// \returns max num SGPRs. This is the common utility
   /// function called by MachineFunction and Function
   /// variants of getMaxNumSGPRs.
   unsigned getBaseMaxNumSGPRs(const Function &F,
                               std::pair<unsigned, unsigned> WavesPerEU,
                               unsigned PreloadedSGPRs,
                               unsigned ReservedNumSGPRs) const;

   /// \returns Maximum number of SGPRs that meets number of waves per execution
   /// unit requirement for function \p MF, or number of SGPRs explicitly
   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
   ///
   /// \returns Value that meets number of waves per execution unit requirement
   /// if explicitly requested value cannot be converted to integer, violates
   /// subtarget's specifications, or does not meet number of waves per execution
   /// unit requirement.
   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;

   /// \returns Maximum number of SGPRs that meets number of waves per execution
   /// unit requirement for function \p F, or number of SGPRs explicitly
   /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
   ///
   /// \returns Value that meets number of waves per execution unit requirement
   /// if explicitly requested value cannot be converted to integer, violates
   /// subtarget's specifications, or does not meet number of waves per execution
   /// unit requirement.
   unsigned getMaxNumSGPRs(const Function &F) const;

   /// \returns VGPR allocation granularity supported by the subtarget.
   unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
     return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize);
   }

   /// \returns VGPR encoding granularity supported by the subtarget.
   unsigned getVGPREncodingGranule() const {
     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
   }

   /// \returns Total number of VGPRs supported by the subtarget.
   unsigned getTotalNumVGPRs() const {
     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
   }

   /// \returns Addressable number of architectural VGPRs supported by the
   /// subtarget.
   unsigned getAddressableNumArchVGPRs() const {
     return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
   }

   /// \returns Addressable number of VGPRs supported by the subtarget.
   unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize);
   }

   /// \returns the minimum number of VGPRs that will prevent achieving more than
   /// the specified number of waves \p WavesPerEU.
   unsigned getMinNumVGPRs(unsigned WavesPerEU,
                           unsigned DynamicVGPRBlockSize) const {
     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU,
                                            DynamicVGPRBlockSize);
   }

   /// \returns the maximum number of VGPRs that can be used and still achieved
   /// at least the specified number of waves \p WavesPerEU.
   unsigned getMaxNumVGPRs(unsigned WavesPerEU,
                           unsigned DynamicVGPRBlockSize) const {
     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU,
                                            DynamicVGPRBlockSize);
   }

   /// \returns max num VGPRs. This is the common utility function
   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
   unsigned
   getBaseMaxNumVGPRs(const Function &F,
                      std::pair<unsigned, unsigned> NumVGPRBounds) const;

   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p F, or number of VGPRs explicitly
   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
   ///
   /// \returns Value that meets number of waves per execution unit requirement
   /// if explicitly requested value cannot be converted to integer, violates
   /// subtarget's specifications, or does not meet number of waves per execution
   /// unit requirement.
   unsigned getMaxNumVGPRs(const Function &F) const;

   unsigned getMaxNumAGPRs(const Function &F) const {
     return getMaxNumVGPRs(F);
   }

   /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
   /// of waves per execution unit required for the function \p MF.
   std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;

   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p MF, or number of VGPRs explicitly
   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
   ///
   /// \returns Value that meets number of waves per execution unit requirement
   /// if explicitly requested value cannot be converted to integer, violates
   /// subtarget's specifications, or does not meet number of waves per execution
   /// unit requirement.
   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;

   bool supportsWave32() const { return getGeneration() >= GFX10; }

   bool supportsWave64() const { return !hasGFX1250Insts(); }

   bool isWave32() const {
     return getWavefrontSize() == 32;
   }

   bool isWave64() const {
     return getWavefrontSize() == 64;
   }

   /// Returns if the wavesize of this subtarget is known reliable. This is false
   /// only for the a default target-cpu that does not have an explicit
   /// +wavefrontsize target feature.
   bool isWaveSizeKnown() const {
     return hasFeature(AMDGPU::FeatureWavefrontSize32) ||
            hasFeature(AMDGPU::FeatureWavefrontSize64);
   }

   const TargetRegisterClass *getBoolRC() const {
     return getRegisterInfo()->getBoolRC();
   }

   /// \returns Maximum number of work groups per compute unit supported by the
   /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
   }

   /// \returns Minimum flat work group size supported by the subtarget.
   unsigned getMinFlatWorkGroupSize() const override {
     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
   }

   /// \returns Maximum flat work group size supported by the subtarget.
   unsigned getMaxFlatWorkGroupSize() const override {
     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
   }

   /// \returns Number of waves per execution unit required to support the given
   /// \p FlatWorkGroupSize.
   unsigned
   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
   }

   /// \returns Minimum number of waves per execution unit supported by the
   /// subtarget.
   unsigned getMinWavesPerEU() const override {
     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
   }

   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
                              SDep &Dep,
                              const TargetSchedModel *SchedModel) const override;

   // \returns true if it's beneficial on this subtarget for the scheduler to
   // cluster stores as well as loads.
   bool shouldClusterStores() const { return getGeneration() >= GFX11; }

   // \returns the number of address arguments from which to enable MIMG NSA
   // on supported architectures.
   unsigned getNSAThreshold(const MachineFunction &MF) const;

   // \returns true if the subtarget has a hazard requiring an "s_nop 0"
   // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
   bool requiresNopBeforeDeallocVGPRs() const { return !GFX1250Insts; }

   // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
   // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
   bool requiresWaitIdleBeforeGetReg() const { return GFX1250Insts; }

   bool isDynamicVGPREnabled() const { return DynamicVGPR; }
   unsigned getDynamicVGPRBlockSize() const {
     return DynamicVGPRBlockSize32 ? 32 : 16;
   }

   bool requiresDisjointEarlyClobberAndUndef() const override {
     // AMDGPU doesn't care if early-clobber and undef operands are allocated
     // to the same register.
     return false;
   }

   // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
   // and surronded by S_WAIT_ALU(0xFFE3).
   bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
     return getGeneration() == GFX12;
   }

   // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
   // read.
   bool hasScratchBaseForwardingHazard() const {
     return GFX1250Insts && getGeneration() == GFX12;
   }

   /// \returns true if the subtarget supports clusters of workgroups.
   bool hasClusters() const { return HasClusters; }

   /// \returns true if the subtarget requires a wait for xcnt before atomic
   /// flat/global stores & rmw.
   bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }

   /// \returns the number of significant bits in the immediate field of the
   /// S_NOP instruction.
   unsigned getSNopBits() const {
     if (getGeneration() >= AMDGPUSubtarget::GFX12)
       return 7;
     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return 4;
     return 3;
   }

   /// \returns true if the sub-target supports buffer resource (V#) with 45-bit
   /// num_records.
   bool has45BitNumRecordsBufferResource() const {
     return Has45BitNumRecordsBufferResource;
   }

   bool requiresWaitsBeforeSystemScopeStores() const {
     return RequiresWaitsBeforeSystemScopeStores;
   }
 };

 class GCNUserSGPRUsageInfo {
 public:
   bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }

   bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }

   bool hasDispatchPtr() const { return DispatchPtr; }

   bool hasQueuePtr() const { return QueuePtr; }

   bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }

   bool hasDispatchID() const { return DispatchID; }

   bool hasFlatScratchInit() const { return FlatScratchInit; }

   bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }

   unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }

   unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }

   unsigned getNumFreeUserSGPRs();

   void allocKernargPreloadSGPRs(unsigned NumSGPRs);

   enum UserSGPRID : unsigned {
     ImplicitBufferPtrID = 0,
     PrivateSegmentBufferID = 1,
     DispatchPtrID = 2,
     QueuePtrID = 3,
     KernargSegmentPtrID = 4,
     DispatchIdID = 5,
     FlatScratchInitID = 6,
     PrivateSegmentSizeID = 7
   };

   // Returns the size in number of SGPRs for preload user SGPR field.
   static unsigned getNumUserSGPRForField(UserSGPRID ID) {
     switch (ID) {
     case ImplicitBufferPtrID:
       return 2;
     case PrivateSegmentBufferID:
       return 4;
     case DispatchPtrID:
       return 2;
     case QueuePtrID:
       return 2;
     case KernargSegmentPtrID:
       return 2;
     case DispatchIdID:
       return 2;
     case FlatScratchInitID:
       return 2;
     case PrivateSegmentSizeID:
       return 1;
     }
     llvm_unreachable("Unknown UserSGPRID.");
   }

   GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);

 private:
   const GCNSubtarget &ST;

   // Private memory buffer
   // Compute directly in sgpr[0:1]
   // Other shaders indirect 64-bits at sgpr[0:1]
   bool ImplicitBufferPtr = false;

   bool PrivateSegmentBuffer = false;

   bool DispatchPtr = false;

   bool QueuePtr = false;

   bool KernargSegmentPtr = false;

   bool DispatchID = false;

   bool FlatScratchInit = false;

   bool PrivateSegmentSize = false;

   unsigned NumKernargPreloadSGPRs = 0;

   unsigned NumUsedUserSGPRs = 0;
 };

 } // end namespace llvm

 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H