|  | //===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | // | 
|  | // This pass lowers the local data store, LDS, uses in kernel and non-kernel | 
|  | // functions in module to use dynamically allocated global memory. | 
|  | // Packed LDS Layout is emulated in the global memory. | 
|  | // The lowered memory instructions from LDS to global memory are then | 
|  | // instrumented for address sanitizer, to catch addressing errors. | 
|  | // This pass only work when address sanitizer has been enabled and has | 
|  | // instrumented the IR. It identifies that IR has been instrumented using | 
|  | // "nosanitize_address" module flag. | 
|  | // | 
|  | // Replacement of Kernel LDS accesses: | 
|  | //    For a kernel, LDS access can be static or dynamic which are direct | 
|  | //    (accessed within kernel) and indirect (accessed through non-kernels). | 
|  | //    All these LDS accesses corresponding to kernel will be packed together, | 
|  | //    where all static LDS accesses will be allocated first and then dynamic | 
|  | //    LDS follows. The total size with alignment is calculated. A new LDS global | 
|  | //    will be created for the kernel called "SW LDS" and it will have the | 
|  | //    attribute "amdgpu-lds-size" attached with value of the size calculated. | 
|  | //    All the LDS accesses in the module will be replaced by GEP with offset | 
|  | //    into the "Sw LDS". | 
|  | //    A new "llvm.amdgcn.<kernel>.dynlds" is created per kernel accessing | 
|  | //    the dynamic LDS. This will be marked used by kernel and will have | 
|  | //    MD_absolue_symbol metadata set to total static LDS size, Since dynamic | 
|  | //    LDS allocation starts after all static LDS allocation. | 
|  | // | 
|  | //    A device global memory equal to the total LDS size will be allocated. | 
|  | //    At the prologue of the kernel, a single work-item from the | 
|  | //    work-group, does a "malloc" and stores the pointer of the | 
|  | //    allocation in "SW LDS". | 
|  | // | 
|  | //    To store the offsets corresponding to all LDS accesses, another global | 
|  | //    variable is created which will be called "SW LDS metadata" in this pass. | 
|  | //    - SW LDS Global: | 
|  | //        It is LDS global of ptr type with name | 
|  | //        "llvm.amdgcn.sw.lds.<kernel-name>". | 
|  | //    - Metadata Global: | 
|  | //        It is of struct type, with n members. n equals the number of LDS | 
|  | //        globals accessed by the kernel(direct and indirect). Each member of | 
|  | //        struct is another struct of type {i32, i32, i32}. First member | 
|  | //        corresponds to offset, second member corresponds to size of LDS global | 
|  | //        being replaced and third represents the total aligned size. It will | 
|  | //        have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have | 
|  | //        an intializer with static LDS related offsets and sizes initialized. | 
|  | //        But for dynamic LDS related entries, offsets will be intialized to | 
|  | //        previous static LDS allocation end offset. Sizes for them will be zero | 
|  | //        initially. These dynamic LDS offset and size values will be updated | 
|  | //        within the kernel, since kernel can read the dynamic LDS size | 
|  | //        allocation done at runtime with query to "hidden_dynamic_lds_size" | 
|  | //        hidden kernel argument. | 
|  | // | 
|  | //    At the epilogue of kernel, allocated memory would be made free by the same | 
|  | //    single work-item. | 
|  | // | 
|  | // Replacement of non-kernel LDS accesses: | 
|  | //    Multiple kernels can access the same non-kernel function. | 
|  | //    All the kernels accessing LDS through non-kernels are sorted and | 
|  | //    assigned a kernel-id. All the LDS globals accessed by non-kernels | 
|  | //    are sorted. This information is used to build two tables: | 
|  | //    - Base table: | 
|  | //        Base table will have single row, with elements of the row | 
|  | //        placed as per kernel ID. Each element in the row corresponds | 
|  | //        to ptr of "SW LDS" variable created for that kernel. | 
|  | //    - Offset table: | 
|  | //        Offset table will have multiple rows and columns. | 
|  | //        Rows are assumed to be from 0 to (n-1). n is total number | 
|  | //        of kernels accessing the LDS through non-kernels. | 
|  | //        Each row will have m elements. m is the total number of | 
|  | //        unique LDS globals accessed by all non-kernels. | 
|  | //        Each element in the row correspond to the ptr of | 
|  | //        the replacement of LDS global done by that particular kernel. | 
|  | //    A LDS variable in non-kernel will be replaced based on the information | 
|  | //    from base and offset tables. Based on kernel-id query, ptr of "SW | 
|  | //    LDS" for that corresponding kernel is obtained from base table. | 
|  | //    The Offset into the base "SW LDS" is obtained from | 
|  | //    corresponding element in offset table. With this information, replacement | 
|  | //    value is obtained. | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #include "AMDGPU.h" | 
|  | #include "AMDGPUAsanInstrumentation.h" | 
|  | #include "AMDGPUMemoryUtils.h" | 
|  | #include "AMDGPUTargetMachine.h" | 
|  | #include "llvm/ADT/DenseMap.h" | 
|  | #include "llvm/ADT/DenseSet.h" | 
|  | #include "llvm/ADT/SetVector.h" | 
|  | #include "llvm/ADT/StringExtras.h" | 
|  | #include "llvm/ADT/StringRef.h" | 
|  | #include "llvm/Analysis/CallGraph.h" | 
|  | #include "llvm/Analysis/DomTreeUpdater.h" | 
|  | #include "llvm/CodeGen/TargetPassConfig.h" | 
|  | #include "llvm/IR/Constants.h" | 
|  | #include "llvm/IR/DIBuilder.h" | 
|  | #include "llvm/IR/DebugInfo.h" | 
|  | #include "llvm/IR/DebugInfoMetadata.h" | 
|  | #include "llvm/IR/IRBuilder.h" | 
|  | #include "llvm/IR/Instructions.h" | 
|  | #include "llvm/IR/IntrinsicsAMDGPU.h" | 
|  | #include "llvm/IR/MDBuilder.h" | 
|  | #include "llvm/IR/ReplaceConstant.h" | 
|  | #include "llvm/Pass.h" | 
|  | #include "llvm/Support/raw_ostream.h" | 
|  | #include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h" | 
|  | #include "llvm/Transforms/Utils/ModuleUtils.h" | 
|  |  | 
|  | #include <algorithm> | 
|  |  | 
|  | #define DEBUG_TYPE "amdgpu-sw-lower-lds" | 
|  | #define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15 | 
|  |  | 
|  | using namespace llvm; | 
|  | using namespace AMDGPU; | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | cl::opt<bool> | 
|  | AsanInstrumentLDS("amdgpu-asan-instrument-lds", | 
|  | cl::desc("Run asan instrumentation on LDS instructions " | 
|  | "lowered to global memory"), | 
|  | cl::init(true), cl::Hidden); | 
|  |  | 
|  | using DomTreeCallback = function_ref<DominatorTree *(Function &F)>; | 
|  |  | 
|  | struct LDSAccessTypeInfo { | 
|  | SetVector<GlobalVariable *> StaticLDSGlobals; | 
|  | SetVector<GlobalVariable *> DynamicLDSGlobals; | 
|  | }; | 
|  |  | 
|  | // Struct to hold all the Metadata required for a kernel | 
|  | // to replace a LDS global uses with corresponding offset | 
|  | // in to device global memory. | 
|  | struct KernelLDSParameters { | 
|  | GlobalVariable *SwLDS = nullptr; | 
|  | GlobalVariable *SwDynLDS = nullptr; | 
|  | GlobalVariable *SwLDSMetadata = nullptr; | 
|  | LDSAccessTypeInfo DirectAccess; | 
|  | LDSAccessTypeInfo IndirectAccess; | 
|  | DenseMap<GlobalVariable *, SmallVector<uint32_t, 3>> | 
|  | LDSToReplacementIndicesMap; | 
|  | uint32_t MallocSize = 0; | 
|  | uint32_t LDSSize = 0; | 
|  | SmallVector<std::pair<uint32_t, uint32_t>, 64> RedzoneOffsetAndSizeVector; | 
|  | }; | 
|  |  | 
|  | // Struct to store information for creation of offset table | 
|  | // for all the non-kernel LDS accesses. | 
|  | struct NonKernelLDSParameters { | 
|  | GlobalVariable *LDSBaseTable = nullptr; | 
|  | GlobalVariable *LDSOffsetTable = nullptr; | 
|  | SetVector<Function *> OrderedKernels; | 
|  | SetVector<GlobalVariable *> OrdereLDSGlobals; | 
|  | }; | 
|  |  | 
|  | struct AsanInstrumentInfo { | 
|  | int Scale = 0; | 
|  | uint32_t Offset = 0; | 
|  | SetVector<Instruction *> Instructions; | 
|  | }; | 
|  |  | 
|  | struct FunctionsAndLDSAccess { | 
|  | DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap; | 
|  | SetVector<Function *> KernelsWithIndirectLDSAccess; | 
|  | SetVector<Function *> NonKernelsWithLDSArgument; | 
|  | SetVector<GlobalVariable *> AllNonKernelLDSAccess; | 
|  | FunctionVariableMap NonKernelToLDSAccessMap; | 
|  | }; | 
|  |  | 
|  | class AMDGPUSwLowerLDS { | 
|  | public: | 
|  | AMDGPUSwLowerLDS(Module &Mod, const AMDGPUTargetMachine &TM, | 
|  | DomTreeCallback Callback) | 
|  | : M(Mod), AMDGPUTM(TM), IRB(M.getContext()), DTCallback(Callback) {} | 
|  | bool run(); | 
|  | void getUsesOfLDSByNonKernels(); | 
|  | void getNonKernelsWithLDSArguments(const CallGraph &CG); | 
|  | SetVector<Function *> | 
|  | getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &Kernels); | 
|  | SetVector<GlobalVariable *> | 
|  | getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &Variables); | 
|  | void buildSwLDSGlobal(Function *Func); | 
|  | void buildSwDynLDSGlobal(Function *Func); | 
|  | void populateSwMetadataGlobal(Function *Func); | 
|  | void populateSwLDSAttributeAndMetadata(Function *Func); | 
|  | void populateLDSToReplacementIndicesMap(Function *Func); | 
|  | void getLDSMemoryInstructions(Function *Func, | 
|  | SetVector<Instruction *> &LDSInstructions); | 
|  | void replaceKernelLDSAccesses(Function *Func); | 
|  | Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr); | 
|  | void translateLDSMemoryOperationsToGlobalMemory( | 
|  | Function *Func, Value *LoadMallocPtr, | 
|  | SetVector<Instruction *> &LDSInstructions); | 
|  | void poisonRedzones(Function *Func, Value *MallocPtr); | 
|  | void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU); | 
|  | void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams); | 
|  | void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams); | 
|  | Constant * | 
|  | getAddressesOfVariablesInKernel(Function *Func, | 
|  | SetVector<GlobalVariable *> &Variables); | 
|  | void lowerNonKernelLDSAccesses(Function *Func, | 
|  | SetVector<GlobalVariable *> &LDSGlobals, | 
|  | NonKernelLDSParameters &NKLDSParams); | 
|  | void | 
|  | updateMallocSizeForDynamicLDS(Function *Func, Value **CurrMallocSize, | 
|  | Value *HiddenDynLDSSize, | 
|  | SetVector<GlobalVariable *> &DynamicLDSGlobals); | 
|  | void initAsanInfo(); | 
|  |  | 
|  | private: | 
|  | Module &M; | 
|  | const AMDGPUTargetMachine &AMDGPUTM; | 
|  | IRBuilder<> IRB; | 
|  | DomTreeCallback DTCallback; | 
|  | FunctionsAndLDSAccess FuncLDSAccessInfo; | 
|  | AsanInstrumentInfo AsanInfo; | 
|  | }; | 
|  |  | 
|  | template <typename T> SetVector<T> sortByName(std::vector<T> &&V) { | 
|  | // Sort the vector of globals or Functions based on their name. | 
|  | // Returns a SetVector of globals/Functions. | 
|  | sort(V, [](const auto *L, const auto *R) { | 
|  | return L->getName() < R->getName(); | 
|  | }); | 
|  | return {SetVector<T>(llvm::from_range, V)}; | 
|  | } | 
|  |  | 
|  | SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals( | 
|  | SetVector<GlobalVariable *> &Variables) { | 
|  | // Sort all the non-kernel LDS accesses based on their name. | 
|  | return sortByName( | 
|  | std::vector<GlobalVariable *>(Variables.begin(), Variables.end())); | 
|  | } | 
|  |  | 
|  | SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels( | 
|  | SetVector<Function *> &Kernels) { | 
|  | // Sort the non-kernels accessing LDS based on their name. | 
|  | // Also assign a kernel ID metadata based on the sorted order. | 
|  | LLVMContext &Ctx = M.getContext(); | 
|  | if (Kernels.size() > UINT32_MAX) { | 
|  | report_fatal_error("Unimplemented SW LDS lowering for > 2**32 kernels"); | 
|  | } | 
|  | SetVector<Function *> OrderedKernels = | 
|  | sortByName(std::vector<Function *>(Kernels.begin(), Kernels.end())); | 
|  | for (size_t i = 0; i < Kernels.size(); i++) { | 
|  | Metadata *AttrMDArgs[1] = { | 
|  | ConstantAsMetadata::get(IRB.getInt32(i)), | 
|  | }; | 
|  | Function *Func = OrderedKernels[i]; | 
|  | Func->setMetadata("llvm.amdgcn.lds.kernel.id", | 
|  | MDNode::get(Ctx, AttrMDArgs)); | 
|  | } | 
|  | return OrderedKernels; | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) { | 
|  | // Among the kernels accessing LDS, get list of | 
|  | // Non-kernels to which a call is made and a ptr | 
|  | // to addrspace(3) is passed as argument. | 
|  | for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) { | 
|  | Function *Func = K.first; | 
|  | const CallGraphNode *CGN = CG[Func]; | 
|  | if (!CGN) | 
|  | continue; | 
|  | for (auto &I : *CGN) { | 
|  | CallGraphNode *CallerCGN = I.second; | 
|  | Function *CalledFunc = CallerCGN->getFunction(); | 
|  | if (!CalledFunc || CalledFunc->isDeclaration()) | 
|  | continue; | 
|  | if (AMDGPU::isKernelLDS(CalledFunc)) | 
|  | continue; | 
|  | for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end(); | 
|  | AI != E; ++AI) { | 
|  | Type *ArgTy = (*AI).getType(); | 
|  | if (!ArgTy->isPointerTy()) | 
|  | continue; | 
|  | if (ArgTy->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) | 
|  | continue; | 
|  | FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(CalledFunc); | 
|  | // Also add the Calling function to KernelsWithIndirectLDSAccess list | 
|  | // so that base table of LDS is generated. | 
|  | FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(Func); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() { | 
|  | for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) { | 
|  | if (!AMDGPU::isLDSVariableToLower(*GV)) | 
|  | continue; | 
|  |  | 
|  | for (User *V : GV->users()) { | 
|  | if (auto *I = dyn_cast<Instruction>(V)) { | 
|  | Function *F = I->getFunction(); | 
|  | if (!isKernelLDS(F) && !F->isDeclaration()) | 
|  | FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV, | 
|  | uint32_t Address) { | 
|  | // Write the specified address into metadata where it can be retrieved by | 
|  | // the assembler. Format is a half open range, [Address Address+1) | 
|  | LLVMContext &Ctx = M.getContext(); | 
|  | auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); | 
|  | MDBuilder MDB(Ctx); | 
|  | MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address), | 
|  | ConstantInt::get(IntTy, Address + 1)); | 
|  | GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode); | 
|  | } | 
|  |  | 
|  | static void addLDSSizeAttribute(Function *Func, uint32_t Offset, | 
|  | bool IsDynLDS) { | 
|  | if (Offset != 0) { | 
|  | std::string Buffer; | 
|  | raw_string_ostream SS{Buffer}; | 
|  | SS << Offset; | 
|  | if (IsDynLDS) | 
|  | SS << "," << Offset; | 
|  | Func->addFnAttr("amdgpu-lds-size", Buffer); | 
|  | } | 
|  | } | 
|  |  | 
|  | static void markUsedByKernel(Function *Func, GlobalVariable *SGV) { | 
|  | BasicBlock *Entry = &Func->getEntryBlock(); | 
|  | IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt()); | 
|  |  | 
|  | Function *Decl = Intrinsic::getOrInsertDeclaration(Func->getParent(), | 
|  | Intrinsic::donothing, {}); | 
|  |  | 
|  | Value *UseInstance[1] = { | 
|  | Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)}; | 
|  |  | 
|  | Builder.CreateCall(Decl, {}, | 
|  | {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)}); | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::buildSwLDSGlobal(Function *Func) { | 
|  | // Create new LDS global required for each kernel to store | 
|  | // device global memory pointer. | 
|  | auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; | 
|  | // Create new global pointer variable | 
|  | LDSParams.SwLDS = new GlobalVariable( | 
|  | M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage, | 
|  | PoisonValue::get(IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(), | 
|  | nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false); | 
|  | GlobalValue::SanitizerMetadata MD; | 
|  | MD.NoAddress = true; | 
|  | LDSParams.SwLDS->setSanitizerMetadata(MD); | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(Function *Func) { | 
|  | // Create new Dyn LDS global if kernel accesses dyn LDS. | 
|  | auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; | 
|  | if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() && | 
|  | LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) | 
|  | return; | 
|  | // Create new global pointer variable | 
|  | auto *emptyCharArray = ArrayType::get(IRB.getInt8Ty(), 0); | 
|  | LDSParams.SwDynLDS = new GlobalVariable( | 
|  | M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr, | 
|  | "llvm.amdgcn." + Func->getName() + ".dynlds", nullptr, | 
|  | GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false); | 
|  | markUsedByKernel(Func, LDSParams.SwDynLDS); | 
|  | GlobalValue::SanitizerMetadata MD; | 
|  | MD.NoAddress = true; | 
|  | LDSParams.SwDynLDS->setSanitizerMetadata(MD); | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) { | 
|  | auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; | 
|  | bool IsDynLDSUsed = LDSParams.SwDynLDS; | 
|  | uint32_t Offset = LDSParams.LDSSize; | 
|  | recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0); | 
|  | addLDSSizeAttribute(Func, Offset, IsDynLDSUsed); | 
|  | if (LDSParams.SwDynLDS) | 
|  | recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset); | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) { | 
|  | // Create new metadata global for every kernel and initialize the | 
|  | // start offsets and sizes corresponding to each LDS accesses. | 
|  | auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; | 
|  | auto &Ctx = M.getContext(); | 
|  | auto &DL = M.getDataLayout(); | 
|  | std::vector<Type *> Items; | 
|  | Type *Int32Ty = IRB.getInt32Ty(); | 
|  | std::vector<Constant *> Initializers; | 
|  | Align MaxAlignment(1); | 
|  | auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) { | 
|  | Align GVAlign = AMDGPU::getAlign(DL, GV); | 
|  | MaxAlignment = std::max(MaxAlignment, GVAlign); | 
|  | }; | 
|  |  | 
|  | for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals) | 
|  | UpdateMaxAlignment(GV); | 
|  |  | 
|  | for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals) | 
|  | UpdateMaxAlignment(GV); | 
|  |  | 
|  | for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals) | 
|  | UpdateMaxAlignment(GV); | 
|  |  | 
|  | for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals) | 
|  | UpdateMaxAlignment(GV); | 
|  |  | 
|  | //{StartOffset, AlignedSizeInBytes} | 
|  | SmallString<128> MDItemStr; | 
|  | raw_svector_ostream MDItemOS(MDItemStr); | 
|  | MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.item"; | 
|  |  | 
|  | StructType *LDSItemTy = | 
|  | StructType::create(Ctx, {Int32Ty, Int32Ty, Int32Ty}, MDItemOS.str()); | 
|  | uint32_t &MallocSize = LDSParams.MallocSize; | 
|  | SetVector<GlobalVariable *> UniqueLDSGlobals; | 
|  | int AsanScale = AsanInfo.Scale; | 
|  | auto buildInitializerForSwLDSMD = | 
|  | [&](SetVector<GlobalVariable *> &LDSGlobals) { | 
|  | for (auto &GV : LDSGlobals) { | 
|  | if (is_contained(UniqueLDSGlobals, GV)) | 
|  | continue; | 
|  | UniqueLDSGlobals.insert(GV); | 
|  |  | 
|  | Type *Ty = GV->getValueType(); | 
|  | const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty); | 
|  | Items.push_back(LDSItemTy); | 
|  | Constant *ItemStartOffset = ConstantInt::get(Int32Ty, MallocSize); | 
|  | Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes); | 
|  | // Get redzone size corresponding a size. | 
|  | const uint64_t RightRedzoneSize = | 
|  | AMDGPU::getRedzoneSizeForGlobal(AsanScale, SizeInBytes); | 
|  | // Update MallocSize with current size and redzone size. | 
|  | MallocSize += SizeInBytes; | 
|  | if (!AMDGPU::isDynamicLDS(*GV)) | 
|  | LDSParams.RedzoneOffsetAndSizeVector.emplace_back(MallocSize, | 
|  | RightRedzoneSize); | 
|  | MallocSize += RightRedzoneSize; | 
|  | // Align current size plus redzone. | 
|  | uint64_t AlignedSize = | 
|  | alignTo(SizeInBytes + RightRedzoneSize, MaxAlignment); | 
|  | Constant *AlignedSizeInBytesConst = | 
|  | ConstantInt::get(Int32Ty, AlignedSize); | 
|  | // Align MallocSize | 
|  | MallocSize = alignTo(MallocSize, MaxAlignment); | 
|  | Constant *InitItem = | 
|  | ConstantStruct::get(LDSItemTy, {ItemStartOffset, SizeInBytesConst, | 
|  | AlignedSizeInBytesConst}); | 
|  | Initializers.push_back(InitItem); | 
|  | } | 
|  | }; | 
|  | SetVector<GlobalVariable *> SwLDSVector; | 
|  | SwLDSVector.insert(LDSParams.SwLDS); | 
|  | buildInitializerForSwLDSMD(SwLDSVector); | 
|  | buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals); | 
|  | buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals); | 
|  | buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals); | 
|  | buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals); | 
|  |  | 
|  | // Update the LDS size used by the kernel. | 
|  | Type *Ty = LDSParams.SwLDS->getValueType(); | 
|  | const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty); | 
|  | uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment); | 
|  | LDSParams.LDSSize = AlignedSize; | 
|  | SmallString<128> MDTypeStr; | 
|  | raw_svector_ostream MDTypeOS(MDTypeStr); | 
|  | MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.type"; | 
|  | StructType *MetadataStructType = | 
|  | StructType::create(Ctx, Items, MDTypeOS.str()); | 
|  | SmallString<128> MDStr; | 
|  | raw_svector_ostream MDOS(MDStr); | 
|  | MDOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md"; | 
|  | LDSParams.SwLDSMetadata = new GlobalVariable( | 
|  | M, MetadataStructType, false, GlobalValue::InternalLinkage, | 
|  | PoisonValue::get(MetadataStructType), MDOS.str(), nullptr, | 
|  | GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, false); | 
|  | Constant *data = ConstantStruct::get(MetadataStructType, Initializers); | 
|  | LDSParams.SwLDSMetadata->setInitializer(data); | 
|  | assert(LDSParams.SwLDS); | 
|  | // Set the alignment to MaxAlignment for SwLDS. | 
|  | LDSParams.SwLDS->setAlignment(MaxAlignment); | 
|  | if (LDSParams.SwDynLDS) | 
|  | LDSParams.SwDynLDS->setAlignment(MaxAlignment); | 
|  | GlobalValue::SanitizerMetadata MD; | 
|  | MD.NoAddress = true; | 
|  | LDSParams.SwLDSMetadata->setSanitizerMetadata(MD); | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) { | 
|  | // Fill the corresponding LDS replacement indices for each LDS access | 
|  | // related to this kernel. | 
|  | auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; | 
|  | SetVector<GlobalVariable *> UniqueLDSGlobals; | 
|  | auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals, | 
|  | uint32_t &Idx) { | 
|  | for (auto &GV : LDSGlobals) { | 
|  | if (is_contained(UniqueLDSGlobals, GV)) | 
|  | continue; | 
|  | UniqueLDSGlobals.insert(GV); | 
|  | LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0}; | 
|  | ++Idx; | 
|  | } | 
|  | }; | 
|  | uint32_t Idx = 0; | 
|  | SetVector<GlobalVariable *> SwLDSVector; | 
|  | SwLDSVector.insert(LDSParams.SwLDS); | 
|  | PopulateIndices(SwLDSVector, Idx); | 
|  | PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx); | 
|  | PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx); | 
|  | PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx); | 
|  | PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx); | 
|  | } | 
|  |  | 
|  | static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV, | 
|  | Value *Replacement) { | 
|  | // Replace all uses of LDS global in this Function with a Replacement. | 
|  | auto ReplaceUsesLambda = [Func](const Use &U) -> bool { | 
|  | auto *V = U.getUser(); | 
|  | if (auto *Inst = dyn_cast<Instruction>(V)) { | 
|  | auto *Func1 = Inst->getParent()->getParent(); | 
|  | if (Func == Func1) | 
|  | return true; | 
|  | } | 
|  | return false; | 
|  | }; | 
|  | GV->replaceUsesWithIf(Replacement, ReplaceUsesLambda); | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) { | 
|  | auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; | 
|  | GlobalVariable *SwLDS = LDSParams.SwLDS; | 
|  | assert(SwLDS); | 
|  | GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata; | 
|  | assert(SwLDSMetadata); | 
|  | StructType *SwLDSMetadataStructType = | 
|  | cast<StructType>(SwLDSMetadata->getValueType()); | 
|  | Type *Int32Ty = IRB.getInt32Ty(); | 
|  | auto &IndirectAccess = LDSParams.IndirectAccess; | 
|  | auto &DirectAccess = LDSParams.DirectAccess; | 
|  | // Replace all uses of LDS global in this Function with a Replacement. | 
|  | SetVector<GlobalVariable *> UniqueLDSGlobals; | 
|  | auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) { | 
|  | for (auto &GV : LDSGlobals) { | 
|  | // Do not generate instructions if LDS access is in non-kernel | 
|  | // i.e indirect-access. | 
|  | if ((IndirectAccess.StaticLDSGlobals.contains(GV) || | 
|  | IndirectAccess.DynamicLDSGlobals.contains(GV)) && | 
|  | (!DirectAccess.StaticLDSGlobals.contains(GV) && | 
|  | !DirectAccess.DynamicLDSGlobals.contains(GV))) | 
|  | continue; | 
|  | if (is_contained(UniqueLDSGlobals, GV)) | 
|  | continue; | 
|  | UniqueLDSGlobals.insert(GV); | 
|  | auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV]; | 
|  | assert(Indices.size() == 3); | 
|  | Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]), | 
|  | ConstantInt::get(Int32Ty, Indices[1]), | 
|  | ConstantInt::get(Int32Ty, Indices[2])}; | 
|  | Constant *GEP = ConstantExpr::getGetElementPtr( | 
|  | SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true); | 
|  | Value *Offset = IRB.CreateLoad(Int32Ty, GEP); | 
|  | Value *BasePlusOffset = | 
|  | IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Offset}); | 
|  | LLVM_DEBUG(GV->printAsOperand(dbgs() << "Sw LDS Lowering, Replacing LDS ", | 
|  | false)); | 
|  | replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset); | 
|  | } | 
|  | }; | 
|  | ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals); | 
|  | ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals); | 
|  | ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals); | 
|  | ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals); | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS( | 
|  | Function *Func, Value **CurrMallocSize, Value *HiddenDynLDSSize, | 
|  | SetVector<GlobalVariable *> &DynamicLDSGlobals) { | 
|  | auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; | 
|  | Type *Int32Ty = IRB.getInt32Ty(); | 
|  |  | 
|  | GlobalVariable *SwLDS = LDSParams.SwLDS; | 
|  | GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata; | 
|  | assert(SwLDS && SwLDSMetadata); | 
|  | StructType *MetadataStructType = | 
|  | cast<StructType>(SwLDSMetadata->getValueType()); | 
|  | unsigned MaxAlignment = SwLDS->getAlignment(); | 
|  | Value *MaxAlignValue = IRB.getInt32(MaxAlignment); | 
|  | Value *MaxAlignValueMinusOne = IRB.getInt32(MaxAlignment - 1); | 
|  |  | 
|  | for (GlobalVariable *DynGV : DynamicLDSGlobals) { | 
|  | auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV]; | 
|  | // Update the Offset metadata. | 
|  | Constant *Index0 = ConstantInt::get(Int32Ty, 0); | 
|  | Constant *Index1 = ConstantInt::get(Int32Ty, Indices[1]); | 
|  |  | 
|  | Constant *Index2Offset = ConstantInt::get(Int32Ty, 0); | 
|  | auto *GEPForOffset = IRB.CreateInBoundsGEP( | 
|  | MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2Offset}); | 
|  |  | 
|  | IRB.CreateStore(*CurrMallocSize, GEPForOffset); | 
|  | // Update the size and Aligned Size metadata. | 
|  | Constant *Index2Size = ConstantInt::get(Int32Ty, 1); | 
|  | auto *GEPForSize = IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata, | 
|  | {Index0, Index1, Index2Size}); | 
|  |  | 
|  | Value *CurrDynLDSSize = IRB.CreateLoad(Int32Ty, HiddenDynLDSSize); | 
|  | IRB.CreateStore(CurrDynLDSSize, GEPForSize); | 
|  | Constant *Index2AlignedSize = ConstantInt::get(Int32Ty, 2); | 
|  | auto *GEPForAlignedSize = IRB.CreateInBoundsGEP( | 
|  | MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2AlignedSize}); | 
|  |  | 
|  | Value *AlignedDynLDSSize = | 
|  | IRB.CreateAdd(CurrDynLDSSize, MaxAlignValueMinusOne); | 
|  | AlignedDynLDSSize = IRB.CreateUDiv(AlignedDynLDSSize, MaxAlignValue); | 
|  | AlignedDynLDSSize = IRB.CreateMul(AlignedDynLDSSize, MaxAlignValue); | 
|  | IRB.CreateStore(AlignedDynLDSSize, GEPForAlignedSize); | 
|  |  | 
|  | // Update the Current Malloc Size | 
|  | *CurrMallocSize = IRB.CreateAdd(*CurrMallocSize, AlignedDynLDSSize); | 
|  | } | 
|  | } | 
|  |  | 
|  | static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore, | 
|  | DISubprogram *SP) { | 
|  | assert(InsertBefore); | 
|  | if (InsertBefore->getDebugLoc()) | 
|  | return InsertBefore->getDebugLoc(); | 
|  | if (SP) | 
|  | return DILocation::get(SP->getContext(), SP->getLine(), 1, SP); | 
|  | return DebugLoc(); | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::getLDSMemoryInstructions( | 
|  | Function *Func, SetVector<Instruction *> &LDSInstructions) { | 
|  | for (BasicBlock &BB : *Func) { | 
|  | for (Instruction &Inst : BB) { | 
|  | if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) { | 
|  | if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) | 
|  | LDSInstructions.insert(&Inst); | 
|  | } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) { | 
|  | if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) | 
|  | LDSInstructions.insert(&Inst); | 
|  | } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&Inst)) { | 
|  | if (RMW->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) | 
|  | LDSInstructions.insert(&Inst); | 
|  | } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(&Inst)) { | 
|  | if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) | 
|  | LDSInstructions.insert(&Inst); | 
|  | } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&Inst)) { | 
|  | if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && | 
|  | ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) | 
|  | LDSInstructions.insert(&Inst); | 
|  | } else | 
|  | continue; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, | 
|  | Value *LDSPtr) { | 
|  | assert(LDSPtr && "Invalid LDS pointer operand"); | 
|  | Type *LDSPtrType = LDSPtr->getType(); | 
|  | LLVMContext &Ctx = M.getContext(); | 
|  | const DataLayout &DL = M.getDataLayout(); | 
|  | Type *IntTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); | 
|  | if (auto *VecPtrTy = dyn_cast<VectorType>(LDSPtrType)) { | 
|  | // Handle vector of pointers | 
|  | ElementCount NumElements = VecPtrTy->getElementCount(); | 
|  | IntTy = VectorType::get(IntTy, NumElements); | 
|  | } | 
|  | Value *GepIndex = IRB.CreatePtrToInt(LDSPtr, IntTy); | 
|  | return IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {GepIndex}); | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( | 
|  | Function *Func, Value *LoadMallocPtr, | 
|  | SetVector<Instruction *> &LDSInstructions) { | 
|  | LLVM_DEBUG(dbgs() << "Translating LDS memory operations to global memory : " | 
|  | << Func->getName()); | 
|  | for (Instruction *Inst : LDSInstructions) { | 
|  | IRB.SetInsertPoint(Inst); | 
|  | if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { | 
|  | Value *LIOperand = LI->getPointerOperand(); | 
|  | Value *Replacement = | 
|  | getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LIOperand); | 
|  | LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement, | 
|  | LI->getAlign(), LI->isVolatile()); | 
|  | NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); | 
|  | AsanInfo.Instructions.insert(NewLI); | 
|  | LI->replaceAllUsesWith(NewLI); | 
|  | LI->eraseFromParent(); | 
|  | } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { | 
|  | Value *SIOperand = SI->getPointerOperand(); | 
|  | Value *Replacement = | 
|  | getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, SIOperand); | 
|  | StoreInst *NewSI = IRB.CreateAlignedStore( | 
|  | SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile()); | 
|  | NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); | 
|  | AsanInfo.Instructions.insert(NewSI); | 
|  | SI->replaceAllUsesWith(NewSI); | 
|  | SI->eraseFromParent(); | 
|  | } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) { | 
|  | Value *RMWPtrOperand = RMW->getPointerOperand(); | 
|  | Value *RMWValOperand = RMW->getValOperand(); | 
|  | Value *Replacement = | 
|  | getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, RMWPtrOperand); | 
|  | AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW( | 
|  | RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(), | 
|  | RMW->getOrdering(), RMW->getSyncScopeID()); | 
|  | NewRMW->setVolatile(RMW->isVolatile()); | 
|  | AsanInfo.Instructions.insert(NewRMW); | 
|  | RMW->replaceAllUsesWith(NewRMW); | 
|  | RMW->eraseFromParent(); | 
|  | } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Inst)) { | 
|  | Value *XCHGPtrOperand = XCHG->getPointerOperand(); | 
|  | Value *Replacement = | 
|  | getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, XCHGPtrOperand); | 
|  | AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg( | 
|  | Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(), | 
|  | XCHG->getAlign(), XCHG->getSuccessOrdering(), | 
|  | XCHG->getFailureOrdering(), XCHG->getSyncScopeID()); | 
|  | NewXCHG->setVolatile(XCHG->isVolatile()); | 
|  | AsanInfo.Instructions.insert(NewXCHG); | 
|  | XCHG->replaceAllUsesWith(NewXCHG); | 
|  | XCHG->eraseFromParent(); | 
|  | } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Inst)) { | 
|  | Value *AIOperand = ASC->getPointerOperand(); | 
|  | Value *Replacement = | 
|  | getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, AIOperand); | 
|  | Value *NewAI = IRB.CreateAddrSpaceCast(Replacement, ASC->getType()); | 
|  | // Note: No need to add the instruction to AsanInfo instructions to be | 
|  | // instrumented list. FLAT_ADDRESS ptr would have been already | 
|  | // instrumented by asan pass prior to this pass. | 
|  | ASC->replaceAllUsesWith(NewAI); | 
|  | ASC->eraseFromParent(); | 
|  | } else | 
|  | report_fatal_error("Unimplemented LDS lowering instruction"); | 
|  | } | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::poisonRedzones(Function *Func, Value *MallocPtr) { | 
|  | auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; | 
|  | Type *Int64Ty = IRB.getInt64Ty(); | 
|  | Type *VoidTy = IRB.getVoidTy(); | 
|  | FunctionCallee AsanPoisonRegion = M.getOrInsertFunction( | 
|  | "__asan_poison_region", | 
|  | FunctionType::get(VoidTy, {Int64Ty, Int64Ty}, false)); | 
|  |  | 
|  | auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector; | 
|  | size_t VecSize = RedzonesVec.size(); | 
|  | for (unsigned i = 0; i < VecSize; i++) { | 
|  | auto &RedzonePair = RedzonesVec[i]; | 
|  | uint64_t RedzoneOffset = RedzonePair.first; | 
|  | uint64_t RedzoneSize = RedzonePair.second; | 
|  | Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP( | 
|  | IRB.getInt8Ty(), MallocPtr, {IRB.getInt64(RedzoneOffset)}); | 
|  | Value *RedzoneAddress = IRB.CreatePtrToInt(RedzoneAddrOffset, Int64Ty); | 
|  | IRB.CreateCall(AsanPoisonRegion, | 
|  | {RedzoneAddress, IRB.getInt64(RedzoneSize)}); | 
|  | } | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func, | 
|  | DomTreeUpdater &DTU) { | 
|  | LLVM_DEBUG(dbgs() << "Sw Lowering Kernel LDS for : " << Func->getName()); | 
|  | auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; | 
|  | auto &Ctx = M.getContext(); | 
|  | auto *PrevEntryBlock = &Func->getEntryBlock(); | 
|  | SetVector<Instruction *> LDSInstructions; | 
|  | getLDSMemoryInstructions(Func, LDSInstructions); | 
|  |  | 
|  | // Create malloc block. | 
|  | auto *MallocBlock = BasicBlock::Create(Ctx, "Malloc", Func, PrevEntryBlock); | 
|  |  | 
|  | // Create WIdBlock block which has instructions related to selection of | 
|  | // {0,0,0} indiex work item in the work group. | 
|  | auto *WIdBlock = BasicBlock::Create(Ctx, "WId", Func, MallocBlock); | 
|  | IRB.SetInsertPoint(WIdBlock, WIdBlock->begin()); | 
|  | DebugLoc FirstDL = | 
|  | getOrCreateDebugLoc(&*PrevEntryBlock->begin(), Func->getSubprogram()); | 
|  | IRB.SetCurrentDebugLocation(FirstDL); | 
|  | Value *WIdx = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}); | 
|  | Value *WIdy = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {}); | 
|  | Value *WIdz = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {}); | 
|  | Value *XYOr = IRB.CreateOr(WIdx, WIdy); | 
|  | Value *XYZOr = IRB.CreateOr(XYOr, WIdz); | 
|  | Value *WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0)); | 
|  |  | 
|  | // All work items will branch to PrevEntryBlock except {0,0,0} index | 
|  | // work item which will branch to malloc block. | 
|  | IRB.CreateCondBr(WIdzCond, MallocBlock, PrevEntryBlock); | 
|  |  | 
|  | // Malloc block | 
|  | IRB.SetInsertPoint(MallocBlock, MallocBlock->begin()); | 
|  |  | 
|  | // If Dynamic LDS globals are accessed by the kernel, | 
|  | // Get the size of dyn lds from hidden dyn_lds_size kernel arg. | 
|  | // Update the corresponding metadata global entries for this dyn lds global. | 
|  | GlobalVariable *SwLDS = LDSParams.SwLDS; | 
|  | GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata; | 
|  | assert(SwLDS && SwLDSMetadata); | 
|  | StructType *MetadataStructType = | 
|  | cast<StructType>(SwLDSMetadata->getValueType()); | 
|  | uint32_t MallocSize = 0; | 
|  | Value *CurrMallocSize; | 
|  | Type *Int32Ty = IRB.getInt32Ty(); | 
|  | Type *Int64Ty = IRB.getInt64Ty(); | 
|  |  | 
|  | SetVector<GlobalVariable *> UniqueLDSGlobals; | 
|  | auto GetUniqueLDSGlobals = [&](SetVector<GlobalVariable *> &LDSGlobals) { | 
|  | for (auto &GV : LDSGlobals) { | 
|  | if (is_contained(UniqueLDSGlobals, GV)) | 
|  | continue; | 
|  | UniqueLDSGlobals.insert(GV); | 
|  | } | 
|  | }; | 
|  |  | 
|  | GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals); | 
|  | GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals); | 
|  | unsigned NumStaticLDS = 1 + UniqueLDSGlobals.size(); | 
|  | UniqueLDSGlobals.clear(); | 
|  |  | 
|  | if (NumStaticLDS) { | 
|  | auto *GEPForEndStaticLDSOffset = | 
|  | IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata, | 
|  | {ConstantInt::get(Int32Ty, 0), | 
|  | ConstantInt::get(Int32Ty, NumStaticLDS - 1), | 
|  | ConstantInt::get(Int32Ty, 0)}); | 
|  |  | 
|  | auto *GEPForEndStaticLDSSize = | 
|  | IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata, | 
|  | {ConstantInt::get(Int32Ty, 0), | 
|  | ConstantInt::get(Int32Ty, NumStaticLDS - 1), | 
|  | ConstantInt::get(Int32Ty, 2)}); | 
|  |  | 
|  | Value *EndStaticLDSOffset = | 
|  | IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSOffset); | 
|  | Value *EndStaticLDSSize = IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSSize); | 
|  | CurrMallocSize = IRB.CreateAdd(EndStaticLDSOffset, EndStaticLDSSize); | 
|  | } else | 
|  | CurrMallocSize = IRB.getInt32(MallocSize); | 
|  |  | 
|  | if (LDSParams.SwDynLDS) { | 
|  | if (!(AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5)) | 
|  | report_fatal_error( | 
|  | "Dynamic LDS size query is only supported for CO V5 and later."); | 
|  | // Get size from hidden dyn_lds_size argument of kernel | 
|  | Value *ImplicitArg = | 
|  | IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}); | 
|  | Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP( | 
|  | ImplicitArg->getType(), ImplicitArg, | 
|  | {ConstantInt::get(Int64Ty, COV5_HIDDEN_DYN_LDS_SIZE_ARG)}); | 
|  | UniqueLDSGlobals.clear(); | 
|  | GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals); | 
|  | GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals); | 
|  | updateMallocSizeForDynamicLDS(Func, &CurrMallocSize, HiddenDynLDSSize, | 
|  | UniqueLDSGlobals); | 
|  | } | 
|  |  | 
|  | CurrMallocSize = IRB.CreateZExt(CurrMallocSize, Int64Ty); | 
|  |  | 
|  | // Create a call to malloc function which does device global memory allocation | 
|  | // with size equals to all LDS global accesses size in this kernel. | 
|  | Value *ReturnAddress = | 
|  | IRB.CreateIntrinsic(Intrinsic::returnaddress, {IRB.getInt32(0)}); | 
|  | FunctionCallee MallocFunc = M.getOrInsertFunction( | 
|  | StringRef("__asan_malloc_impl"), | 
|  | FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false)); | 
|  | Value *RAPtrToInt = IRB.CreatePtrToInt(ReturnAddress, Int64Ty); | 
|  | Value *MallocCall = IRB.CreateCall(MallocFunc, {CurrMallocSize, RAPtrToInt}); | 
|  |  | 
|  | Value *MallocPtr = | 
|  | IRB.CreateIntToPtr(MallocCall, IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS)); | 
|  |  | 
|  | // Create store of malloc to new global | 
|  | IRB.CreateStore(MallocPtr, SwLDS); | 
|  |  | 
|  | // Create calls to __asan_poison_region to poison redzones. | 
|  | poisonRedzones(Func, MallocPtr); | 
|  |  | 
|  | // Create branch to PrevEntryBlock | 
|  | IRB.CreateBr(PrevEntryBlock); | 
|  |  | 
|  | // Create wave-group barrier at the starting of Previous entry block | 
|  | Type *Int1Ty = IRB.getInt1Ty(); | 
|  | IRB.SetInsertPoint(PrevEntryBlock, PrevEntryBlock->begin()); | 
|  | auto *XYZCondPhi = IRB.CreatePHI(Int1Ty, 2, "xyzCond"); | 
|  | XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock); | 
|  | XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock); | 
|  |  | 
|  | IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}); | 
|  |  | 
|  | // Load malloc pointer from Sw LDS. | 
|  | Value *LoadMallocPtr = | 
|  | IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), SwLDS); | 
|  |  | 
|  | // Replace All uses of LDS globals with new LDS pointers. | 
|  | replaceKernelLDSAccesses(Func); | 
|  |  | 
|  | // Replace Memory Operations on LDS with corresponding | 
|  | // global memory pointers. | 
|  | translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr, | 
|  | LDSInstructions); | 
|  |  | 
|  | auto *CondFreeBlock = BasicBlock::Create(Ctx, "CondFree", Func); | 
|  | auto *FreeBlock = BasicBlock::Create(Ctx, "Free", Func); | 
|  | auto *EndBlock = BasicBlock::Create(Ctx, "End", Func); | 
|  | for (BasicBlock &BB : *Func) { | 
|  | if (!BB.empty()) { | 
|  | if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) { | 
|  | RI->eraseFromParent(); | 
|  | IRB.SetInsertPoint(&BB, BB.end()); | 
|  | IRB.CreateBr(CondFreeBlock); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Cond Free Block | 
|  | IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin()); | 
|  | IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}); | 
|  | IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock); | 
|  |  | 
|  | // Free Block | 
|  | IRB.SetInsertPoint(FreeBlock, FreeBlock->begin()); | 
|  |  | 
|  | // Free the previously allocate device global memory. | 
|  | FunctionCallee AsanFreeFunc = M.getOrInsertFunction( | 
|  | StringRef("__asan_free_impl"), | 
|  | FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false)); | 
|  | Value *ReturnAddr = | 
|  | IRB.CreateIntrinsic(Intrinsic::returnaddress, IRB.getInt32(0)); | 
|  | Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty); | 
|  | Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty); | 
|  | IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt}); | 
|  |  | 
|  | IRB.CreateBr(EndBlock); | 
|  |  | 
|  | // End Block | 
|  | IRB.SetInsertPoint(EndBlock, EndBlock->begin()); | 
|  | IRB.CreateRetVoid(); | 
|  | // Update the DomTree with corresponding links to basic blocks. | 
|  | DTU.applyUpdates({{DominatorTree::Insert, WIdBlock, MallocBlock}, | 
|  | {DominatorTree::Insert, MallocBlock, PrevEntryBlock}, | 
|  | {DominatorTree::Insert, CondFreeBlock, FreeBlock}, | 
|  | {DominatorTree::Insert, FreeBlock, EndBlock}}); | 
|  | } | 
|  |  | 
|  | Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel( | 
|  | Function *Func, SetVector<GlobalVariable *> &Variables) { | 
|  | Type *Int32Ty = IRB.getInt32Ty(); | 
|  | auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; | 
|  |  | 
|  | GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata; | 
|  | assert(SwLDSMetadata); | 
|  | auto *SwLDSMetadataStructType = | 
|  | cast<StructType>(SwLDSMetadata->getValueType()); | 
|  | ArrayType *KernelOffsetsType = | 
|  | ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), Variables.size()); | 
|  |  | 
|  | SmallVector<Constant *> Elements; | 
|  | for (auto *GV : Variables) { | 
|  | auto It = LDSParams.LDSToReplacementIndicesMap.find(GV); | 
|  | if (It == LDSParams.LDSToReplacementIndicesMap.end()) { | 
|  | Elements.push_back( | 
|  | PoisonValue::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS))); | 
|  | continue; | 
|  | } | 
|  | auto &Indices = It->second; | 
|  | Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]), | 
|  | ConstantInt::get(Int32Ty, Indices[1]), | 
|  | ConstantInt::get(Int32Ty, Indices[2])}; | 
|  | Constant *GEP = ConstantExpr::getGetElementPtr(SwLDSMetadataStructType, | 
|  | SwLDSMetadata, GEPIdx, true); | 
|  | Elements.push_back(GEP); | 
|  | } | 
|  | return ConstantArray::get(KernelOffsetsType, Elements); | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable( | 
|  | NonKernelLDSParameters &NKLDSParams) { | 
|  | // Base table will have single row, with elements of the row | 
|  | // placed as per kernel ID. Each element in the row corresponds | 
|  | // to addresss of "SW LDS" global of the kernel. | 
|  | auto &Kernels = NKLDSParams.OrderedKernels; | 
|  | if (Kernels.empty()) | 
|  | return; | 
|  | Type *Int32Ty = IRB.getInt32Ty(); | 
|  | const size_t NumberKernels = Kernels.size(); | 
|  | ArrayType *AllKernelsOffsetsType = | 
|  | ArrayType::get(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), NumberKernels); | 
|  | std::vector<Constant *> OverallConstantExprElts(NumberKernels); | 
|  | for (size_t i = 0; i < NumberKernels; i++) { | 
|  | Function *Func = Kernels[i]; | 
|  | auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; | 
|  | GlobalVariable *SwLDS = LDSParams.SwLDS; | 
|  | assert(SwLDS); | 
|  | Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, 0)}; | 
|  | Constant *GEP = | 
|  | ConstantExpr::getGetElementPtr(SwLDS->getType(), SwLDS, GEPIdx, true); | 
|  | OverallConstantExprElts[i] = GEP; | 
|  | } | 
|  | Constant *init = | 
|  | ConstantArray::get(AllKernelsOffsetsType, OverallConstantExprElts); | 
|  | NKLDSParams.LDSBaseTable = new GlobalVariable( | 
|  | M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init, | 
|  | "llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal, | 
|  | AMDGPUAS::GLOBAL_ADDRESS); | 
|  | GlobalValue::SanitizerMetadata MD; | 
|  | MD.NoAddress = true; | 
|  | NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD); | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable( | 
|  | NonKernelLDSParameters &NKLDSParams) { | 
|  | // Offset table will have multiple rows and columns. | 
|  | // Rows are assumed to be from 0 to (n-1). n is total number | 
|  | // of kernels accessing the LDS through non-kernels. | 
|  | // Each row will have m elements. m is the total number of | 
|  | // unique LDS globals accessed by non-kernels. | 
|  | // Each element in the row correspond to the address of | 
|  | // the replacement of LDS global done by that particular kernel. | 
|  | auto &Variables = NKLDSParams.OrdereLDSGlobals; | 
|  | auto &Kernels = NKLDSParams.OrderedKernels; | 
|  | if (Variables.empty() || Kernels.empty()) | 
|  | return; | 
|  | const size_t NumberVariables = Variables.size(); | 
|  | const size_t NumberKernels = Kernels.size(); | 
|  |  | 
|  | ArrayType *KernelOffsetsType = | 
|  | ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), NumberVariables); | 
|  |  | 
|  | ArrayType *AllKernelsOffsetsType = | 
|  | ArrayType::get(KernelOffsetsType, NumberKernels); | 
|  | std::vector<Constant *> overallConstantExprElts(NumberKernels); | 
|  | for (size_t i = 0; i < NumberKernels; i++) { | 
|  | Function *Func = Kernels[i]; | 
|  | overallConstantExprElts[i] = | 
|  | getAddressesOfVariablesInKernel(Func, Variables); | 
|  | } | 
|  | Constant *Init = | 
|  | ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts); | 
|  | NKLDSParams.LDSOffsetTable = new GlobalVariable( | 
|  | M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, Init, | 
|  | "llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal, | 
|  | AMDGPUAS::GLOBAL_ADDRESS); | 
|  | GlobalValue::SanitizerMetadata MD; | 
|  | MD.NoAddress = true; | 
|  | NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD); | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses( | 
|  | Function *Func, SetVector<GlobalVariable *> &LDSGlobals, | 
|  | NonKernelLDSParameters &NKLDSParams) { | 
|  | // Replace LDS access in non-kernel with replacement queried from | 
|  | // Base table and offset from offset table. | 
|  | LLVM_DEBUG(dbgs() << "Sw LDS lowering, lower non-kernel access for : " | 
|  | << Func->getName()); | 
|  | auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); | 
|  | IRB.SetInsertPoint(InsertAt); | 
|  |  | 
|  | // Get LDS memory instructions. | 
|  | SetVector<Instruction *> LDSInstructions; | 
|  | getLDSMemoryInstructions(Func, LDSInstructions); | 
|  |  | 
|  | auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {}); | 
|  | GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable; | 
|  | GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable; | 
|  | auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals; | 
|  | Value *BaseGEP = IRB.CreateInBoundsGEP( | 
|  | LDSBaseTable->getValueType(), LDSBaseTable, {IRB.getInt32(0), KernelId}); | 
|  | Value *BaseLoad = | 
|  | IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), BaseGEP); | 
|  | Value *LoadMallocPtr = | 
|  | IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), BaseLoad); | 
|  |  | 
|  | for (GlobalVariable *GV : LDSGlobals) { | 
|  | const auto *GVIt = llvm::find(OrdereLDSGlobals, GV); | 
|  | assert(GVIt != OrdereLDSGlobals.end()); | 
|  | uint32_t GVOffset = std::distance(OrdereLDSGlobals.begin(), GVIt); | 
|  |  | 
|  | Value *OffsetGEP = IRB.CreateInBoundsGEP( | 
|  | LDSOffsetTable->getValueType(), LDSOffsetTable, | 
|  | {IRB.getInt32(0), KernelId, IRB.getInt32(GVOffset)}); | 
|  | Value *OffsetLoad = | 
|  | IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), OffsetGEP); | 
|  | Value *Offset = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad); | 
|  | Value *BasePlusOffset = | 
|  | IRB.CreateInBoundsGEP(IRB.getInt8Ty(), BaseLoad, {Offset}); | 
|  | LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replace non-kernel LDS for " | 
|  | << GV->getName()); | 
|  | replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset); | 
|  | } | 
|  | translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr, | 
|  | LDSInstructions); | 
|  | } | 
|  |  | 
|  | static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) { | 
|  | // Sort Static, dynamic LDS globals which are either | 
|  | // direct or indirect access on basis of name. | 
|  | auto &DirectAccess = LDSParams.DirectAccess; | 
|  | auto &IndirectAccess = LDSParams.IndirectAccess; | 
|  | LDSParams.DirectAccess.StaticLDSGlobals = sortByName( | 
|  | std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(), | 
|  | DirectAccess.StaticLDSGlobals.end())); | 
|  | LDSParams.DirectAccess.DynamicLDSGlobals = sortByName( | 
|  | std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(), | 
|  | DirectAccess.DynamicLDSGlobals.end())); | 
|  | LDSParams.IndirectAccess.StaticLDSGlobals = sortByName( | 
|  | std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(), | 
|  | IndirectAccess.StaticLDSGlobals.end())); | 
|  | LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName( | 
|  | std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(), | 
|  | IndirectAccess.DynamicLDSGlobals.end())); | 
|  | } | 
|  |  | 
|  | void AMDGPUSwLowerLDS::initAsanInfo() { | 
|  | // Get Shadow mapping scale and offset. | 
|  | unsigned LongSize = | 
|  | M.getDataLayout().getPointerSizeInBits(AMDGPUAS::GLOBAL_ADDRESS); | 
|  | uint64_t Offset; | 
|  | int Scale; | 
|  | bool OrShadowOffset; | 
|  | llvm::getAddressSanitizerParams(AMDGPUTM.getTargetTriple(), LongSize, false, | 
|  | &Offset, &Scale, &OrShadowOffset); | 
|  | AsanInfo.Scale = Scale; | 
|  | AsanInfo.Offset = Offset; | 
|  | } | 
|  |  | 
|  | static bool hasFnWithSanitizeAddressAttr(FunctionVariableMap &LDSAccesses) { | 
|  | for (auto &K : LDSAccesses) { | 
|  | Function *F = K.first; | 
|  | if (!F) | 
|  | continue; | 
|  | if (F->hasFnAttribute(Attribute::SanitizeAddress)) | 
|  | return true; | 
|  | } | 
|  | return false; | 
|  | } | 
|  |  | 
|  | bool AMDGPUSwLowerLDS::run() { | 
|  | bool Changed = false; | 
|  |  | 
|  | CallGraph CG = CallGraph(M); | 
|  |  | 
|  | Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); | 
|  |  | 
|  | // Get all the direct and indirect access of LDS for all the kernels. | 
|  | LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); | 
|  |  | 
|  | // Flag to decide whether to lower all the LDS accesses | 
|  | // based on sanitize_address attribute. | 
|  | bool LowerAllLDS = hasFnWithSanitizeAddressAttr(LDSUsesInfo.direct_access) || | 
|  | hasFnWithSanitizeAddressAttr(LDSUsesInfo.indirect_access); | 
|  |  | 
|  | if (!LowerAllLDS) | 
|  | return Changed; | 
|  |  | 
|  | // Utility to group LDS access into direct, indirect, static and dynamic. | 
|  | auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses, | 
|  | bool DirectAccess) { | 
|  | for (auto &K : LDSAccesses) { | 
|  | Function *F = K.first; | 
|  | if (!F || K.second.empty()) | 
|  | continue; | 
|  |  | 
|  | assert(isKernelLDS(F)); | 
|  |  | 
|  | // Only inserts if key isn't already in the map. | 
|  | FuncLDSAccessInfo.KernelToLDSParametersMap.insert( | 
|  | {F, KernelLDSParameters()}); | 
|  |  | 
|  | auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[F]; | 
|  | if (!DirectAccess) | 
|  | FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(F); | 
|  | for (GlobalVariable *GV : K.second) { | 
|  | if (!DirectAccess) { | 
|  | if (AMDGPU::isDynamicLDS(*GV)) | 
|  | LDSParams.IndirectAccess.DynamicLDSGlobals.insert(GV); | 
|  | else | 
|  | LDSParams.IndirectAccess.StaticLDSGlobals.insert(GV); | 
|  | FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(GV); | 
|  | } else { | 
|  | if (AMDGPU::isDynamicLDS(*GV)) | 
|  | LDSParams.DirectAccess.DynamicLDSGlobals.insert(GV); | 
|  | else | 
|  | LDSParams.DirectAccess.StaticLDSGlobals.insert(GV); | 
|  | } | 
|  | } | 
|  | } | 
|  | }; | 
|  |  | 
|  | PopulateKernelStaticDynamicLDS(LDSUsesInfo.direct_access, true); | 
|  | PopulateKernelStaticDynamicLDS(LDSUsesInfo.indirect_access, false); | 
|  |  | 
|  | // Get address sanitizer scale. | 
|  | initAsanInfo(); | 
|  |  | 
|  | for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) { | 
|  | Function *Func = K.first; | 
|  | auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func]; | 
|  | if (LDSParams.DirectAccess.StaticLDSGlobals.empty() && | 
|  | LDSParams.DirectAccess.DynamicLDSGlobals.empty() && | 
|  | LDSParams.IndirectAccess.StaticLDSGlobals.empty() && | 
|  | LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) { | 
|  | Changed = false; | 
|  | } else { | 
|  | removeFnAttrFromReachable( | 
|  | CG, Func, | 
|  | {"amdgpu-no-workitem-id-x", "amdgpu-no-workitem-id-y", | 
|  | "amdgpu-no-workitem-id-z", "amdgpu-no-heap-ptr"}); | 
|  | if (!LDSParams.IndirectAccess.StaticLDSGlobals.empty() || | 
|  | !LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) | 
|  | removeFnAttrFromReachable(CG, Func, {"amdgpu-no-lds-kernel-id"}); | 
|  | reorderStaticDynamicIndirectLDSSet(LDSParams); | 
|  | buildSwLDSGlobal(Func); | 
|  | buildSwDynLDSGlobal(Func); | 
|  | populateSwMetadataGlobal(Func); | 
|  | populateSwLDSAttributeAndMetadata(Func); | 
|  | populateLDSToReplacementIndicesMap(Func); | 
|  | DomTreeUpdater DTU(DTCallback(*Func), | 
|  | DomTreeUpdater::UpdateStrategy::Lazy); | 
|  | lowerKernelLDSAccesses(Func, DTU); | 
|  | Changed = true; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Get the Uses of LDS from non-kernels. | 
|  | getUsesOfLDSByNonKernels(); | 
|  |  | 
|  | // Get non-kernels with LDS ptr as argument and called by kernels. | 
|  | getNonKernelsWithLDSArguments(CG); | 
|  |  | 
|  | // Lower LDS accesses in non-kernels. | 
|  | if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() || | 
|  | !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) { | 
|  | NonKernelLDSParameters NKLDSParams; | 
|  | NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels( | 
|  | FuncLDSAccessInfo.KernelsWithIndirectLDSAccess); | 
|  | NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals( | 
|  | FuncLDSAccessInfo.AllNonKernelLDSAccess); | 
|  | buildNonKernelLDSBaseTable(NKLDSParams); | 
|  | buildNonKernelLDSOffsetTable(NKLDSParams); | 
|  | for (auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) { | 
|  | Function *Func = K.first; | 
|  | DenseSet<GlobalVariable *> &LDSGlobals = K.second; | 
|  | SetVector<GlobalVariable *> OrderedLDSGlobals = sortByName( | 
|  | std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end())); | 
|  | lowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams); | 
|  | } | 
|  | for (Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) { | 
|  | auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap; | 
|  | if (K.contains(Func)) | 
|  | continue; | 
|  | SetVector<llvm::GlobalVariable *> Vec; | 
|  | lowerNonKernelLDSAccesses(Func, Vec, NKLDSParams); | 
|  | } | 
|  | Changed = true; | 
|  | } | 
|  |  | 
|  | if (!Changed) | 
|  | return Changed; | 
|  |  | 
|  | for (auto &GV : make_early_inc_range(M.globals())) { | 
|  | if (AMDGPU::isLDSVariableToLower(GV)) { | 
|  | // probably want to remove from used lists | 
|  | GV.removeDeadConstantUsers(); | 
|  | if (GV.use_empty()) | 
|  | GV.eraseFromParent(); | 
|  | } | 
|  | } | 
|  |  | 
|  | if (AsanInstrumentLDS) { | 
|  | SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument; | 
|  | for (Instruction *Inst : AsanInfo.Instructions) { | 
|  | SmallVector<InterestingMemoryOperand, 1> InterestingOperands; | 
|  | getInterestingMemoryOperands(M, Inst, InterestingOperands); | 
|  | llvm::append_range(OperandsToInstrument, InterestingOperands); | 
|  | } | 
|  | for (auto &Operand : OperandsToInstrument) { | 
|  | Value *Addr = Operand.getPtr(); | 
|  | instrumentAddress(M, IRB, Operand.getInsn(), Operand.getInsn(), Addr, | 
|  | Operand.Alignment.valueOrOne(), Operand.TypeStoreSize, | 
|  | Operand.IsWrite, nullptr, false, false, AsanInfo.Scale, | 
|  | AsanInfo.Offset); | 
|  | Changed = true; | 
|  | } | 
|  | } | 
|  |  | 
|  | return Changed; | 
|  | } | 
|  |  | 
|  | class AMDGPUSwLowerLDSLegacy : public ModulePass { | 
|  | public: | 
|  | const AMDGPUTargetMachine *AMDGPUTM; | 
|  | static char ID; | 
|  | AMDGPUSwLowerLDSLegacy(const AMDGPUTargetMachine *TM) | 
|  | : ModulePass(ID), AMDGPUTM(TM) {} | 
|  | bool runOnModule(Module &M) override; | 
|  | void getAnalysisUsage(AnalysisUsage &AU) const override { | 
|  | AU.addPreserved<DominatorTreeWrapperPass>(); | 
|  | } | 
|  | }; | 
|  | } // namespace | 
|  |  | 
|  | char AMDGPUSwLowerLDSLegacy::ID = 0; | 
|  | char &llvm::AMDGPUSwLowerLDSLegacyPassID = AMDGPUSwLowerLDSLegacy::ID; | 
|  |  | 
|  | INITIALIZE_PASS_BEGIN(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds", | 
|  | "AMDGPU Software lowering of LDS", false, false) | 
|  | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) | 
|  | INITIALIZE_PASS_END(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds", | 
|  | "AMDGPU Software lowering of LDS", false, false) | 
|  |  | 
|  | bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) { | 
|  | // AddressSanitizer pass adds "nosanitize_address" module flag if it has | 
|  | // instrumented the IR. Return early if the flag is not present. | 
|  | if (!M.getModuleFlag("nosanitize_address")) | 
|  | return false; | 
|  | DominatorTreeWrapperPass *const DTW = | 
|  | getAnalysisIfAvailable<DominatorTreeWrapperPass>(); | 
|  | auto DTCallback = [&DTW](Function &F) -> DominatorTree * { | 
|  | return DTW ? &DTW->getDomTree() : nullptr; | 
|  | }; | 
|  | if (!AMDGPUTM) { | 
|  | auto &TPC = getAnalysis<TargetPassConfig>(); | 
|  | AMDGPUTM = &TPC.getTM<AMDGPUTargetMachine>(); | 
|  | } | 
|  | AMDGPUSwLowerLDS SwLowerLDSImpl(M, *AMDGPUTM, DTCallback); | 
|  | bool IsChanged = SwLowerLDSImpl.run(); | 
|  | return IsChanged; | 
|  | } | 
|  |  | 
|  | ModulePass * | 
|  | llvm::createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM) { | 
|  | return new AMDGPUSwLowerLDSLegacy(TM); | 
|  | } | 
|  |  | 
|  | PreservedAnalyses AMDGPUSwLowerLDSPass::run(Module &M, | 
|  | ModuleAnalysisManager &AM) { | 
|  | // AddressSanitizer pass adds "nosanitize_address" module flag if it has | 
|  | // instrumented the IR. Return early if the flag is not present. | 
|  | if (!M.getModuleFlag("nosanitize_address")) | 
|  | return PreservedAnalyses::all(); | 
|  | auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); | 
|  | auto DTCallback = [&FAM](Function &F) -> DominatorTree * { | 
|  | return &FAM.getResult<DominatorTreeAnalysis>(F); | 
|  | }; | 
|  | AMDGPUSwLowerLDS SwLowerLDSImpl(M, TM, DTCallback); | 
|  | bool IsChanged = SwLowerLDSImpl.run(); | 
|  | if (!IsChanged) | 
|  | return PreservedAnalyses::all(); | 
|  |  | 
|  | PreservedAnalyses PA; | 
|  | PA.preserve<DominatorTreeAnalysis>(); | 
|  | return PA; | 
|  | } |