| //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This provides a class for OpenMP runtime code generation specialized to NVPTX |
| // targets. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "CGOpenMPRuntimeNVPTX.h" |
| #include "clang/AST/DeclOpenMP.h" |
| #include "CodeGenFunction.h" |
| #include "clang/AST/StmtOpenMP.h" |
| |
| using namespace clang; |
| using namespace CodeGen; |
| |
| namespace { |
| enum OpenMPRTLFunctionNVPTX { |
| /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit); |
| OMPRTL_NVPTX__kmpc_kernel_init, |
| /// \brief Call to void __kmpc_kernel_deinit(); |
| OMPRTL_NVPTX__kmpc_kernel_deinit, |
| /// \brief Call to void __kmpc_kernel_prepare_parallel(void |
| /// *outlined_function); |
| OMPRTL_NVPTX__kmpc_kernel_prepare_parallel, |
| /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function); |
| OMPRTL_NVPTX__kmpc_kernel_parallel, |
| /// \brief Call to void __kmpc_kernel_end_parallel(); |
| OMPRTL_NVPTX__kmpc_kernel_end_parallel, |
| /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 |
| /// global_tid); |
| OMPRTL_NVPTX__kmpc_serialized_parallel, |
| /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 |
| /// global_tid); |
| OMPRTL_NVPTX__kmpc_end_serialized_parallel, |
| }; |
| |
| /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. |
| class NVPTXActionTy final : public PrePostActionTy { |
| llvm::Value *EnterCallee; |
| ArrayRef<llvm::Value *> EnterArgs; |
| llvm::Value *ExitCallee; |
| ArrayRef<llvm::Value *> ExitArgs; |
| bool Conditional; |
| llvm::BasicBlock *ContBlock = nullptr; |
| |
| public: |
| NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs, |
| llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs, |
| bool Conditional = false) |
| : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), |
| ExitArgs(ExitArgs), Conditional(Conditional) {} |
| void Enter(CodeGenFunction &CGF) override { |
| llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs); |
| if (Conditional) { |
| llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes); |
| auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); |
| ContBlock = CGF.createBasicBlock("omp_if.end"); |
| // Generate the branch (If-stmt) |
| CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); |
| CGF.EmitBlock(ThenBlock); |
| } |
| } |
| void Done(CodeGenFunction &CGF) { |
| // Emit the rest of blocks/branches |
| CGF.EmitBranch(ContBlock); |
| CGF.EmitBlock(ContBlock, true); |
| } |
| void Exit(CodeGenFunction &CGF) override { |
| CGF.EmitRuntimeCall(ExitCallee, ExitArgs); |
| } |
| }; |
| } // anonymous namespace |
| |
| /// Get the GPU warp size. |
| static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) { |
| CGBuilderTy &Bld = CGF.Builder; |
| return Bld.CreateCall( |
| llvm::Intrinsic::getDeclaration( |
| &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize), |
| llvm::None, "nvptx_warp_size"); |
| } |
| |
| /// Get the id of the current thread on the GPU. |
| static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) { |
| CGBuilderTy &Bld = CGF.Builder; |
| return Bld.CreateCall( |
| llvm::Intrinsic::getDeclaration( |
| &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x), |
| llvm::None, "nvptx_tid"); |
| } |
| |
| /// Get the maximum number of threads in a block of the GPU. |
| static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { |
| CGBuilderTy &Bld = CGF.Builder; |
| return Bld.CreateCall( |
| llvm::Intrinsic::getDeclaration( |
| &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x), |
| llvm::None, "nvptx_num_threads"); |
| } |
| |
| /// Get barrier to synchronize all threads in a block. |
| static void getNVPTXCTABarrier(CodeGenFunction &CGF) { |
| CGBuilderTy &Bld = CGF.Builder; |
| Bld.CreateCall(llvm::Intrinsic::getDeclaration( |
| &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0)); |
| } |
| |
| /// Synchronize all GPU threads in a block. |
| static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); } |
| |
| /// Get the value of the thread_limit clause in the teams directive. |
| /// The runtime encodes thread_limit in the launch parameter, always starting |
| /// thread_limit+warpSize threads per team. |
| static llvm::Value *getThreadLimit(CodeGenFunction &CGF) { |
| CGBuilderTy &Bld = CGF.Builder; |
| return Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF), |
| "thread_limit"); |
| } |
| |
| /// Get the thread id of the OMP master thread. |
| /// The master thread id is the first thread (lane) of the last warp in the |
| /// GPU block. Warp size is assumed to be some power of 2. |
| /// Thread id is 0 indexed. |
| /// E.g: If NumThreads is 33, master id is 32. |
| /// If NumThreads is 64, master id is 32. |
| /// If NumThreads is 1024, master id is 992. |
| static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) { |
| CGBuilderTy &Bld = CGF.Builder; |
| llvm::Value *NumThreads = getNVPTXNumThreads(CGF); |
| |
| // We assume that the warp size is a power of 2. |
| llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1)); |
| |
| return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)), |
| Bld.CreateNot(Mask), "master_tid"); |
| } |
| |
| CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState( |
| CodeGenModule &CGM) |
| : WorkerFn(nullptr), CGFI(nullptr) { |
| createWorkerFunction(CGM); |
| } |
| |
| void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( |
| CodeGenModule &CGM) { |
| // Create an worker function with no arguments. |
| CGFI = &CGM.getTypes().arrangeNullaryFunction(); |
| |
| WorkerFn = llvm::Function::Create( |
| CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage, |
| /* placeholder */ "_worker", &CGM.getModule()); |
| CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI); |
| } |
| |
| void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, |
| StringRef ParentName, |
| llvm::Function *&OutlinedFn, |
| llvm::Constant *&OutlinedFnID, |
| bool IsOffloadEntry, |
| const RegionCodeGenTy &CodeGen) { |
| EntryFunctionState EST; |
| WorkerFunctionState WST(CGM); |
| Work.clear(); |
| |
| // Emit target region as a standalone region. |
| class NVPTXPrePostActionTy : public PrePostActionTy { |
| CGOpenMPRuntimeNVPTX &RT; |
| CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; |
| CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST; |
| |
| public: |
| NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, |
| CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, |
| CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) |
| : RT(RT), EST(EST), WST(WST) {} |
| void Enter(CodeGenFunction &CGF) override { |
| RT.emitGenericEntryHeader(CGF, EST, WST); |
| } |
| void Exit(CodeGenFunction &CGF) override { |
| RT.emitGenericEntryFooter(CGF, EST); |
| } |
| } Action(*this, EST, WST); |
| CodeGen.setAction(Action); |
| emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, |
| IsOffloadEntry, CodeGen); |
| |
| // Create the worker function |
| emitWorkerFunction(WST); |
| |
| // Now change the name of the worker function to correspond to this target |
| // region's entry function. |
| WST.WorkerFn->setName(OutlinedFn->getName() + "_worker"); |
| } |
| |
| // Setup NVPTX threads for master-worker OpenMP scheme. |
| void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, |
| EntryFunctionState &EST, |
| WorkerFunctionState &WST) { |
| CGBuilderTy &Bld = CGF.Builder; |
| |
| llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); |
| llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); |
| llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); |
| EST.ExitBB = CGF.createBasicBlock(".exit"); |
| |
| auto *IsWorker = |
| Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF)); |
| Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); |
| |
| CGF.EmitBlock(WorkerBB); |
| CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None); |
| CGF.EmitBranch(EST.ExitBB); |
| |
| CGF.EmitBlock(MasterCheckBB); |
| auto *IsMaster = |
| Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF)); |
| Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); |
| |
| CGF.EmitBlock(MasterBB); |
| // First action in sequential region: |
| // Initialize the state of the OpenMP runtime library on the GPU. |
| llvm::Value *Args[] = {getThreadLimit(CGF)}; |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); |
| } |
| |
| void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF, |
| EntryFunctionState &EST) { |
| if (!EST.ExitBB) |
| EST.ExitBB = CGF.createBasicBlock(".exit"); |
| |
| llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); |
| CGF.EmitBranch(TerminateBB); |
| |
| CGF.EmitBlock(TerminateBB); |
| // Signal termination condition. |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None); |
| // Barrier to terminate worker threads. |
| syncCTAThreads(CGF); |
| // Master thread jumps to exit point. |
| CGF.EmitBranch(EST.ExitBB); |
| |
| CGF.EmitBlock(EST.ExitBB); |
| EST.ExitBB = nullptr; |
| } |
| |
| void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { |
| auto &Ctx = CGM.getContext(); |
| |
| CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); |
| CGF.disableDebugInfo(); |
| CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {}); |
| emitWorkerLoop(CGF, WST); |
| CGF.FinishFunction(); |
| } |
| |
| void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, |
| WorkerFunctionState &WST) { |
| // |
| // The workers enter this loop and wait for parallel work from the master. |
| // When the master encounters a parallel region it sets up the work + variable |
| // arguments, and wakes up the workers. The workers first check to see if |
| // they are required for the parallel region, i.e., within the # of requested |
| // parallel threads. The activated workers load the variable arguments and |
| // execute the parallel work. |
| // |
| |
| CGBuilderTy &Bld = CGF.Builder; |
| |
| llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work"); |
| llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers"); |
| llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel"); |
| llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel"); |
| llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel"); |
| llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); |
| |
| CGF.EmitBranch(AwaitBB); |
| |
| // Workers wait for work from master. |
| CGF.EmitBlock(AwaitBB); |
| // Wait for parallel work |
| syncCTAThreads(CGF); |
| |
| Address WorkFn = |
| CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn"); |
| Address ExecStatus = |
| CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status"); |
| CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); |
| CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); |
| |
| llvm::Value *Args[] = {WorkFn.getPointer()}; |
| llvm::Value *Ret = CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args); |
| Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); |
| |
| // On termination condition (workid == 0), exit loop. |
| llvm::Value *ShouldTerminate = |
| Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate"); |
| Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); |
| |
| // Activate requested workers. |
| CGF.EmitBlock(SelectWorkersBB); |
| llvm::Value *IsActive = |
| Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active"); |
| Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB); |
| |
| // Signal start of parallel region. |
| CGF.EmitBlock(ExecuteBB); |
| |
| // Process work items: outlined parallel functions. |
| for (auto *W : Work) { |
| // Try to match this outlined function. |
| auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy); |
| |
| llvm::Value *WorkFnMatch = |
| Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match"); |
| |
| llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn"); |
| llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next"); |
| Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB); |
| |
| // Execute this outlined function. |
| CGF.EmitBlock(ExecuteFNBB); |
| |
| // Insert call to work function. |
| // FIXME: Pass arguments to outlined function from master thread. |
| auto *Fn = cast<llvm::Function>(W); |
| Address ZeroAddr = |
| CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr"); |
| CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0)); |
| llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()}; |
| CGF.EmitCallOrInvoke(Fn, FnArgs); |
| |
| // Go to end of parallel region. |
| CGF.EmitBranch(TerminateBB); |
| |
| CGF.EmitBlock(CheckNextBB); |
| } |
| |
| // Signal end of parallel region. |
| CGF.EmitBlock(TerminateBB); |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel), |
| llvm::None); |
| CGF.EmitBranch(BarrierBB); |
| |
| // All active and inactive workers wait at a barrier after parallel region. |
| CGF.EmitBlock(BarrierBB); |
| // Barrier after parallel region. |
| syncCTAThreads(CGF); |
| CGF.EmitBranch(AwaitBB); |
| |
| // Exit target region. |
| CGF.EmitBlock(ExitBB); |
| } |
| |
| /// \brief Returns specified OpenMP runtime function for the current OpenMP |
| /// implementation. Specialized for the NVPTX device. |
| /// \param Function OpenMP runtime function. |
| /// \return Specified function. |
| llvm::Constant * |
| CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { |
| llvm::Constant *RTLFn = nullptr; |
| switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) { |
| case OMPRTL_NVPTX__kmpc_kernel_init: { |
| // Build void __kmpc_kernel_init(kmp_int32 thread_limit); |
| llvm::Type *TypeParams[] = {CGM.Int32Ty}; |
| llvm::FunctionType *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_kernel_deinit: { |
| // Build void __kmpc_kernel_deinit(); |
| llvm::FunctionType *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { |
| /// Build void __kmpc_kernel_prepare_parallel( |
| /// void *outlined_function); |
| llvm::Type *TypeParams[] = {CGM.Int8PtrTy}; |
| llvm::FunctionType *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_kernel_parallel: { |
| /// Build bool __kmpc_kernel_parallel(void **outlined_function); |
| llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy}; |
| llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy); |
| llvm::FunctionType *FnTy = |
| llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_kernel_end_parallel: { |
| /// Build void __kmpc_kernel_end_parallel(); |
| llvm::FunctionType *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_serialized_parallel: { |
| // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 |
| // global_tid); |
| llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; |
| llvm::FunctionType *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_end_serialized_parallel: { |
| // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 |
| // global_tid); |
| llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; |
| llvm::FunctionType *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel"); |
| break; |
| } |
| } |
| return RTLFn; |
| } |
| |
| void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID, |
| llvm::Constant *Addr, |
| uint64_t Size, int32_t) { |
| auto *F = dyn_cast<llvm::Function>(Addr); |
| // TODO: Add support for global variables on the device after declare target |
| // support. |
| if (!F) |
| return; |
| llvm::Module *M = F->getParent(); |
| llvm::LLVMContext &Ctx = M->getContext(); |
| |
| // Get "nvvm.annotations" metadata node |
| llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations"); |
| |
| llvm::Metadata *MDVals[] = { |
| llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"), |
| llvm::ConstantAsMetadata::get( |
| llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; |
| // Append metadata to nvvm.annotations |
| MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); |
| } |
| |
| void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction( |
| const OMPExecutableDirective &D, StringRef ParentName, |
| llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, |
| bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { |
| if (!IsOffloadEntry) // Nothing to do. |
| return; |
| |
| assert(!ParentName.empty() && "Invalid target region parent name!"); |
| |
| emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, |
| CodeGen); |
| } |
| |
| CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) |
| : CGOpenMPRuntime(CGM) { |
| if (!CGM.getLangOpts().OpenMPIsDevice) |
| llvm_unreachable("OpenMP NVPTX can only handle device code."); |
| } |
| |
| void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, |
| const Expr *NumTeams, |
| const Expr *ThreadLimit, |
| SourceLocation Loc) {} |
| |
| llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction( |
| const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, |
| OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { |
| |
| llvm::Function *OutlinedFun = nullptr; |
| if (isa<OMPTeamsDirective>(D)) { |
| llvm::Value *OutlinedFunVal = |
| CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction( |
| D, ThreadIDVar, InnermostKind, CodeGen); |
| OutlinedFun = cast<llvm::Function>(OutlinedFunVal); |
| OutlinedFun->removeFnAttr(llvm::Attribute::NoInline); |
| OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); |
| } else { |
| llvm::Value *OutlinedFunVal = |
| CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction( |
| D, ThreadIDVar, InnermostKind, CodeGen); |
| OutlinedFun = cast<llvm::Function>(OutlinedFunVal); |
| } |
| |
| return OutlinedFun; |
| } |
| |
| void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, |
| const OMPExecutableDirective &D, |
| SourceLocation Loc, |
| llvm::Value *OutlinedFn, |
| ArrayRef<llvm::Value *> CapturedVars) { |
| if (!CGF.HaveInsertPoint()) |
| return; |
| |
| Address ZeroAddr = |
| CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4), |
| /*Name*/ ".zero.addr"); |
| CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); |
| llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; |
| OutlinedFnArgs.push_back(ZeroAddr.getPointer()); |
| OutlinedFnArgs.push_back(ZeroAddr.getPointer()); |
| OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); |
| CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); |
| } |
| |
| void CGOpenMPRuntimeNVPTX::emitParallelCall( |
| CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, |
| ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) { |
| if (!CGF.HaveInsertPoint()) |
| return; |
| |
| emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); |
| } |
| |
| void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( |
| CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, |
| ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) { |
| llvm::Function *Fn = cast<llvm::Function>(OutlinedFn); |
| |
| auto &&L0ParallelGen = [this, Fn, &CapturedVars](CodeGenFunction &CGF, |
| PrePostActionTy &) { |
| CGBuilderTy &Bld = CGF.Builder; |
| |
| // Prepare for parallel region. Indicate the outlined function. |
| llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy)}; |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), |
| Args); |
| |
| // Activate workers. This barrier is used by the master to signal |
| // work for the workers. |
| syncCTAThreads(CGF); |
| |
| // OpenMP [2.5, Parallel Construct, p.49] |
| // There is an implied barrier at the end of a parallel region. After the |
| // end of a parallel region, only the master thread of the team resumes |
| // execution of the enclosing task region. |
| // |
| // The master waits at this barrier until all workers are done. |
| syncCTAThreads(CGF); |
| |
| // Remember for post-processing in worker loop. |
| Work.push_back(Fn); |
| }; |
| |
| auto *RTLoc = emitUpdateLocation(CGF, Loc); |
| auto *ThreadID = getThreadID(CGF, Loc); |
| llvm::Value *Args[] = {RTLoc, ThreadID}; |
| |
| auto &&SeqGen = [this, Fn, &CapturedVars, &Args](CodeGenFunction &CGF, |
| PrePostActionTy &) { |
| auto &&CodeGen = [this, Fn, &CapturedVars, &Args](CodeGenFunction &CGF, |
| PrePostActionTy &Action) { |
| Action.Enter(CGF); |
| |
| llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; |
| OutlinedFnArgs.push_back( |
| llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); |
| OutlinedFnArgs.push_back( |
| llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); |
| OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); |
| CGF.EmitCallOrInvoke(Fn, OutlinedFnArgs); |
| }; |
| |
| RegionCodeGenTy RCG(CodeGen); |
| NVPTXActionTy Action( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel), |
| Args, |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel), |
| Args); |
| RCG.setAction(Action); |
| RCG(CGF); |
| }; |
| |
| if (IfCond) |
| emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen); |
| else { |
| CodeGenFunction::RunCleanupsScope Scope(CGF); |
| RegionCodeGenTy ThenRCG(L0ParallelGen); |
| ThenRCG(CGF); |
| } |
| } |