| //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This provides a generalized class for OpenMP runtime code generation |
| // specialized by GPU targets NVPTX and AMDGCN. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "CGOpenMPRuntimeGPU.h" |
| #include "CGOpenMPRuntimeNVPTX.h" |
| #include "CodeGenFunction.h" |
| #include "clang/AST/Attr.h" |
| #include "clang/AST/DeclOpenMP.h" |
| #include "clang/AST/StmtOpenMP.h" |
| #include "clang/AST/StmtVisitor.h" |
| #include "clang/Basic/Cuda.h" |
| #include "llvm/ADT/SmallPtrSet.h" |
| #include "llvm/Frontend/OpenMP/OMPGridValues.h" |
| #include "llvm/IR/IntrinsicsNVPTX.h" |
| |
| using namespace clang; |
| using namespace CodeGen; |
| using namespace llvm::omp; |
| |
| namespace { |
| enum OpenMPRTLFunctionNVPTX { |
| /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit, |
| /// int16_t RequiresOMPRuntime); |
| OMPRTL_NVPTX__kmpc_kernel_init, |
| /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); |
| OMPRTL_NVPTX__kmpc_kernel_deinit, |
| /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, |
| /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing); |
| OMPRTL_NVPTX__kmpc_spmd_kernel_init, |
| /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); |
| OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2, |
| /// Call to void __kmpc_kernel_prepare_parallel(void |
| /// *outlined_function); |
| OMPRTL_NVPTX__kmpc_kernel_prepare_parallel, |
| /// Call to bool __kmpc_kernel_parallel(void **outlined_function); |
| OMPRTL_NVPTX__kmpc_kernel_parallel, |
| /// Call to void __kmpc_kernel_end_parallel(); |
| OMPRTL_NVPTX__kmpc_kernel_end_parallel, |
| /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 |
| /// global_tid); |
| OMPRTL_NVPTX__kmpc_serialized_parallel, |
| /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 |
| /// global_tid); |
| OMPRTL_NVPTX__kmpc_end_serialized_parallel, |
| /// Call to int32_t __kmpc_shuffle_int32(int32_t element, |
| /// int16_t lane_offset, int16_t warp_size); |
| OMPRTL_NVPTX__kmpc_shuffle_int32, |
| /// Call to int64_t __kmpc_shuffle_int64(int64_t element, |
| /// int16_t lane_offset, int16_t warp_size); |
| OMPRTL_NVPTX__kmpc_shuffle_int64, |
| /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32 |
| /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data, |
| /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t |
| /// lane_offset, int16_t shortCircuit), |
| /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num)); |
| OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2, |
| /// Call to __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32 |
| /// global_tid, void *global_buffer, int32_t num_of_records, void* |
| /// reduce_data, |
| /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t |
| /// lane_offset, int16_t shortCircuit), |
| /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void |
| /// (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data), |
| /// void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx, |
| /// void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer, |
| /// int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void |
| /// *buffer, int idx, void *reduce_data)); |
| OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2, |
| /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); |
| OMPRTL_NVPTX__kmpc_end_reduce_nowait, |
| /// Call to void __kmpc_data_sharing_init_stack(); |
| OMPRTL_NVPTX__kmpc_data_sharing_init_stack, |
| /// Call to void __kmpc_data_sharing_init_stack_spmd(); |
| OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd, |
| /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size, |
| /// int16_t UseSharedMemory); |
| OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack, |
| /// Call to void* __kmpc_data_sharing_push_stack(size_t size, int16_t |
| /// UseSharedMemory); |
| OMPRTL_NVPTX__kmpc_data_sharing_push_stack, |
| /// Call to void __kmpc_data_sharing_pop_stack(void *a); |
| OMPRTL_NVPTX__kmpc_data_sharing_pop_stack, |
| /// Call to void __kmpc_begin_sharing_variables(void ***args, |
| /// size_t n_args); |
| OMPRTL_NVPTX__kmpc_begin_sharing_variables, |
| /// Call to void __kmpc_end_sharing_variables(); |
| OMPRTL_NVPTX__kmpc_end_sharing_variables, |
| /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs) |
| OMPRTL_NVPTX__kmpc_get_shared_variables, |
| /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 |
| /// global_tid); |
| OMPRTL_NVPTX__kmpc_parallel_level, |
| /// Call to int8_t __kmpc_is_spmd_exec_mode(); |
| OMPRTL_NVPTX__kmpc_is_spmd_exec_mode, |
| /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, |
| /// const void *buf, size_t size, int16_t is_shared, const void **res); |
| OMPRTL_NVPTX__kmpc_get_team_static_memory, |
| /// Call to void __kmpc_restore_team_static_memory(int16_t |
| /// isSPMDExecutionMode, int16_t is_shared); |
| OMPRTL_NVPTX__kmpc_restore_team_static_memory, |
| /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid); |
| OMPRTL__kmpc_barrier, |
| /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32 |
| /// global_tid); |
| OMPRTL__kmpc_barrier_simple_spmd, |
| /// Call to int32_t __kmpc_warp_active_thread_mask(void); |
| OMPRTL_NVPTX__kmpc_warp_active_thread_mask, |
| /// Call to void __kmpc_syncwarp(int32_t Mask); |
| OMPRTL_NVPTX__kmpc_syncwarp, |
| }; |
| |
| /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. |
| class NVPTXActionTy final : public PrePostActionTy { |
| llvm::FunctionCallee EnterCallee = nullptr; |
| ArrayRef<llvm::Value *> EnterArgs; |
| llvm::FunctionCallee ExitCallee = nullptr; |
| ArrayRef<llvm::Value *> ExitArgs; |
| bool Conditional = false; |
| llvm::BasicBlock *ContBlock = nullptr; |
| |
| public: |
| NVPTXActionTy(llvm::FunctionCallee EnterCallee, |
| ArrayRef<llvm::Value *> EnterArgs, |
| llvm::FunctionCallee ExitCallee, |
| ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false) |
| : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), |
| ExitArgs(ExitArgs), Conditional(Conditional) {} |
| void Enter(CodeGenFunction &CGF) override { |
| llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs); |
| if (Conditional) { |
| llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes); |
| auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); |
| ContBlock = CGF.createBasicBlock("omp_if.end"); |
| // Generate the branch (If-stmt) |
| CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); |
| CGF.EmitBlock(ThenBlock); |
| } |
| } |
| void Done(CodeGenFunction &CGF) { |
| // Emit the rest of blocks/branches |
| CGF.EmitBranch(ContBlock); |
| CGF.EmitBlock(ContBlock, true); |
| } |
| void Exit(CodeGenFunction &CGF) override { |
| CGF.EmitRuntimeCall(ExitCallee, ExitArgs); |
| } |
| }; |
| |
| /// A class to track the execution mode when codegening directives within |
| /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry |
| /// to the target region and used by containing directives such as 'parallel' |
| /// to emit optimized code. |
| class ExecutionRuntimeModesRAII { |
| private: |
| CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode = |
| CGOpenMPRuntimeGPU::EM_Unknown; |
| CGOpenMPRuntimeGPU::ExecutionMode &ExecMode; |
| bool SavedRuntimeMode = false; |
| bool *RuntimeMode = nullptr; |
| |
| public: |
| /// Constructor for Non-SPMD mode. |
| ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode) |
| : ExecMode(ExecMode) { |
| SavedExecMode = ExecMode; |
| ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD; |
| } |
| /// Constructor for SPMD mode. |
| ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode, |
| bool &RuntimeMode, bool FullRuntimeMode) |
| : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) { |
| SavedExecMode = ExecMode; |
| SavedRuntimeMode = RuntimeMode; |
| ExecMode = CGOpenMPRuntimeGPU::EM_SPMD; |
| RuntimeMode = FullRuntimeMode; |
| } |
| ~ExecutionRuntimeModesRAII() { |
| ExecMode = SavedExecMode; |
| if (RuntimeMode) |
| *RuntimeMode = SavedRuntimeMode; |
| } |
| }; |
| |
| /// GPU Configuration: This information can be derived from cuda registers, |
| /// however, providing compile time constants helps generate more efficient |
| /// code. For all practical purposes this is fine because the configuration |
| /// is the same for all known NVPTX architectures. |
| enum MachineConfiguration : unsigned { |
| /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target |
| /// specific Grid Values like GV_Warp_Size, GV_Warp_Size_Log2, |
| /// and GV_Warp_Size_Log2_Mask. |
| |
| /// Global memory alignment for performance. |
| GlobalMemoryAlignment = 128, |
| |
| /// Maximal size of the shared memory buffer. |
| SharedMemorySize = 128, |
| }; |
| |
| static const ValueDecl *getPrivateItem(const Expr *RefExpr) { |
| RefExpr = RefExpr->IgnoreParens(); |
| if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) { |
| const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); |
| while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) |
| Base = TempASE->getBase()->IgnoreParenImpCasts(); |
| RefExpr = Base; |
| } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) { |
| const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); |
| while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base)) |
| Base = TempOASE->getBase()->IgnoreParenImpCasts(); |
| while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) |
| Base = TempASE->getBase()->IgnoreParenImpCasts(); |
| RefExpr = Base; |
| } |
| RefExpr = RefExpr->IgnoreParenImpCasts(); |
| if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr)) |
| return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()); |
| const auto *ME = cast<MemberExpr>(RefExpr); |
| return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl()); |
| } |
| |
| |
| static RecordDecl *buildRecordForGlobalizedVars( |
| ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls, |
| ArrayRef<const ValueDecl *> EscapedDeclsForTeams, |
| llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> |
| &MappedDeclsFields, int BufSize) { |
| using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>; |
| if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) |
| return nullptr; |
| SmallVector<VarsDataTy, 4> GlobalizedVars; |
| for (const ValueDecl *D : EscapedDecls) |
| GlobalizedVars.emplace_back( |
| CharUnits::fromQuantity(std::max( |
| C.getDeclAlign(D).getQuantity(), |
| static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))), |
| D); |
| for (const ValueDecl *D : EscapedDeclsForTeams) |
| GlobalizedVars.emplace_back(C.getDeclAlign(D), D); |
| llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) { |
| return L.first > R.first; |
| }); |
| |
| // Build struct _globalized_locals_ty { |
| // /* globalized vars */[WarSize] align (max(decl_align, |
| // GlobalMemoryAlignment)) |
| // /* globalized vars */ for EscapedDeclsForTeams |
| // }; |
| RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); |
| GlobalizedRD->startDefinition(); |
| llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped( |
| EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end()); |
| for (const auto &Pair : GlobalizedVars) { |
| const ValueDecl *VD = Pair.second; |
| QualType Type = VD->getType(); |
| if (Type->isLValueReferenceType()) |
| Type = C.getPointerType(Type.getNonReferenceType()); |
| else |
| Type = Type.getNonReferenceType(); |
| SourceLocation Loc = VD->getLocation(); |
| FieldDecl *Field; |
| if (SingleEscaped.count(VD)) { |
| Field = FieldDecl::Create( |
| C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, |
| C.getTrivialTypeSourceInfo(Type, SourceLocation()), |
| /*BW=*/nullptr, /*Mutable=*/false, |
| /*InitStyle=*/ICIS_NoInit); |
| Field->setAccess(AS_public); |
| if (VD->hasAttrs()) { |
| for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()), |
| E(VD->getAttrs().end()); |
| I != E; ++I) |
| Field->addAttr(*I); |
| } |
| } else { |
| llvm::APInt ArraySize(32, BufSize); |
| Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal, |
| 0); |
| Field = FieldDecl::Create( |
| C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, |
| C.getTrivialTypeSourceInfo(Type, SourceLocation()), |
| /*BW=*/nullptr, /*Mutable=*/false, |
| /*InitStyle=*/ICIS_NoInit); |
| Field->setAccess(AS_public); |
| llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(), |
| static_cast<CharUnits::QuantityType>( |
| GlobalMemoryAlignment))); |
| Field->addAttr(AlignedAttr::CreateImplicit( |
| C, /*IsAlignmentExpr=*/true, |
| IntegerLiteral::Create(C, Align, |
| C.getIntTypeForBitwidth(32, /*Signed=*/0), |
| SourceLocation()), |
| {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned)); |
| } |
| GlobalizedRD->addDecl(Field); |
| MappedDeclsFields.try_emplace(VD, Field); |
| } |
| GlobalizedRD->completeDefinition(); |
| return GlobalizedRD; |
| } |
| |
| /// Get the list of variables that can escape their declaration context. |
| class CheckVarsEscapingDeclContext final |
| : public ConstStmtVisitor<CheckVarsEscapingDeclContext> { |
| CodeGenFunction &CGF; |
| llvm::SetVector<const ValueDecl *> EscapedDecls; |
| llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls; |
| llvm::SmallPtrSet<const Decl *, 4> EscapedParameters; |
| RecordDecl *GlobalizedRD = nullptr; |
| llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; |
| bool AllEscaped = false; |
| bool IsForCombinedParallelRegion = false; |
| |
| void markAsEscaped(const ValueDecl *VD) { |
| // Do not globalize declare target variables. |
| if (!isa<VarDecl>(VD) || |
| OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) |
| return; |
| VD = cast<ValueDecl>(VD->getCanonicalDecl()); |
| // Use user-specified allocation. |
| if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>()) |
| return; |
| // Variables captured by value must be globalized. |
| if (auto *CSI = CGF.CapturedStmtInfo) { |
| if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) { |
| // Check if need to capture the variable that was already captured by |
| // value in the outer region. |
| if (!IsForCombinedParallelRegion) { |
| if (!FD->hasAttrs()) |
| return; |
| const auto *Attr = FD->getAttr<OMPCaptureKindAttr>(); |
| if (!Attr) |
| return; |
| if (((Attr->getCaptureKind() != OMPC_map) && |
| !isOpenMPPrivate(Attr->getCaptureKind())) || |
| ((Attr->getCaptureKind() == OMPC_map) && |
| !FD->getType()->isAnyPointerType())) |
| return; |
| } |
| if (!FD->getType()->isReferenceType()) { |
| assert(!VD->getType()->isVariablyModifiedType() && |
| "Parameter captured by value with variably modified type"); |
| EscapedParameters.insert(VD); |
| } else if (!IsForCombinedParallelRegion) { |
| return; |
| } |
| } |
| } |
| if ((!CGF.CapturedStmtInfo || |
| (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) && |
| VD->getType()->isReferenceType()) |
| // Do not globalize variables with reference type. |
| return; |
| if (VD->getType()->isVariablyModifiedType()) |
| EscapedVariableLengthDecls.insert(VD); |
| else |
| EscapedDecls.insert(VD); |
| } |
| |
| void VisitValueDecl(const ValueDecl *VD) { |
| if (VD->getType()->isLValueReferenceType()) |
| markAsEscaped(VD); |
| if (const auto *VarD = dyn_cast<VarDecl>(VD)) { |
| if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) { |
| const bool SavedAllEscaped = AllEscaped; |
| AllEscaped = VD->getType()->isLValueReferenceType(); |
| Visit(VarD->getInit()); |
| AllEscaped = SavedAllEscaped; |
| } |
| } |
| } |
| void VisitOpenMPCapturedStmt(const CapturedStmt *S, |
| ArrayRef<OMPClause *> Clauses, |
| bool IsCombinedParallelRegion) { |
| if (!S) |
| return; |
| for (const CapturedStmt::Capture &C : S->captures()) { |
| if (C.capturesVariable() && !C.capturesVariableByCopy()) { |
| const ValueDecl *VD = C.getCapturedVar(); |
| bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion; |
| if (IsCombinedParallelRegion) { |
| // Check if the variable is privatized in the combined construct and |
| // those private copies must be shared in the inner parallel |
| // directive. |
| IsForCombinedParallelRegion = false; |
| for (const OMPClause *C : Clauses) { |
| if (!isOpenMPPrivate(C->getClauseKind()) || |
| C->getClauseKind() == OMPC_reduction || |
| C->getClauseKind() == OMPC_linear || |
| C->getClauseKind() == OMPC_private) |
| continue; |
| ArrayRef<const Expr *> Vars; |
| if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C)) |
| Vars = PC->getVarRefs(); |
| else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C)) |
| Vars = PC->getVarRefs(); |
| else |
| llvm_unreachable("Unexpected clause."); |
| for (const auto *E : Vars) { |
| const Decl *D = |
| cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl(); |
| if (D == VD->getCanonicalDecl()) { |
| IsForCombinedParallelRegion = true; |
| break; |
| } |
| } |
| if (IsForCombinedParallelRegion) |
| break; |
| } |
| } |
| markAsEscaped(VD); |
| if (isa<OMPCapturedExprDecl>(VD)) |
| VisitValueDecl(VD); |
| IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion; |
| } |
| } |
| } |
| |
| void buildRecordForGlobalizedVars(bool IsInTTDRegion) { |
| assert(!GlobalizedRD && |
| "Record for globalized variables is built already."); |
| ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams; |
| unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); |
| if (IsInTTDRegion) |
| EscapedDeclsForTeams = EscapedDecls.getArrayRef(); |
| else |
| EscapedDeclsForParallel = EscapedDecls.getArrayRef(); |
| GlobalizedRD = ::buildRecordForGlobalizedVars( |
| CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, |
| MappedDeclsFields, WarpSize); |
| } |
| |
| public: |
| CheckVarsEscapingDeclContext(CodeGenFunction &CGF, |
| ArrayRef<const ValueDecl *> TeamsReductions) |
| : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) { |
| } |
| virtual ~CheckVarsEscapingDeclContext() = default; |
| void VisitDeclStmt(const DeclStmt *S) { |
| if (!S) |
| return; |
| for (const Decl *D : S->decls()) |
| if (const auto *VD = dyn_cast_or_null<ValueDecl>(D)) |
| VisitValueDecl(VD); |
| } |
| void VisitOMPExecutableDirective(const OMPExecutableDirective *D) { |
| if (!D) |
| return; |
| if (!D->hasAssociatedStmt()) |
| return; |
| if (const auto *S = |
| dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) { |
| // Do not analyze directives that do not actually require capturing, |
| // like `omp for` or `omp simd` directives. |
| llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions; |
| getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind()); |
| if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) { |
| VisitStmt(S->getCapturedStmt()); |
| return; |
| } |
| VisitOpenMPCapturedStmt( |
| S, D->clauses(), |
| CaptureRegions.back() == OMPD_parallel && |
| isOpenMPDistributeDirective(D->getDirectiveKind())); |
| } |
| } |
| void VisitCapturedStmt(const CapturedStmt *S) { |
| if (!S) |
| return; |
| for (const CapturedStmt::Capture &C : S->captures()) { |
| if (C.capturesVariable() && !C.capturesVariableByCopy()) { |
| const ValueDecl *VD = C.getCapturedVar(); |
| markAsEscaped(VD); |
| if (isa<OMPCapturedExprDecl>(VD)) |
| VisitValueDecl(VD); |
| } |
| } |
| } |
| void VisitLambdaExpr(const LambdaExpr *E) { |
| if (!E) |
| return; |
| for (const LambdaCapture &C : E->captures()) { |
| if (C.capturesVariable()) { |
| if (C.getCaptureKind() == LCK_ByRef) { |
| const ValueDecl *VD = C.getCapturedVar(); |
| markAsEscaped(VD); |
| if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD)) |
| VisitValueDecl(VD); |
| } |
| } |
| } |
| } |
| void VisitBlockExpr(const BlockExpr *E) { |
| if (!E) |
| return; |
| for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) { |
| if (C.isByRef()) { |
| const VarDecl *VD = C.getVariable(); |
| markAsEscaped(VD); |
| if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture()) |
| VisitValueDecl(VD); |
| } |
| } |
| } |
| void VisitCallExpr(const CallExpr *E) { |
| if (!E) |
| return; |
| for (const Expr *Arg : E->arguments()) { |
| if (!Arg) |
| continue; |
| if (Arg->isLValue()) { |
| const bool SavedAllEscaped = AllEscaped; |
| AllEscaped = true; |
| Visit(Arg); |
| AllEscaped = SavedAllEscaped; |
| } else { |
| Visit(Arg); |
| } |
| } |
| Visit(E->getCallee()); |
| } |
| void VisitDeclRefExpr(const DeclRefExpr *E) { |
| if (!E) |
| return; |
| const ValueDecl *VD = E->getDecl(); |
| if (AllEscaped) |
| markAsEscaped(VD); |
| if (isa<OMPCapturedExprDecl>(VD)) |
| VisitValueDecl(VD); |
| else if (const auto *VarD = dyn_cast<VarDecl>(VD)) |
| if (VarD->isInitCapture()) |
| VisitValueDecl(VD); |
| } |
| void VisitUnaryOperator(const UnaryOperator *E) { |
| if (!E) |
| return; |
| if (E->getOpcode() == UO_AddrOf) { |
| const bool SavedAllEscaped = AllEscaped; |
| AllEscaped = true; |
| Visit(E->getSubExpr()); |
| AllEscaped = SavedAllEscaped; |
| } else { |
| Visit(E->getSubExpr()); |
| } |
| } |
| void VisitImplicitCastExpr(const ImplicitCastExpr *E) { |
| if (!E) |
| return; |
| if (E->getCastKind() == CK_ArrayToPointerDecay) { |
| const bool SavedAllEscaped = AllEscaped; |
| AllEscaped = true; |
| Visit(E->getSubExpr()); |
| AllEscaped = SavedAllEscaped; |
| } else { |
| Visit(E->getSubExpr()); |
| } |
| } |
| void VisitExpr(const Expr *E) { |
| if (!E) |
| return; |
| bool SavedAllEscaped = AllEscaped; |
| if (!E->isLValue()) |
| AllEscaped = false; |
| for (const Stmt *Child : E->children()) |
| if (Child) |
| Visit(Child); |
| AllEscaped = SavedAllEscaped; |
| } |
| void VisitStmt(const Stmt *S) { |
| if (!S) |
| return; |
| for (const Stmt *Child : S->children()) |
| if (Child) |
| Visit(Child); |
| } |
| |
| /// Returns the record that handles all the escaped local variables and used |
| /// instead of their original storage. |
| const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) { |
| if (!GlobalizedRD) |
| buildRecordForGlobalizedVars(IsInTTDRegion); |
| return GlobalizedRD; |
| } |
| |
| /// Returns the field in the globalized record for the escaped variable. |
| const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const { |
| assert(GlobalizedRD && |
| "Record for globalized variables must be generated already."); |
| auto I = MappedDeclsFields.find(VD); |
| if (I == MappedDeclsFields.end()) |
| return nullptr; |
| return I->getSecond(); |
| } |
| |
| /// Returns the list of the escaped local variables/parameters. |
| ArrayRef<const ValueDecl *> getEscapedDecls() const { |
| return EscapedDecls.getArrayRef(); |
| } |
| |
| /// Checks if the escaped local variable is actually a parameter passed by |
| /// value. |
| const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const { |
| return EscapedParameters; |
| } |
| |
| /// Returns the list of the escaped variables with the variably modified |
| /// types. |
| ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const { |
| return EscapedVariableLengthDecls.getArrayRef(); |
| } |
| }; |
| } // anonymous namespace |
| |
| /// Get the id of the warp in the block. |
| /// We assume that the warp size is 32, which is always the case |
| /// on the NVPTX device, to generate more efficient code. |
| static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { |
| CGBuilderTy &Bld = CGF.Builder; |
| unsigned LaneIDBits = |
| CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2); |
| auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); |
| return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id"); |
| } |
| |
| /// Get the id of the current lane in the Warp. |
| /// We assume that the warp size is 32, which is always the case |
| /// on the NVPTX device, to generate more efficient code. |
| static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { |
| CGBuilderTy &Bld = CGF.Builder; |
| unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue( |
| llvm::omp::GV_Warp_Size_Log2_Mask); |
| auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); |
| return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask), |
| "nvptx_lane_id"); |
| } |
| |
| /// Get the value of the thread_limit clause in the teams directive. |
| /// For the 'generic' execution mode, the runtime encodes thread_limit in |
| /// the launch parameters, always starting thread_limit+warpSize threads per |
| /// CTA. The threads in the last warp are reserved for master execution. |
| /// For the 'spmd' execution mode, all threads in a CTA are part of the team. |
| static llvm::Value *getThreadLimit(CodeGenFunction &CGF, |
| bool IsInSPMDExecutionMode = false) { |
| CGBuilderTy &Bld = CGF.Builder; |
| auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); |
| return IsInSPMDExecutionMode |
| ? RT.getGPUNumThreads(CGF) |
| : Bld.CreateNUWSub(RT.getGPUNumThreads(CGF), |
| RT.getGPUWarpSize(CGF), "thread_limit"); |
| } |
| |
| /// Get the thread id of the OMP master thread. |
| /// The master thread id is the first thread (lane) of the last warp in the |
| /// GPU block. Warp size is assumed to be some power of 2. |
| /// Thread id is 0 indexed. |
| /// E.g: If NumThreads is 33, master id is 32. |
| /// If NumThreads is 64, master id is 32. |
| /// If NumThreads is 1024, master id is 992. |
| static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) { |
| CGBuilderTy &Bld = CGF.Builder; |
| auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); |
| llvm::Value *NumThreads = RT.getGPUNumThreads(CGF); |
| // We assume that the warp size is a power of 2. |
| llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1)); |
| |
| return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)), |
| Bld.CreateNot(Mask), "master_tid"); |
| } |
| |
| CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState( |
| CodeGenModule &CGM, SourceLocation Loc) |
| : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()), |
| Loc(Loc) { |
| createWorkerFunction(CGM); |
| } |
| |
| void CGOpenMPRuntimeGPU::WorkerFunctionState::createWorkerFunction( |
| CodeGenModule &CGM) { |
| // Create an worker function with no arguments. |
| |
| WorkerFn = llvm::Function::Create( |
| CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, |
| /*placeholder=*/"_worker", &CGM.getModule()); |
| CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI); |
| WorkerFn->setDoesNotRecurse(); |
| } |
| |
| CGOpenMPRuntimeGPU::ExecutionMode |
| CGOpenMPRuntimeGPU::getExecutionMode() const { |
| return CurrentExecutionMode; |
| } |
| |
| static CGOpenMPRuntimeGPU::DataSharingMode |
| getDataSharingMode(CodeGenModule &CGM) { |
| return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA |
| : CGOpenMPRuntimeGPU::Generic; |
| } |
| |
| /// Check for inner (nested) SPMD construct, if any |
| static bool hasNestedSPMDDirective(ASTContext &Ctx, |
| const OMPExecutableDirective &D) { |
| const auto *CS = D.getInnermostCapturedStmt(); |
| const auto *Body = |
| CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); |
| const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); |
| |
| if (const auto *NestedDir = |
| dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { |
| OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); |
| switch (D.getDirectiveKind()) { |
| case OMPD_target: |
| if (isOpenMPParallelDirective(DKind)) |
| return true; |
| if (DKind == OMPD_teams) { |
| Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( |
| /*IgnoreCaptured=*/true); |
| if (!Body) |
| return false; |
| ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); |
| if (const auto *NND = |
| dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { |
| DKind = NND->getDirectiveKind(); |
| if (isOpenMPParallelDirective(DKind)) |
| return true; |
| } |
| } |
| return false; |
| case OMPD_target_teams: |
| return isOpenMPParallelDirective(DKind); |
| case OMPD_target_simd: |
| case OMPD_target_parallel: |
| case OMPD_target_parallel_for: |
| case OMPD_target_parallel_for_simd: |
| case OMPD_target_teams_distribute: |
| case OMPD_target_teams_distribute_simd: |
| case OMPD_target_teams_distribute_parallel_for: |
| case OMPD_target_teams_distribute_parallel_for_simd: |
| case OMPD_parallel: |
| case OMPD_for: |
| case OMPD_parallel_for: |
| case OMPD_parallel_master: |
| case OMPD_parallel_sections: |
| case OMPD_for_simd: |
| case OMPD_parallel_for_simd: |
| case OMPD_cancel: |
| case OMPD_cancellation_point: |
| case OMPD_ordered: |
| case OMPD_threadprivate: |
| case OMPD_allocate: |
| case OMPD_task: |
| case OMPD_simd: |
| case OMPD_sections: |
| case OMPD_section: |
| case OMPD_single: |
| case OMPD_master: |
| case OMPD_critical: |
| case OMPD_taskyield: |
| case OMPD_barrier: |
| case OMPD_taskwait: |
| case OMPD_taskgroup: |
| case OMPD_atomic: |
| case OMPD_flush: |
| case OMPD_depobj: |
| case OMPD_scan: |
| case OMPD_teams: |
| case OMPD_target_data: |
| case OMPD_target_exit_data: |
| case OMPD_target_enter_data: |
| case OMPD_distribute: |
| case OMPD_distribute_simd: |
| case OMPD_distribute_parallel_for: |
| case OMPD_distribute_parallel_for_simd: |
| case OMPD_teams_distribute: |
| case OMPD_teams_distribute_simd: |
| case OMPD_teams_distribute_parallel_for: |
| case OMPD_teams_distribute_parallel_for_simd: |
| case OMPD_target_update: |
| case OMPD_declare_simd: |
| case OMPD_declare_variant: |
| case OMPD_begin_declare_variant: |
| case OMPD_end_declare_variant: |
| case OMPD_declare_target: |
| case OMPD_end_declare_target: |
| case OMPD_declare_reduction: |
| case OMPD_declare_mapper: |
| case OMPD_taskloop: |
| case OMPD_taskloop_simd: |
| case OMPD_master_taskloop: |
| case OMPD_master_taskloop_simd: |
| case OMPD_parallel_master_taskloop: |
| case OMPD_parallel_master_taskloop_simd: |
| case OMPD_requires: |
| case OMPD_unknown: |
| default: |
| llvm_unreachable("Unexpected directive."); |
| } |
| } |
| |
| return false; |
| } |
| |
| static bool supportsSPMDExecutionMode(ASTContext &Ctx, |
| const OMPExecutableDirective &D) { |
| OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); |
| switch (DirectiveKind) { |
| case OMPD_target: |
| case OMPD_target_teams: |
| return hasNestedSPMDDirective(Ctx, D); |
| case OMPD_target_parallel: |
| case OMPD_target_parallel_for: |
| case OMPD_target_parallel_for_simd: |
| case OMPD_target_teams_distribute_parallel_for: |
| case OMPD_target_teams_distribute_parallel_for_simd: |
| case OMPD_target_simd: |
| case OMPD_target_teams_distribute_simd: |
| return true; |
| case OMPD_target_teams_distribute: |
| return false; |
| case OMPD_parallel: |
| case OMPD_for: |
| case OMPD_parallel_for: |
| case OMPD_parallel_master: |
| case OMPD_parallel_sections: |
| case OMPD_for_simd: |
| case OMPD_parallel_for_simd: |
| case OMPD_cancel: |
| case OMPD_cancellation_point: |
| case OMPD_ordered: |
| case OMPD_threadprivate: |
| case OMPD_allocate: |
| case OMPD_task: |
| case OMPD_simd: |
| case OMPD_sections: |
| case OMPD_section: |
| case OMPD_single: |
| case OMPD_master: |
| case OMPD_critical: |
| case OMPD_taskyield: |
| case OMPD_barrier: |
| case OMPD_taskwait: |
| case OMPD_taskgroup: |
| case OMPD_atomic: |
| case OMPD_flush: |
| case OMPD_depobj: |
| case OMPD_scan: |
| case OMPD_teams: |
| case OMPD_target_data: |
| case OMPD_target_exit_data: |
| case OMPD_target_enter_data: |
| case OMPD_distribute: |
| case OMPD_distribute_simd: |
| case OMPD_distribute_parallel_for: |
| case OMPD_distribute_parallel_for_simd: |
| case OMPD_teams_distribute: |
| case OMPD_teams_distribute_simd: |
| case OMPD_teams_distribute_parallel_for: |
| case OMPD_teams_distribute_parallel_for_simd: |
| case OMPD_target_update: |
| case OMPD_declare_simd: |
| case OMPD_declare_variant: |
| case OMPD_begin_declare_variant: |
| case OMPD_end_declare_variant: |
| case OMPD_declare_target: |
| case OMPD_end_declare_target: |
| case OMPD_declare_reduction: |
| case OMPD_declare_mapper: |
| case OMPD_taskloop: |
| case OMPD_taskloop_simd: |
| case OMPD_master_taskloop: |
| case OMPD_master_taskloop_simd: |
| case OMPD_parallel_master_taskloop: |
| case OMPD_parallel_master_taskloop_simd: |
| case OMPD_requires: |
| case OMPD_unknown: |
| default: |
| break; |
| } |
| llvm_unreachable( |
| "Unknown programming model for OpenMP directive on NVPTX target."); |
| } |
| |
| /// Check if the directive is loops based and has schedule clause at all or has |
| /// static scheduling. |
| static bool hasStaticScheduling(const OMPExecutableDirective &D) { |
| assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) && |
| isOpenMPLoopDirective(D.getDirectiveKind()) && |
| "Expected loop-based directive."); |
| return !D.hasClausesOfKind<OMPOrderedClause>() && |
| (!D.hasClausesOfKind<OMPScheduleClause>() || |
| llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(), |
| [](const OMPScheduleClause *C) { |
| return C->getScheduleKind() == OMPC_SCHEDULE_static; |
| })); |
| } |
| |
| /// Check for inner (nested) lightweight runtime construct, if any |
| static bool hasNestedLightweightDirective(ASTContext &Ctx, |
| const OMPExecutableDirective &D) { |
| assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive."); |
| const auto *CS = D.getInnermostCapturedStmt(); |
| const auto *Body = |
| CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); |
| const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); |
| |
| if (const auto *NestedDir = |
| dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { |
| OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); |
| switch (D.getDirectiveKind()) { |
| case OMPD_target: |
| if (isOpenMPParallelDirective(DKind) && |
| isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && |
| hasStaticScheduling(*NestedDir)) |
| return true; |
| if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd) |
| return true; |
| if (DKind == OMPD_parallel) { |
| Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( |
| /*IgnoreCaptured=*/true); |
| if (!Body) |
| return false; |
| ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); |
| if (const auto *NND = |
| dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { |
| DKind = NND->getDirectiveKind(); |
| if (isOpenMPWorksharingDirective(DKind) && |
| isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) |
| return true; |
| } |
| } else if (DKind == OMPD_teams) { |
| Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( |
| /*IgnoreCaptured=*/true); |
| if (!Body) |
| return false; |
| ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); |
| if (const auto *NND = |
| dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { |
| DKind = NND->getDirectiveKind(); |
| if (isOpenMPParallelDirective(DKind) && |
| isOpenMPWorksharingDirective(DKind) && |
| isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) |
| return true; |
| if (DKind == OMPD_parallel) { |
| Body = NND->getInnermostCapturedStmt()->IgnoreContainers( |
| /*IgnoreCaptured=*/true); |
| if (!Body) |
| return false; |
| ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); |
| if (const auto *NND = |
| dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { |
| DKind = NND->getDirectiveKind(); |
| if (isOpenMPWorksharingDirective(DKind) && |
| isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) |
| return true; |
| } |
| } |
| } |
| } |
| return false; |
| case OMPD_target_teams: |
| if (isOpenMPParallelDirective(DKind) && |
| isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && |
| hasStaticScheduling(*NestedDir)) |
| return true; |
| if (DKind == OMPD_distribute_simd || DKind == OMPD_simd) |
| return true; |
| if (DKind == OMPD_parallel) { |
| Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( |
| /*IgnoreCaptured=*/true); |
| if (!Body) |
| return false; |
| ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); |
| if (const auto *NND = |
| dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { |
| DKind = NND->getDirectiveKind(); |
| if (isOpenMPWorksharingDirective(DKind) && |
| isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) |
| return true; |
| } |
| } |
| return false; |
| case OMPD_target_parallel: |
| if (DKind == OMPD_simd) |
| return true; |
| return isOpenMPWorksharingDirective(DKind) && |
| isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir); |
| case OMPD_target_teams_distribute: |
| case OMPD_target_simd: |
| case OMPD_target_parallel_for: |
| case OMPD_target_parallel_for_simd: |
| case OMPD_target_teams_distribute_simd: |
| case OMPD_target_teams_distribute_parallel_for: |
| case OMPD_target_teams_distribute_parallel_for_simd: |
| case OMPD_parallel: |
| case OMPD_for: |
| case OMPD_parallel_for: |
| case OMPD_parallel_master: |
| case OMPD_parallel_sections: |
| case OMPD_for_simd: |
| case OMPD_parallel_for_simd: |
| case OMPD_cancel: |
| case OMPD_cancellation_point: |
| case OMPD_ordered: |
| case OMPD_threadprivate: |
| case OMPD_allocate: |
| case OMPD_task: |
| case OMPD_simd: |
| case OMPD_sections: |
| case OMPD_section: |
| case OMPD_single: |
| case OMPD_master: |
| case OMPD_critical: |
| case OMPD_taskyield: |
| case OMPD_barrier: |
| case OMPD_taskwait: |
| case OMPD_taskgroup: |
| case OMPD_atomic: |
| case OMPD_flush: |
| case OMPD_depobj: |
| case OMPD_scan: |
| case OMPD_teams: |
| case OMPD_target_data: |
| case OMPD_target_exit_data: |
| case OMPD_target_enter_data: |
| case OMPD_distribute: |
| case OMPD_distribute_simd: |
| case OMPD_distribute_parallel_for: |
| case OMPD_distribute_parallel_for_simd: |
| case OMPD_teams_distribute: |
| case OMPD_teams_distribute_simd: |
| case OMPD_teams_distribute_parallel_for: |
| case OMPD_teams_distribute_parallel_for_simd: |
| case OMPD_target_update: |
| case OMPD_declare_simd: |
| case OMPD_declare_variant: |
| case OMPD_begin_declare_variant: |
| case OMPD_end_declare_variant: |
| case OMPD_declare_target: |
| case OMPD_end_declare_target: |
| case OMPD_declare_reduction: |
| case OMPD_declare_mapper: |
| case OMPD_taskloop: |
| case OMPD_taskloop_simd: |
| case OMPD_master_taskloop: |
| case OMPD_master_taskloop_simd: |
| case OMPD_parallel_master_taskloop: |
| case OMPD_parallel_master_taskloop_simd: |
| case OMPD_requires: |
| case OMPD_unknown: |
| default: |
| llvm_unreachable("Unexpected directive."); |
| } |
| } |
| |
| return false; |
| } |
| |
| /// Checks if the construct supports lightweight runtime. It must be SPMD |
| /// construct + inner loop-based construct with static scheduling. |
| static bool supportsLightweightRuntime(ASTContext &Ctx, |
| const OMPExecutableDirective &D) { |
| if (!supportsSPMDExecutionMode(Ctx, D)) |
| return false; |
| OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); |
| switch (DirectiveKind) { |
| case OMPD_target: |
| case OMPD_target_teams: |
| case OMPD_target_parallel: |
| return hasNestedLightweightDirective(Ctx, D); |
| case OMPD_target_parallel_for: |
| case OMPD_target_parallel_for_simd: |
| case OMPD_target_teams_distribute_parallel_for: |
| case OMPD_target_teams_distribute_parallel_for_simd: |
| // (Last|First)-privates must be shared in parallel region. |
| return hasStaticScheduling(D); |
| case OMPD_target_simd: |
| case OMPD_target_teams_distribute_simd: |
| return true; |
| case OMPD_target_teams_distribute: |
| return false; |
| case OMPD_parallel: |
| case OMPD_for: |
| case OMPD_parallel_for: |
| case OMPD_parallel_master: |
| case OMPD_parallel_sections: |
| case OMPD_for_simd: |
| case OMPD_parallel_for_simd: |
| case OMPD_cancel: |
| case OMPD_cancellation_point: |
| case OMPD_ordered: |
| case OMPD_threadprivate: |
| case OMPD_allocate: |
| case OMPD_task: |
| case OMPD_simd: |
| case OMPD_sections: |
| case OMPD_section: |
| case OMPD_single: |
| case OMPD_master: |
| case OMPD_critical: |
| case OMPD_taskyield: |
| case OMPD_barrier: |
| case OMPD_taskwait: |
| case OMPD_taskgroup: |
| case OMPD_atomic: |
| case OMPD_flush: |
| case OMPD_depobj: |
| case OMPD_scan: |
| case OMPD_teams: |
| case OMPD_target_data: |
| case OMPD_target_exit_data: |
| case OMPD_target_enter_data: |
| case OMPD_distribute: |
| case OMPD_distribute_simd: |
| case OMPD_distribute_parallel_for: |
| case OMPD_distribute_parallel_for_simd: |
| case OMPD_teams_distribute: |
| case OMPD_teams_distribute_simd: |
| case OMPD_teams_distribute_parallel_for: |
| case OMPD_teams_distribute_parallel_for_simd: |
| case OMPD_target_update: |
| case OMPD_declare_simd: |
| case OMPD_declare_variant: |
| case OMPD_begin_declare_variant: |
| case OMPD_end_declare_variant: |
| case OMPD_declare_target: |
| case OMPD_end_declare_target: |
| case OMPD_declare_reduction: |
| case OMPD_declare_mapper: |
| case OMPD_taskloop: |
| case OMPD_taskloop_simd: |
| case OMPD_master_taskloop: |
| case OMPD_master_taskloop_simd: |
| case OMPD_parallel_master_taskloop: |
| case OMPD_parallel_master_taskloop_simd: |
| case OMPD_requires: |
| case OMPD_unknown: |
| default: |
| break; |
| } |
| llvm_unreachable( |
| "Unknown programming model for OpenMP directive on NVPTX target."); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, |
| StringRef ParentName, |
| llvm::Function *&OutlinedFn, |
| llvm::Constant *&OutlinedFnID, |
| bool IsOffloadEntry, |
| const RegionCodeGenTy &CodeGen) { |
| ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode); |
| EntryFunctionState EST; |
| WorkerFunctionState WST(CGM, D.getBeginLoc()); |
| Work.clear(); |
| WrapperFunctionsMap.clear(); |
| |
| // Emit target region as a standalone region. |
| class NVPTXPrePostActionTy : public PrePostActionTy { |
| CGOpenMPRuntimeGPU::EntryFunctionState &EST; |
| CGOpenMPRuntimeGPU::WorkerFunctionState &WST; |
| |
| public: |
| NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST, |
| CGOpenMPRuntimeGPU::WorkerFunctionState &WST) |
| : EST(EST), WST(WST) {} |
| void Enter(CodeGenFunction &CGF) override { |
| auto &RT = |
| static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); |
| RT.emitNonSPMDEntryHeader(CGF, EST, WST); |
| // Skip target region initialization. |
| RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); |
| } |
| void Exit(CodeGenFunction &CGF) override { |
| auto &RT = |
| static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); |
| RT.clearLocThreadIdInsertPt(CGF); |
| RT.emitNonSPMDEntryFooter(CGF, EST); |
| } |
| } Action(EST, WST); |
| CodeGen.setAction(Action); |
| IsInTTDRegion = true; |
| // Reserve place for the globalized memory. |
| GlobalizedRecords.emplace_back(); |
| if (!KernelStaticGlobalized) { |
| KernelStaticGlobalized = new llvm::GlobalVariable( |
| CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, |
| llvm::GlobalValue::InternalLinkage, |
| llvm::ConstantPointerNull::get(CGM.VoidPtrTy), |
| "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, |
| llvm::GlobalValue::NotThreadLocal, |
| CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); |
| } |
| emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, |
| IsOffloadEntry, CodeGen); |
| IsInTTDRegion = false; |
| |
| // Now change the name of the worker function to correspond to this target |
| // region's entry function. |
| WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker")); |
| |
| // Create the worker function |
| emitWorkerFunction(WST); |
| } |
| |
| // Setup NVPTX threads for master-worker OpenMP scheme. |
| void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF, |
| EntryFunctionState &EST, |
| WorkerFunctionState &WST) { |
| CGBuilderTy &Bld = CGF.Builder; |
| |
| llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); |
| llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); |
| llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); |
| EST.ExitBB = CGF.createBasicBlock(".exit"); |
| |
| auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); |
| llvm::Value *IsWorker = |
| Bld.CreateICmpULT(RT.getGPUThreadID(CGF), getThreadLimit(CGF)); |
| Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); |
| |
| CGF.EmitBlock(WorkerBB); |
| emitCall(CGF, WST.Loc, WST.WorkerFn); |
| CGF.EmitBranch(EST.ExitBB); |
| |
| CGF.EmitBlock(MasterCheckBB); |
| llvm::Value *IsMaster = |
| Bld.CreateICmpEQ(RT.getGPUThreadID(CGF), getMasterThreadID(CGF)); |
| Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); |
| |
| CGF.EmitBlock(MasterBB); |
| IsInTargetMasterThreadRegion = true; |
| // SEQUENTIAL (MASTER) REGION START |
| // First action in sequential region: |
| // Initialize the state of the OpenMP runtime library on the GPU. |
| // TODO: Optimize runtime initialization and pass in correct value. |
| llvm::Value *Args[] = {getThreadLimit(CGF), |
| Bld.getInt16(/*RequiresOMPRuntime=*/1)}; |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); |
| |
| // For data sharing, we need to initialize the stack. |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction( |
| OMPRTL_NVPTX__kmpc_data_sharing_init_stack)); |
| |
| emitGenericVarsProlog(CGF, WST.Loc); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitNonSPMDEntryFooter(CodeGenFunction &CGF, |
| EntryFunctionState &EST) { |
| IsInTargetMasterThreadRegion = false; |
| if (!CGF.HaveInsertPoint()) |
| return; |
| |
| emitGenericVarsEpilog(CGF); |
| |
| if (!EST.ExitBB) |
| EST.ExitBB = CGF.createBasicBlock(".exit"); |
| |
| llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); |
| CGF.EmitBranch(TerminateBB); |
| |
| CGF.EmitBlock(TerminateBB); |
| // Signal termination condition. |
| // TODO: Optimize runtime initialization and pass in correct value. |
| llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)}; |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args); |
| // Barrier to terminate worker threads. |
| syncCTAThreads(CGF); |
| // Master thread jumps to exit point. |
| CGF.EmitBranch(EST.ExitBB); |
| |
| CGF.EmitBlock(EST.ExitBB); |
| EST.ExitBB = nullptr; |
| } |
| |
| void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, |
| StringRef ParentName, |
| llvm::Function *&OutlinedFn, |
| llvm::Constant *&OutlinedFnID, |
| bool IsOffloadEntry, |
| const RegionCodeGenTy &CodeGen) { |
| ExecutionRuntimeModesRAII ModeRAII( |
| CurrentExecutionMode, RequiresFullRuntime, |
| CGM.getLangOpts().OpenMPCUDAForceFullRuntime || |
| !supportsLightweightRuntime(CGM.getContext(), D)); |
| EntryFunctionState EST; |
| |
| // Emit target region as a standalone region. |
| class NVPTXPrePostActionTy : public PrePostActionTy { |
| CGOpenMPRuntimeGPU &RT; |
| CGOpenMPRuntimeGPU::EntryFunctionState &EST; |
| const OMPExecutableDirective &D; |
| |
| public: |
| NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT, |
| CGOpenMPRuntimeGPU::EntryFunctionState &EST, |
| const OMPExecutableDirective &D) |
| : RT(RT), EST(EST), D(D) {} |
| void Enter(CodeGenFunction &CGF) override { |
| RT.emitSPMDEntryHeader(CGF, EST, D); |
| // Skip target region initialization. |
| RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); |
| } |
| void Exit(CodeGenFunction &CGF) override { |
| RT.clearLocThreadIdInsertPt(CGF); |
| RT.emitSPMDEntryFooter(CGF, EST); |
| } |
| } Action(*this, EST, D); |
| CodeGen.setAction(Action); |
| IsInTTDRegion = true; |
| // Reserve place for the globalized memory. |
| GlobalizedRecords.emplace_back(); |
| if (!KernelStaticGlobalized) { |
| KernelStaticGlobalized = new llvm::GlobalVariable( |
| CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, |
| llvm::GlobalValue::InternalLinkage, |
| llvm::ConstantPointerNull::get(CGM.VoidPtrTy), |
| "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, |
| llvm::GlobalValue::NotThreadLocal, |
| CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); |
| } |
| emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, |
| IsOffloadEntry, CodeGen); |
| IsInTTDRegion = false; |
| } |
| |
| void CGOpenMPRuntimeGPU::emitSPMDEntryHeader( |
| CodeGenFunction &CGF, EntryFunctionState &EST, |
| const OMPExecutableDirective &D) { |
| CGBuilderTy &Bld = CGF.Builder; |
| |
| // Setup BBs in entry function. |
| llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute"); |
| EST.ExitBB = CGF.createBasicBlock(".exit"); |
| |
| llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true), |
| /*RequiresOMPRuntime=*/ |
| Bld.getInt16(RequiresFullRuntime ? 1 : 0), |
| /*RequiresDataSharing=*/Bld.getInt16(0)}; |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args); |
| |
| if (RequiresFullRuntime) { |
| // For data sharing, we need to initialize the stack. |
| CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( |
| OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd)); |
| } |
| |
| CGF.EmitBranch(ExecuteBB); |
| |
| CGF.EmitBlock(ExecuteBB); |
| |
| IsInTargetMasterThreadRegion = true; |
| } |
| |
| void CGOpenMPRuntimeGPU::emitSPMDEntryFooter(CodeGenFunction &CGF, |
| EntryFunctionState &EST) { |
| IsInTargetMasterThreadRegion = false; |
| if (!CGF.HaveInsertPoint()) |
| return; |
| |
| if (!EST.ExitBB) |
| EST.ExitBB = CGF.createBasicBlock(".exit"); |
| |
| llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit"); |
| CGF.EmitBranch(OMPDeInitBB); |
| |
| CGF.EmitBlock(OMPDeInitBB); |
| // DeInitialize the OMP state in the runtime; called by all active threads. |
| llvm::Value *Args[] = {/*RequiresOMPRuntime=*/ |
| CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)}; |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction( |
| OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args); |
| CGF.EmitBranch(EST.ExitBB); |
| |
| CGF.EmitBlock(EST.ExitBB); |
| EST.ExitBB = nullptr; |
| } |
| |
| // Create a unique global variable to indicate the execution mode of this target |
| // region. The execution mode is either 'generic', or 'spmd' depending on the |
| // target directive. This variable is picked up by the offload library to setup |
| // the device appropriately before kernel launch. If the execution mode is |
| // 'generic', the runtime reserves one warp for the master, otherwise, all |
| // warps participate in parallel work. |
| static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, |
| bool Mode) { |
| auto *GVMode = |
| new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, |
| llvm::GlobalValue::WeakAnyLinkage, |
| llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1), |
| Twine(Name, "_exec_mode")); |
| CGM.addCompilerUsedGlobal(GVMode); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitWorkerFunction(WorkerFunctionState &WST) { |
| ASTContext &Ctx = CGM.getContext(); |
| |
| CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); |
| CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {}, |
| WST.Loc, WST.Loc); |
| emitWorkerLoop(CGF, WST); |
| CGF.FinishFunction(); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitWorkerLoop(CodeGenFunction &CGF, |
| WorkerFunctionState &WST) { |
| // |
| // The workers enter this loop and wait for parallel work from the master. |
| // When the master encounters a parallel region it sets up the work + variable |
| // arguments, and wakes up the workers. The workers first check to see if |
| // they are required for the parallel region, i.e., within the # of requested |
| // parallel threads. The activated workers load the variable arguments and |
| // execute the parallel work. |
| // |
| |
| CGBuilderTy &Bld = CGF.Builder; |
| |
| llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work"); |
| llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers"); |
| llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel"); |
| llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel"); |
| llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel"); |
| llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); |
| |
| CGF.EmitBranch(AwaitBB); |
| |
| // Workers wait for work from master. |
| CGF.EmitBlock(AwaitBB); |
| // Wait for parallel work |
| syncCTAThreads(CGF); |
| |
| Address WorkFn = |
| CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn"); |
| Address ExecStatus = |
| CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status"); |
| CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); |
| CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); |
| |
| // TODO: Optimize runtime initialization and pass in correct value. |
| llvm::Value *Args[] = {WorkFn.getPointer()}; |
| llvm::Value *Ret = CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args); |
| Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); |
| |
| // On termination condition (workid == 0), exit loop. |
| llvm::Value *WorkID = Bld.CreateLoad(WorkFn); |
| llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate"); |
| Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); |
| |
| // Activate requested workers. |
| CGF.EmitBlock(SelectWorkersBB); |
| llvm::Value *IsActive = |
| Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active"); |
| Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB); |
| |
| // Signal start of parallel region. |
| CGF.EmitBlock(ExecuteBB); |
| // Skip initialization. |
| setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); |
| |
| // Process work items: outlined parallel functions. |
| for (llvm::Function *W : Work) { |
| // Try to match this outlined function. |
| llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy); |
| |
| llvm::Value *WorkFnMatch = |
| Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match"); |
| |
| llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn"); |
| llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next"); |
| Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB); |
| |
| // Execute this outlined function. |
| CGF.EmitBlock(ExecuteFNBB); |
| |
| // Insert call to work function via shared wrapper. The shared |
| // wrapper takes two arguments: |
| // - the parallelism level; |
| // - the thread ID; |
| emitCall(CGF, WST.Loc, W, |
| {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); |
| |
| // Go to end of parallel region. |
| CGF.EmitBranch(TerminateBB); |
| |
| CGF.EmitBlock(CheckNextBB); |
| } |
| // Default case: call to outlined function through pointer if the target |
| // region makes a declare target call that may contain an orphaned parallel |
| // directive. |
| auto *ParallelFnTy = |
| llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty}, |
| /*isVarArg=*/false); |
| llvm::Value *WorkFnCast = |
| Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo()); |
| // Insert call to work function via shared wrapper. The shared |
| // wrapper takes two arguments: |
| // - the parallelism level; |
| // - the thread ID; |
| emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast}, |
| {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); |
| // Go to end of parallel region. |
| CGF.EmitBranch(TerminateBB); |
| |
| // Signal end of parallel region. |
| CGF.EmitBlock(TerminateBB); |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel), |
| llvm::None); |
| CGF.EmitBranch(BarrierBB); |
| |
| // All active and inactive workers wait at a barrier after parallel region. |
| CGF.EmitBlock(BarrierBB); |
| // Barrier after parallel region. |
| syncCTAThreads(CGF); |
| CGF.EmitBranch(AwaitBB); |
| |
| // Exit target region. |
| CGF.EmitBlock(ExitBB); |
| // Skip initialization. |
| clearLocThreadIdInsertPt(CGF); |
| } |
| |
| /// Returns specified OpenMP runtime function for the current OpenMP |
| /// implementation. Specialized for the NVPTX device. |
| /// \param Function OpenMP runtime function. |
| /// \return Specified function. |
| llvm::FunctionCallee |
| CGOpenMPRuntimeGPU::createNVPTXRuntimeFunction(unsigned Function) { |
| llvm::FunctionCallee RTLFn = nullptr; |
| switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) { |
| case OMPRTL_NVPTX__kmpc_kernel_init: { |
| // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t |
| // RequiresOMPRuntime); |
| llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_kernel_deinit: { |
| // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); |
| llvm::Type *TypeParams[] = {CGM.Int16Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_spmd_kernel_init: { |
| // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, |
| // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing); |
| llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: { |
| // Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); |
| llvm::Type *TypeParams[] = {CGM.Int16Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { |
| /// Build void __kmpc_kernel_prepare_parallel( |
| /// void *outlined_function); |
| llvm::Type *TypeParams[] = {CGM.Int8PtrTy}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_kernel_parallel: { |
| /// Build bool __kmpc_kernel_parallel(void **outlined_function); |
| llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy}; |
| llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy); |
| auto *FnTy = |
| llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_kernel_end_parallel: { |
| /// Build void __kmpc_kernel_end_parallel(); |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_serialized_parallel: { |
| // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 |
| // global_tid); |
| llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_end_serialized_parallel: { |
| // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 |
| // global_tid); |
| llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_shuffle_int32: { |
| // Build int32_t __kmpc_shuffle_int32(int32_t element, |
| // int16_t lane_offset, int16_t warp_size); |
| llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_shuffle_int64: { |
| // Build int64_t __kmpc_shuffle_int64(int64_t element, |
| // int16_t lane_offset, int16_t warp_size); |
| llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2: { |
| // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, |
| // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void* |
| // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t |
| // lane_id, int16_t lane_offset, int16_t Algorithm Version), void |
| // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num)); |
| llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, |
| CGM.Int16Ty, CGM.Int16Ty}; |
| auto *ShuffleReduceFnTy = |
| llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams, |
| /*isVarArg=*/false); |
| llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty}; |
| auto *InterWarpCopyFnTy = |
| llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, |
| /*isVarArg=*/false); |
| llvm::Type *TypeParams[] = {getIdentTyPointerTy(), |
| CGM.Int32Ty, |
| CGM.Int32Ty, |
| CGM.SizeTy, |
| CGM.VoidPtrTy, |
| ShuffleReduceFnTy->getPointerTo(), |
| InterWarpCopyFnTy->getPointerTo()}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); |
| RTLFn = CGM.CreateRuntimeFunction( |
| FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_end_reduce_nowait: { |
| // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid); |
| llvm::Type *TypeParams[] = {CGM.Int32Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); |
| RTLFn = CGM.CreateRuntimeFunction( |
| FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2: { |
| // Build int32_t __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32 |
| // global_tid, void *global_buffer, int32_t num_of_records, void* |
| // reduce_data, |
| // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t |
| // lane_offset, int16_t shortCircuit), |
| // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void |
| // (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data), |
| // void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx, |
| // void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer, |
| // int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void |
| // *buffer, int idx, void *reduce_data)); |
| llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, |
| CGM.Int16Ty, CGM.Int16Ty}; |
| auto *ShuffleReduceFnTy = |
| llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams, |
| /*isVarArg=*/false); |
| llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty}; |
| auto *InterWarpCopyFnTy = |
| llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, |
| /*isVarArg=*/false); |
| llvm::Type *GlobalListTypeParams[] = {CGM.VoidPtrTy, CGM.IntTy, |
| CGM.VoidPtrTy}; |
| auto *GlobalListFnTy = |
| llvm::FunctionType::get(CGM.VoidTy, GlobalListTypeParams, |
| /*isVarArg=*/false); |
| llvm::Type *TypeParams[] = {getIdentTyPointerTy(), |
| CGM.Int32Ty, |
| CGM.VoidPtrTy, |
| CGM.Int32Ty, |
| CGM.VoidPtrTy, |
| ShuffleReduceFnTy->getPointerTo(), |
| InterWarpCopyFnTy->getPointerTo(), |
| GlobalListFnTy->getPointerTo(), |
| GlobalListFnTy->getPointerTo(), |
| GlobalListFnTy->getPointerTo(), |
| GlobalListFnTy->getPointerTo()}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); |
| RTLFn = CGM.CreateRuntimeFunction( |
| FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_v2"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: { |
| /// Build void __kmpc_data_sharing_init_stack(); |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: { |
| /// Build void __kmpc_data_sharing_init_stack_spmd(); |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); |
| RTLFn = |
| CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: { |
| // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size, |
| // int16_t UseSharedMemory); |
| llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false); |
| RTLFn = CGM.CreateRuntimeFunction( |
| FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: { |
| // Build void *__kmpc_data_sharing_push_stack(size_t size, int16_t |
| // UseSharedMemory); |
| llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false); |
| RTLFn = CGM.CreateRuntimeFunction( |
| FnTy, /*Name=*/"__kmpc_data_sharing_push_stack"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: { |
| // Build void __kmpc_data_sharing_pop_stack(void *a); |
| llvm::Type *TypeParams[] = {CGM.VoidPtrTy}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, |
| /*Name=*/"__kmpc_data_sharing_pop_stack"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_begin_sharing_variables: { |
| /// Build void __kmpc_begin_sharing_variables(void ***args, |
| /// size_t n_args); |
| llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_end_sharing_variables: { |
| /// Build void __kmpc_end_sharing_variables(); |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_get_shared_variables: { |
| /// Build void __kmpc_get_shared_variables(void ***GlobalArgs); |
| llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_parallel_level: { |
| // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid); |
| llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: { |
| // Build int8_t __kmpc_is_spmd_exec_mode(); |
| auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_get_team_static_memory: { |
| // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, |
| // const void *buf, size_t size, int16_t is_shared, const void **res); |
| llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy, |
| CGM.Int16Ty, CGM.VoidPtrPtrTy}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_restore_team_static_memory: { |
| // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, |
| // int16_t is_shared); |
| llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); |
| RTLFn = |
| CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory"); |
| break; |
| } |
| case OMPRTL__kmpc_barrier: { |
| // Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid); |
| llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = |
| CGM.CreateConvergentRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier"); |
| break; |
| } |
| case OMPRTL__kmpc_barrier_simple_spmd: { |
| // Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32 |
| // global_tid); |
| llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| RTLFn = CGM.CreateConvergentRuntimeFunction( |
| FnTy, /*Name*/ "__kmpc_barrier_simple_spmd"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_warp_active_thread_mask: { |
| // Build int32_t __kmpc_warp_active_thread_mask(void); |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.Int32Ty, llvm::None, /*isVarArg=*/false); |
| RTLFn = CGM.CreateConvergentRuntimeFunction(FnTy, "__kmpc_warp_active_thread_mask"); |
| break; |
| } |
| case OMPRTL_NVPTX__kmpc_syncwarp: { |
| // Build void __kmpc_syncwarp(kmp_int32 Mask); |
| auto *FnTy = |
| llvm::FunctionType::get(CGM.VoidTy, CGM.Int32Ty, /*isVarArg=*/false); |
| RTLFn = CGM.CreateConvergentRuntimeFunction(FnTy, "__kmpc_syncwarp"); |
| break; |
| } |
| } |
| return RTLFn; |
| } |
| |
| void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID, |
| llvm::Constant *Addr, |
| uint64_t Size, int32_t, |
| llvm::GlobalValue::LinkageTypes) { |
| // TODO: Add support for global variables on the device after declare target |
| // support. |
| if (!isa<llvm::Function>(Addr)) |
| return; |
| llvm::Module &M = CGM.getModule(); |
| llvm::LLVMContext &Ctx = CGM.getLLVMContext(); |
| |
| // Get "nvvm.annotations" metadata node |
| llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); |
| |
| llvm::Metadata *MDVals[] = { |
| llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"), |
| llvm::ConstantAsMetadata::get( |
| llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; |
| // Append metadata to nvvm.annotations |
| MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction( |
| const OMPExecutableDirective &D, StringRef ParentName, |
| llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, |
| bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { |
| if (!IsOffloadEntry) // Nothing to do. |
| return; |
| |
| assert(!ParentName.empty() && "Invalid target region parent name!"); |
| |
| bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); |
| if (Mode) |
| emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, |
| CodeGen); |
| else |
| emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, |
| CodeGen); |
| |
| setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); |
| } |
| |
| namespace { |
| LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); |
| /// Enum for accesseing the reserved_2 field of the ident_t struct. |
| enum ModeFlagsTy : unsigned { |
| /// Bit set to 1 when in SPMD mode. |
| KMP_IDENT_SPMD_MODE = 0x01, |
| /// Bit set to 1 when a simplified runtime is used. |
| KMP_IDENT_SIMPLE_RT_MODE = 0x02, |
| LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE) |
| }; |
| |
| /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime. |
| static const ModeFlagsTy UndefinedMode = |
| (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE; |
| } // anonymous namespace |
| |
| unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const { |
| switch (getExecutionMode()) { |
| case EM_SPMD: |
| if (requiresFullRuntime()) |
| return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE); |
| return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE; |
| case EM_NonSPMD: |
| assert(requiresFullRuntime() && "Expected full runtime."); |
| return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE); |
| case EM_Unknown: |
| return UndefinedMode; |
| } |
| llvm_unreachable("Unknown flags are requested."); |
| } |
| |
| CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) |
| : CGOpenMPRuntime(CGM, "_", "$") { |
| if (!CGM.getLangOpts().OpenMPIsDevice) |
| llvm_unreachable("OpenMP NVPTX can only handle device code."); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, |
| ProcBindKind ProcBind, |
| SourceLocation Loc) { |
| // Do nothing in case of SPMD mode and L0 parallel. |
| if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) |
| return; |
| |
| CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF, |
| llvm::Value *NumThreads, |
| SourceLocation Loc) { |
| // Do nothing in case of SPMD mode and L0 parallel. |
| if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) |
| return; |
| |
| CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF, |
| const Expr *NumTeams, |
| const Expr *ThreadLimit, |
| SourceLocation Loc) {} |
| |
| llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction( |
| const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, |
| OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { |
| // Emit target region as a standalone region. |
| class NVPTXPrePostActionTy : public PrePostActionTy { |
| bool &IsInParallelRegion; |
| bool PrevIsInParallelRegion; |
| |
| public: |
| NVPTXPrePostActionTy(bool &IsInParallelRegion) |
| : IsInParallelRegion(IsInParallelRegion) {} |
| void Enter(CodeGenFunction &CGF) override { |
| PrevIsInParallelRegion = IsInParallelRegion; |
| IsInParallelRegion = true; |
| } |
| void Exit(CodeGenFunction &CGF) override { |
| IsInParallelRegion = PrevIsInParallelRegion; |
| } |
| } Action(IsInParallelRegion); |
| CodeGen.setAction(Action); |
| bool PrevIsInTTDRegion = IsInTTDRegion; |
| IsInTTDRegion = false; |
| bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion; |
| IsInTargetMasterThreadRegion = false; |
| auto *OutlinedFun = |
| cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction( |
| D, ThreadIDVar, InnermostKind, CodeGen)); |
| if (CGM.getLangOpts().Optimize) { |
| OutlinedFun->removeFnAttr(llvm::Attribute::NoInline); |
| OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone); |
| OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); |
| } |
| IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion; |
| IsInTTDRegion = PrevIsInTTDRegion; |
| if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD && |
| !IsInParallelRegion) { |
| llvm::Function *WrapperFun = |
| createParallelDataSharingWrapper(OutlinedFun, D); |
| WrapperFunctionsMap[OutlinedFun] = WrapperFun; |
| } |
| |
| return OutlinedFun; |
| } |
| |
| /// Get list of lastprivate variables from the teams distribute ... or |
| /// teams {distribute ...} directives. |
| static void |
| getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, |
| llvm::SmallVectorImpl<const ValueDecl *> &Vars) { |
| assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && |
| "expected teams directive."); |
| const OMPExecutableDirective *Dir = &D; |
| if (!isOpenMPDistributeDirective(D.getDirectiveKind())) { |
| if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild( |
| Ctx, |
| D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers( |
| /*IgnoreCaptured=*/true))) { |
| Dir = dyn_cast_or_null<OMPExecutableDirective>(S); |
| if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind())) |
| Dir = nullptr; |
| } |
| } |
| if (!Dir) |
| return; |
| for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) { |
| for (const Expr *E : C->getVarRefs()) |
| Vars.push_back(getPrivateItem(E)); |
| } |
| } |
| |
| /// Get list of reduction variables from the teams ... directives. |
| static void |
| getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, |
| llvm::SmallVectorImpl<const ValueDecl *> &Vars) { |
| assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && |
| "expected teams directive."); |
| for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) { |
| for (const Expr *E : C->privates()) |
| Vars.push_back(getPrivateItem(E)); |
| } |
| } |
| |
| llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction( |
| const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, |
| OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { |
| SourceLocation Loc = D.getBeginLoc(); |
| |
| const RecordDecl *GlobalizedRD = nullptr; |
| llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions; |
| llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; |
| unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size); |
| // Globalize team reductions variable unconditionally in all modes. |
| if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) |
| getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); |
| if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { |
| getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); |
| if (!LastPrivatesReductions.empty()) { |
| GlobalizedRD = ::buildRecordForGlobalizedVars( |
| CGM.getContext(), llvm::None, LastPrivatesReductions, |
| MappedDeclsFields, WarpSize); |
| } |
| } else if (!LastPrivatesReductions.empty()) { |
| assert(!TeamAndReductions.first && |
| "Previous team declaration is not expected."); |
| TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); |
| std::swap(TeamAndReductions.second, LastPrivatesReductions); |
| } |
| |
| // Emit target region as a standalone region. |
| class NVPTXPrePostActionTy : public PrePostActionTy { |
| SourceLocation &Loc; |
| const RecordDecl *GlobalizedRD; |
| llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> |
| &MappedDeclsFields; |
| |
| public: |
| NVPTXPrePostActionTy( |
| SourceLocation &Loc, const RecordDecl *GlobalizedRD, |
| llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> |
| &MappedDeclsFields) |
| : Loc(Loc), GlobalizedRD(GlobalizedRD), |
| MappedDeclsFields(MappedDeclsFields) {} |
| void Enter(CodeGenFunction &CGF) override { |
| auto &Rt = |
| static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); |
| if (GlobalizedRD) { |
| auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; |
| I->getSecond().GlobalRecord = GlobalizedRD; |
| I->getSecond().MappedParams = |
| std::make_unique<CodeGenFunction::OMPMapVars>(); |
| DeclToAddrMapTy &Data = I->getSecond().LocalVarData; |
| for (const auto &Pair : MappedDeclsFields) { |
| assert(Pair.getFirst()->isCanonicalDecl() && |
| "Expected canonical declaration"); |
| Data.insert(std::make_pair(Pair.getFirst(), |
| MappedVarData(Pair.getSecond(), |
| /*IsOnePerTeam=*/true))); |
| } |
| } |
| Rt.emitGenericVarsProlog(CGF, Loc); |
| } |
| void Exit(CodeGenFunction &CGF) override { |
| static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) |
| .emitGenericVarsEpilog(CGF); |
| } |
| } Action(Loc, GlobalizedRD, MappedDeclsFields); |
| CodeGen.setAction(Action); |
| llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction( |
| D, ThreadIDVar, InnermostKind, CodeGen); |
| if (CGM.getLangOpts().Optimize) { |
| OutlinedFun->removeFnAttr(llvm::Attribute::NoInline); |
| OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone); |
| OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); |
| } |
| |
| return OutlinedFun; |
| } |
| |
| void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, |
| SourceLocation Loc, |
| bool WithSPMDCheck) { |
| if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic && |
| getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) |
| return; |
| |
| CGBuilderTy &Bld = CGF.Builder; |
| |
| const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); |
| if (I == FunctionGlobalizedDecls.end()) |
| return; |
| if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) { |
| QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); |
| QualType SecGlobalRecTy; |
| |
| // Recover pointer to this function's global record. The runtime will |
| // handle the specifics of the allocation of the memory. |
| // Use actual memory size of the record including the padding |
| // for alignment purposes. |
| unsigned Alignment = |
| CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); |
| unsigned GlobalRecordSize = |
| CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity(); |
| GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); |
| |
| llvm::PointerType *GlobalRecPtrTy = |
| CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo(); |
| llvm::Value *GlobalRecCastAddr; |
| llvm::Value *IsTTD = nullptr; |
| if (!IsInTTDRegion && |
| (WithSPMDCheck || |
| getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { |
| llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); |
| llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); |
| llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); |
| if (I->getSecond().SecondaryGlobalRecord.hasValue()) { |
| llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); |
| llvm::Value *ThreadID = getThreadID(CGF, Loc); |
| llvm::Value *PL = CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level), |
| {RTLoc, ThreadID}); |
| IsTTD = Bld.CreateIsNull(PL); |
| } |
| llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode))); |
| Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB); |
| // There is no need to emit line number for unconditional branch. |
| (void)ApplyDebugLocation::CreateEmpty(CGF); |
| CGF.EmitBlock(SPMDBB); |
| Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy), |
| CharUnits::fromQuantity(Alignment)); |
| CGF.EmitBranch(ExitBB); |
| // There is no need to emit line number for unconditional branch. |
| (void)ApplyDebugLocation::CreateEmpty(CGF); |
| CGF.EmitBlock(NonSPMDBB); |
| llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize); |
| if (const RecordDecl *SecGlobalizedVarsRecord = |
| I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) { |
| SecGlobalRecTy = |
| CGM.getContext().getRecordType(SecGlobalizedVarsRecord); |
| |
| // Recover pointer to this function's global record. The runtime will |
| // handle the specifics of the allocation of the memory. |
| // Use actual memory size of the record including the padding |
| // for alignment purposes. |
| unsigned Alignment = |
| CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity(); |
| unsigned GlobalRecordSize = |
| CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity(); |
| GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); |
| Size = Bld.CreateSelect( |
| IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size); |
| } |
| // TODO: allow the usage of shared memory to be controlled by |
| // the user, for now, default to global. |
| llvm::Value *GlobalRecordSizeArg[] = { |
| Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; |
| llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction( |
| OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack), |
| GlobalRecordSizeArg); |
| GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( |
| GlobalRecValue, GlobalRecPtrTy); |
| CGF.EmitBlock(ExitBB); |
| auto *Phi = Bld.CreatePHI(GlobalRecPtrTy, |
| /*NumReservedValues=*/2, "_select_stack"); |
| Phi->addIncoming(RecPtr.getPointer(), SPMDBB); |
| Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB); |
| GlobalRecCastAddr = Phi; |
| I->getSecond().GlobalRecordAddr = Phi; |
| I->getSecond().IsInSPMDModeFlag = IsSPMD; |
| } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { |
| assert(GlobalizedRecords.back().Records.size() < 2 && |
| "Expected less than 2 globalized records: one for target and one " |
| "for teams."); |
| unsigned Offset = 0; |
| for (const RecordDecl *RD : GlobalizedRecords.back().Records) { |
| QualType RDTy = CGM.getContext().getRecordType(RD); |
| unsigned Alignment = |
| CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); |
| unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); |
| Offset = |
| llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment); |
| } |
| unsigned Alignment = |
| CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); |
| Offset = llvm::alignTo(Offset, Alignment); |
| GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord); |
| ++GlobalizedRecords.back().RegionCounter; |
| if (GlobalizedRecords.back().Records.size() == 1) { |
| assert(KernelStaticGlobalized && |
| "Kernel static pointer must be initialized already."); |
| auto *UseSharedMemory = new llvm::GlobalVariable( |
| CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true, |
| llvm::GlobalValue::InternalLinkage, nullptr, |
| "_openmp_static_kernel$is_shared"); |
| UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); |
| QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( |
| /*DestWidth=*/16, /*Signed=*/0); |
| llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( |
| Address(UseSharedMemory, |
| CGM.getContext().getTypeAlignInChars(Int16Ty)), |
| /*Volatile=*/false, Int16Ty, Loc); |
| auto *StaticGlobalized = new llvm::GlobalVariable( |
| CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false, |
| llvm::GlobalValue::CommonLinkage, nullptr); |
| auto *RecSize = new llvm::GlobalVariable( |
| CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, |
| llvm::GlobalValue::InternalLinkage, nullptr, |
| "_openmp_static_kernel$size"); |
| RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); |
| llvm::Value *Ld = CGF.EmitLoadOfScalar( |
| Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false, |
| CGM.getContext().getSizeType(), Loc); |
| llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( |
| KernelStaticGlobalized, CGM.VoidPtrPtrTy); |
| llvm::Value *GlobalRecordSizeArg[] = { |
| llvm::ConstantInt::get( |
| CGM.Int16Ty, |
| getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), |
| StaticGlobalized, Ld, IsInSharedMemory, ResAddr}; |
| CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( |
| OMPRTL_NVPTX__kmpc_get_team_static_memory), |
| GlobalRecordSizeArg); |
| GlobalizedRecords.back().Buffer = StaticGlobalized; |
| GlobalizedRecords.back().RecSize = RecSize; |
| GlobalizedRecords.back().UseSharedMemory = UseSharedMemory; |
| GlobalizedRecords.back().Loc = Loc; |
| } |
| assert(KernelStaticGlobalized && "Global address must be set already."); |
| Address FrameAddr = CGF.EmitLoadOfPointer( |
| Address(KernelStaticGlobalized, CGM.getPointerAlign()), |
| CGM.getContext() |
| .getPointerType(CGM.getContext().VoidPtrTy) |
| .castAs<PointerType>()); |
| llvm::Value *GlobalRecValue = |
| Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer(); |
| I->getSecond().GlobalRecordAddr = GlobalRecValue; |
| I->getSecond().IsInSPMDModeFlag = nullptr; |
| GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( |
| GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo()); |
| } else { |
| // TODO: allow the usage of shared memory to be controlled by |
| // the user, for now, default to global. |
| bool UseSharedMemory = |
| IsInTTDRegion && GlobalRecordSize <= SharedMemorySize; |
| llvm::Value *GlobalRecordSizeArg[] = { |
| llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), |
| CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)}; |
| llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction( |
| IsInTTDRegion |
| ? OMPRTL_NVPTX__kmpc_data_sharing_push_stack |
| : OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack), |
| GlobalRecordSizeArg); |
| GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( |
| GlobalRecValue, GlobalRecPtrTy); |
| I->getSecond().GlobalRecordAddr = GlobalRecValue; |
| I->getSecond().IsInSPMDModeFlag = nullptr; |
| } |
| LValue Base = |
| CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy); |
| |
| // Emit the "global alloca" which is a GEP from the global declaration |
| // record using the pointer returned by the runtime. |
| LValue SecBase; |
| decltype(I->getSecond().LocalVarData)::const_iterator SecIt; |
| if (IsTTD) { |
| SecIt = I->getSecond().SecondaryLocalVarData->begin(); |
| llvm::PointerType *SecGlobalRecPtrTy = |
| CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo(); |
| SecBase = CGF.MakeNaturalAlignPointeeAddrLValue( |
| Bld.CreatePointerBitCastOrAddrSpaceCast( |
| I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy), |
| SecGlobalRecTy); |
| } |
| for (auto &Rec : I->getSecond().LocalVarData) { |
| bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); |
| llvm::Value *ParValue; |
| if (EscapedParam) { |
| const auto *VD = cast<VarDecl>(Rec.first); |
| LValue ParLVal = |
| CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); |
| ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); |
| } |
| LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD); |
| // Emit VarAddr basing on lane-id if required. |
| QualType VarTy; |
| if (Rec.second.IsOnePerTeam) { |
| VarTy = Rec.second.FD->getType(); |
| } else { |
| llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP( |
| VarAddr.getAddress(CGF).getPointer(), |
| {Bld.getInt32(0), getNVPTXLaneID(CGF)}); |
| VarTy = |
| Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType(); |
| VarAddr = CGF.MakeAddrLValue( |
| Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy, |
| AlignmentSource::Decl); |
| } |
| Rec.second.PrivateAddr = VarAddr.getAddress(CGF); |
| if (!IsInTTDRegion && |
| (WithSPMDCheck || |
| getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { |
| assert(I->getSecond().IsInSPMDModeFlag && |
| "Expected unknown execution mode or required SPMD check."); |
| if (IsTTD) { |
| assert(SecIt->second.IsOnePerTeam && |
| "Secondary glob data must be one per team."); |
| LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD); |
| VarAddr.setAddress( |
| Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF), |
| VarAddr.getPointer(CGF)), |
| VarAddr.getAlignment())); |
| Rec.second.PrivateAddr = VarAddr.getAddress(CGF); |
| } |
| Address GlobalPtr = Rec.second.PrivateAddr; |
| Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName()); |
| Rec.second.PrivateAddr = Address( |
| Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag, |
| LocalAddr.getPointer(), GlobalPtr.getPointer()), |
| LocalAddr.getAlignment()); |
| } |
| if (EscapedParam) { |
| const auto *VD = cast<VarDecl>(Rec.first); |
| CGF.EmitStoreOfScalar(ParValue, VarAddr); |
| I->getSecond().MappedParams->setVarAddr(CGF, VD, |
| VarAddr.getAddress(CGF)); |
| } |
| if (IsTTD) |
| ++SecIt; |
| } |
| } |
| for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) { |
| // Recover pointer to this function's global record. The runtime will |
| // handle the specifics of the allocation of the memory. |
| // Use actual memory size of the record including the padding |
| // for alignment purposes. |
| CGBuilderTy &Bld = CGF.Builder; |
| llvm::Value *Size = CGF.getTypeSize(VD->getType()); |
| CharUnits Align = CGM.getContext().getDeclAlign(VD); |
| Size = Bld.CreateNUWAdd( |
| Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); |
| llvm::Value *AlignVal = |
| llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); |
| Size = Bld.CreateUDiv(Size, AlignVal); |
| Size = Bld.CreateNUWMul(Size, AlignVal); |
| // TODO: allow the usage of shared memory to be controlled by |
| // the user, for now, default to global. |
| llvm::Value *GlobalRecordSizeArg[] = { |
| Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; |
| llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction( |
| OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack), |
| GlobalRecordSizeArg); |
| llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( |
| GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo()); |
| LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(), |
| CGM.getContext().getDeclAlign(VD), |
| AlignmentSource::Decl); |
| I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD), |
| Base.getAddress(CGF)); |
| I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue); |
| } |
| I->getSecond().MappedParams->apply(CGF); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF, |
| bool WithSPMDCheck) { |
| if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic && |
| getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) |
| return; |
| |
| const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); |
| if (I != FunctionGlobalizedDecls.end()) { |
| I->getSecond().MappedParams->restore(CGF); |
| if (!CGF.HaveInsertPoint()) |
| return; |
| for (llvm::Value *Addr : |
| llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), |
| Addr); |
| } |
| if (I->getSecond().GlobalRecordAddr) { |
| if (!IsInTTDRegion && |
| (WithSPMDCheck || |
| getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { |
| CGBuilderTy &Bld = CGF.Builder; |
| llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); |
| llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); |
| Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB); |
| // There is no need to emit line number for unconditional branch. |
| (void)ApplyDebugLocation::CreateEmpty(CGF); |
| CGF.EmitBlock(NonSPMDBB); |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction( |
| OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), |
| CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); |
| CGF.EmitBlock(ExitBB); |
| } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { |
| assert(GlobalizedRecords.back().RegionCounter > 0 && |
| "region counter must be > 0."); |
| --GlobalizedRecords.back().RegionCounter; |
| // Emit the restore function only in the target region. |
| if (GlobalizedRecords.back().RegionCounter == 0) { |
| QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( |
| /*DestWidth=*/16, /*Signed=*/0); |
| llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( |
| Address(GlobalizedRecords.back().UseSharedMemory, |
| CGM.getContext().getTypeAlignInChars(Int16Ty)), |
| /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc); |
| llvm::Value *Args[] = { |
| llvm::ConstantInt::get( |
| CGM.Int16Ty, |
| getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), |
| IsInSharedMemory}; |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction( |
| OMPRTL_NVPTX__kmpc_restore_team_static_memory), |
| Args); |
| } |
| } else { |
| CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( |
| OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), |
| I->getSecond().GlobalRecordAddr); |
| } |
| } |
| } |
| } |
| |
| void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF, |
| const OMPExecutableDirective &D, |
| SourceLocation Loc, |
| llvm::Function *OutlinedFn, |
| ArrayRef<llvm::Value *> CapturedVars) { |
| if (!CGF.HaveInsertPoint()) |
| return; |
| |
| Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, |
| /*Name=*/".zero.addr"); |
| CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); |
| llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; |
| OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer()); |
| OutlinedFnArgs.push_back(ZeroAddr.getPointer()); |
| OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); |
| emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitParallelCall( |
| CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, |
| ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) { |
| if (!CGF.HaveInsertPoint()) |
| return; |
| |
| if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) |
| emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); |
| else |
| emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); |
| } |
| |
| void CGOpenMPRuntimeGPU::emitNonSPMDParallelCall( |
| CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, |
| ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) { |
| llvm::Function *Fn = cast<llvm::Function>(OutlinedFn); |
| |
| // Force inline this outlined function at its call site. |
| Fn->setLinkage(llvm::GlobalValue::InternalLinkage); |
| |
| Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, |
| /*Name=*/".zero.addr"); |
| CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); |
| // ThreadId for serialized parallels is 0. |
| Address ThreadIDAddr = ZeroAddr; |
| auto &&CodeGen = [this, Fn, CapturedVars, Loc, &ThreadIDAddr]( |
| CodeGenFunction &CGF, PrePostActionTy &Action) { |
| Action.Enter(CGF); |
| |
| Address ZeroAddr = |
| CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, |
| /*Name=*/".bound.zero.addr"); |
| CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); |
| llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; |
| OutlinedFnArgs.push_back(ThreadIDAddr.getPointer()); |
| OutlinedFnArgs.push_back(ZeroAddr.getPointer()); |
| OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); |
| emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs); |
| }; |
| auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF, |
| PrePostActionTy &) { |
| |
| RegionCodeGenTy RCG(CodeGen); |
| llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); |
| llvm::Value *ThreadID = getThreadID(CGF, Loc); |
| llvm::Value *Args[] = {RTLoc, ThreadID}; |
| |
| NVPTXActionTy Action( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel), |
| Args, |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel), |
| Args); |
| RCG.setAction(Action); |
| RCG(CGF); |
| }; |
| |
| auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF, |
| PrePostActionTy &Action) { |
| CGBuilderTy &Bld = CGF.Builder; |
| llvm::Function *WFn = WrapperFunctionsMap[Fn]; |
| assert(WFn && "Wrapper function does not exist!"); |
| llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); |
| |
| // Prepare for parallel region. Indicate the outlined function. |
| llvm::Value *Args[] = {ID}; |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), |
| Args); |
| |
| // Create a private scope that will globalize the arguments |
| // passed from the outside of the target region. |
| CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); |
| |
| // There's something to share. |
| if (!CapturedVars.empty()) { |
| // Prepare for parallel region. Indicate the outlined function. |
| Address SharedArgs = |
| CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs"); |
| llvm::Value *SharedArgsPtr = SharedArgs.getPointer(); |
| |
| llvm::Value *DataSharingArgs[] = { |
| SharedArgsPtr, |
| llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; |
| CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( |
| OMPRTL_NVPTX__kmpc_begin_sharing_variables), |
| DataSharingArgs); |
| |
| // Store variable address in a list of references to pass to workers. |
| unsigned Idx = 0; |
| ASTContext &Ctx = CGF.getContext(); |
| Address SharedArgListAddress = CGF.EmitLoadOfPointer( |
| SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy)) |
| .castAs<PointerType>()); |
| for (llvm::Value *V : CapturedVars) { |
| Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); |
| llvm::Value *PtrV; |
| if (V->getType()->isIntegerTy()) |
| PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); |
| else |
| PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); |
| CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, |
| Ctx.getPointerType(Ctx.VoidPtrTy)); |
| ++Idx; |
| } |
| } |
| |
| // Activate workers. This barrier is used by the master to signal |
| // work for the workers. |
| syncCTAThreads(CGF); |
| |
| // OpenMP [2.5, Parallel Construct, p.49] |
| // There is an implied barrier at the end of a parallel region. After the |
| // end of a parallel region, only the master thread of the team resumes |
| // execution of the enclosing task region. |
| // |
| // The master waits at this barrier until all workers are done. |
| syncCTAThreads(CGF); |
| |
| if (!CapturedVars.empty()) |
| CGF.EmitRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables)); |
| |
| // Remember for post-processing in worker loop. |
| Work.emplace_back(WFn); |
| }; |
| |
| auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen]( |
| CodeGenFunction &CGF, PrePostActionTy &Action) { |
| if (IsInParallelRegion) { |
| SeqGen(CGF, Action); |
| } else if (IsInTargetMasterThreadRegion) { |
| L0ParallelGen(CGF, Action); |
| } else { |
| // Check for master and then parallelism: |
| // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) { |
| // Serialized execution. |
| // } else { |
| // Worker call. |
| // } |
| CGBuilderTy &Bld = CGF.Builder; |
| llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); |
| llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential"); |
| llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck"); |
| llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); |
| llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall( |
| createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode))); |
| Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB); |
| // There is no need to emit line number for unconditional branch. |
| (<
|