diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h new file mode 100644 --- /dev/null +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h @@ -0,0 +1,35 @@ +//===--- CGOpenMPRuntimeAMDGCN.h - Interface to OpenMP AMDGCN Runtimes ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides a class for OpenMP runtime code generation specialized to +// AMDGCN targets from generalized CGOpenMPRuntimeGPU class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H +#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H + +#include "CGOpenMPRuntime.h" +#include "CGOpenMPRuntimeGPU.h" +#include "CodeGenFunction.h" +#include "clang/AST/StmtOpenMP.h" + +namespace clang { +namespace CodeGen { + +class CGOpenMPRuntimeAMDGCN : public CGOpenMPRuntimeGPU { + +public: + explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM); + llvm::Value *getGPUWarpSize(CodeGenFunction &CGF); +}; + +} // namespace CodeGen +} // namespace clang + +#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp @@ -0,0 +1,41 @@ +//===-- CGOpenMPRuntimeAMDGCN.cpp - Interface to OpenMP AMDGCN Runtimes --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides a class for OpenMP runtime code generation specialized to +// AMDGCN targets from generalized CGOpenMPRuntimeGPU class. +// +//===----------------------------------------------------------------------===// + +#include "CGOpenMPRuntimeAMDGCN.h" +#include "CGOpenMPRuntimeGPU.h" +#include "CodeGenFunction.h" +#include "clang/AST/Attr.h" +#include "clang/AST/DeclOpenMP.h" +#include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtVisitor.h" +#include "clang/Basic/Cuda.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" + +using namespace clang; +using namespace CodeGen; +using namespace llvm::omp; + +CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM) + : CGOpenMPRuntimeGPU(CGM) { + if (!CGM.getLangOpts().OpenMPIsDevice) + llvm_unreachable("OpenMP AMDGCN can only handle device code."); +} + +/// Get the GPU warp size. +llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + // return constant compile-time target-specific warp size + unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); + return Bld.getInt32(WarpSize); +} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h copy from clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h copy to clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -1,4 +1,4 @@ -//===----- CGOpenMPRuntimeNVPTX.h - Interface to OpenMP NVPTX Runtimes ----===// +//===------ CGOpenMPRuntimeGPU.h - Interface to OpenMP GPU Runtimes ------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,22 +6,23 @@ // //===----------------------------------------------------------------------===// // -// This provides a class for OpenMP runtime code generation specialized to NVPTX -// targets. +// This provides a generalized class for OpenMP runtime code generation +// specialized by GPU targets AMDGCN and NVPTX. // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H -#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H +#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEGPU_H +#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEGPU_H #include "CGOpenMPRuntime.h" #include "CodeGenFunction.h" #include "clang/AST/StmtOpenMP.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" namespace clang { namespace CodeGen { -class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime { +class CGOpenMPRuntimeGPU : public CGOpenMPRuntime { public: /// Defines the execution mode. enum ExecutionMode { @@ -194,9 +195,14 @@ unsigned getDefaultLocationReserved2Flags() const override; public: - explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM); + explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM); void clear() override; + /// Declare generalized virtual functions which need to be defined + /// by all specializations of OpenMPGPURuntime Targets like AMDGCN + /// and NVPTX. + virtual llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) = 0; + /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. virtual void emitProcBindClause(CodeGenFunction &CGF, @@ -487,4 +493,4 @@ } // CodeGen namespace. } // clang namespace. -#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H +#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEGPU_H diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp copy from clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp copy to clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1,4 +1,4 @@ -//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===// +//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,11 +6,12 @@ // //===----------------------------------------------------------------------===// // -// This provides a class for OpenMP runtime code generation specialized to NVPTX -// targets. +// This provides a generalized class for OpenMP runtime code generation +// specialized by GPU targets AMDGCN and NVPTX. // //===----------------------------------------------------------------------===// +#include "CGOpenMPRuntimeGPU.h" #include "CGOpenMPRuntimeNVPTX.h" #include "CodeGenFunction.h" #include "clang/AST/Attr.h" @@ -160,26 +161,26 @@ /// to emit optimized code. class ExecutionRuntimeModesRAII { private: - CGOpenMPRuntimeNVPTX::ExecutionMode SavedExecMode = - CGOpenMPRuntimeNVPTX::EM_Unknown; - CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode; + CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode = + CGOpenMPRuntimeGPU::EM_Unknown; + CGOpenMPRuntimeGPU::ExecutionMode &ExecMode; bool SavedRuntimeMode = false; bool *RuntimeMode = nullptr; public: /// Constructor for Non-SPMD mode. - ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode) + ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode) : ExecMode(ExecMode) { SavedExecMode = ExecMode; - ExecMode = CGOpenMPRuntimeNVPTX::EM_NonSPMD; + ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD; } /// Constructor for SPMD mode. - ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode, + ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode, bool &RuntimeMode, bool FullRuntimeMode) : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) { SavedExecMode = ExecMode; SavedRuntimeMode = RuntimeMode; - ExecMode = CGOpenMPRuntimeNVPTX::EM_SPMD; + ExecMode = CGOpenMPRuntimeGPU::EM_SPMD; RuntimeMode = FullRuntimeMode; } ~ExecutionRuntimeModesRAII() { @@ -620,14 +621,6 @@ }; } // anonymous namespace -/// Get the GPU warp size. -static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) { - return CGF.EmitRuntimeCall( - llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize), - "nvptx_warp_size"); -} - /// Get the id of the current thread on the GPU. static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) { return CGF.EmitRuntimeCall( @@ -669,9 +662,10 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF, bool IsInSPMDExecutionMode = false) { CGBuilderTy &Bld = CGF.Builder; + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); return IsInSPMDExecutionMode ? getNVPTXNumThreads(CGF) - : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF), + : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), RT.getGPUWarpSize(CGF), "thread_limit"); } @@ -685,22 +679,22 @@ static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; llvm::Value *NumThreads = getNVPTXNumThreads(CGF); - + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); // We assume that the warp size is a power of 2. - llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1)); + llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1)); return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)), Bld.CreateNot(Mask), "master_tid"); } -CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState( +CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState( CodeGenModule &CGM, SourceLocation Loc) : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()), Loc(Loc) { createWorkerFunction(CGM); } -void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( +void CGOpenMPRuntimeGPU::WorkerFunctionState::createWorkerFunction( CodeGenModule &CGM) { // Create an worker function with no arguments. @@ -711,15 +705,15 @@ WorkerFn->setDoesNotRecurse(); } -CGOpenMPRuntimeNVPTX::ExecutionMode -CGOpenMPRuntimeNVPTX::getExecutionMode() const { +CGOpenMPRuntimeGPU::ExecutionMode +CGOpenMPRuntimeGPU::getExecutionMode() const { return CurrentExecutionMode; } -static CGOpenMPRuntimeNVPTX::DataSharingMode +static CGOpenMPRuntimeGPU::DataSharingMode getDataSharingMode(CodeGenModule &CGM) { - return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA - : CGOpenMPRuntimeNVPTX::Generic; + return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA + : CGOpenMPRuntimeGPU::Generic; } /// Check for inner (nested) SPMD construct, if any @@ -1166,7 +1160,7 @@ "Unknown programming model for OpenMP directive on NVPTX target."); } -void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D, +void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, @@ -1180,23 +1174,23 @@ // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { - CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; - CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST; + CGOpenMPRuntimeGPU::EntryFunctionState &EST; + CGOpenMPRuntimeGPU::WorkerFunctionState &WST; public: - NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, - CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) + NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST, + CGOpenMPRuntimeGPU::WorkerFunctionState &WST) : EST(EST), WST(WST) {} void Enter(CodeGenFunction &CGF) override { auto &RT = - static_cast(CGF.CGM.getOpenMPRuntime()); + static_cast(CGF.CGM.getOpenMPRuntime()); RT.emitNonSPMDEntryHeader(CGF, EST, WST); // Skip target region initialization. RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); } void Exit(CodeGenFunction &CGF) override { auto &RT = - static_cast(CGF.CGM.getOpenMPRuntime()); + static_cast(CGF.CGM.getOpenMPRuntime()); RT.clearLocThreadIdInsertPt(CGF); RT.emitNonSPMDEntryFooter(CGF, EST); } @@ -1227,7 +1221,7 @@ } // Setup NVPTX threads for master-worker OpenMP scheme. -void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, WorkerFunctionState &WST) { CGBuilderTy &Bld = CGF.Builder; @@ -1269,7 +1263,7 @@ emitGenericVarsProlog(CGF, WST.Loc); } -void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST) { IsInTargetMasterThreadRegion = false; if (!CGF.HaveInsertPoint()) @@ -1298,7 +1292,7 @@ EST.ExitBB = nullptr; } -void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D, +void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, @@ -1312,13 +1306,13 @@ // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { - CGOpenMPRuntimeNVPTX &RT; - CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; + CGOpenMPRuntimeGPU &RT; + CGOpenMPRuntimeGPU::EntryFunctionState &EST; const OMPExecutableDirective &D; public: - NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, - CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, + NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT, + CGOpenMPRuntimeGPU::EntryFunctionState &EST, const OMPExecutableDirective &D) : RT(RT), EST(EST), D(D) {} void Enter(CodeGenFunction &CGF) override { @@ -1349,7 +1343,7 @@ IsInTTDRegion = false; } -void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader( +void CGOpenMPRuntimeGPU::emitSPMDEntryHeader( CodeGenFunction &CGF, EntryFunctionState &EST, const OMPExecutableDirective &D) { CGBuilderTy &Bld = CGF.Builder; @@ -1378,7 +1372,7 @@ IsInTargetMasterThreadRegion = true; } -void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST) { IsInTargetMasterThreadRegion = false; if (!CGF.HaveInsertPoint()) @@ -1419,7 +1413,7 @@ CGM.addCompilerUsedGlobal(GVMode); } -void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { +void CGOpenMPRuntimeGPU::emitWorkerFunction(WorkerFunctionState &WST) { ASTContext &Ctx = CGM.getContext(); CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); @@ -1429,7 +1423,7 @@ CGF.FinishFunction(); } -void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST) { // // The workers enter this loop and wait for parallel work from the master. @@ -1553,7 +1547,7 @@ /// \param Function OpenMP runtime function. /// \return Specified function. llvm::FunctionCallee -CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { +CGOpenMPRuntimeGPU::createNVPTXRuntimeFunction(unsigned Function) { llvm::FunctionCallee RTLFn = nullptr; switch (static_cast(Function)) { case OMPRTL_NVPTX__kmpc_kernel_init: { @@ -1870,7 +1864,7 @@ return RTLFn; } -void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID, +void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t, llvm::GlobalValue::LinkageTypes) { @@ -1892,7 +1886,7 @@ MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); } -void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction( +void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction( const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { @@ -1928,7 +1922,7 @@ (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE; } // anonymous namespace -unsigned CGOpenMPRuntimeNVPTX::getDefaultLocationReserved2Flags() const { +unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const { switch (getExecutionMode()) { case EM_SPMD: if (requiresFullRuntime()) @@ -1943,38 +1937,38 @@ llvm_unreachable("Unknown flags are requested."); } -CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) +CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) : CGOpenMPRuntime(CGM, "_", "$") { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP NVPTX can only handle device code."); } -void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, ProcBindKind ProcBind, SourceLocation Loc) { // Do nothing in case of SPMD mode and L0 parallel. - if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) + if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) return; CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc); } -void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) { // Do nothing in case of SPMD mode and L0 parallel. - if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) + if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) return; CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc); } -void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) {} -llvm::Function *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( +llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { // Emit target region as a standalone region. @@ -2008,7 +2002,7 @@ } IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion; IsInTTDRegion = PrevIsInTTDRegion; - if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD && + if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD && !IsInParallelRegion) { llvm::Function *WrapperFun = createParallelDataSharingWrapper(OutlinedFun, D); @@ -2056,7 +2050,7 @@ } } -llvm::Function *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( +llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { SourceLocation Loc = D.getBeginLoc(); @@ -2065,9 +2059,9 @@ llvm::SmallVector LastPrivatesReductions; llvm::SmallDenseMap MappedDeclsFields; // Globalize team reductions variable unconditionally in all modes. - if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD) + if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); - if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) { + if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); if (!LastPrivatesReductions.empty()) { GlobalizedRD = ::buildRecordForGlobalizedVars( @@ -2097,7 +2091,7 @@ MappedDeclsFields(MappedDeclsFields) {} void Enter(CodeGenFunction &CGF) override { auto &Rt = - static_cast(CGF.CGM.getOpenMPRuntime()); + static_cast(CGF.CGM.getOpenMPRuntime()); if (GlobalizedRD) { auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; I->getSecond().GlobalRecord = GlobalizedRD; @@ -2115,7 +2109,7 @@ Rt.emitGenericVarsProlog(CGF, Loc); } void Exit(CodeGenFunction &CGF) override { - static_cast(CGF.CGM.getOpenMPRuntime()) + static_cast(CGF.CGM.getOpenMPRuntime()) .emitGenericVarsEpilog(CGF); } } Action(Loc, GlobalizedRD, MappedDeclsFields); @@ -2131,11 +2125,11 @@ return OutlinedFun; } -void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc, bool WithSPMDCheck) { - if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic && - getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD) + if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic && + getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) return; CGBuilderTy &Bld = CGF.Builder; @@ -2163,7 +2157,7 @@ llvm::Value *IsTTD = nullptr; if (!IsInTTDRegion && (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) { + getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); @@ -2271,7 +2265,7 @@ llvm::Value *GlobalRecordSizeArg[] = { llvm::ConstantInt::get( CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0), + getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), StaticGlobalized, Ld, IsInSharedMemory, ResAddr}; CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_get_team_static_memory), @@ -2355,7 +2349,7 @@ Rec.second.PrivateAddr = VarAddr.getAddress(CGF); if (!IsInTTDRegion && (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) { + getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { assert(I->getSecond().IsInSPMDModeFlag && "Expected unknown execution mode or required SPMD check."); if (IsTTD) { @@ -2419,10 +2413,10 @@ I->getSecond().MappedParams->apply(CGF); } -void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF, bool WithSPMDCheck) { - if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic && - getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD) + if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic && + getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) return; const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); @@ -2439,7 +2433,7 @@ if (I->getSecond().GlobalRecordAddr) { if (!IsInTTDRegion && (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) { + getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { CGBuilderTy &Bld = CGF.Builder; llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); @@ -2467,7 +2461,7 @@ llvm::Value *Args[] = { llvm::ConstantInt::get( CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0), + getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), IsInSharedMemory}; CGF.EmitRuntimeCall( createNVPTXRuntimeFunction( @@ -2483,7 +2477,7 @@ } } -void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, @@ -2501,19 +2495,19 @@ emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); } -void CGOpenMPRuntimeNVPTX::emitParallelCall( +void CGOpenMPRuntimeGPU::emitParallelCall( CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef CapturedVars, const Expr *IfCond) { if (!CGF.HaveInsertPoint()) return; - if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) + if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); else emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); } -void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall( +void CGOpenMPRuntimeGPU::emitNonSPMDParallelCall( CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef CapturedVars, const Expr *IfCond) { llvm::Function *Fn = cast(OutlinedFn); @@ -2682,7 +2676,7 @@ } } -void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall( +void CGOpenMPRuntimeGPU::emitSPMDParallelCall( CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef CapturedVars, const Expr *IfCond) { // Just call the outlined function to execute the parallel region. @@ -2740,7 +2734,7 @@ } } -void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) { +void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) { // Always emit simple barriers! if (!CGF.HaveInsertPoint()) return; @@ -2755,7 +2749,7 @@ Call->setConvergent(); } -void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool, bool) { @@ -2771,7 +2765,7 @@ Call->setConvergent(); } -void CGOpenMPRuntimeNVPTX::emitCriticalRegion( +void CGOpenMPRuntimeGPU::emitCriticalRegion( CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint) { @@ -2870,8 +2864,8 @@ SourceLocation Loc) { CodeGenModule &CGM = CGF.CGM; CGBuilderTy &Bld = CGF.Builder; - CGOpenMPRuntimeNVPTX &RT = - *(static_cast(&CGM.getOpenMPRuntime())); + CGOpenMPRuntimeGPU &RT = + *(static_cast(&CGM.getOpenMPRuntime())); CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType); assert(Size.getQuantity() <= 8 && @@ -2886,7 +2880,7 @@ Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1); llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc); llvm::Value *WarpSize = - Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true); + Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true); llvm::Value *ShuffledVal = CGF.EmitRuntimeCall( RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize}); @@ -4289,7 +4283,7 @@ /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to /// reduce across workers and compute a globally reduced value. /// -void CGOpenMPRuntimeNVPTX::emitReduction( +void CGOpenMPRuntimeGPU::emitReduction( CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, ArrayRef LHSExprs, ArrayRef RHSExprs, ArrayRef ReductionOps, ReductionOptionsTy Options) { @@ -4474,7 +4468,7 @@ } const VarDecl * -CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD, +CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const { if (!NativeParam->getType()->isReferenceType()) return NativeParam; @@ -4510,7 +4504,7 @@ } Address -CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF, +CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const { assert(NativeParam != TargetParam && @@ -4540,7 +4534,7 @@ return NativeParamAddr; } -void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall( +void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall( CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef Args) const { SmallVector TargetArgs; @@ -4570,7 +4564,7 @@ /// and controls the arguments which are passed to this function. /// The wrapper ensures that the outlined function is called /// with the correct arguments when data is shared. -llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper( +llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { ASTContext &Ctx = CGM.getContext(); const auto &CS = *D.getCapturedStmt(OMPD_parallel); @@ -4688,9 +4682,9 @@ return Fn; } -void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) { - if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic) + if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) return; assert(D && "Expected function or captured|block decl."); @@ -4708,7 +4702,7 @@ Body = CD->getBody(); NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP; if (NeedToDelayGlobalization && - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) + getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) return; } if (!Body) @@ -4758,7 +4752,7 @@ GlobalizationScope() = default; void Emit(CodeGenFunction &CGF, Flags flags) override { - static_cast(CGF.CGM.getOpenMPRuntime()) + static_cast(CGF.CGM.getOpenMPRuntime()) .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true); } }; @@ -4766,7 +4760,7 @@ } } -Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, +Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) { if (VD && VD->hasAttr()) { const auto *A = VD->getAttr(); @@ -4821,7 +4815,7 @@ } } - if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic) + if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) return Address::invalid(); VD = VD->getCanonicalDecl(); @@ -4846,16 +4840,16 @@ return Address::invalid(); } -void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) { +void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) { FunctionGlobalizedDecls.erase(CGF.CurFn); CGOpenMPRuntime::functionFinished(CGF); } -void CGOpenMPRuntimeNVPTX::getDefaultDistScheduleAndChunk( +void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk( CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const { - if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) { + if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { ScheduleKind = OMPC_DIST_SCHEDULE_static; Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF), CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), @@ -4866,7 +4860,7 @@ CGF, S, ScheduleKind, Chunk); } -void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk( +void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk( CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const { @@ -4878,7 +4872,7 @@ SourceLocation()); } -void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas( +void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas( CodeGenFunction &CGF, const OMPExecutableDirective &D) const { assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && " Expected target-based directive."); @@ -4930,11 +4924,11 @@ } } -unsigned CGOpenMPRuntimeNVPTX::getDefaultFirstprivateAddressSpace() const { +unsigned CGOpenMPRuntimeGPU::getDefaultFirstprivateAddressSpace() const { return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant); } -bool CGOpenMPRuntimeNVPTX::hasAllocateAttributeForGlobalVar(const VarDecl *VD, +bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) { if (!VD || !VD->hasAttr()) return false; @@ -4983,7 +4977,7 @@ /// Check to see if target architecture supports unified addressing which is /// a restriction for OpenMP requires clause "unified_shared_memory". -void CGOpenMPRuntimeNVPTX::processRequiresDirective( +void CGOpenMPRuntimeGPU::processRequiresDirective( const OMPRequiresDecl *D) { for (const OMPClause *Clause : D->clauselists()) { if (Clause->getClauseKind() == OMPC_unified_shared_memory) { @@ -5101,7 +5095,7 @@ llvm_unreachable("Unexpected NVPTX target without ptx feature."); } -void CGOpenMPRuntimeNVPTX::clear() { +void CGOpenMPRuntimeGPU::clear() { if (!GlobalizedRecords.empty() && !CGM.getLangOpts().OpenMPCUDATargetParallel) { ASTContext &C = CGM.getContext(); diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // // This provides a class for OpenMP runtime code generation specialized to NVPTX -// targets. +// targets from generalized CGOpenMPRuntimeGPU class. // //===----------------------------------------------------------------------===// @@ -15,473 +15,18 @@ #define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H #include "CGOpenMPRuntime.h" +#include "CGOpenMPRuntimeGPU.h" #include "CodeGenFunction.h" #include "clang/AST/StmtOpenMP.h" namespace clang { namespace CodeGen { -class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime { -public: - /// Defines the execution mode. - enum ExecutionMode { - /// SPMD execution mode (all threads are worker threads). - EM_SPMD, - /// Non-SPMD execution mode (1 master thread, others are workers). - EM_NonSPMD, - /// Unknown execution mode (orphaned directive). - EM_Unknown, - }; -private: - /// Parallel outlined function work for workers to execute. - llvm::SmallVector Work; - - struct EntryFunctionState { - llvm::BasicBlock *ExitBB = nullptr; - }; - - class WorkerFunctionState { - public: - llvm::Function *WorkerFn; - const CGFunctionInfo &CGFI; - SourceLocation Loc; - - WorkerFunctionState(CodeGenModule &CGM, SourceLocation Loc); - - private: - void createWorkerFunction(CodeGenModule &CGM); - }; - - ExecutionMode getExecutionMode() const; - - bool requiresFullRuntime() const { return RequiresFullRuntime; } - - /// Get barrier to synchronize all threads in a block. - void syncCTAThreads(CodeGenFunction &CGF); - - /// Emit the worker function for the current target region. - void emitWorkerFunction(WorkerFunctionState &WST); - - /// Helper for worker function. Emit body of worker loop. - void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST); - - /// Helper for non-SPMD target entry function. Guide the master and - /// worker threads to their respective locations. - void emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, - WorkerFunctionState &WST); - - /// Signal termination of OMP execution for non-SPMD target entry - /// function. - void emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); - - /// Helper for generic variables globalization prolog. - void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc, - bool WithSPMDCheck = false); - - /// Helper for generic variables globalization epilog. - void emitGenericVarsEpilog(CodeGenFunction &CGF, bool WithSPMDCheck = false); - - /// Helper for SPMD mode target directive's entry function. - void emitSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, - const OMPExecutableDirective &D); - - /// Signal termination of SPMD mode execution. - void emitSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); - - // - // Base class overrides. - // - - /// Creates offloading entry for the provided entry ID \a ID, - /// address \a Addr, size \a Size, and flags \a Flags. - void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr, - uint64_t Size, int32_t Flags, - llvm::GlobalValue::LinkageTypes Linkage) override; - - /// Emit outlined function specialized for the Fork-Join - /// programming model for applicable target directives on the NVPTX device. - /// \param D Directive to emit. - /// \param ParentName Name of the function that encloses the target region. - /// \param OutlinedFn Outlined function value to be defined by this call. - /// \param OutlinedFnID Outlined function ID value to be defined by this call. - /// \param IsOffloadEntry True if the outlined function is an offload entry. - /// An outlined function may not be an entry if, e.g. the if clause always - /// evaluates to false. - void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, - llvm::Function *&OutlinedFn, - llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, - const RegionCodeGenTy &CodeGen); - - /// Emit outlined function specialized for the Single Program - /// Multiple Data programming model for applicable target directives on the - /// NVPTX device. - /// \param D Directive to emit. - /// \param ParentName Name of the function that encloses the target region. - /// \param OutlinedFn Outlined function value to be defined by this call. - /// \param OutlinedFnID Outlined function ID value to be defined by this call. - /// \param IsOffloadEntry True if the outlined function is an offload entry. - /// \param CodeGen Object containing the target statements. - /// An outlined function may not be an entry if, e.g. the if clause always - /// evaluates to false. - void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, - llvm::Function *&OutlinedFn, - llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, - const RegionCodeGenTy &CodeGen); - - /// Emit outlined function for 'target' directive on the NVPTX - /// device. - /// \param D Directive to emit. - /// \param ParentName Name of the function that encloses the target region. - /// \param OutlinedFn Outlined function value to be defined by this call. - /// \param OutlinedFnID Outlined function ID value to be defined by this call. - /// \param IsOffloadEntry True if the outlined function is an offload entry. - /// An outlined function may not be an entry if, e.g. the if clause always - /// evaluates to false. - void emitTargetOutlinedFunction(const OMPExecutableDirective &D, - StringRef ParentName, - llvm::Function *&OutlinedFn, - llvm::Constant *&OutlinedFnID, - bool IsOffloadEntry, - const RegionCodeGenTy &CodeGen) override; - - /// Emits code for parallel or serial call of the \a OutlinedFn with - /// variables captured in a record which address is stored in \a - /// CapturedStruct. - /// This call is for the Non-SPMD Execution Mode. - /// \param OutlinedFn Outlined function to be run in parallel threads. Type of - /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). - /// \param CapturedVars A pointer to the record with the references to - /// variables used in \a OutlinedFn function. - /// \param IfCond Condition in the associated 'if' clause, if it was - /// specified, nullptr otherwise. - void emitNonSPMDParallelCall(CodeGenFunction &CGF, SourceLocation Loc, - llvm::Value *OutlinedFn, - ArrayRef CapturedVars, - const Expr *IfCond); - - /// Emits code for parallel or serial call of the \a OutlinedFn with - /// variables captured in a record which address is stored in \a - /// CapturedStruct. - /// This call is for a parallel directive within an SPMD target directive. - /// \param OutlinedFn Outlined function to be run in parallel threads. Type of - /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). - /// \param CapturedVars A pointer to the record with the references to - /// variables used in \a OutlinedFn function. - /// \param IfCond Condition in the associated 'if' clause, if it was - /// specified, nullptr otherwise. - /// - void emitSPMDParallelCall(CodeGenFunction &CGF, SourceLocation Loc, - llvm::Function *OutlinedFn, - ArrayRef CapturedVars, - const Expr *IfCond); - -protected: - /// Get the function name of an outlined region. - // The name can be customized depending on the target. - // - StringRef getOutlinedHelperName() const override { - return "__omp_outlined__"; - } - - /// Check if the default location must be constant. - /// Constant for NVPTX for better optimization. - bool isDefaultLocationConstant() const override { return true; } - - /// Returns additional flags that can be stored in reserved_2 field of the - /// default location. - /// For NVPTX target contains data about SPMD/Non-SPMD execution mode + - /// Full/Lightweight runtime mode. Used for better optimization. - unsigned getDefaultLocationReserved2Flags() const override; +class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntimeGPU { public: explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM); - void clear() override; - - /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 - /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. - virtual void emitProcBindClause(CodeGenFunction &CGF, - llvm::omp::ProcBindKind ProcBind, - SourceLocation Loc) override; - - /// Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 - /// global_tid, kmp_int32 num_threads) to generate code for 'num_threads' - /// clause. - /// \param NumThreads An integer value of threads. - virtual void emitNumThreadsClause(CodeGenFunction &CGF, - llvm::Value *NumThreads, - SourceLocation Loc) override; - - /// This function ought to emit, in the general case, a call to - // the openmp runtime kmpc_push_num_teams. In NVPTX backend it is not needed - // as these numbers are obtained through the PTX grid and block configuration. - /// \param NumTeams An integer expression of teams. - /// \param ThreadLimit An integer expression of threads. - void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, - const Expr *ThreadLimit, SourceLocation Loc) override; - - /// Emits inlined function for the specified OpenMP parallel - // directive. - /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID, - /// kmp_int32 BoundID, struct context_vars*). - /// \param D OpenMP directive. - /// \param ThreadIDVar Variable for thread id in the current OpenMP region. - /// \param InnermostKind Kind of innermost directive (for simple directives it - /// is a directive itself, for combined - its innermost directive). - /// \param CodeGen Code generation sequence for the \a D directive. - llvm::Function * - emitParallelOutlinedFunction(const OMPExecutableDirective &D, - const VarDecl *ThreadIDVar, - OpenMPDirectiveKind InnermostKind, - const RegionCodeGenTy &CodeGen) override; - - /// Emits inlined function for the specified OpenMP teams - // directive. - /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID, - /// kmp_int32 BoundID, struct context_vars*). - /// \param D OpenMP directive. - /// \param ThreadIDVar Variable for thread id in the current OpenMP region. - /// \param InnermostKind Kind of innermost directive (for simple directives it - /// is a directive itself, for combined - its innermost directive). - /// \param CodeGen Code generation sequence for the \a D directive. - llvm::Function * - emitTeamsOutlinedFunction(const OMPExecutableDirective &D, - const VarDecl *ThreadIDVar, - OpenMPDirectiveKind InnermostKind, - const RegionCodeGenTy &CodeGen) override; - - /// Emits code for teams call of the \a OutlinedFn with - /// variables captured in a record which address is stored in \a - /// CapturedStruct. - /// \param OutlinedFn Outlined function to be run by team masters. Type of - /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). - /// \param CapturedVars A pointer to the record with the references to - /// variables used in \a OutlinedFn function. - /// - void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, - SourceLocation Loc, llvm::Function *OutlinedFn, - ArrayRef CapturedVars) override; - - /// Emits code for parallel or serial call of the \a OutlinedFn with - /// variables captured in a record which address is stored in \a - /// CapturedStruct. - /// \param OutlinedFn Outlined function to be run in parallel threads. Type of - /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). - /// \param CapturedVars A pointer to the record with the references to - /// variables used in \a OutlinedFn function. - /// \param IfCond Condition in the associated 'if' clause, if it was - /// specified, nullptr otherwise. - void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, - llvm::Function *OutlinedFn, - ArrayRef CapturedVars, - const Expr *IfCond) override; - - /// Emit an implicit/explicit barrier for OpenMP threads. - /// \param Kind Directive for which this implicit barrier call must be - /// generated. Must be OMPD_barrier for explicit barrier generation. - /// \param EmitChecks true if need to emit checks for cancellation barriers. - /// \param ForceSimpleCall true simple barrier call must be emitted, false if - /// runtime class decides which one to emit (simple or with cancellation - /// checks). - /// - void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, - OpenMPDirectiveKind Kind, bool EmitChecks = true, - bool ForceSimpleCall = false) override; - - /// Emits a critical region. - /// \param CriticalName Name of the critical region. - /// \param CriticalOpGen Generator for the statement associated with the given - /// critical region. - /// \param Hint Value of the 'hint' clause (optional). - void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, - const RegionCodeGenTy &CriticalOpGen, - SourceLocation Loc, - const Expr *Hint = nullptr) override; - - /// Emit a code for reduction clause. - /// - /// \param Privates List of private copies for original reduction arguments. - /// \param LHSExprs List of LHS in \a ReductionOps reduction operations. - /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. - /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' - /// or 'operator binop(LHS, RHS)'. - /// \param Options List of options for reduction codegen: - /// WithNowait true if parent directive has also nowait clause, false - /// otherwise. - /// SimpleReduction Emit reduction operation only. Used for omp simd - /// directive on the host. - /// ReductionKind The kind of reduction to perform. - virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, - ArrayRef Privates, - ArrayRef LHSExprs, - ArrayRef RHSExprs, - ArrayRef ReductionOps, - ReductionOptionsTy Options) override; - - /// Returns specified OpenMP runtime function for the current OpenMP - /// implementation. Specialized for the NVPTX device. - /// \param Function OpenMP runtime function. - /// \return Specified function. - llvm::FunctionCallee createNVPTXRuntimeFunction(unsigned Function); - - /// Translates the native parameter of outlined function if this is required - /// for target. - /// \param FD Field decl from captured record for the parameter. - /// \param NativeParam Parameter itself. - const VarDecl *translateParameter(const FieldDecl *FD, - const VarDecl *NativeParam) const override; - - /// Gets the address of the native argument basing on the address of the - /// target-specific parameter. - /// \param NativeParam Parameter itself. - /// \param TargetParam Corresponding target-specific parameter. - Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, - const VarDecl *TargetParam) const override; - - /// Emits call of the outlined function with the provided arguments, - /// translating these arguments to correct target-specific arguments. - void emitOutlinedFunctionCall( - CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, - ArrayRef Args = llvm::None) const override; - - /// Emits OpenMP-specific function prolog. - /// Required for device constructs. - void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override; - - /// Gets the OpenMP-specific address of the local variable. - Address getAddressOfLocalVariable(CodeGenFunction &CGF, - const VarDecl *VD) override; - - /// Target codegen is specialized based on two data-sharing modes: CUDA, in - /// which the local variables are actually global threadlocal, and Generic, in - /// which the local variables are placed in global memory if they may escape - /// their declaration context. - enum DataSharingMode { - /// CUDA data sharing mode. - CUDA, - /// Generic data-sharing mode. - Generic, - }; - - /// Cleans up references to the objects in finished function. - /// - void functionFinished(CodeGenFunction &CGF) override; - - /// Choose a default value for the dist_schedule clause. - void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, - const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, - llvm::Value *&Chunk) const override; - - /// Choose a default value for the schedule clause. - void getDefaultScheduleAndChunk(CodeGenFunction &CGF, - const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, - const Expr *&ChunkExpr) const override; - - /// Adjust some parameters for the target-based directives, like addresses of - /// the variables captured by reference in lambdas. - void adjustTargetSpecificDataForLambdas( - CodeGenFunction &CGF, const OMPExecutableDirective &D) const override; - - /// Perform check on requires decl to ensure that target architecture - /// supports unified addressing - void processRequiresDirective(const OMPRequiresDecl *D) override; - - /// Returns default address space for the constant firstprivates, __constant__ - /// address space by default. - unsigned getDefaultFirstprivateAddressSpace() const override; - - /// Checks if the variable has associated OMPAllocateDeclAttr attribute with - /// the predefined allocator and translates it into the corresponding address - /// space. - bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override; - -private: - /// Track the execution mode when codegening directives within a target - /// region. The appropriate mode (SPMD/NON-SPMD) is set on entry to the - /// target region and used by containing directives such as 'parallel' - /// to emit optimized code. - ExecutionMode CurrentExecutionMode = EM_Unknown; - - /// Check if the full runtime is required (default - yes). - bool RequiresFullRuntime = true; - - /// true if we're emitting the code for the target region and next parallel - /// region is L0 for sure. - bool IsInTargetMasterThreadRegion = false; - /// true if currently emitting code for target/teams/distribute region, false - /// - otherwise. - bool IsInTTDRegion = false; - /// true if we're definitely in the parallel region. - bool IsInParallelRegion = false; - - /// Map between an outlined function and its wrapper. - llvm::DenseMap WrapperFunctionsMap; - - /// Emit function which wraps the outline parallel region - /// and controls the parameters which are passed to this function. - /// The wrapper ensures that the outlined function is called - /// with the correct arguments when data is shared. - llvm::Function *createParallelDataSharingWrapper( - llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D); - - /// The data for the single globalized variable. - struct MappedVarData { - /// Corresponding field in the global record. - const FieldDecl *FD = nullptr; - /// Corresponding address. - Address PrivateAddr = Address::invalid(); - /// true, if only one element is required (for latprivates in SPMD mode), - /// false, if need to create based on the warp-size. - bool IsOnePerTeam = false; - MappedVarData() = delete; - MappedVarData(const FieldDecl *FD, bool IsOnePerTeam = false) - : FD(FD), IsOnePerTeam(IsOnePerTeam) {} - }; - /// The map of local variables to their addresses in the global memory. - using DeclToAddrMapTy = llvm::MapVector; - /// Set of the parameters passed by value escaping OpenMP context. - using EscapedParamsTy = llvm::SmallPtrSet; - struct FunctionData { - DeclToAddrMapTy LocalVarData; - llvm::Optional SecondaryLocalVarData = llvm::None; - EscapedParamsTy EscapedParameters; - llvm::SmallVector EscapedVariableLengthDecls; - llvm::SmallVector EscapedVariableLengthDeclsAddrs; - const RecordDecl *GlobalRecord = nullptr; - llvm::Optional SecondaryGlobalRecord = llvm::None; - llvm::Value *GlobalRecordAddr = nullptr; - llvm::Value *IsInSPMDModeFlag = nullptr; - std::unique_ptr MappedParams; - }; - /// Maps the function to the list of the globalized variables with their - /// addresses. - llvm::SmallDenseMap FunctionGlobalizedDecls; - /// List of records for the globalized variables in target/teams/distribute - /// contexts. Inner records are going to be joined into the single record, - /// while those resulting records are going to be joined into the single - /// union. This resulting union (one per CU) is the entry point for the static - /// memory management runtime functions. - struct GlobalPtrSizeRecsTy { - llvm::GlobalVariable *UseSharedMemory = nullptr; - llvm::GlobalVariable *RecSize = nullptr; - llvm::GlobalVariable *Buffer = nullptr; - SourceLocation Loc; - llvm::SmallVector Records; - unsigned RegionCounter = 0; - }; - llvm::SmallVector GlobalizedRecords; - llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr; - /// List of the records with the list of fields for the reductions across the - /// teams. Used to build the intermediate buffer for the fast teams - /// reductions. - /// All the records are gathered into a union `union.type` is created. - llvm::SmallVector TeamsReductions; - /// Shared pointer for the global memory in the global memory buffer used for - /// the given kernel. - llvm::GlobalVariable *KernelStaticGlobalized = nullptr; - /// Pair of the Non-SPMD team and all reductions variables in this team - /// region. - std::pair> - TeamAndReductions; + llvm::Value *getGPUWarpSize(CodeGenFunction &CGF); }; } // CodeGen namespace. diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -7,11 +7,12 @@ //===----------------------------------------------------------------------===// // // This provides a class for OpenMP runtime code generation specialized to NVPTX -// targets. +// targets from generalized CGOpenMPRuntimeGPU class. // //===----------------------------------------------------------------------===// #include "CGOpenMPRuntimeNVPTX.h" +#include "CGOpenMPRuntimeGPU.h" #include "CodeGenFunction.h" #include "clang/AST/Attr.h" #include "clang/AST/DeclOpenMP.h" @@ -25,5223 +26,16 @@ using namespace CodeGen; using namespace llvm::omp; -namespace { -enum OpenMPRTLFunctionNVPTX { - /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit, - /// int16_t RequiresOMPRuntime); - OMPRTL_NVPTX__kmpc_kernel_init, - /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); - OMPRTL_NVPTX__kmpc_kernel_deinit, - /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, - /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing); - OMPRTL_NVPTX__kmpc_spmd_kernel_init, - /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); - OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2, - /// Call to void __kmpc_kernel_prepare_parallel(void - /// *outlined_function); - OMPRTL_NVPTX__kmpc_kernel_prepare_parallel, - /// Call to bool __kmpc_kernel_parallel(void **outlined_function); - OMPRTL_NVPTX__kmpc_kernel_parallel, - /// Call to void __kmpc_kernel_end_parallel(); - OMPRTL_NVPTX__kmpc_kernel_end_parallel, - /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 - /// global_tid); - OMPRTL_NVPTX__kmpc_serialized_parallel, - /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 - /// global_tid); - OMPRTL_NVPTX__kmpc_end_serialized_parallel, - /// Call to int32_t __kmpc_shuffle_int32(int32_t element, - /// int16_t lane_offset, int16_t warp_size); - OMPRTL_NVPTX__kmpc_shuffle_int32, - /// Call to int64_t __kmpc_shuffle_int64(int64_t element, - /// int16_t lane_offset, int16_t warp_size); - OMPRTL_NVPTX__kmpc_shuffle_int64, - /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32 - /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data, - /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t - /// lane_offset, int16_t shortCircuit), - /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num)); - OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2, - /// Call to __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32 - /// global_tid, void *global_buffer, int32_t num_of_records, void* - /// reduce_data, - /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t - /// lane_offset, int16_t shortCircuit), - /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void - /// (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data), - /// void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx, - /// void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer, - /// int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void - /// *buffer, int idx, void *reduce_data)); - OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2, - /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); - OMPRTL_NVPTX__kmpc_end_reduce_nowait, - /// Call to void __kmpc_data_sharing_init_stack(); - OMPRTL_NVPTX__kmpc_data_sharing_init_stack, - /// Call to void __kmpc_data_sharing_init_stack_spmd(); - OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd, - /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size, - /// int16_t UseSharedMemory); - OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack, - /// Call to void* __kmpc_data_sharing_push_stack(size_t size, int16_t - /// UseSharedMemory); - OMPRTL_NVPTX__kmpc_data_sharing_push_stack, - /// Call to void __kmpc_data_sharing_pop_stack(void *a); - OMPRTL_NVPTX__kmpc_data_sharing_pop_stack, - /// Call to void __kmpc_begin_sharing_variables(void ***args, - /// size_t n_args); - OMPRTL_NVPTX__kmpc_begin_sharing_variables, - /// Call to void __kmpc_end_sharing_variables(); - OMPRTL_NVPTX__kmpc_end_sharing_variables, - /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs) - OMPRTL_NVPTX__kmpc_get_shared_variables, - /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 - /// global_tid); - OMPRTL_NVPTX__kmpc_parallel_level, - /// Call to int8_t __kmpc_is_spmd_exec_mode(); - OMPRTL_NVPTX__kmpc_is_spmd_exec_mode, - /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, - /// const void *buf, size_t size, int16_t is_shared, const void **res); - OMPRTL_NVPTX__kmpc_get_team_static_memory, - /// Call to void __kmpc_restore_team_static_memory(int16_t - /// isSPMDExecutionMode, int16_t is_shared); - OMPRTL_NVPTX__kmpc_restore_team_static_memory, - /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid); - OMPRTL__kmpc_barrier, - /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32 - /// global_tid); - OMPRTL__kmpc_barrier_simple_spmd, - /// Call to int32_t __kmpc_warp_active_thread_mask(void); - OMPRTL_NVPTX__kmpc_warp_active_thread_mask, - /// Call to void __kmpc_syncwarp(int32_t Mask); - OMPRTL_NVPTX__kmpc_syncwarp, -}; - -/// Pre(post)-action for different OpenMP constructs specialized for NVPTX. -class NVPTXActionTy final : public PrePostActionTy { - llvm::FunctionCallee EnterCallee = nullptr; - ArrayRef EnterArgs; - llvm::FunctionCallee ExitCallee = nullptr; - ArrayRef ExitArgs; - bool Conditional = false; - llvm::BasicBlock *ContBlock = nullptr; - -public: - NVPTXActionTy(llvm::FunctionCallee EnterCallee, - ArrayRef EnterArgs, - llvm::FunctionCallee ExitCallee, - ArrayRef ExitArgs, bool Conditional = false) - : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), - ExitArgs(ExitArgs), Conditional(Conditional) {} - void Enter(CodeGenFunction &CGF) override { - llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs); - if (Conditional) { - llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes); - auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); - ContBlock = CGF.createBasicBlock("omp_if.end"); - // Generate the branch (If-stmt) - CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); - CGF.EmitBlock(ThenBlock); - } - } - void Done(CodeGenFunction &CGF) { - // Emit the rest of blocks/branches - CGF.EmitBranch(ContBlock); - CGF.EmitBlock(ContBlock, true); - } - void Exit(CodeGenFunction &CGF) override { - CGF.EmitRuntimeCall(ExitCallee, ExitArgs); - } -}; - -/// A class to track the execution mode when codegening directives within -/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry -/// to the target region and used by containing directives such as 'parallel' -/// to emit optimized code. -class ExecutionRuntimeModesRAII { -private: - CGOpenMPRuntimeNVPTX::ExecutionMode SavedExecMode = - CGOpenMPRuntimeNVPTX::EM_Unknown; - CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode; - bool SavedRuntimeMode = false; - bool *RuntimeMode = nullptr; - -public: - /// Constructor for Non-SPMD mode. - ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode) - : ExecMode(ExecMode) { - SavedExecMode = ExecMode; - ExecMode = CGOpenMPRuntimeNVPTX::EM_NonSPMD; - } - /// Constructor for SPMD mode. - ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode, - bool &RuntimeMode, bool FullRuntimeMode) - : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) { - SavedExecMode = ExecMode; - SavedRuntimeMode = RuntimeMode; - ExecMode = CGOpenMPRuntimeNVPTX::EM_SPMD; - RuntimeMode = FullRuntimeMode; - } - ~ExecutionRuntimeModesRAII() { - ExecMode = SavedExecMode; - if (RuntimeMode) - *RuntimeMode = SavedRuntimeMode; - } -}; - -/// GPU Configuration: This information can be derived from cuda registers, -/// however, providing compile time constants helps generate more efficient -/// code. For all practical purposes this is fine because the configuration -/// is the same for all known NVPTX architectures. -enum MachineConfiguration : unsigned { - WarpSize = 32, - /// Number of bits required to represent a lane identifier, which is - /// computed as log_2(WarpSize). - LaneIDBits = 5, - LaneIDMask = WarpSize - 1, - - /// Global memory alignment for performance. - GlobalMemoryAlignment = 128, - - /// Maximal size of the shared memory buffer. - SharedMemorySize = 128, -}; - -static const ValueDecl *getPrivateItem(const Expr *RefExpr) { - RefExpr = RefExpr->IgnoreParens(); - if (const auto *ASE = dyn_cast(RefExpr)) { - const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); - while (const auto *TempASE = dyn_cast(Base)) - Base = TempASE->getBase()->IgnoreParenImpCasts(); - RefExpr = Base; - } else if (auto *OASE = dyn_cast(RefExpr)) { - const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); - while (const auto *TempOASE = dyn_cast(Base)) - Base = TempOASE->getBase()->IgnoreParenImpCasts(); - while (const auto *TempASE = dyn_cast(Base)) - Base = TempASE->getBase()->IgnoreParenImpCasts(); - RefExpr = Base; - } - RefExpr = RefExpr->IgnoreParenImpCasts(); - if (const auto *DE = dyn_cast(RefExpr)) - return cast(DE->getDecl()->getCanonicalDecl()); - const auto *ME = cast(RefExpr); - return cast(ME->getMemberDecl()->getCanonicalDecl()); -} - - -static RecordDecl *buildRecordForGlobalizedVars( - ASTContext &C, ArrayRef EscapedDecls, - ArrayRef EscapedDeclsForTeams, - llvm::SmallDenseMap - &MappedDeclsFields, int BufSize) { - using VarsDataTy = std::pair; - if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) - return nullptr; - SmallVector GlobalizedVars; - for (const ValueDecl *D : EscapedDecls) - GlobalizedVars.emplace_back( - CharUnits::fromQuantity(std::max( - C.getDeclAlign(D).getQuantity(), - static_cast(GlobalMemoryAlignment))), - D); - for (const ValueDecl *D : EscapedDeclsForTeams) - GlobalizedVars.emplace_back(C.getDeclAlign(D), D); - llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) { - return L.first > R.first; - }); - - // Build struct _globalized_locals_ty { - // /* globalized vars */[WarSize] align (max(decl_align, - // GlobalMemoryAlignment)) - // /* globalized vars */ for EscapedDeclsForTeams - // }; - RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); - GlobalizedRD->startDefinition(); - llvm::SmallPtrSet SingleEscaped( - EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end()); - for (const auto &Pair : GlobalizedVars) { - const ValueDecl *VD = Pair.second; - QualType Type = VD->getType(); - if (Type->isLValueReferenceType()) - Type = C.getPointerType(Type.getNonReferenceType()); - else - Type = Type.getNonReferenceType(); - SourceLocation Loc = VD->getLocation(); - FieldDecl *Field; - if (SingleEscaped.count(VD)) { - Field = FieldDecl::Create( - C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, - C.getTrivialTypeSourceInfo(Type, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - if (VD->hasAttrs()) { - for (specific_attr_iterator I(VD->getAttrs().begin()), - E(VD->getAttrs().end()); - I != E; ++I) - Field->addAttr(*I); - } - } else { - llvm::APInt ArraySize(32, BufSize); - Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal, - 0); - Field = FieldDecl::Create( - C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, - C.getTrivialTypeSourceInfo(Type, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(), - static_cast( - GlobalMemoryAlignment))); - Field->addAttr(AlignedAttr::CreateImplicit( - C, /*IsAlignmentExpr=*/true, - IntegerLiteral::Create(C, Align, - C.getIntTypeForBitwidth(32, /*Signed=*/0), - SourceLocation()), - {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned)); - } - GlobalizedRD->addDecl(Field); - MappedDeclsFields.try_emplace(VD, Field); - } - GlobalizedRD->completeDefinition(); - return GlobalizedRD; +CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) + : CGOpenMPRuntimeGPU(CGM) { + if (!CGM.getLangOpts().OpenMPIsDevice) + llvm_unreachable("OpenMP NVPTX can only handle device code."); } -/// Get the list of variables that can escape their declaration context. -class CheckVarsEscapingDeclContext final - : public ConstStmtVisitor { - CodeGenFunction &CGF; - llvm::SetVector EscapedDecls; - llvm::SetVector EscapedVariableLengthDecls; - llvm::SmallPtrSet EscapedParameters; - RecordDecl *GlobalizedRD = nullptr; - llvm::SmallDenseMap MappedDeclsFields; - bool AllEscaped = false; - bool IsForCombinedParallelRegion = false; - - void markAsEscaped(const ValueDecl *VD) { - // Do not globalize declare target variables. - if (!isa(VD) || - OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) - return; - VD = cast(VD->getCanonicalDecl()); - // Use user-specified allocation. - if (VD->hasAttrs() && VD->hasAttr()) - return; - // Variables captured by value must be globalized. - if (auto *CSI = CGF.CapturedStmtInfo) { - if (const FieldDecl *FD = CSI->lookup(cast(VD))) { - // Check if need to capture the variable that was already captured by - // value in the outer region. - if (!IsForCombinedParallelRegion) { - if (!FD->hasAttrs()) - return; - const auto *Attr = FD->getAttr(); - if (!Attr) - return; - if (((Attr->getCaptureKind() != OMPC_map) && - !isOpenMPPrivate(Attr->getCaptureKind())) || - ((Attr->getCaptureKind() == OMPC_map) && - !FD->getType()->isAnyPointerType())) - return; - } - if (!FD->getType()->isReferenceType()) { - assert(!VD->getType()->isVariablyModifiedType() && - "Parameter captured by value with variably modified type"); - EscapedParameters.insert(VD); - } else if (!IsForCombinedParallelRegion) { - return; - } - } - } - if ((!CGF.CapturedStmtInfo || - (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) && - VD->getType()->isReferenceType()) - // Do not globalize variables with reference type. - return; - if (VD->getType()->isVariablyModifiedType()) - EscapedVariableLengthDecls.insert(VD); - else - EscapedDecls.insert(VD); - } - - void VisitValueDecl(const ValueDecl *VD) { - if (VD->getType()->isLValueReferenceType()) - markAsEscaped(VD); - if (const auto *VarD = dyn_cast(VD)) { - if (!isa(VarD) && VarD->hasInit()) { - const bool SavedAllEscaped = AllEscaped; - AllEscaped = VD->getType()->isLValueReferenceType(); - Visit(VarD->getInit()); - AllEscaped = SavedAllEscaped; - } - } - } - void VisitOpenMPCapturedStmt(const CapturedStmt *S, - ArrayRef Clauses, - bool IsCombinedParallelRegion) { - if (!S) - return; - for (const CapturedStmt::Capture &C : S->captures()) { - if (C.capturesVariable() && !C.capturesVariableByCopy()) { - const ValueDecl *VD = C.getCapturedVar(); - bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion; - if (IsCombinedParallelRegion) { - // Check if the variable is privatized in the combined construct and - // those private copies must be shared in the inner parallel - // directive. - IsForCombinedParallelRegion = false; - for (const OMPClause *C : Clauses) { - if (!isOpenMPPrivate(C->getClauseKind()) || - C->getClauseKind() == OMPC_reduction || - C->getClauseKind() == OMPC_linear || - C->getClauseKind() == OMPC_private) - continue; - ArrayRef Vars; - if (const auto *PC = dyn_cast(C)) - Vars = PC->getVarRefs(); - else if (const auto *PC = dyn_cast(C)) - Vars = PC->getVarRefs(); - else - llvm_unreachable("Unexpected clause."); - for (const auto *E : Vars) { - const Decl *D = - cast(E)->getDecl()->getCanonicalDecl(); - if (D == VD->getCanonicalDecl()) { - IsForCombinedParallelRegion = true; - break; - } - } - if (IsForCombinedParallelRegion) - break; - } - } - markAsEscaped(VD); - if (isa(VD)) - VisitValueDecl(VD); - IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion; - } - } - } - - void buildRecordForGlobalizedVars(bool IsInTTDRegion) { - assert(!GlobalizedRD && - "Record for globalized variables is built already."); - ArrayRef EscapedDeclsForParallel, EscapedDeclsForTeams; - if (IsInTTDRegion) - EscapedDeclsForTeams = EscapedDecls.getArrayRef(); - else - EscapedDeclsForParallel = EscapedDecls.getArrayRef(); - GlobalizedRD = ::buildRecordForGlobalizedVars( - CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, - MappedDeclsFields, WarpSize); - } - -public: - CheckVarsEscapingDeclContext(CodeGenFunction &CGF, - ArrayRef TeamsReductions) - : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) { - } - virtual ~CheckVarsEscapingDeclContext() = default; - void VisitDeclStmt(const DeclStmt *S) { - if (!S) - return; - for (const Decl *D : S->decls()) - if (const auto *VD = dyn_cast_or_null(D)) - VisitValueDecl(VD); - } - void VisitOMPExecutableDirective(const OMPExecutableDirective *D) { - if (!D) - return; - if (!D->hasAssociatedStmt()) - return; - if (const auto *S = - dyn_cast_or_null(D->getAssociatedStmt())) { - // Do not analyze directives that do not actually require capturing, - // like `omp for` or `omp simd` directives. - llvm::SmallVector CaptureRegions; - getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind()); - if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) { - VisitStmt(S->getCapturedStmt()); - return; - } - VisitOpenMPCapturedStmt( - S, D->clauses(), - CaptureRegions.back() == OMPD_parallel && - isOpenMPDistributeDirective(D->getDirectiveKind())); - } - } - void VisitCapturedStmt(const CapturedStmt *S) { - if (!S) - return; - for (const CapturedStmt::Capture &C : S->captures()) { - if (C.capturesVariable() && !C.capturesVariableByCopy()) { - const ValueDecl *VD = C.getCapturedVar(); - markAsEscaped(VD); - if (isa(VD)) - VisitValueDecl(VD); - } - } - } - void VisitLambdaExpr(const LambdaExpr *E) { - if (!E) - return; - for (const LambdaCapture &C : E->captures()) { - if (C.capturesVariable()) { - if (C.getCaptureKind() == LCK_ByRef) { - const ValueDecl *VD = C.getCapturedVar(); - markAsEscaped(VD); - if (E->isInitCapture(&C) || isa(VD)) - VisitValueDecl(VD); - } - } - } - } - void VisitBlockExpr(const BlockExpr *E) { - if (!E) - return; - for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) { - if (C.isByRef()) { - const VarDecl *VD = C.getVariable(); - markAsEscaped(VD); - if (isa(VD) || VD->isInitCapture()) - VisitValueDecl(VD); - } - } - } - void VisitCallExpr(const CallExpr *E) { - if (!E) - return; - for (const Expr *Arg : E->arguments()) { - if (!Arg) - continue; - if (Arg->isLValue()) { - const bool SavedAllEscaped = AllEscaped; - AllEscaped = true; - Visit(Arg); - AllEscaped = SavedAllEscaped; - } else { - Visit(Arg); - } - } - Visit(E->getCallee()); - } - void VisitDeclRefExpr(const DeclRefExpr *E) { - if (!E) - return; - const ValueDecl *VD = E->getDecl(); - if (AllEscaped) - markAsEscaped(VD); - if (isa(VD)) - VisitValueDecl(VD); - else if (const auto *VarD = dyn_cast(VD)) - if (VarD->isInitCapture()) - VisitValueDecl(VD); - } - void VisitUnaryOperator(const UnaryOperator *E) { - if (!E) - return; - if (E->getOpcode() == UO_AddrOf) { - const bool SavedAllEscaped = AllEscaped; - AllEscaped = true; - Visit(E->getSubExpr()); - AllEscaped = SavedAllEscaped; - } else { - Visit(E->getSubExpr()); - } - } - void VisitImplicitCastExpr(const ImplicitCastExpr *E) { - if (!E) - return; - if (E->getCastKind() == CK_ArrayToPointerDecay) { - const bool SavedAllEscaped = AllEscaped; - AllEscaped = true; - Visit(E->getSubExpr()); - AllEscaped = SavedAllEscaped; - } else { - Visit(E->getSubExpr()); - } - } - void VisitExpr(const Expr *E) { - if (!E) - return; - bool SavedAllEscaped = AllEscaped; - if (!E->isLValue()) - AllEscaped = false; - for (const Stmt *Child : E->children()) - if (Child) - Visit(Child); - AllEscaped = SavedAllEscaped; - } - void VisitStmt(const Stmt *S) { - if (!S) - return; - for (const Stmt *Child : S->children()) - if (Child) - Visit(Child); - } - - /// Returns the record that handles all the escaped local variables and used - /// instead of their original storage. - const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) { - if (!GlobalizedRD) - buildRecordForGlobalizedVars(IsInTTDRegion); - return GlobalizedRD; - } - - /// Returns the field in the globalized record for the escaped variable. - const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const { - assert(GlobalizedRD && - "Record for globalized variables must be generated already."); - auto I = MappedDeclsFields.find(VD); - if (I == MappedDeclsFields.end()) - return nullptr; - return I->getSecond(); - } - - /// Returns the list of the escaped local variables/parameters. - ArrayRef getEscapedDecls() const { - return EscapedDecls.getArrayRef(); - } - - /// Checks if the escaped local variable is actually a parameter passed by - /// value. - const llvm::SmallPtrSetImpl &getEscapedParameters() const { - return EscapedParameters; - } - - /// Returns the list of the escaped variables with the variably modified - /// types. - ArrayRef getEscapedVariableLengthDecls() const { - return EscapedVariableLengthDecls.getArrayRef(); - } -}; -} // anonymous namespace - /// Get the GPU warp size. -static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) { +llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) { return CGF.EmitRuntimeCall( llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize), "nvptx_warp_size"); } - -/// Get the id of the current thread on the GPU. -static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) { - return CGF.EmitRuntimeCall( - llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x), - "nvptx_tid"); -} - -/// Get the id of the warp in the block. -/// We assume that the warp size is 32, which is always the case -/// on the NVPTX device, to generate more efficient code. -static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id"); -} - -/// Get the id of the current lane in the Warp. -/// We assume that the warp size is 32, which is always the case -/// on the NVPTX device, to generate more efficient code. -static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask), - "nvptx_lane_id"); -} - -/// Get the maximum number of threads in a block of the GPU. -static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { - return CGF.EmitRuntimeCall( - llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x), - "nvptx_num_threads"); -} - -/// Get the value of the thread_limit clause in the teams directive. -/// For the 'generic' execution mode, the runtime encodes thread_limit in -/// the launch parameters, always starting thread_limit+warpSize threads per -/// CTA. The threads in the last warp are reserved for master execution. -/// For the 'spmd' execution mode, all threads in a CTA are part of the team. -static llvm::Value *getThreadLimit(CodeGenFunction &CGF, - bool IsInSPMDExecutionMode = false) { - CGBuilderTy &Bld = CGF.Builder; - return IsInSPMDExecutionMode - ? getNVPTXNumThreads(CGF) - : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF), - "thread_limit"); -} - -/// Get the thread id of the OMP master thread. -/// The master thread id is the first thread (lane) of the last warp in the -/// GPU block. Warp size is assumed to be some power of 2. -/// Thread id is 0 indexed. -/// E.g: If NumThreads is 33, master id is 32. -/// If NumThreads is 64, master id is 32. -/// If NumThreads is 1024, master id is 992. -static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - llvm::Value *NumThreads = getNVPTXNumThreads(CGF); - - // We assume that the warp size is a power of 2. - llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1)); - - return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)), - Bld.CreateNot(Mask), "master_tid"); -} - -CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState( - CodeGenModule &CGM, SourceLocation Loc) - : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()), - Loc(Loc) { - createWorkerFunction(CGM); -} - -void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( - CodeGenModule &CGM) { - // Create an worker function with no arguments. - - WorkerFn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - /*placeholder=*/"_worker", &CGM.getModule()); - CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI); - WorkerFn->setDoesNotRecurse(); -} - -CGOpenMPRuntimeNVPTX::ExecutionMode -CGOpenMPRuntimeNVPTX::getExecutionMode() const { - return CurrentExecutionMode; -} - -static CGOpenMPRuntimeNVPTX::DataSharingMode -getDataSharingMode(CodeGenModule &CGM) { - return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA - : CGOpenMPRuntimeNVPTX::Generic; -} - -/// Check for inner (nested) SPMD construct, if any -static bool hasNestedSPMDDirective(ASTContext &Ctx, - const OMPExecutableDirective &D) { - const auto *CS = D.getInnermostCapturedStmt(); - const auto *Body = - CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); - const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); - - if (const auto *NestedDir = - dyn_cast_or_null(ChildStmt)) { - OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); - switch (D.getDirectiveKind()) { - case OMPD_target: - if (isOpenMPParallelDirective(DKind)) - return true; - if (DKind == OMPD_teams) { - Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( - /*IgnoreCaptured=*/true); - if (!Body) - return false; - ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); - if (const auto *NND = - dyn_cast_or_null(ChildStmt)) { - DKind = NND->getDirectiveKind(); - if (isOpenMPParallelDirective(DKind)) - return true; - } - } - return false; - case OMPD_target_teams: - return isOpenMPParallelDirective(DKind); - case OMPD_target_simd: - case OMPD_target_parallel: - case OMPD_target_parallel_for: - case OMPD_target_parallel_for_simd: - case OMPD_target_teams_distribute: - case OMPD_target_teams_distribute_simd: - case OMPD_target_teams_distribute_parallel_for: - case OMPD_target_teams_distribute_parallel_for_simd: - case OMPD_parallel: - case OMPD_for: - case OMPD_parallel_for: - case OMPD_parallel_master: - case OMPD_parallel_sections: - case OMPD_for_simd: - case OMPD_parallel_for_simd: - case OMPD_cancel: - case OMPD_cancellation_point: - case OMPD_ordered: - case OMPD_threadprivate: - case OMPD_allocate: - case OMPD_task: - case OMPD_simd: - case OMPD_sections: - case OMPD_section: - case OMPD_single: - case OMPD_master: - case OMPD_critical: - case OMPD_taskyield: - case OMPD_barrier: - case OMPD_taskwait: - case OMPD_taskgroup: - case OMPD_atomic: - case OMPD_flush: - case OMPD_depobj: - case OMPD_scan: - case OMPD_teams: - case OMPD_target_data: - case OMPD_target_exit_data: - case OMPD_target_enter_data: - case OMPD_distribute: - case OMPD_distribute_simd: - case OMPD_distribute_parallel_for: - case OMPD_distribute_parallel_for_simd: - case OMPD_teams_distribute: - case OMPD_teams_distribute_simd: - case OMPD_teams_distribute_parallel_for: - case OMPD_teams_distribute_parallel_for_simd: - case OMPD_target_update: - case OMPD_declare_simd: - case OMPD_declare_variant: - case OMPD_begin_declare_variant: - case OMPD_end_declare_variant: - case OMPD_declare_target: - case OMPD_end_declare_target: - case OMPD_declare_reduction: - case OMPD_declare_mapper: - case OMPD_taskloop: - case OMPD_taskloop_simd: - case OMPD_master_taskloop: - case OMPD_master_taskloop_simd: - case OMPD_parallel_master_taskloop: - case OMPD_parallel_master_taskloop_simd: - case OMPD_requires: - case OMPD_unknown: - default: - llvm_unreachable("Unexpected directive."); - } - } - - return false; -} - -static bool supportsSPMDExecutionMode(ASTContext &Ctx, - const OMPExecutableDirective &D) { - OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); - switch (DirectiveKind) { - case OMPD_target: - case OMPD_target_teams: - return hasNestedSPMDDirective(Ctx, D); - case OMPD_target_parallel: - case OMPD_target_parallel_for: - case OMPD_target_parallel_for_simd: - case OMPD_target_teams_distribute_parallel_for: - case OMPD_target_teams_distribute_parallel_for_simd: - case OMPD_target_simd: - case OMPD_target_teams_distribute_simd: - return true; - case OMPD_target_teams_distribute: - return false; - case OMPD_parallel: - case OMPD_for: - case OMPD_parallel_for: - case OMPD_parallel_master: - case OMPD_parallel_sections: - case OMPD_for_simd: - case OMPD_parallel_for_simd: - case OMPD_cancel: - case OMPD_cancellation_point: - case OMPD_ordered: - case OMPD_threadprivate: - case OMPD_allocate: - case OMPD_task: - case OMPD_simd: - case OMPD_sections: - case OMPD_section: - case OMPD_single: - case OMPD_master: - case OMPD_critical: - case OMPD_taskyield: - case OMPD_barrier: - case OMPD_taskwait: - case OMPD_taskgroup: - case OMPD_atomic: - case OMPD_flush: - case OMPD_depobj: - case OMPD_scan: - case OMPD_teams: - case OMPD_target_data: - case OMPD_target_exit_data: - case OMPD_target_enter_data: - case OMPD_distribute: - case OMPD_distribute_simd: - case OMPD_distribute_parallel_for: - case OMPD_distribute_parallel_for_simd: - case OMPD_teams_distribute: - case OMPD_teams_distribute_simd: - case OMPD_teams_distribute_parallel_for: - case OMPD_teams_distribute_parallel_for_simd: - case OMPD_target_update: - case OMPD_declare_simd: - case OMPD_declare_variant: - case OMPD_begin_declare_variant: - case OMPD_end_declare_variant: - case OMPD_declare_target: - case OMPD_end_declare_target: - case OMPD_declare_reduction: - case OMPD_declare_mapper: - case OMPD_taskloop: - case OMPD_taskloop_simd: - case OMPD_master_taskloop: - case OMPD_master_taskloop_simd: - case OMPD_parallel_master_taskloop: - case OMPD_parallel_master_taskloop_simd: - case OMPD_requires: - case OMPD_unknown: - default: - break; - } - llvm_unreachable( - "Unknown programming model for OpenMP directive on NVPTX target."); -} - -/// Check if the directive is loops based and has schedule clause at all or has -/// static scheduling. -static bool hasStaticScheduling(const OMPExecutableDirective &D) { - assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) && - isOpenMPLoopDirective(D.getDirectiveKind()) && - "Expected loop-based directive."); - return !D.hasClausesOfKind() && - (!D.hasClausesOfKind() || - llvm::any_of(D.getClausesOfKind(), - [](const OMPScheduleClause *C) { - return C->getScheduleKind() == OMPC_SCHEDULE_static; - })); -} - -/// Check for inner (nested) lightweight runtime construct, if any -static bool hasNestedLightweightDirective(ASTContext &Ctx, - const OMPExecutableDirective &D) { - assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive."); - const auto *CS = D.getInnermostCapturedStmt(); - const auto *Body = - CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); - const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); - - if (const auto *NestedDir = - dyn_cast_or_null(ChildStmt)) { - OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); - switch (D.getDirectiveKind()) { - case OMPD_target: - if (isOpenMPParallelDirective(DKind) && - isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && - hasStaticScheduling(*NestedDir)) - return true; - if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd) - return true; - if (DKind == OMPD_parallel) { - Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( - /*IgnoreCaptured=*/true); - if (!Body) - return false; - ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); - if (const auto *NND = - dyn_cast_or_null(ChildStmt)) { - DKind = NND->getDirectiveKind(); - if (isOpenMPWorksharingDirective(DKind) && - isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) - return true; - } - } else if (DKind == OMPD_teams) { - Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( - /*IgnoreCaptured=*/true); - if (!Body) - return false; - ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); - if (const auto *NND = - dyn_cast_or_null(ChildStmt)) { - DKind = NND->getDirectiveKind(); - if (isOpenMPParallelDirective(DKind) && - isOpenMPWorksharingDirective(DKind) && - isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) - return true; - if (DKind == OMPD_parallel) { - Body = NND->getInnermostCapturedStmt()->IgnoreContainers( - /*IgnoreCaptured=*/true); - if (!Body) - return false; - ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); - if (const auto *NND = - dyn_cast_or_null(ChildStmt)) { - DKind = NND->getDirectiveKind(); - if (isOpenMPWorksharingDirective(DKind) && - isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) - return true; - } - } - } - } - return false; - case OMPD_target_teams: - if (isOpenMPParallelDirective(DKind) && - isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && - hasStaticScheduling(*NestedDir)) - return true; - if (DKind == OMPD_distribute_simd || DKind == OMPD_simd) - return true; - if (DKind == OMPD_parallel) { - Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( - /*IgnoreCaptured=*/true); - if (!Body) - return false; - ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); - if (const auto *NND = - dyn_cast_or_null(ChildStmt)) { - DKind = NND->getDirectiveKind(); - if (isOpenMPWorksharingDirective(DKind) && - isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) - return true; - } - } - return false; - case OMPD_target_parallel: - if (DKind == OMPD_simd) - return true; - return isOpenMPWorksharingDirective(DKind) && - isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir); - case OMPD_target_teams_distribute: - case OMPD_target_simd: - case OMPD_target_parallel_for: - case OMPD_target_parallel_for_simd: - case OMPD_target_teams_distribute_simd: - case OMPD_target_teams_distribute_parallel_for: - case OMPD_target_teams_distribute_parallel_for_simd: - case OMPD_parallel: - case OMPD_for: - case OMPD_parallel_for: - case OMPD_parallel_master: - case OMPD_parallel_sections: - case OMPD_for_simd: - case OMPD_parallel_for_simd: - case OMPD_cancel: - case OMPD_cancellation_point: - case OMPD_ordered: - case OMPD_threadprivate: - case OMPD_allocate: - case OMPD_task: - case OMPD_simd: - case OMPD_sections: - case OMPD_section: - case OMPD_single: - case OMPD_master: - case OMPD_critical: - case OMPD_taskyield: - case OMPD_barrier: - case OMPD_taskwait: - case OMPD_taskgroup: - case OMPD_atomic: - case OMPD_flush: - case OMPD_depobj: - case OMPD_scan: - case OMPD_teams: - case OMPD_target_data: - case OMPD_target_exit_data: - case OMPD_target_enter_data: - case OMPD_distribute: - case OMPD_distribute_simd: - case OMPD_distribute_parallel_for: - case OMPD_distribute_parallel_for_simd: - case OMPD_teams_distribute: - case OMPD_teams_distribute_simd: - case OMPD_teams_distribute_parallel_for: - case OMPD_teams_distribute_parallel_for_simd: - case OMPD_target_update: - case OMPD_declare_simd: - case OMPD_declare_variant: - case OMPD_begin_declare_variant: - case OMPD_end_declare_variant: - case OMPD_declare_target: - case OMPD_end_declare_target: - case OMPD_declare_reduction: - case OMPD_declare_mapper: - case OMPD_taskloop: - case OMPD_taskloop_simd: - case OMPD_master_taskloop: - case OMPD_master_taskloop_simd: - case OMPD_parallel_master_taskloop: - case OMPD_parallel_master_taskloop_simd: - case OMPD_requires: - case OMPD_unknown: - default: - llvm_unreachable("Unexpected directive."); - } - } - - return false; -} - -/// Checks if the construct supports lightweight runtime. It must be SPMD -/// construct + inner loop-based construct with static scheduling. -static bool supportsLightweightRuntime(ASTContext &Ctx, - const OMPExecutableDirective &D) { - if (!supportsSPMDExecutionMode(Ctx, D)) - return false; - OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); - switch (DirectiveKind) { - case OMPD_target: - case OMPD_target_teams: - case OMPD_target_parallel: - return hasNestedLightweightDirective(Ctx, D); - case OMPD_target_parallel_for: - case OMPD_target_parallel_for_simd: - case OMPD_target_teams_distribute_parallel_for: - case OMPD_target_teams_distribute_parallel_for_simd: - // (Last|First)-privates must be shared in parallel region. - return hasStaticScheduling(D); - case OMPD_target_simd: - case OMPD_target_teams_distribute_simd: - return true; - case OMPD_target_teams_distribute: - return false; - case OMPD_parallel: - case OMPD_for: - case OMPD_parallel_for: - case OMPD_parallel_master: - case OMPD_parallel_sections: - case OMPD_for_simd: - case OMPD_parallel_for_simd: - case OMPD_cancel: - case OMPD_cancellation_point: - case OMPD_ordered: - case OMPD_threadprivate: - case OMPD_allocate: - case OMPD_task: - case OMPD_simd: - case OMPD_sections: - case OMPD_section: - case OMPD_single: - case OMPD_master: - case OMPD_critical: - case OMPD_taskyield: - case OMPD_barrier: - case OMPD_taskwait: - case OMPD_taskgroup: - case OMPD_atomic: - case OMPD_flush: - case OMPD_depobj: - case OMPD_scan: - case OMPD_teams: - case OMPD_target_data: - case OMPD_target_exit_data: - case OMPD_target_enter_data: - case OMPD_distribute: - case OMPD_distribute_simd: - case OMPD_distribute_parallel_for: - case OMPD_distribute_parallel_for_simd: - case OMPD_teams_distribute: - case OMPD_teams_distribute_simd: - case OMPD_teams_distribute_parallel_for: - case OMPD_teams_distribute_parallel_for_simd: - case OMPD_target_update: - case OMPD_declare_simd: - case OMPD_declare_variant: - case OMPD_begin_declare_variant: - case OMPD_end_declare_variant: - case OMPD_declare_target: - case OMPD_end_declare_target: - case OMPD_declare_reduction: - case OMPD_declare_mapper: - case OMPD_taskloop: - case OMPD_taskloop_simd: - case OMPD_master_taskloop: - case OMPD_master_taskloop_simd: - case OMPD_parallel_master_taskloop: - case OMPD_parallel_master_taskloop_simd: - case OMPD_requires: - case OMPD_unknown: - default: - break; - } - llvm_unreachable( - "Unknown programming model for OpenMP directive on NVPTX target."); -} - -void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D, - StringRef ParentName, - llvm::Function *&OutlinedFn, - llvm::Constant *&OutlinedFnID, - bool IsOffloadEntry, - const RegionCodeGenTy &CodeGen) { - ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode); - EntryFunctionState EST; - WorkerFunctionState WST(CGM, D.getBeginLoc()); - Work.clear(); - WrapperFunctionsMap.clear(); - - // Emit target region as a standalone region. - class NVPTXPrePostActionTy : public PrePostActionTy { - CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; - CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST; - - public: - NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, - CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) - : EST(EST), WST(WST) {} - void Enter(CodeGenFunction &CGF) override { - auto &RT = - static_cast(CGF.CGM.getOpenMPRuntime()); - RT.emitNonSPMDEntryHeader(CGF, EST, WST); - // Skip target region initialization. - RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); - } - void Exit(CodeGenFunction &CGF) override { - auto &RT = - static_cast(CGF.CGM.getOpenMPRuntime()); - RT.clearLocThreadIdInsertPt(CGF); - RT.emitNonSPMDEntryFooter(CGF, EST); - } - } Action(EST, WST); - CodeGen.setAction(Action); - IsInTTDRegion = true; - // Reserve place for the globalized memory. - GlobalizedRecords.emplace_back(); - if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - } - emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, - IsOffloadEntry, CodeGen); - IsInTTDRegion = false; - - // Now change the name of the worker function to correspond to this target - // region's entry function. - WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker")); - - // Create the worker function - emitWorkerFunction(WST); -} - -// Setup NVPTX threads for master-worker OpenMP scheme. -void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF, - EntryFunctionState &EST, - WorkerFunctionState &WST) { - CGBuilderTy &Bld = CGF.Builder; - - llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); - llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); - llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); - EST.ExitBB = CGF.createBasicBlock(".exit"); - - llvm::Value *IsWorker = - Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF)); - Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); - - CGF.EmitBlock(WorkerBB); - emitCall(CGF, WST.Loc, WST.WorkerFn); - CGF.EmitBranch(EST.ExitBB); - - CGF.EmitBlock(MasterCheckBB); - llvm::Value *IsMaster = - Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF)); - Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); - - CGF.EmitBlock(MasterBB); - IsInTargetMasterThreadRegion = true; - // SEQUENTIAL (MASTER) REGION START - // First action in sequential region: - // Initialize the state of the OpenMP runtime library on the GPU. - // TODO: Optimize runtime initialization and pass in correct value. - llvm::Value *Args[] = {getThreadLimit(CGF), - Bld.getInt16(/*RequiresOMPRuntime=*/1)}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); - - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_init_stack)); - - emitGenericVarsProlog(CGF, WST.Loc); -} - -void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF, - EntryFunctionState &EST) { - IsInTargetMasterThreadRegion = false; - if (!CGF.HaveInsertPoint()) - return; - - emitGenericVarsEpilog(CGF); - - if (!EST.ExitBB) - EST.ExitBB = CGF.createBasicBlock(".exit"); - - llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); - CGF.EmitBranch(TerminateBB); - - CGF.EmitBlock(TerminateBB); - // Signal termination condition. - // TODO: Optimize runtime initialization and pass in correct value. - llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args); - // Barrier to terminate worker threads. - syncCTAThreads(CGF); - // Master thread jumps to exit point. - CGF.EmitBranch(EST.ExitBB); - - CGF.EmitBlock(EST.ExitBB); - EST.ExitBB = nullptr; -} - -void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D, - StringRef ParentName, - llvm::Function *&OutlinedFn, - llvm::Constant *&OutlinedFnID, - bool IsOffloadEntry, - const RegionCodeGenTy &CodeGen) { - ExecutionRuntimeModesRAII ModeRAII( - CurrentExecutionMode, RequiresFullRuntime, - CGM.getLangOpts().OpenMPCUDAForceFullRuntime || - !supportsLightweightRuntime(CGM.getContext(), D)); - EntryFunctionState EST; - - // Emit target region as a standalone region. - class NVPTXPrePostActionTy : public PrePostActionTy { - CGOpenMPRuntimeNVPTX &RT; - CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; - const OMPExecutableDirective &D; - - public: - NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, - CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, - const OMPExecutableDirective &D) - : RT(RT), EST(EST), D(D) {} - void Enter(CodeGenFunction &CGF) override { - RT.emitSPMDEntryHeader(CGF, EST, D); - // Skip target region initialization. - RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); - } - void Exit(CodeGenFunction &CGF) override { - RT.clearLocThreadIdInsertPt(CGF); - RT.emitSPMDEntryFooter(CGF, EST); - } - } Action(*this, EST, D); - CodeGen.setAction(Action); - IsInTTDRegion = true; - // Reserve place for the globalized memory. - GlobalizedRecords.emplace_back(); - if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - } - emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, - IsOffloadEntry, CodeGen); - IsInTTDRegion = false; -} - -void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader( - CodeGenFunction &CGF, EntryFunctionState &EST, - const OMPExecutableDirective &D) { - CGBuilderTy &Bld = CGF.Builder; - - // Setup BBs in entry function. - llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute"); - EST.ExitBB = CGF.createBasicBlock(".exit"); - - llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true), - /*RequiresOMPRuntime=*/ - Bld.getInt16(RequiresFullRuntime ? 1 : 0), - /*RequiresDataSharing=*/Bld.getInt16(0)}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args); - - if (RequiresFullRuntime) { - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd)); - } - - CGF.EmitBranch(ExecuteBB); - - CGF.EmitBlock(ExecuteBB); - - IsInTargetMasterThreadRegion = true; -} - -void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF, - EntryFunctionState &EST) { - IsInTargetMasterThreadRegion = false; - if (!CGF.HaveInsertPoint()) - return; - - if (!EST.ExitBB) - EST.ExitBB = CGF.createBasicBlock(".exit"); - - llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit"); - CGF.EmitBranch(OMPDeInitBB); - - CGF.EmitBlock(OMPDeInitBB); - // DeInitialize the OMP state in the runtime; called by all active threads. - llvm::Value *Args[] = {/*RequiresOMPRuntime=*/ - CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args); - CGF.EmitBranch(EST.ExitBB); - - CGF.EmitBlock(EST.ExitBB); - EST.ExitBB = nullptr; -} - -// Create a unique global variable to indicate the execution mode of this target -// region. The execution mode is either 'generic', or 'spmd' depending on the -// target directive. This variable is picked up by the offload library to setup -// the device appropriately before kernel launch. If the execution mode is -// 'generic', the runtime reserves one warp for the master, otherwise, all -// warps participate in parallel work. -static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, - bool Mode) { - auto *GVMode = - new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, - llvm::GlobalValue::WeakAnyLinkage, - llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1), - Twine(Name, "_exec_mode")); - CGM.addCompilerUsedGlobal(GVMode); -} - -void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { - ASTContext &Ctx = CGM.getContext(); - - CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); - CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {}, - WST.Loc, WST.Loc); - emitWorkerLoop(CGF, WST); - CGF.FinishFunction(); -} - -void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, - WorkerFunctionState &WST) { - // - // The workers enter this loop and wait for parallel work from the master. - // When the master encounters a parallel region it sets up the work + variable - // arguments, and wakes up the workers. The workers first check to see if - // they are required for the parallel region, i.e., within the # of requested - // parallel threads. The activated workers load the variable arguments and - // execute the parallel work. - // - - CGBuilderTy &Bld = CGF.Builder; - - llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work"); - llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers"); - llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel"); - llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel"); - llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel"); - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - - CGF.EmitBranch(AwaitBB); - - // Workers wait for work from master. - CGF.EmitBlock(AwaitBB); - // Wait for parallel work - syncCTAThreads(CGF); - - Address WorkFn = - CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn"); - Address ExecStatus = - CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status"); - CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); - CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); - - // TODO: Optimize runtime initialization and pass in correct value. - llvm::Value *Args[] = {WorkFn.getPointer()}; - llvm::Value *Ret = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args); - Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); - - // On termination condition (workid == 0), exit loop. - llvm::Value *WorkID = Bld.CreateLoad(WorkFn); - llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate"); - Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); - - // Activate requested workers. - CGF.EmitBlock(SelectWorkersBB); - llvm::Value *IsActive = - Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active"); - Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB); - - // Signal start of parallel region. - CGF.EmitBlock(ExecuteBB); - // Skip initialization. - setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); - - // Process work items: outlined parallel functions. - for (llvm::Function *W : Work) { - // Try to match this outlined function. - llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy); - - llvm::Value *WorkFnMatch = - Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match"); - - llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn"); - llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next"); - Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB); - - // Execute this outlined function. - CGF.EmitBlock(ExecuteFNBB); - - // Insert call to work function via shared wrapper. The shared - // wrapper takes two arguments: - // - the parallelism level; - // - the thread ID; - emitCall(CGF, WST.Loc, W, - {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); - - // Go to end of parallel region. - CGF.EmitBranch(TerminateBB); - - CGF.EmitBlock(CheckNextBB); - } - // Default case: call to outlined function through pointer if the target - // region makes a declare target call that may contain an orphaned parallel - // directive. - auto *ParallelFnTy = - llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty}, - /*isVarArg=*/false); - llvm::Value *WorkFnCast = - Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo()); - // Insert call to work function via shared wrapper. The shared - // wrapper takes two arguments: - // - the parallelism level; - // - the thread ID; - emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast}, - {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); - // Go to end of parallel region. - CGF.EmitBranch(TerminateBB); - - // Signal end of parallel region. - CGF.EmitBlock(TerminateBB); - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel), - llvm::None); - CGF.EmitBranch(BarrierBB); - - // All active and inactive workers wait at a barrier after parallel region. - CGF.EmitBlock(BarrierBB); - // Barrier after parallel region. - syncCTAThreads(CGF); - CGF.EmitBranch(AwaitBB); - - // Exit target region. - CGF.EmitBlock(ExitBB); - // Skip initialization. - clearLocThreadIdInsertPt(CGF); -} - -/// Returns specified OpenMP runtime function for the current OpenMP -/// implementation. Specialized for the NVPTX device. -/// \param Function OpenMP runtime function. -/// \return Specified function. -llvm::FunctionCallee -CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { - llvm::FunctionCallee RTLFn = nullptr; - switch (static_cast(Function)) { - case OMPRTL_NVPTX__kmpc_kernel_init: { - // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t - // RequiresOMPRuntime); - llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init"); - break; - } - case OMPRTL_NVPTX__kmpc_kernel_deinit: { - // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); - llvm::Type *TypeParams[] = {CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit"); - break; - } - case OMPRTL_NVPTX__kmpc_spmd_kernel_init: { - // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, - // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing); - llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init"); - break; - } - case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: { - // Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); - llvm::Type *TypeParams[] = {CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2"); - break; - } - case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { - /// Build void __kmpc_kernel_prepare_parallel( - /// void *outlined_function); - llvm::Type *TypeParams[] = {CGM.Int8PtrTy}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel"); - break; - } - case OMPRTL_NVPTX__kmpc_kernel_parallel: { - /// Build bool __kmpc_kernel_parallel(void **outlined_function); - llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy}; - llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy); - auto *FnTy = - llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel"); - break; - } - case OMPRTL_NVPTX__kmpc_kernel_end_parallel: { - /// Build void __kmpc_kernel_end_parallel(); - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel"); - break; - } - case OMPRTL_NVPTX__kmpc_serialized_parallel: { - // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 - // global_tid); - llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel"); - break; - } - case OMPRTL_NVPTX__kmpc_end_serialized_parallel: { - // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 - // global_tid); - llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel"); - break; - } - case OMPRTL_NVPTX__kmpc_shuffle_int32: { - // Build int32_t __kmpc_shuffle_int32(int32_t element, - // int16_t lane_offset, int16_t warp_size); - llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32"); - break; - } - case OMPRTL_NVPTX__kmpc_shuffle_int64: { - // Build int64_t __kmpc_shuffle_int64(int64_t element, - // int16_t lane_offset, int16_t warp_size); - llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64"); - break; - } - case OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2: { - // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, - // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void* - // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t - // lane_id, int16_t lane_offset, int16_t Algorithm Version), void - // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num)); - llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, - CGM.Int16Ty, CGM.Int16Ty}; - auto *ShuffleReduceFnTy = - llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams, - /*isVarArg=*/false); - llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty}; - auto *InterWarpCopyFnTy = - llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, - /*isVarArg=*/false); - llvm::Type *TypeParams[] = {getIdentTyPointerTy(), - CGM.Int32Ty, - CGM.Int32Ty, - CGM.SizeTy, - CGM.VoidPtrTy, - ShuffleReduceFnTy->getPointerTo(), - InterWarpCopyFnTy->getPointerTo()}; - auto *FnTy = - llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); - RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2"); - break; - } - case OMPRTL_NVPTX__kmpc_end_reduce_nowait: { - // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid); - llvm::Type *TypeParams[] = {CGM.Int32Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); - RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait"); - break; - } - case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2: { - // Build int32_t __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32 - // global_tid, void *global_buffer, int32_t num_of_records, void* - // reduce_data, - // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t - // lane_offset, int16_t shortCircuit), - // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void - // (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data), - // void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx, - // void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer, - // int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void - // *buffer, int idx, void *reduce_data)); - llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, - CGM.Int16Ty, CGM.Int16Ty}; - auto *ShuffleReduceFnTy = - llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams, - /*isVarArg=*/false); - llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty}; - auto *InterWarpCopyFnTy = - llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, - /*isVarArg=*/false); - llvm::Type *GlobalListTypeParams[] = {CGM.VoidPtrTy, CGM.IntTy, - CGM.VoidPtrTy}; - auto *GlobalListFnTy = - llvm::FunctionType::get(CGM.VoidTy, GlobalListTypeParams, - /*isVarArg=*/false); - llvm::Type *TypeParams[] = {getIdentTyPointerTy(), - CGM.Int32Ty, - CGM.VoidPtrTy, - CGM.Int32Ty, - CGM.VoidPtrTy, - ShuffleReduceFnTy->getPointerTo(), - InterWarpCopyFnTy->getPointerTo(), - GlobalListFnTy->getPointerTo(), - GlobalListFnTy->getPointerTo(), - GlobalListFnTy->getPointerTo(), - GlobalListFnTy->getPointerTo()}; - auto *FnTy = - llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); - RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_v2"); - break; - } - case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: { - /// Build void __kmpc_data_sharing_init_stack(); - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack"); - break; - } - case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: { - /// Build void __kmpc_data_sharing_init_stack_spmd(); - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); - RTLFn = - CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd"); - break; - } - case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: { - // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size, - // int16_t UseSharedMemory); - llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false); - RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack"); - break; - } - case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: { - // Build void *__kmpc_data_sharing_push_stack(size_t size, int16_t - // UseSharedMemory); - llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false); - RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_data_sharing_push_stack"); - break; - } - case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: { - // Build void __kmpc_data_sharing_pop_stack(void *a); - llvm::Type *TypeParams[] = {CGM.VoidPtrTy}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, - /*Name=*/"__kmpc_data_sharing_pop_stack"); - break; - } - case OMPRTL_NVPTX__kmpc_begin_sharing_variables: { - /// Build void __kmpc_begin_sharing_variables(void ***args, - /// size_t n_args); - llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables"); - break; - } - case OMPRTL_NVPTX__kmpc_end_sharing_variables: { - /// Build void __kmpc_end_sharing_variables(); - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables"); - break; - } - case OMPRTL_NVPTX__kmpc_get_shared_variables: { - /// Build void __kmpc_get_shared_variables(void ***GlobalArgs); - llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables"); - break; - } - case OMPRTL_NVPTX__kmpc_parallel_level: { - // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid); - llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level"); - break; - } - case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: { - // Build int8_t __kmpc_is_spmd_exec_mode(); - auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode"); - break; - } - case OMPRTL_NVPTX__kmpc_get_team_static_memory: { - // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, - // const void *buf, size_t size, int16_t is_shared, const void **res); - llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy, - CGM.Int16Ty, CGM.VoidPtrPtrTy}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory"); - break; - } - case OMPRTL_NVPTX__kmpc_restore_team_static_memory: { - // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, - // int16_t is_shared); - llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); - RTLFn = - CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory"); - break; - } - case OMPRTL__kmpc_barrier: { - // Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid); - llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = - CGM.CreateConvergentRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier"); - break; - } - case OMPRTL__kmpc_barrier_simple_spmd: { - // Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32 - // global_tid); - llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateConvergentRuntimeFunction( - FnTy, /*Name*/ "__kmpc_barrier_simple_spmd"); - break; - } - case OMPRTL_NVPTX__kmpc_warp_active_thread_mask: { - // Build int32_t __kmpc_warp_active_thread_mask(void); - auto *FnTy = - llvm::FunctionType::get(CGM.Int32Ty, llvm::None, /*isVarArg=*/false); - RTLFn = CGM.CreateConvergentRuntimeFunction(FnTy, "__kmpc_warp_active_thread_mask"); - break; - } - case OMPRTL_NVPTX__kmpc_syncwarp: { - // Build void __kmpc_syncwarp(kmp_int32 Mask); - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, CGM.Int32Ty, /*isVarArg=*/false); - RTLFn = CGM.CreateConvergentRuntimeFunction(FnTy, "__kmpc_syncwarp"); - break; - } - } - return RTLFn; -} - -void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID, - llvm::Constant *Addr, - uint64_t Size, int32_t, - llvm::GlobalValue::LinkageTypes) { - // TODO: Add support for global variables on the device after declare target - // support. - if (!isa(Addr)) - return; - llvm::Module &M = CGM.getModule(); - llvm::LLVMContext &Ctx = CGM.getLLVMContext(); - - // Get "nvvm.annotations" metadata node - llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); - - llvm::Metadata *MDVals[] = { - llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"), - llvm::ConstantAsMetadata::get( - llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; - // Append metadata to nvvm.annotations - MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); -} - -void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction( - const OMPExecutableDirective &D, StringRef ParentName, - llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, - bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { - if (!IsOffloadEntry) // Nothing to do. - return; - - assert(!ParentName.empty() && "Invalid target region parent name!"); - - bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); - if (Mode) - emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, - CodeGen); - else - emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, - CodeGen); - - setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); -} - -namespace { -LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); -/// Enum for accesseing the reserved_2 field of the ident_t struct. -enum ModeFlagsTy : unsigned { - /// Bit set to 1 when in SPMD mode. - KMP_IDENT_SPMD_MODE = 0x01, - /// Bit set to 1 when a simplified runtime is used. - KMP_IDENT_SIMPLE_RT_MODE = 0x02, - LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE) -}; - -/// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime. -static const ModeFlagsTy UndefinedMode = - (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE; -} // anonymous namespace - -unsigned CGOpenMPRuntimeNVPTX::getDefaultLocationReserved2Flags() const { - switch (getExecutionMode()) { - case EM_SPMD: - if (requiresFullRuntime()) - return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE); - return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE; - case EM_NonSPMD: - assert(requiresFullRuntime() && "Expected full runtime."); - return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE); - case EM_Unknown: - return UndefinedMode; - } - llvm_unreachable("Unknown flags are requested."); -} - -CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) - : CGOpenMPRuntime(CGM, "_", "$") { - if (!CGM.getLangOpts().OpenMPIsDevice) - llvm_unreachable("OpenMP NVPTX can only handle device code."); -} - -void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF, - ProcBindKind ProcBind, - SourceLocation Loc) { - // Do nothing in case of SPMD mode and L0 parallel. - if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) - return; - - CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc); -} - -void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF, - llvm::Value *NumThreads, - SourceLocation Loc) { - // Do nothing in case of SPMD mode and L0 parallel. - if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) - return; - - CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc); -} - -void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, - const Expr *NumTeams, - const Expr *ThreadLimit, - SourceLocation Loc) {} - -llvm::Function *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( - const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, - OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { - // Emit target region as a standalone region. - class NVPTXPrePostActionTy : public PrePostActionTy { - bool &IsInParallelRegion; - bool PrevIsInParallelRegion; - - public: - NVPTXPrePostActionTy(bool &IsInParallelRegion) - : IsInParallelRegion(IsInParallelRegion) {} - void Enter(CodeGenFunction &CGF) override { - PrevIsInParallelRegion = IsInParallelRegion; - IsInParallelRegion = true; - } - void Exit(CodeGenFunction &CGF) override { - IsInParallelRegion = PrevIsInParallelRegion; - } - } Action(IsInParallelRegion); - CodeGen.setAction(Action); - bool PrevIsInTTDRegion = IsInTTDRegion; - IsInTTDRegion = false; - bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion; - IsInTargetMasterThreadRegion = false; - auto *OutlinedFun = - cast(CGOpenMPRuntime::emitParallelOutlinedFunction( - D, ThreadIDVar, InnermostKind, CodeGen)); - if (CGM.getLangOpts().Optimize) { - OutlinedFun->removeFnAttr(llvm::Attribute::NoInline); - OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone); - OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); - } - IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion; - IsInTTDRegion = PrevIsInTTDRegion; - if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD && - !IsInParallelRegion) { - llvm::Function *WrapperFun = - createParallelDataSharingWrapper(OutlinedFun, D); - WrapperFunctionsMap[OutlinedFun] = WrapperFun; - } - - return OutlinedFun; -} - -/// Get list of lastprivate variables from the teams distribute ... or -/// teams {distribute ...} directives. -static void -getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, - llvm::SmallVectorImpl &Vars) { - assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && - "expected teams directive."); - const OMPExecutableDirective *Dir = &D; - if (!isOpenMPDistributeDirective(D.getDirectiveKind())) { - if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild( - Ctx, - D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers( - /*IgnoreCaptured=*/true))) { - Dir = dyn_cast_or_null(S); - if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind())) - Dir = nullptr; - } - } - if (!Dir) - return; - for (const auto *C : Dir->getClausesOfKind()) { - for (const Expr *E : C->getVarRefs()) - Vars.push_back(getPrivateItem(E)); - } -} - -/// Get list of reduction variables from the teams ... directives. -static void -getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, - llvm::SmallVectorImpl &Vars) { - assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && - "expected teams directive."); - for (const auto *C : D.getClausesOfKind()) { - for (const Expr *E : C->privates()) - Vars.push_back(getPrivateItem(E)); - } -} - -llvm::Function *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( - const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, - OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { - SourceLocation Loc = D.getBeginLoc(); - - const RecordDecl *GlobalizedRD = nullptr; - llvm::SmallVector LastPrivatesReductions; - llvm::SmallDenseMap MappedDeclsFields; - // Globalize team reductions variable unconditionally in all modes. - if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD) - getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); - if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) { - getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); - if (!LastPrivatesReductions.empty()) { - GlobalizedRD = ::buildRecordForGlobalizedVars( - CGM.getContext(), llvm::None, LastPrivatesReductions, - MappedDeclsFields, WarpSize); - } - } else if (!LastPrivatesReductions.empty()) { - assert(!TeamAndReductions.first && - "Previous team declaration is not expected."); - TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); - std::swap(TeamAndReductions.second, LastPrivatesReductions); - } - - // Emit target region as a standalone region. - class NVPTXPrePostActionTy : public PrePostActionTy { - SourceLocation &Loc; - const RecordDecl *GlobalizedRD; - llvm::SmallDenseMap - &MappedDeclsFields; - - public: - NVPTXPrePostActionTy( - SourceLocation &Loc, const RecordDecl *GlobalizedRD, - llvm::SmallDenseMap - &MappedDeclsFields) - : Loc(Loc), GlobalizedRD(GlobalizedRD), - MappedDeclsFields(MappedDeclsFields) {} - void Enter(CodeGenFunction &CGF) override { - auto &Rt = - static_cast(CGF.CGM.getOpenMPRuntime()); - if (GlobalizedRD) { - auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; - I->getSecond().GlobalRecord = GlobalizedRD; - I->getSecond().MappedParams = - std::make_unique(); - DeclToAddrMapTy &Data = I->getSecond().LocalVarData; - for (const auto &Pair : MappedDeclsFields) { - assert(Pair.getFirst()->isCanonicalDecl() && - "Expected canonical declaration"); - Data.insert(std::make_pair(Pair.getFirst(), - MappedVarData(Pair.getSecond(), - /*IsOnePerTeam=*/true))); - } - } - Rt.emitGenericVarsProlog(CGF, Loc); - } - void Exit(CodeGenFunction &CGF) override { - static_cast(CGF.CGM.getOpenMPRuntime()) - .emitGenericVarsEpilog(CGF); - } - } Action(Loc, GlobalizedRD, MappedDeclsFields); - CodeGen.setAction(Action); - llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction( - D, ThreadIDVar, InnermostKind, CodeGen); - if (CGM.getLangOpts().Optimize) { - OutlinedFun->removeFnAttr(llvm::Attribute::NoInline); - OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone); - OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); - } - - return OutlinedFun; -} - -void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, - SourceLocation Loc, - bool WithSPMDCheck) { - if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic && - getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD) - return; - - CGBuilderTy &Bld = CGF.Builder; - - const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); - if (I == FunctionGlobalizedDecls.end()) - return; - if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) { - QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); - QualType SecGlobalRecTy; - - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - - llvm::PointerType *GlobalRecPtrTy = - CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo(); - llvm::Value *GlobalRecCastAddr; - llvm::Value *IsTTD = nullptr; - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) { - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - if (I->getSecond().SecondaryGlobalRecord.hasValue()) { - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *PL = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level), - {RTLoc, ThreadID}); - IsTTD = Bld.CreateIsNull(PL); - } - llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode))); - Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(SPMDBB); - Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy), - CharUnits::fromQuantity(Alignment)); - CGF.EmitBranch(ExitBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); - llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize); - if (const RecordDecl *SecGlobalizedVarsRecord = - I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) { - SecGlobalRecTy = - CGM.getContext().getRecordType(SecGlobalizedVarsRecord); - - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - Size = Bld.CreateSelect( - IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size); - } - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, GlobalRecPtrTy); - CGF.EmitBlock(ExitBB); - auto *Phi = Bld.CreatePHI(GlobalRecPtrTy, - /*NumReservedValues=*/2, "_select_stack"); - Phi->addIncoming(RecPtr.getPointer(), SPMDBB); - Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB); - GlobalRecCastAddr = Phi; - I->getSecond().GlobalRecordAddr = Phi; - I->getSecond().IsInSPMDModeFlag = IsSPMD; - } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { - assert(GlobalizedRecords.back().Records.size() < 2 && - "Expected less than 2 globalized records: one for target and one " - "for teams."); - unsigned Offset = 0; - for (const RecordDecl *RD : GlobalizedRecords.back().Records) { - QualType RDTy = CGM.getContext().getRecordType(RD); - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); - unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); - Offset = - llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment); - } - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); - Offset = llvm::alignTo(Offset, Alignment); - GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord); - ++GlobalizedRecords.back().RegionCounter; - if (GlobalizedRecords.back().Records.size() == 1) { - assert(KernelStaticGlobalized && - "Kernel static pointer must be initialized already."); - auto *UseSharedMemory = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_static_kernel$is_shared"); - UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( - /*DestWidth=*/16, /*Signed=*/0); - llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( - Address(UseSharedMemory, - CGM.getContext().getTypeAlignInChars(Int16Ty)), - /*Volatile=*/false, Int16Ty, Loc); - auto *StaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false, - llvm::GlobalValue::CommonLinkage, nullptr); - auto *RecSize = new llvm::GlobalVariable( - CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_static_kernel$size"); - RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - llvm::Value *Ld = CGF.EmitLoadOfScalar( - Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false, - CGM.getContext().getSizeType(), Loc); - llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - KernelStaticGlobalized, CGM.VoidPtrPtrTy); - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get( - CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0), - StaticGlobalized, Ld, IsInSharedMemory, ResAddr}; - CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_get_team_static_memory), - GlobalRecordSizeArg); - GlobalizedRecords.back().Buffer = StaticGlobalized; - GlobalizedRecords.back().RecSize = RecSize; - GlobalizedRecords.back().UseSharedMemory = UseSharedMemory; - GlobalizedRecords.back().Loc = Loc; - } - assert(KernelStaticGlobalized && "Global address must be set already."); - Address FrameAddr = CGF.EmitLoadOfPointer( - Address(KernelStaticGlobalized, CGM.getPointerAlign()), - CGM.getContext() - .getPointerType(CGM.getContext().VoidPtrTy) - .castAs()); - llvm::Value *GlobalRecValue = - Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer(); - I->getSecond().GlobalRecordAddr = GlobalRecValue; - I->getSecond().IsInSPMDModeFlag = nullptr; - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo()); - } else { - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - bool UseSharedMemory = - IsInTTDRegion && GlobalRecordSize <= SharedMemorySize; - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), - CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - IsInTTDRegion - ? OMPRTL_NVPTX__kmpc_data_sharing_push_stack - : OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, GlobalRecPtrTy); - I->getSecond().GlobalRecordAddr = GlobalRecValue; - I->getSecond().IsInSPMDModeFlag = nullptr; - } - LValue Base = - CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy); - - // Emit the "global alloca" which is a GEP from the global declaration - // record using the pointer returned by the runtime. - LValue SecBase; - decltype(I->getSecond().LocalVarData)::const_iterator SecIt; - if (IsTTD) { - SecIt = I->getSecond().SecondaryLocalVarData->begin(); - llvm::PointerType *SecGlobalRecPtrTy = - CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo(); - SecBase = CGF.MakeNaturalAlignPointeeAddrLValue( - Bld.CreatePointerBitCastOrAddrSpaceCast( - I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy), - SecGlobalRecTy); - } - for (auto &Rec : I->getSecond().LocalVarData) { - bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); - llvm::Value *ParValue; - if (EscapedParam) { - const auto *VD = cast(Rec.first); - LValue ParLVal = - CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); - ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); - } - LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD); - // Emit VarAddr basing on lane-id if required. - QualType VarTy; - if (Rec.second.IsOnePerTeam) { - VarTy = Rec.second.FD->getType(); - } else { - llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP( - VarAddr.getAddress(CGF).getPointer(), - {Bld.getInt32(0), getNVPTXLaneID(CGF)}); - VarTy = - Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType(); - VarAddr = CGF.MakeAddrLValue( - Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy, - AlignmentSource::Decl); - } - Rec.second.PrivateAddr = VarAddr.getAddress(CGF); - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) { - assert(I->getSecond().IsInSPMDModeFlag && - "Expected unknown execution mode or required SPMD check."); - if (IsTTD) { - assert(SecIt->second.IsOnePerTeam && - "Secondary glob data must be one per team."); - LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD); - VarAddr.setAddress( - Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF), - VarAddr.getPointer(CGF)), - VarAddr.getAlignment())); - Rec.second.PrivateAddr = VarAddr.getAddress(CGF); - } - Address GlobalPtr = Rec.second.PrivateAddr; - Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName()); - Rec.second.PrivateAddr = Address( - Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag, - LocalAddr.getPointer(), GlobalPtr.getPointer()), - LocalAddr.getAlignment()); - } - if (EscapedParam) { - const auto *VD = cast(Rec.first); - CGF.EmitStoreOfScalar(ParValue, VarAddr); - I->getSecond().MappedParams->setVarAddr(CGF, VD, - VarAddr.getAddress(CGF)); - } - if (IsTTD) - ++SecIt; - } - } - for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) { - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - CGBuilderTy &Bld = CGF.Builder; - llvm::Value *Size = CGF.getTypeSize(VD->getType()); - CharUnits Align = CGM.getContext().getDeclAlign(VD); - Size = Bld.CreateNUWAdd( - Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); - llvm::Value *AlignVal = - llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); - Size = Bld.CreateUDiv(Size, AlignVal); - Size = Bld.CreateNUWMul(Size, AlignVal); - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo()); - LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(), - CGM.getContext().getDeclAlign(VD), - AlignmentSource::Decl); - I->getSecond().MappedParams->setVarAddr(CGF, cast(VD), - Base.getAddress(CGF)); - I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue); - } - I->getSecond().MappedParams->apply(CGF); -} - -void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF, - bool WithSPMDCheck) { - if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic && - getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD) - return; - - const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); - if (I != FunctionGlobalizedDecls.end()) { - I->getSecond().MappedParams->restore(CGF); - if (!CGF.HaveInsertPoint()) - return; - for (llvm::Value *Addr : - llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), - Addr); - } - if (I->getSecond().GlobalRecordAddr) { - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) { - CGBuilderTy &Bld = CGF.Builder; - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), - CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); - CGF.EmitBlock(ExitBB); - } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { - assert(GlobalizedRecords.back().RegionCounter > 0 && - "region counter must be > 0."); - --GlobalizedRecords.back().RegionCounter; - // Emit the restore function only in the target region. - if (GlobalizedRecords.back().RegionCounter == 0) { - QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( - /*DestWidth=*/16, /*Signed=*/0); - llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( - Address(GlobalizedRecords.back().UseSharedMemory, - CGM.getContext().getTypeAlignInChars(Int16Ty)), - /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc); - llvm::Value *Args[] = { - llvm::ConstantInt::get( - CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0), - IsInSharedMemory}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_restore_team_static_memory), - Args); - } - } else { - CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), - I->getSecond().GlobalRecordAddr); - } - } - } -} - -void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, - const OMPExecutableDirective &D, - SourceLocation Loc, - llvm::Function *OutlinedFn, - ArrayRef CapturedVars) { - if (!CGF.HaveInsertPoint()) - return; - - Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, - /*Name=*/".zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); - llvm::SmallVector OutlinedFnArgs; - OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer()); - OutlinedFnArgs.push_back(ZeroAddr.getPointer()); - OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); - emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); -} - -void CGOpenMPRuntimeNVPTX::emitParallelCall( - CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, - ArrayRef CapturedVars, const Expr *IfCond) { - if (!CGF.HaveInsertPoint()) - return; - - if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) - emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); - else - emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); -} - -void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall( - CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, - ArrayRef CapturedVars, const Expr *IfCond) { - llvm::Function *Fn = cast(OutlinedFn); - - // Force inline this outlined function at its call site. - Fn->setLinkage(llvm::GlobalValue::InternalLinkage); - - Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, - /*Name=*/".zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); - // ThreadId for serialized parallels is 0. - Address ThreadIDAddr = ZeroAddr; - auto &&CodeGen = [this, Fn, CapturedVars, Loc, &ThreadIDAddr]( - CodeGenFunction &CGF, PrePostActionTy &Action) { - Action.Enter(CGF); - - Address ZeroAddr = - CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, - /*Name=*/".bound.zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); - llvm::SmallVector OutlinedFnArgs; - OutlinedFnArgs.push_back(ThreadIDAddr.getPointer()); - OutlinedFnArgs.push_back(ZeroAddr.getPointer()); - OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); - emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs); - }; - auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF, - PrePostActionTy &) { - - RegionCodeGenTy RCG(CodeGen); - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *Args[] = {RTLoc, ThreadID}; - - NVPTXActionTy Action( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel), - Args, - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel), - Args); - RCG.setAction(Action); - RCG(CGF); - }; - - auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF, - PrePostActionTy &Action) { - CGBuilderTy &Bld = CGF.Builder; - llvm::Function *WFn = WrapperFunctionsMap[Fn]; - assert(WFn && "Wrapper function does not exist!"); - llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); - - // Prepare for parallel region. Indicate the outlined function. - llvm::Value *Args[] = {ID}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), - Args); - - // Create a private scope that will globalize the arguments - // passed from the outside of the target region. - CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); - - // There's something to share. - if (!CapturedVars.empty()) { - // Prepare for parallel region. Indicate the outlined function. - Address SharedArgs = - CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs"); - llvm::Value *SharedArgsPtr = SharedArgs.getPointer(); - - llvm::Value *DataSharingArgs[] = { - SharedArgsPtr, - llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; - CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_begin_sharing_variables), - DataSharingArgs); - - // Store variable address in a list of references to pass to workers. - unsigned Idx = 0; - ASTContext &Ctx = CGF.getContext(); - Address SharedArgListAddress = CGF.EmitLoadOfPointer( - SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy)) - .castAs()); - for (llvm::Value *V : CapturedVars) { - Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); - llvm::Value *PtrV; - if (V->getType()->isIntegerTy()) - PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); - else - PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); - CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, - Ctx.getPointerType(Ctx.VoidPtrTy)); - ++Idx; - } - } - - // Activate workers. This barrier is used by the master to signal - // work for the workers. - syncCTAThreads(CGF); - - // OpenMP [2.5, Parallel Construct, p.49] - // There is an implied barrier at the end of a parallel region. After the - // end of a parallel region, only the master thread of the team resumes - // execution of the enclosing task region. - // - // The master waits at this barrier until all workers are done. - syncCTAThreads(CGF); - - if (!CapturedVars.empty()) - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables)); - - // Remember for post-processing in worker loop. - Work.emplace_back(WFn); - }; - - auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen]( - CodeGenFunction &CGF, PrePostActionTy &Action) { - if (IsInParallelRegion) { - SeqGen(CGF, Action); - } else if (IsInTargetMasterThreadRegion) { - L0ParallelGen(CGF, Action); - } else { - // Check for master and then parallelism: - // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) { - // Serialized execution. - // } else { - // Worker call. - // } - CGBuilderTy &Bld = CGF.Builder; - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential"); - llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck"); - llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); - llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode))); - Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(ParallelCheckBB); - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *PL = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level), - {RTLoc, ThreadID}); - llvm::Value *Res = Bld.CreateIsNotNull(PL); - Bld.CreateCondBr(Res, SeqBB, MasterBB); - CGF.EmitBlock(SeqBB); - SeqGen(CGF, Action); - CGF.EmitBranch(ExitBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(MasterBB); - L0ParallelGen(CGF, Action); - CGF.EmitBranch(ExitBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - // Emit the continuation block for code after the if. - CGF.EmitBlock(ExitBB, /*IsFinished=*/true); - } - }; - - if (IfCond) { - emitIfClause(CGF, IfCond, LNParallelGen, SeqGen); - } else { - CodeGenFunction::RunCleanupsScope Scope(CGF); - RegionCodeGenTy ThenRCG(LNParallelGen); - ThenRCG(CGF); - } -} - -void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall( - CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, - ArrayRef CapturedVars, const Expr *IfCond) { - // Just call the outlined function to execute the parallel region. - // OutlinedFn(>id, &zero, CapturedStruct); - // - llvm::SmallVector OutlinedFnArgs; - - Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, - /*Name=*/".zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); - // ThreadId for serialized parallels is 0. - Address ThreadIDAddr = ZeroAddr; - auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, &ThreadIDAddr]( - CodeGenFunction &CGF, PrePostActionTy &Action) { - Action.Enter(CGF); - - Address ZeroAddr = - CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, - /*Name=*/".bound.zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); - llvm::SmallVector OutlinedFnArgs; - OutlinedFnArgs.push_back(ThreadIDAddr.getPointer()); - OutlinedFnArgs.push_back(ZeroAddr.getPointer()); - OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); - emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); - }; - auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF, - PrePostActionTy &) { - - RegionCodeGenTy RCG(CodeGen); - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *Args[] = {RTLoc, ThreadID}; - - NVPTXActionTy Action( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel), - Args, - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel), - Args); - RCG.setAction(Action); - RCG(CGF); - }; - - if (IsInTargetMasterThreadRegion) { - // In the worker need to use the real thread id. - ThreadIDAddr = emitThreadIDAddress(CGF, Loc); - RegionCodeGenTy RCG(CodeGen); - RCG(CGF); - } else { - // If we are not in the target region, it is definitely L2 parallelism or - // more, because for SPMD mode we always has L1 parallel level, sowe don't - // need to check for orphaned directives. - RegionCodeGenTy RCG(SeqGen); - RCG(CGF); - } -} - -void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) { - // Always emit simple barriers! - if (!CGF.HaveInsertPoint()) - return; - // Build call __kmpc_barrier_simple_spmd(nullptr, 0); - // This function does not use parameters, so we can emit just default values. - llvm::Value *Args[] = { - llvm::ConstantPointerNull::get( - cast(getIdentTyPointerTy())), - llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)}; - llvm::CallInst *Call = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args); - Call->setConvergent(); -} - -void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF, - SourceLocation Loc, - OpenMPDirectiveKind Kind, bool, - bool) { - // Always emit simple barriers! - if (!CGF.HaveInsertPoint()) - return; - // Build call __kmpc_cancel_barrier(loc, thread_id); - unsigned Flags = getDefaultFlagsForBarriers(Kind); - llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags), - getThreadID(CGF, Loc)}; - llvm::CallInst *Call = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier), Args); - Call->setConvergent(); -} - -void CGOpenMPRuntimeNVPTX::emitCriticalRegion( - CodeGenFunction &CGF, StringRef CriticalName, - const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, - const Expr *Hint) { - llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop"); - llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test"); - llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync"); - llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body"); - llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit"); - - // Get the mask of active threads in the warp. - llvm::Value *Mask = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask)); - // Fetch team-local id of the thread. - llvm::Value *ThreadID = getNVPTXThreadID(CGF); - - // Get the width of the team. - llvm::Value *TeamWidth = getNVPTXNumThreads(CGF); - - // Initialize the counter variable for the loop. - QualType Int32Ty = - CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0); - Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter"); - LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty); - CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal, - /*isInit=*/true); - - // Block checks if loop counter exceeds upper bound. - CGF.EmitBlock(LoopBB); - llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); - llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth); - CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB); - - // Block tests which single thread should execute region, and which threads - // should go straight to synchronisation point. - CGF.EmitBlock(TestBB); - CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); - llvm::Value *CmpThreadToCounter = - CGF.Builder.CreateICmpEQ(ThreadID, CounterVal); - CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB); - - // Block emits the body of the critical region. - CGF.EmitBlock(BodyBB); - - // Output the critical statement. - CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc, - Hint); - - // After the body surrounded by the critical region, the single executing - // thread will jump to the synchronisation point. - // Block waits for all threads in current team to finish then increments the - // counter variable and returns to the loop. - CGF.EmitBlock(SyncBB); - // Reconverge active threads in the warp. - (void)CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_syncwarp), Mask); - - llvm::Value *IncCounterVal = - CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1)); - CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal); - CGF.EmitBranch(LoopBB); - - // Block that is reached when all threads in the team complete the region. - CGF.EmitBlock(ExitBB, /*IsFinished=*/true); -} - -/// Cast value to the specified type. -static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, - QualType ValTy, QualType CastTy, - SourceLocation Loc) { - assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && - "Cast type must sized."); - assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && - "Val type must sized."); - llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy); - if (ValTy == CastTy) - return Val; - if (CGF.getContext().getTypeSizeInChars(ValTy) == - CGF.getContext().getTypeSizeInChars(CastTy)) - return CGF.Builder.CreateBitCast(Val, LLVMCastTy); - if (CastTy->isIntegerType() && ValTy->isIntegerType()) - return CGF.Builder.CreateIntCast(Val, LLVMCastTy, - CastTy->hasSignedIntegerRepresentation()); - Address CastItem = CGF.CreateMemTemp(CastTy); - Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace())); - CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy); - return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc); -} - -/// This function creates calls to one of two shuffle functions to copy -/// variables between lanes in a warp. -static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF, - llvm::Value *Elem, - QualType ElemType, - llvm::Value *Offset, - SourceLocation Loc) { - CodeGenModule &CGM = CGF.CGM; - CGBuilderTy &Bld = CGF.Builder; - CGOpenMPRuntimeNVPTX &RT = - *(static_cast(&CGM.getOpenMPRuntime())); - - CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType); - assert(Size.getQuantity() <= 8 && - "Unsupported bitwidth in shuffle instruction."); - - OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4 - ? OMPRTL_NVPTX__kmpc_shuffle_int32 - : OMPRTL_NVPTX__kmpc_shuffle_int64; - - // Cast all types to 32- or 64-bit values before calling shuffle routines. - QualType CastTy = CGF.getContext().getIntTypeForBitwidth( - Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1); - llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc); - llvm::Value *WarpSize = - Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true); - - llvm::Value *ShuffledVal = CGF.EmitRuntimeCall( - RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize}); - - return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc); -} - -static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, - Address DestAddr, QualType ElemType, - llvm::Value *Offset, SourceLocation Loc) { - CGBuilderTy &Bld = CGF.Builder; - - CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType); - // Create the loop over the big sized data. - // ptr = (void*)Elem; - // ptrEnd = (void*) Elem + 1; - // Step = 8; - // while (ptr + Step < ptrEnd) - // shuffle((int64_t)*ptr); - // Step = 4; - // while (ptr + Step < ptrEnd) - // shuffle((int32_t)*ptr); - // ... - Address ElemPtr = DestAddr; - Address Ptr = SrcAddr; - Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast( - Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy); - for (int IntSize = 8; IntSize >= 1; IntSize /= 2) { - if (Size < CharUnits::fromQuantity(IntSize)) - continue; - QualType IntType = CGF.getContext().getIntTypeForBitwidth( - CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)), - /*Signed=*/1); - llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType); - Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo()); - ElemPtr = - Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo()); - if (Size.getQuantity() / IntSize > 1) { - llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond"); - llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then"); - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit"); - llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock(); - CGF.EmitBlock(PreCondBB); - llvm::PHINode *PhiSrc = - Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2); - PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB); - llvm::PHINode *PhiDest = - Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2); - PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB); - Ptr = Address(PhiSrc, Ptr.getAlignment()); - ElemPtr = Address(PhiDest, ElemPtr.getAlignment()); - llvm::Value *PtrDiff = Bld.CreatePtrDiff( - PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast( - Ptr.getPointer(), CGF.VoidPtrTy)); - Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)), - ThenBB, ExitBB); - CGF.EmitBlock(ThenBB); - llvm::Value *Res = createRuntimeShuffleFunction( - CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc), - IntType, Offset, Loc); - CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType); - Address LocalPtr = Bld.CreateConstGEP(Ptr, 1); - Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1); - PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB); - PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB); - CGF.EmitBranch(PreCondBB); - CGF.EmitBlock(ExitBB); - } else { - llvm::Value *Res = createRuntimeShuffleFunction( - CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc), - IntType, Offset, Loc); - CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType); - Ptr = Bld.CreateConstGEP(Ptr, 1); - ElemPtr = Bld.CreateConstGEP(ElemPtr, 1); - } - Size = Size % IntSize; - } -} - -namespace { -enum CopyAction : unsigned { - // RemoteLaneToThread: Copy over a Reduce list from a remote lane in - // the warp using shuffle instructions. - RemoteLaneToThread, - // ThreadCopy: Make a copy of a Reduce list on the thread's stack. - ThreadCopy, - // ThreadToScratchpad: Copy a team-reduced array to the scratchpad. - ThreadToScratchpad, - // ScratchpadToThread: Copy from a scratchpad array in global memory - // containing team-reduced data to a thread's stack. - ScratchpadToThread, -}; -} // namespace - -struct CopyOptionsTy { - llvm::Value *RemoteLaneOffset; - llvm::Value *ScratchpadIndex; - llvm::Value *ScratchpadWidth; -}; - -/// Emit instructions to copy a Reduce list, which contains partially -/// aggregated values, in the specified direction. -static void emitReductionListCopy( - CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, - ArrayRef Privates, Address SrcBase, Address DestBase, - CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) { - - CodeGenModule &CGM = CGF.CGM; - ASTContext &C = CGM.getContext(); - CGBuilderTy &Bld = CGF.Builder; - - llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset; - llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex; - llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth; - - // Iterates, element-by-element, through the source Reduce list and - // make a copy. - unsigned Idx = 0; - unsigned Size = Privates.size(); - for (const Expr *Private : Privates) { - Address SrcElementAddr = Address::invalid(); - Address DestElementAddr = Address::invalid(); - Address DestElementPtrAddr = Address::invalid(); - // Should we shuffle in an element from a remote lane? - bool ShuffleInElement = false; - // Set to true to update the pointer in the dest Reduce list to a - // newly created element. - bool UpdateDestListPtr = false; - // Increment the src or dest pointer to the scratchpad, for each - // new element. - bool IncrScratchpadSrc = false; - bool IncrScratchpadDest = false; - - switch (Action) { - case RemoteLaneToThread: { - // Step 1.1: Get the address for the src element in the Reduce list. - Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); - SrcElementAddr = CGF.EmitLoadOfPointer( - SrcElementPtrAddr, - C.getPointerType(Private->getType())->castAs()); - - // Step 1.2: Create a temporary to store the element in the destination - // Reduce list. - DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); - DestElementAddr = - CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); - ShuffleInElement = true; - UpdateDestListPtr = true; - break; - } - case ThreadCopy: { - // Step 1.1: Get the address for the src element in the Reduce list. - Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); - SrcElementAddr = CGF.EmitLoadOfPointer( - SrcElementPtrAddr, - C.getPointerType(Private->getType())->castAs()); - - // Step 1.2: Get the address for dest element. The destination - // element has already been created on the thread's stack. - DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); - DestElementAddr = CGF.EmitLoadOfPointer( - DestElementPtrAddr, - C.getPointerType(Private->getType())->castAs()); - break; - } - case ThreadToScratchpad: { - // Step 1.1: Get the address for the src element in the Reduce list. - Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); - SrcElementAddr = CGF.EmitLoadOfPointer( - SrcElementPtrAddr, - C.getPointerType(Private->getType())->castAs()); - - // Step 1.2: Get the address for dest element: - // address = base + index * ElementSizeInChars. - llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); - llvm::Value *CurrentOffset = - Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex); - llvm::Value *ScratchPadElemAbsolutePtrVal = - Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset); - ScratchPadElemAbsolutePtrVal = - Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); - DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, - C.getTypeAlignInChars(Private->getType())); - IncrScratchpadDest = true; - break; - } - case ScratchpadToThread: { - // Step 1.1: Get the address for the src element in the scratchpad. - // address = base + index * ElementSizeInChars. - llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); - llvm::Value *CurrentOffset = - Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex); - llvm::Value *ScratchPadElemAbsolutePtrVal = - Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset); - ScratchPadElemAbsolutePtrVal = - Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); - SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, - C.getTypeAlignInChars(Private->getType())); - IncrScratchpadSrc = true; - - // Step 1.2: Create a temporary to store the element in the destination - // Reduce list. - DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); - DestElementAddr = - CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); - UpdateDestListPtr = true; - break; - } - } - - // Regardless of src and dest of copy, we emit the load of src - // element as this is required in all directions - SrcElementAddr = Bld.CreateElementBitCast( - SrcElementAddr, CGF.ConvertTypeForMem(Private->getType())); - DestElementAddr = Bld.CreateElementBitCast(DestElementAddr, - SrcElementAddr.getElementType()); - - // Now that all active lanes have read the element in the - // Reduce list, shuffle over the value from the remote lane. - if (ShuffleInElement) { - shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(), - RemoteLaneOffset, Private->getExprLoc()); - } else { - switch (CGF.getEvaluationKind(Private->getType())) { - case TEK_Scalar: { - llvm::Value *Elem = - CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false, - Private->getType(), Private->getExprLoc()); - // Store the source element value to the dest element address. - CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false, - Private->getType()); - break; - } - case TEK_Complex: { - CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex( - CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), - Private->getExprLoc()); - CGF.EmitStoreOfComplex( - Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()), - /*isInit=*/false); - break; - } - case TEK_Aggregate: - CGF.EmitAggregateCopy( - CGF.MakeAddrLValue(DestElementAddr, Private->getType()), - CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), - Private->getType(), AggValueSlot::DoesNotOverlap); - break; - } - } - - // Step 3.1: Modify reference in dest Reduce list as needed. - // Modifying the reference in Reduce list to point to the newly - // created element. The element is live in the current function - // scope and that of functions it invokes (i.e., reduce_function). - // RemoteReduceData[i] = (void*)&RemoteElem - if (UpdateDestListPtr) { - CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast( - DestElementAddr.getPointer(), CGF.VoidPtrTy), - DestElementPtrAddr, /*Volatile=*/false, - C.VoidPtrTy); - } - - // Step 4.1: Increment SrcBase/DestBase so that it points to the starting - // address of the next element in scratchpad memory, unless we're currently - // processing the last one. Memory alignment is also taken care of here. - if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) { - llvm::Value *ScratchpadBasePtr = - IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer(); - llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); - ScratchpadBasePtr = Bld.CreateNUWAdd( - ScratchpadBasePtr, - Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars)); - - // Take care of global memory alignment for performance - ScratchpadBasePtr = Bld.CreateNUWSub( - ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1)); - ScratchpadBasePtr = Bld.CreateUDiv( - ScratchpadBasePtr, - llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment)); - ScratchpadBasePtr = Bld.CreateNUWAdd( - ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1)); - ScratchpadBasePtr = Bld.CreateNUWMul( - ScratchpadBasePtr, - llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment)); - - if (IncrScratchpadDest) - DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign()); - else /* IncrScratchpadSrc = true */ - SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign()); - } - - ++Idx; - } -} - -/// This function emits a helper that gathers Reduce lists from the first -/// lane of every active warp to lanes in the first warp. -/// -/// void inter_warp_copy_func(void* reduce_data, num_warps) -/// shared smem[warp_size]; -/// For all data entries D in reduce_data: -/// sync -/// If (I am the first lane in each warp) -/// Copy my local D to smem[warp_id] -/// sync -/// if (I am the first warp) -/// Copy smem[thread_id] to my local D -static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, - ArrayRef Privates, - QualType ReductionArrayTy, - SourceLocation Loc) { - ASTContext &C = CGM.getContext(); - llvm::Module &M = CGM.getModule(); - - // ReduceList: thread local Reduce list. - // At the stage of the computation when this function is called, partially - // aggregated values reside in the first lane of every active warp. - ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - // NumWarps: number of warps active in the parallel region. This could - // be smaller than 32 (max warps in a CTA) for partial block reduction. - ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.getIntTypeForBitwidth(32, /* Signed */ true), - ImplicitParamDecl::Other); - FunctionArgList Args; - Args.push_back(&ReduceListArg); - Args.push_back(&NumWarpsArg); - - const CGFunctionInfo &CGFI = - CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI), - llvm::GlobalValue::InternalLinkage, - "_omp_reduction_inter_warp_copy_func", &M); - CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); - Fn->setDoesNotRecurse(); - CodeGenFunction CGF(CGM); - CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); - - CGBuilderTy &Bld = CGF.Builder; - - // This array is used as a medium to transfer, one reduce element at a time, - // the data from the first lane of every warp to lanes in the first warp - // in order to perform the final step of a reduction in a parallel region - // (reduction across warps). The array is placed in NVPTX __shared__ memory - // for reduced latency, as well as to have a distinct copy for concurrently - // executing target regions. The array is declared with common linkage so - // as to be shared across compilation units. - StringRef TransferMediumName = - "__openmp_nvptx_data_transfer_temporary_storage"; - llvm::GlobalVariable *TransferMedium = - M.getGlobalVariable(TransferMediumName); - if (!TransferMedium) { - auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); - unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); - TransferMedium = new llvm::GlobalVariable( - M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage, - llvm::Constant::getNullValue(Ty), TransferMediumName, - /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, - SharedAddressSpace); - CGM.addCompilerUsedGlobal(TransferMedium); - } - - // Get the CUDA thread id of the current OpenMP thread on the GPU. - llvm::Value *ThreadID = getNVPTXThreadID(CGF); - // nvptx_lane_id = nvptx_id % warpsize - llvm::Value *LaneID = getNVPTXLaneID(CGF); - // nvptx_warp_id = nvptx_id / warpsize - llvm::Value *WarpID = getNVPTXWarpID(CGF); - - Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); - Address LocalReduceList( - Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, - C.VoidPtrTy, Loc), - CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), - CGF.getPointerAlign()); - - unsigned Idx = 0; - for (const Expr *Private : Privates) { - // - // Warp master copies reduce element to transfer medium in __shared__ - // memory. - // - unsigned RealTySize = - C.getTypeSizeInChars(Private->getType()) - .alignTo(C.getTypeAlignInChars(Private->getType())) - .getQuantity(); - for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) { - unsigned NumIters = RealTySize / TySize; - if (NumIters == 0) - continue; - QualType CType = C.getIntTypeForBitwidth( - C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1); - llvm::Type *CopyType = CGF.ConvertTypeForMem(CType); - CharUnits Align = CharUnits::fromQuantity(TySize); - llvm::Value *Cnt = nullptr; - Address CntAddr = Address::invalid(); - llvm::BasicBlock *PrecondBB = nullptr; - llvm::BasicBlock *ExitBB = nullptr; - if (NumIters > 1) { - CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr"); - CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr, - /*Volatile=*/false, C.IntTy); - PrecondBB = CGF.createBasicBlock("precond"); - ExitBB = CGF.createBasicBlock("exit"); - llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body"); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(PrecondBB); - Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc); - llvm::Value *Cmp = - Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters)); - Bld.CreateCondBr(Cmp, BodyBB, ExitBB); - CGF.EmitBlock(BodyBB); - } - // kmpc_barrier. - CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown, - /*EmitChecks=*/false, - /*ForceSimpleCall=*/true); - llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); - - // if (lane_id == 0) - llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master"); - Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); - CGF.EmitBlock(ThenBB); - - // Reduce element = LocalReduceList[i] - Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); - llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( - ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); - // elemptr = ((CopyType*)(elemptrptr)) + I - Address ElemPtr = Address(ElemPtrPtr, Align); - ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType); - if (NumIters > 1) { - ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt), - ElemPtr.getAlignment()); - } - - // Get pointer to location in transfer medium. - // MediumPtr = &medium[warp_id] - llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP( - TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID}); - Address MediumPtr(MediumPtrVal, Align); - // Casting to actual data type. - // MediumPtr = (CopyType*)MediumPtrAddr; - MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType); - - // elem = *elemptr - //*MediumPtr = elem - llvm::Value *Elem = - CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc); - // Store the source element value to the dest element address. - CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType); - - Bld.CreateBr(MergeBB); - - CGF.EmitBlock(ElseBB); - Bld.CreateBr(MergeBB); - - CGF.EmitBlock(MergeBB); - - // kmpc_barrier. - CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown, - /*EmitChecks=*/false, - /*ForceSimpleCall=*/true); - - // - // Warp 0 copies reduce element from transfer medium. - // - llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont"); - - Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg); - llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar( - AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc); - - // Up to 32 threads in warp 0 are active. - llvm::Value *IsActiveThread = - Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread"); - Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB); - - CGF.EmitBlock(W0ThenBB); - - // SrcMediumPtr = &medium[tid] - llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP( - TransferMedium, - {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID}); - Address SrcMediumPtr(SrcMediumPtrVal, Align); - // SrcMediumVal = *SrcMediumPtr; - SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType); - - // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I - Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); - llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar( - TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc); - Address TargetElemPtr = Address(TargetElemPtrVal, Align); - TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType); - if (NumIters > 1) { - TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt), - TargetElemPtr.getAlignment()); - } - - // *TargetElemPtr = SrcMediumVal; - llvm::Value *SrcMediumValue = - CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc); - CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false, - CType); - Bld.CreateBr(W0MergeBB); - - CGF.EmitBlock(W0ElseBB); - Bld.CreateBr(W0MergeBB); - - CGF.EmitBlock(W0MergeBB); - - if (NumIters > 1) { - Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1)); - CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy); - CGF.EmitBranch(PrecondBB); - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(ExitBB); - } - RealTySize %= TySize; - } - ++Idx; - } - - CGF.FinishFunction(); - return Fn; -} - -/// Emit a helper that reduces data across two OpenMP threads (lanes) -/// in the same warp. It uses shuffle instructions to copy over data from -/// a remote lane's stack. The reduction algorithm performed is specified -/// by the fourth parameter. -/// -/// Algorithm Versions. -/// Full Warp Reduce (argument value 0): -/// This algorithm assumes that all 32 lanes are active and gathers -/// data from these 32 lanes, producing a single resultant value. -/// Contiguous Partial Warp Reduce (argument value 1): -/// This algorithm assumes that only a *contiguous* subset of lanes -/// are active. This happens for the last warp in a parallel region -/// when the user specified num_threads is not an integer multiple of -/// 32. This contiguous subset always starts with the zeroth lane. -/// Partial Warp Reduce (argument value 2): -/// This algorithm gathers data from any number of lanes at any position. -/// All reduced values are stored in the lowest possible lane. The set -/// of problems every algorithm addresses is a super set of those -/// addressable by algorithms with a lower version number. Overhead -/// increases as algorithm version increases. -/// -/// Terminology -/// Reduce element: -/// Reduce element refers to the individual data field with primitive -/// data types to be combined and reduced across threads. -/// Reduce list: -/// Reduce list refers to a collection of local, thread-private -/// reduce elements. -/// Remote Reduce list: -/// Remote Reduce list refers to a collection of remote (relative to -/// the current thread) reduce elements. -/// -/// We distinguish between three states of threads that are important to -/// the implementation of this function. -/// Alive threads: -/// Threads in a warp executing the SIMT instruction, as distinguished from -/// threads that are inactive due to divergent control flow. -/// Active threads: -/// The minimal set of threads that has to be alive upon entry to this -/// function. The computation is correct iff active threads are alive. -/// Some threads are alive but they are not active because they do not -/// contribute to the computation in any useful manner. Turning them off -/// may introduce control flow overheads without any tangible benefits. -/// Effective threads: -/// In order to comply with the argument requirements of the shuffle -/// function, we must keep all lanes holding data alive. But at most -/// half of them perform value aggregation; we refer to this half of -/// threads as effective. The other half is simply handing off their -/// data. -/// -/// Procedure -/// Value shuffle: -/// In this step active threads transfer data from higher lane positions -/// in the warp to lower lane positions, creating Remote Reduce list. -/// Value aggregation: -/// In this step, effective threads combine their thread local Reduce list -/// with Remote Reduce list and store the result in the thread local -/// Reduce list. -/// Value copy: -/// In this step, we deal with the assumption made by algorithm 2 -/// (i.e. contiguity assumption). When we have an odd number of lanes -/// active, say 2k+1, only k threads will be effective and therefore k -/// new values will be produced. However, the Reduce list owned by the -/// (2k+1)th thread is ignored in the value aggregation. Therefore -/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so -/// that the contiguity assumption still holds. -static llvm::Function *emitShuffleAndReduceFunction( - CodeGenModule &CGM, ArrayRef Privates, - QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) { - ASTContext &C = CGM.getContext(); - - // Thread local Reduce list used to host the values of data to be reduced. - ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - // Current lane id; could be logical. - ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy, - ImplicitParamDecl::Other); - // Offset of the remote source lane relative to the current lane. - ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.ShortTy, ImplicitParamDecl::Other); - // Algorithm version. This is expected to be known at compile time. - ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.ShortTy, ImplicitParamDecl::Other); - FunctionArgList Args; - Args.push_back(&ReduceListArg); - Args.push_back(&LaneIDArg); - Args.push_back(&RemoteLaneOffsetArg); - Args.push_back(&AlgoVerArg); - - const CGFunctionInfo &CGFI = - CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - auto *Fn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule()); - CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); - Fn->setDoesNotRecurse(); - if (CGM.getLangOpts().Optimize) { - Fn->removeFnAttr(llvm::Attribute::NoInline); - Fn->removeFnAttr(llvm::Attribute::OptimizeNone); - Fn->addFnAttr(llvm::Attribute::AlwaysInline); - } - - CodeGenFunction CGF(CGM); - CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); - - CGBuilderTy &Bld = CGF.Builder; - - Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); - Address LocalReduceList( - Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, - C.VoidPtrTy, SourceLocation()), - CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), - CGF.getPointerAlign()); - - Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg); - llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar( - AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); - - Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg); - llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar( - AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); - - Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg); - llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar( - AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); - - // Create a local thread-private variable to host the Reduce list - // from a remote lane. - Address RemoteReduceList = - CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list"); - - // This loop iterates through the list of reduce elements and copies, - // element by element, from a remote lane in the warp to RemoteReduceList, - // hosted on the thread's stack. - emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates, - LocalReduceList, RemoteReduceList, - {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal, - /*ScratchpadIndex=*/nullptr, - /*ScratchpadWidth=*/nullptr}); - - // The actions to be performed on the Remote Reduce list is dependent - // on the algorithm version. - // - // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 && - // LaneId % 2 == 0 && Offset > 0): - // do the reduction value aggregation - // - // The thread local variable Reduce list is mutated in place to host the - // reduced data, which is the aggregated value produced from local and - // remote lanes. - // - // Note that AlgoVer is expected to be a constant integer known at compile - // time. - // When AlgoVer==0, the first conjunction evaluates to true, making - // the entire predicate true during compile time. - // When AlgoVer==1, the second conjunction has only the second part to be - // evaluated during runtime. Other conjunctions evaluates to false - // during compile time. - // When AlgoVer==2, the third conjunction has only the second part to be - // evaluated during runtime. Other conjunctions evaluates to false - // during compile time. - llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal); - - llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)); - llvm::Value *CondAlgo1 = Bld.CreateAnd( - Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal)); - - llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2)); - llvm::Value *CondAlgo2 = Bld.CreateAnd( - Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)))); - CondAlgo2 = Bld.CreateAnd( - CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0))); - - llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1); - CondReduce = Bld.CreateOr(CondReduce, CondAlgo2); - - llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); - Bld.CreateCondBr(CondReduce, ThenBB, ElseBB); - - CGF.EmitBlock(ThenBB); - // reduce_function(LocalReduceList, RemoteReduceList) - llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - LocalReduceList.getPointer(), CGF.VoidPtrTy); - llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - RemoteReduceList.getPointer(), CGF.VoidPtrTy); - CGM.getOpenMPRuntime().emitOutlinedFunctionCall( - CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr}); - Bld.CreateBr(MergeBB); - - CGF.EmitBlock(ElseBB); - Bld.CreateBr(MergeBB); - - CGF.EmitBlock(MergeBB); - - // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local - // Reduce list. - Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)); - llvm::Value *CondCopy = Bld.CreateAnd( - Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal)); - - llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont"); - Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB); - - CGF.EmitBlock(CpyThenBB); - emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates, - RemoteReduceList, LocalReduceList); - Bld.CreateBr(CpyMergeBB); - - CGF.EmitBlock(CpyElseBB); - Bld.CreateBr(CpyMergeBB); - - CGF.EmitBlock(CpyMergeBB); - - CGF.FinishFunction(); - return Fn; -} - -/// This function emits a helper that copies all the reduction variables from -/// the team into the provided global buffer for the reduction variables. -/// -/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) -/// For all data entries D in reduce_data: -/// Copy local D to buffer.D[Idx] -static llvm::Value *emitListToGlobalCopyFunction( - CodeGenModule &CGM, ArrayRef Privates, - QualType ReductionArrayTy, SourceLocation Loc, - const RecordDecl *TeamReductionRec, - const llvm::SmallDenseMap - &VarFieldMap) { - ASTContext &C = CGM.getContext(); - - // Buffer: global reduction buffer. - ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - // Idx: index of the buffer. - ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, - ImplicitParamDecl::Other); - // ReduceList: thread local Reduce list. - ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - FunctionArgList Args; - Args.push_back(&BufferArg); - Args.push_back(&IdxArg); - Args.push_back(&ReduceListArg); - - const CGFunctionInfo &CGFI = - CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - auto *Fn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - "_omp_reduction_list_to_global_copy_func", &CGM.getModule()); - CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); - Fn->setDoesNotRecurse(); - CodeGenFunction CGF(CGM); - CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); - - CGBuilderTy &Bld = CGF.Builder; - - Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); - Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); - Address LocalReduceList( - Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, - C.VoidPtrTy, Loc), - CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), - CGF.getPointerAlign()); - QualType StaticTy = C.getRecordType(TeamReductionRec); - llvm::Type *LLVMReductionsBufferTy = - CGM.getTypes().ConvertTypeForMem(StaticTy); - llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), - LLVMReductionsBufferTy->getPointerTo()); - llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), - CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), - /*Volatile=*/false, C.IntTy, - Loc)}; - unsigned Idx = 0; - for (const Expr *Private : Privates) { - // Reduce element = LocalReduceList[i] - Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); - llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( - ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); - // elemptr = ((CopyType*)(elemptrptr)) + I - ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo()); - Address ElemPtr = - Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); - const ValueDecl *VD = cast(Private)->getDecl(); - // Global = Buffer.VD[Idx]; - const FieldDecl *FD = VarFieldMap.lookup(VD); - LValue GlobLVal = CGF.EmitLValueForField( - CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); - llvm::Value *BufferPtr = - Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs); - GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment())); - switch (CGF.getEvaluationKind(Private->getType())) { - case TEK_Scalar: { - llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, - Private->getType(), Loc); - CGF.EmitStoreOfScalar(V, GlobLVal); - break; - } - case TEK_Complex: { - CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex( - CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc); - CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false); - break; - } - case TEK_Aggregate: - CGF.EmitAggregateCopy(GlobLVal, - CGF.MakeAddrLValue(ElemPtr, Private->getType()), - Private->getType(), AggValueSlot::DoesNotOverlap); - break; - } - ++Idx; - } - - CGF.FinishFunction(); - return Fn; -} - -/// This function emits a helper that reduces all the reduction variables from -/// the team into the provided global buffer for the reduction variables. -/// -/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data) -/// void *GlobPtrs[]; -/// GlobPtrs[0] = (void*)&buffer.D0[Idx]; -/// ... -/// GlobPtrs[N] = (void*)&buffer.DN[Idx]; -/// reduce_function(GlobPtrs, reduce_data); -static llvm::Value *emitListToGlobalReduceFunction( - CodeGenModule &CGM, ArrayRef Privates, - QualType ReductionArrayTy, SourceLocation Loc, - const RecordDecl *TeamReductionRec, - const llvm::SmallDenseMap - &VarFieldMap, - llvm::Function *ReduceFn) { - ASTContext &C = CGM.getContext(); - - // Buffer: global reduction buffer. - ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - // Idx: index of the buffer. - ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, - ImplicitParamDecl::Other); - // ReduceList: thread local Reduce list. - ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - FunctionArgList Args; - Args.push_back(&BufferArg); - Args.push_back(&IdxArg); - Args.push_back(&ReduceListArg); - - const CGFunctionInfo &CGFI = - CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - auto *Fn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - "_omp_reduction_list_to_global_reduce_func", &CGM.getModule()); - CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); - Fn->setDoesNotRecurse(); - CodeGenFunction CGF(CGM); - CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); - - CGBuilderTy &Bld = CGF.Builder; - - Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); - QualType StaticTy = C.getRecordType(TeamReductionRec); - llvm::Type *LLVMReductionsBufferTy = - CGM.getTypes().ConvertTypeForMem(StaticTy); - llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), - LLVMReductionsBufferTy->getPointerTo()); - - // 1. Build a list of reduction variables. - // void *RedList[] = {[0], ..., [-1]}; - Address ReductionList = - CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); - auto IPriv = Privates.begin(); - llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), - CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), - /*Volatile=*/false, C.IntTy, - Loc)}; - unsigned Idx = 0; - for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) { - Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); - // Global = Buffer.VD[Idx]; - const ValueDecl *VD = cast(*IPriv)->getDecl(); - const FieldDecl *FD = VarFieldMap.lookup(VD); - LValue GlobLVal = CGF.EmitLValueForField( - CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); - llvm::Value *BufferPtr = - Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs); - llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr); - CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy); - if ((*IPriv)->getType()->isVariablyModifiedType()) { - // Store array size. - ++Idx; - Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); - llvm::Value *Size = CGF.Builder.CreateIntCast( - CGF.getVLASize( - CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) - .NumElts, - CGF.SizeTy, /*isSigned=*/false); - CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), - Elem); - } - } - - // Call reduce_function(GlobalReduceList, ReduceList) - llvm::Value *GlobalReduceList = - CGF.EmitCastToVoidPtr(ReductionList.getPointer()); - Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); - llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar( - AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc); - CGM.getOpenMPRuntime().emitOutlinedFunctionCall( - CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr}); - CGF.FinishFunction(); - return Fn; -} - -/// This function emits a helper that copies all the reduction variables from -/// the team into the provided global buffer for the reduction variables. -/// -/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) -/// For all data entries D in reduce_data: -/// Copy buffer.D[Idx] to local D; -static llvm::Value *emitGlobalToListCopyFunction( - CodeGenModule &CGM, ArrayRef Privates, - QualType ReductionArrayTy, SourceLocation Loc, - const RecordDecl *TeamReductionRec, - const llvm::SmallDenseMap - &VarFieldMap) { - ASTContext &C = CGM.getContext(); - - // Buffer: global reduction buffer. - ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - // Idx: index of the buffer. - ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, - ImplicitParamDecl::Other); - // ReduceList: thread local Reduce list. - ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - FunctionArgList Args; - Args.push_back(&BufferArg); - Args.push_back(&IdxArg); - Args.push_back(&ReduceListArg); - - const CGFunctionInfo &CGFI = - CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - auto *Fn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - "_omp_reduction_global_to_list_copy_func", &CGM.getModule()); - CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); - Fn->setDoesNotRecurse(); - CodeGenFunction CGF(CGM); - CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); - - CGBuilderTy &Bld = CGF.Builder; - - Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); - Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); - Address LocalReduceList( - Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, - C.VoidPtrTy, Loc), - CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), - CGF.getPointerAlign()); - QualType StaticTy = C.getRecordType(TeamReductionRec); - llvm::Type *LLVMReductionsBufferTy = - CGM.getTypes().ConvertTypeForMem(StaticTy); - llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), - LLVMReductionsBufferTy->getPointerTo()); - - llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), - CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), - /*Volatile=*/false, C.IntTy, - Loc)}; - unsigned Idx = 0; - for (const Expr *Private : Privates) { - // Reduce element = LocalReduceList[i] - Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); - llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( - ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); - // elemptr = ((CopyType*)(elemptrptr)) + I - ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo()); - Address ElemPtr = - Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); - const ValueDecl *VD = cast(Private)->getDecl(); - // Global = Buffer.VD[Idx]; - const FieldDecl *FD = VarFieldMap.lookup(VD); - LValue GlobLVal = CGF.EmitLValueForField( - CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); - llvm::Value *BufferPtr = - Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs); - GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment())); - switch (CGF.getEvaluationKind(Private->getType())) { - case TEK_Scalar: { - llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc); - CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType()); - break; - } - case TEK_Complex: { - CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc); - CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()), - /*isInit=*/false); - break; - } - case TEK_Aggregate: - CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()), - GlobLVal, Private->getType(), - AggValueSlot::DoesNotOverlap); - break; - } - ++Idx; - } - - CGF.FinishFunction(); - return Fn; -} - -/// This function emits a helper that reduces all the reduction variables from -/// the team into the provided global buffer for the reduction variables. -/// -/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data) -/// void *GlobPtrs[]; -/// GlobPtrs[0] = (void*)&buffer.D0[Idx]; -/// ... -/// GlobPtrs[N] = (void*)&buffer.DN[Idx]; -/// reduce_function(reduce_data, GlobPtrs); -static llvm::Value *emitGlobalToListReduceFunction( - CodeGenModule &CGM, ArrayRef Privates, - QualType ReductionArrayTy, SourceLocation Loc, - const RecordDecl *TeamReductionRec, - const llvm::SmallDenseMap - &VarFieldMap, - llvm::Function *ReduceFn) { - ASTContext &C = CGM.getContext(); - - // Buffer: global reduction buffer. - ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - // Idx: index of the buffer. - ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, - ImplicitParamDecl::Other); - // ReduceList: thread local Reduce list. - ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - FunctionArgList Args; - Args.push_back(&BufferArg); - Args.push_back(&IdxArg); - Args.push_back(&ReduceListArg); - - const CGFunctionInfo &CGFI = - CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - auto *Fn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - "_omp_reduction_global_to_list_reduce_func", &CGM.getModule()); - CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); - Fn->setDoesNotRecurse(); - CodeGenFunction CGF(CGM); - CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); - - CGBuilderTy &Bld = CGF.Builder; - - Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); - QualType StaticTy = C.getRecordType(TeamReductionRec); - llvm::Type *LLVMReductionsBufferTy = - CGM.getTypes().ConvertTypeForMem(StaticTy); - llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), - LLVMReductionsBufferTy->getPointerTo()); - - // 1. Build a list of reduction variables. - // void *RedList[] = {[0], ..., [-1]}; - Address ReductionList = - CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); - auto IPriv = Privates.begin(); - llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), - CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), - /*Volatile=*/false, C.IntTy, - Loc)}; - unsigned Idx = 0; - for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) { - Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); - // Global = Buffer.VD[Idx]; - const ValueDecl *VD = cast(*IPriv)->getDecl(); - const FieldDecl *FD = VarFieldMap.lookup(VD); - LValue GlobLVal = CGF.EmitLValueForField( - CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); - llvm::Value *BufferPtr = - Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs); - llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr); - CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy); - if ((*IPriv)->getType()->isVariablyModifiedType()) { - // Store array size. - ++Idx; - Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); - llvm::Value *Size = CGF.Builder.CreateIntCast( - CGF.getVLASize( - CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) - .NumElts, - CGF.SizeTy, /*isSigned=*/false); - CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), - Elem); - } - } - - // Call reduce_function(ReduceList, GlobalReduceList) - llvm::Value *GlobalReduceList = - CGF.EmitCastToVoidPtr(ReductionList.getPointer()); - Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); - llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar( - AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc); - CGM.getOpenMPRuntime().emitOutlinedFunctionCall( - CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList}); - CGF.FinishFunction(); - return Fn; -} - -/// -/// Design of OpenMP reductions on the GPU -/// -/// Consider a typical OpenMP program with one or more reduction -/// clauses: -/// -/// float foo; -/// double bar; -/// #pragma omp target teams distribute parallel for \ -/// reduction(+:foo) reduction(*:bar) -/// for (int i = 0; i < N; i++) { -/// foo += A[i]; bar *= B[i]; -/// } -/// -/// where 'foo' and 'bar' are reduced across all OpenMP threads in -/// all teams. In our OpenMP implementation on the NVPTX device an -/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads -/// within a team are mapped to CUDA threads within a threadblock. -/// Our goal is to efficiently aggregate values across all OpenMP -/// threads such that: -/// -/// - the compiler and runtime are logically concise, and -/// - the reduction is performed efficiently in a hierarchical -/// manner as follows: within OpenMP threads in the same warp, -/// across warps in a threadblock, and finally across teams on -/// the NVPTX device. -/// -/// Introduction to Decoupling -/// -/// We would like to decouple the compiler and the runtime so that the -/// latter is ignorant of the reduction variables (number, data types) -/// and the reduction operators. This allows a simpler interface -/// and implementation while still attaining good performance. -/// -/// Pseudocode for the aforementioned OpenMP program generated by the -/// compiler is as follows: -/// -/// 1. Create private copies of reduction variables on each OpenMP -/// thread: 'foo_private', 'bar_private' -/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned -/// to it and writes the result in 'foo_private' and 'bar_private' -/// respectively. -/// 3. Call the OpenMP runtime on the GPU to reduce within a team -/// and store the result on the team master: -/// -/// __kmpc_nvptx_parallel_reduce_nowait_v2(..., -/// reduceData, shuffleReduceFn, interWarpCpyFn) -/// -/// where: -/// struct ReduceData { -/// double *foo; -/// double *bar; -/// } reduceData -/// reduceData.foo = &foo_private -/// reduceData.bar = &bar_private -/// -/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two -/// auxiliary functions generated by the compiler that operate on -/// variables of type 'ReduceData'. They aid the runtime perform -/// algorithmic steps in a data agnostic manner. -/// -/// 'shuffleReduceFn' is a pointer to a function that reduces data -/// of type 'ReduceData' across two OpenMP threads (lanes) in the -/// same warp. It takes the following arguments as input: -/// -/// a. variable of type 'ReduceData' on the calling lane, -/// b. its lane_id, -/// c. an offset relative to the current lane_id to generate a -/// remote_lane_id. The remote lane contains the second -/// variable of type 'ReduceData' that is to be reduced. -/// d. an algorithm version parameter determining which reduction -/// algorithm to use. -/// -/// 'shuffleReduceFn' retrieves data from the remote lane using -/// efficient GPU shuffle intrinsics and reduces, using the -/// algorithm specified by the 4th parameter, the two operands -/// element-wise. The result is written to the first operand. -/// -/// Different reduction algorithms are implemented in different -/// runtime functions, all calling 'shuffleReduceFn' to perform -/// the essential reduction step. Therefore, based on the 4th -/// parameter, this function behaves slightly differently to -/// cooperate with the runtime to ensure correctness under -/// different circumstances. -/// -/// 'InterWarpCpyFn' is a pointer to a function that transfers -/// reduced variables across warps. It tunnels, through CUDA -/// shared memory, the thread-private data of type 'ReduceData' -/// from lane 0 of each warp to a lane in the first warp. -/// 4. Call the OpenMP runtime on the GPU to reduce across teams. -/// The last team writes the global reduced value to memory. -/// -/// ret = __kmpc_nvptx_teams_reduce_nowait(..., -/// reduceData, shuffleReduceFn, interWarpCpyFn, -/// scratchpadCopyFn, loadAndReduceFn) -/// -/// 'scratchpadCopyFn' is a helper that stores reduced -/// data from the team master to a scratchpad array in -/// global memory. -/// -/// 'loadAndReduceFn' is a helper that loads data from -/// the scratchpad array and reduces it with the input -/// operand. -/// -/// These compiler generated functions hide address -/// calculation and alignment information from the runtime. -/// 5. if ret == 1: -/// The team master of the last team stores the reduced -/// result to the globals in memory. -/// foo += reduceData.foo; bar *= reduceData.bar -/// -/// -/// Warp Reduction Algorithms -/// -/// On the warp level, we have three algorithms implemented in the -/// OpenMP runtime depending on the number of active lanes: -/// -/// Full Warp Reduction -/// -/// The reduce algorithm within a warp where all lanes are active -/// is implemented in the runtime as follows: -/// -/// full_warp_reduce(void *reduce_data, -/// kmp_ShuffleReductFctPtr ShuffleReduceFn) { -/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) -/// ShuffleReduceFn(reduce_data, 0, offset, 0); -/// } -/// -/// The algorithm completes in log(2, WARPSIZE) steps. -/// -/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is -/// not used therefore we save instructions by not retrieving lane_id -/// from the corresponding special registers. The 4th parameter, which -/// represents the version of the algorithm being used, is set to 0 to -/// signify full warp reduction. -/// -/// In this version, 'ShuffleReduceFn' behaves, per element, as follows: -/// -/// #reduce_elem refers to an element in the local lane's data structure -/// #remote_elem is retrieved from a remote lane -/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); -/// reduce_elem = reduce_elem REDUCE_OP remote_elem; -/// -/// Contiguous Partial Warp Reduction -/// -/// This reduce algorithm is used within a warp where only the first -/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the -/// number of OpenMP threads in a parallel region is not a multiple of -/// WARPSIZE. The algorithm is implemented in the runtime as follows: -/// -/// void -/// contiguous_partial_reduce(void *reduce_data, -/// kmp_ShuffleReductFctPtr ShuffleReduceFn, -/// int size, int lane_id) { -/// int curr_size; -/// int offset; -/// curr_size = size; -/// mask = curr_size/2; -/// while (offset>0) { -/// ShuffleReduceFn(reduce_data, lane_id, offset, 1); -/// curr_size = (curr_size+1)/2; -/// offset = curr_size/2; -/// } -/// } -/// -/// In this version, 'ShuffleReduceFn' behaves, per element, as follows: -/// -/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); -/// if (lane_id < offset) -/// reduce_elem = reduce_elem REDUCE_OP remote_elem -/// else -/// reduce_elem = remote_elem -/// -/// This algorithm assumes that the data to be reduced are located in a -/// contiguous subset of lanes starting from the first. When there is -/// an odd number of active lanes, the data in the last lane is not -/// aggregated with any other lane's dat but is instead copied over. -/// -/// Dispersed Partial Warp Reduction -/// -/// This algorithm is used within a warp when any discontiguous subset of -/// lanes are active. It is used to implement the reduction operation -/// across lanes in an OpenMP simd region or in a nested parallel region. -/// -/// void -/// dispersed_partial_reduce(void *reduce_data, -/// kmp_ShuffleReductFctPtr ShuffleReduceFn) { -/// int size, remote_id; -/// int logical_lane_id = number_of_active_lanes_before_me() * 2; -/// do { -/// remote_id = next_active_lane_id_right_after_me(); -/// # the above function returns 0 of no active lane -/// # is present right after the current lane. -/// size = number_of_active_lanes_in_this_warp(); -/// logical_lane_id /= 2; -/// ShuffleReduceFn(reduce_data, logical_lane_id, -/// remote_id-1-threadIdx.x, 2); -/// } while (logical_lane_id % 2 == 0 && size > 1); -/// } -/// -/// There is no assumption made about the initial state of the reduction. -/// Any number of lanes (>=1) could be active at any position. The reduction -/// result is returned in the first active lane. -/// -/// In this version, 'ShuffleReduceFn' behaves, per element, as follows: -/// -/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); -/// if (lane_id % 2 == 0 && offset > 0) -/// reduce_elem = reduce_elem REDUCE_OP remote_elem -/// else -/// reduce_elem = remote_elem -/// -/// -/// Intra-Team Reduction -/// -/// This function, as implemented in the runtime call -/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP -/// threads in a team. It first reduces within a warp using the -/// aforementioned algorithms. We then proceed to gather all such -/// reduced values at the first warp. -/// -/// The runtime makes use of the function 'InterWarpCpyFn', which copies -/// data from each of the "warp master" (zeroth lane of each warp, where -/// warp-reduced data is held) to the zeroth warp. This step reduces (in -/// a mathematical sense) the problem of reduction across warp masters in -/// a block to the problem of warp reduction. -/// -/// -/// Inter-Team Reduction -/// -/// Once a team has reduced its data to a single value, it is stored in -/// a global scratchpad array. Since each team has a distinct slot, this -/// can be done without locking. -/// -/// The last team to write to the scratchpad array proceeds to reduce the -/// scratchpad array. One or more workers in the last team use the helper -/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., -/// the k'th worker reduces every k'th element. -/// -/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to -/// reduce across workers and compute a globally reduced value. -/// -void CGOpenMPRuntimeNVPTX::emitReduction( - CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, - ArrayRef LHSExprs, ArrayRef RHSExprs, - ArrayRef ReductionOps, ReductionOptionsTy Options) { - if (!CGF.HaveInsertPoint()) - return; - - bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); -#ifndef NDEBUG - bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); -#endif - - if (Options.SimpleReduction) { - assert(!TeamsReduction && !ParallelReduction && - "Invalid reduction selection in emitReduction."); - CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, - ReductionOps, Options); - return; - } - - assert((TeamsReduction || ParallelReduction) && - "Invalid reduction selection in emitReduction."); - - // Build res = __kmpc_reduce{_nowait}(, , sizeof(RedList), - // RedList, shuffle_reduce_func, interwarp_copy_func); - // or - // Build res = __kmpc_reduce_teams_nowait_simple(, , ); - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadId = getThreadID(CGF, Loc); - - llvm::Value *Res; - ASTContext &C = CGM.getContext(); - // 1. Build a list of reduction variables. - // void *RedList[] = {[0], ..., [-1]}; - auto Size = RHSExprs.size(); - for (const Expr *E : Privates) { - if (E->getType()->isVariablyModifiedType()) - // Reserve place for array size. - ++Size; - } - llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); - QualType ReductionArrayTy = - C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); - Address ReductionList = - CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); - auto IPriv = Privates.begin(); - unsigned Idx = 0; - for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { - Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); - CGF.Builder.CreateStore( - CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy), - Elem); - if ((*IPriv)->getType()->isVariablyModifiedType()) { - // Store array size. - ++Idx; - Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); - llvm::Value *Size = CGF.Builder.CreateIntCast( - CGF.getVLASize( - CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) - .NumElts, - CGF.SizeTy, /*isSigned=*/false); - CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), - Elem); - } - } - - llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - ReductionList.getPointer(), CGF.VoidPtrTy); - llvm::Function *ReductionFn = emitReductionFunction( - Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates, - LHSExprs, RHSExprs, ReductionOps); - llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); - llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction( - CGM, Privates, ReductionArrayTy, ReductionFn, Loc); - llvm::Value *InterWarpCopyFn = - emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); - - if (ParallelReduction) { - llvm::Value *Args[] = {RTLoc, - ThreadId, - CGF.Builder.getInt32(RHSExprs.size()), - ReductionArrayTySize, - RL, - ShuffleAndReduceFn, - InterWarpCopyFn}; - - Res = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2), - Args); - } else { - assert(TeamsReduction && "expected teams reduction."); - llvm::SmallDenseMap VarFieldMap; - llvm::SmallVector PrivatesReductions(Privates.size()); - int Cnt = 0; - for (const Expr *DRE : Privates) { - PrivatesReductions[Cnt] = cast(DRE)->getDecl(); - ++Cnt; - } - const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars( - CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap, - C.getLangOpts().OpenMPCUDAReductionBufNum); - TeamsReductions.push_back(TeamReductionRec); - if (!KernelTeamsReductionPtr) { - KernelTeamsReductionPtr = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_teams_reductions_buffer_$_$ptr"); - } - llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar( - Address(KernelTeamsReductionPtr, CGM.getPointerAlign()), - /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc); - llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction( - CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); - llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction( - CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, - ReductionFn); - llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction( - CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); - llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction( - CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, - ReductionFn); - - llvm::Value *Args[] = { - RTLoc, - ThreadId, - GlobalBufferPtr, - CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum), - RL, - ShuffleAndReduceFn, - InterWarpCopyFn, - GlobalToBufferCpyFn, - GlobalToBufferRedFn, - BufferToGlobalCpyFn, - BufferToGlobalRedFn}; - - Res = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2), - Args); - } - - // 5. Build if (res == 1) - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done"); - llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then"); - llvm::Value *Cond = CGF.Builder.CreateICmpEQ( - Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1)); - CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB); - - // 6. Build then branch: where we have reduced values in the master - // thread in each team. - // __kmpc_end_reduce{_nowait}(); - // break; - CGF.EmitBlock(ThenBB); - - // Add emission of __kmpc_end_reduce{_nowait}(); - auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps, - this](CodeGenFunction &CGF, PrePostActionTy &Action) { - auto IPriv = Privates.begin(); - auto ILHS = LHSExprs.begin(); - auto IRHS = RHSExprs.begin(); - for (const Expr *E : ReductionOps) { - emitSingleReductionCombiner(CGF, E, *IPriv, cast(*ILHS), - cast(*IRHS)); - ++IPriv; - ++ILHS; - ++IRHS; - } - }; - llvm::Value *EndArgs[] = {ThreadId}; - RegionCodeGenTy RCG(CodeGen); - NVPTXActionTy Action( - nullptr, llvm::None, - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait), - EndArgs); - RCG.setAction(Action); - RCG(CGF); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(ExitBB, /*IsFinished=*/true); -} - -const VarDecl * -CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD, - const VarDecl *NativeParam) const { - if (!NativeParam->getType()->isReferenceType()) - return NativeParam; - QualType ArgType = NativeParam->getType(); - QualifierCollector QC; - const Type *NonQualTy = QC.strip(ArgType); - QualType PointeeTy = cast(NonQualTy)->getPointeeType(); - if (const auto *Attr = FD->getAttr()) { - if (Attr->getCaptureKind() == OMPC_map) { - PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, - LangAS::opencl_global); - } else if (Attr->getCaptureKind() == OMPC_firstprivate && - PointeeTy.isConstant(CGM.getContext())) { - PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, - LangAS::opencl_generic); - } - } - ArgType = CGM.getContext().getPointerType(PointeeTy); - QC.addRestrict(); - enum { NVPTX_local_addr = 5 }; - QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr)); - ArgType = QC.apply(CGM.getContext(), ArgType); - if (isa(NativeParam)) - return ImplicitParamDecl::Create( - CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(), - NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other); - return ParmVarDecl::Create( - CGM.getContext(), - const_cast(NativeParam->getDeclContext()), - NativeParam->getBeginLoc(), NativeParam->getLocation(), - NativeParam->getIdentifier(), ArgType, - /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr); -} - -Address -CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF, - const VarDecl *NativeParam, - const VarDecl *TargetParam) const { - assert(NativeParam != TargetParam && - NativeParam->getType()->isReferenceType() && - "Native arg must not be the same as target arg."); - Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam); - QualType NativeParamType = NativeParam->getType(); - QualifierCollector QC; - const Type *NonQualTy = QC.strip(NativeParamType); - QualType NativePointeeTy = cast(NonQualTy)->getPointeeType(); - unsigned NativePointeeAddrSpace = - CGF.getContext().getTargetAddressSpace(NativePointeeTy); - QualType TargetTy = TargetParam->getType(); - llvm::Value *TargetAddr = CGF.EmitLoadOfScalar( - LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation()); - // First cast to generic. - TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo( - /*AddrSpace=*/0)); - // Cast from generic to native address space. - TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo( - NativePointeeAddrSpace)); - Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType); - CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false, - NativeParamType); - return NativeParamAddr; -} - -void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall( - CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, - ArrayRef Args) const { - SmallVector TargetArgs; - TargetArgs.reserve(Args.size()); - auto *FnType = OutlinedFn.getFunctionType(); - for (unsigned I = 0, E = Args.size(); I < E; ++I) { - if (FnType->isVarArg() && FnType->getNumParams() <= I) { - TargetArgs.append(std::next(Args.begin(), I), Args.end()); - break; - } - llvm::Type *TargetType = FnType->getParamType(I); - llvm::Value *NativeArg = Args[I]; - if (!TargetType->isPointerTy()) { - TargetArgs.emplace_back(NativeArg); - continue; - } - llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - NativeArg, - NativeArg->getType()->getPointerElementType()->getPointerTo()); - TargetArgs.emplace_back( - CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType)); - } - CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); -} - -/// Emit function which wraps the outline parallel region -/// and controls the arguments which are passed to this function. -/// The wrapper ensures that the outlined function is called -/// with the correct arguments when data is shared. -llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper( - llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { - ASTContext &Ctx = CGM.getContext(); - const auto &CS = *D.getCapturedStmt(OMPD_parallel); - - // Create a function that takes as argument the source thread. - FunctionArgList WrapperArgs; - QualType Int16QTy = - Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); - QualType Int32QTy = - Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); - ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), - /*Id=*/nullptr, Int16QTy, - ImplicitParamDecl::Other); - ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), - /*Id=*/nullptr, Int32QTy, - ImplicitParamDecl::Other); - WrapperArgs.emplace_back(&ParallelLevelArg); - WrapperArgs.emplace_back(&WrapperArg); - - const CGFunctionInfo &CGFI = - CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); - - auto *Fn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule()); - CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); - Fn->setLinkage(llvm::GlobalValue::InternalLinkage); - Fn->setDoesNotRecurse(); - - CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); - CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs, - D.getBeginLoc(), D.getBeginLoc()); - - const auto *RD = CS.getCapturedRecordDecl(); - auto CurField = RD->field_begin(); - - Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, - /*Name=*/".zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); - // Get the array of arguments. - SmallVector Args; - - Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer()); - Args.emplace_back(ZeroAddr.getPointer()); - - CGBuilderTy &Bld = CGF.Builder; - auto CI = CS.capture_begin(); - - // Use global memory for data sharing. - // Handle passing of global args to workers. - Address GlobalArgs = - CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args"); - llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer(); - llvm::Value *DataSharingArgs[] = {GlobalArgsPtr}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables), - DataSharingArgs); - - // Retrieve the shared variables from the list of references returned - // by the runtime. Pass the variables to the outlined function. - Address SharedArgListAddress = Address::invalid(); - if (CS.capture_size() > 0 || - isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { - SharedArgListAddress = CGF.EmitLoadOfPointer( - GlobalArgs, CGF.getContext() - .getPointerType(CGF.getContext().getPointerType( - CGF.getContext().VoidPtrTy)) - .castAs()); - } - unsigned Idx = 0; - if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { - Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); - Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( - Src, CGF.SizeTy->getPointerTo()); - llvm::Value *LB = CGF.EmitLoadOfScalar( - TypedAddress, - /*Volatile=*/false, - CGF.getContext().getPointerType(CGF.getContext().getSizeType()), - cast(D).getLowerBoundVariable()->getExprLoc()); - Args.emplace_back(LB); - ++Idx; - Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); - TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( - Src, CGF.SizeTy->getPointerTo()); - llvm::Value *UB = CGF.EmitLoadOfScalar( - TypedAddress, - /*Volatile=*/false, - CGF.getContext().getPointerType(CGF.getContext().getSizeType()), - cast(D).getUpperBoundVariable()->getExprLoc()); - Args.emplace_back(UB); - ++Idx; - } - if (CS.capture_size() > 0) { - ASTContext &CGFContext = CGF.getContext(); - for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { - QualType ElemTy = CurField->getType(); - Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx); - Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( - Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy))); - llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, - /*Volatile=*/false, - CGFContext.getPointerType(ElemTy), - CI->getLocation()); - if (CI->capturesVariableByCopy() && - !CI->getCapturedVar()->getType()->isAnyPointerType()) { - Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(), - CI->getLocation()); - } - Args.emplace_back(Arg); - } - } - - emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args); - CGF.FinishFunction(); - return Fn; -} - -void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, - const Decl *D) { - if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic) - return; - - assert(D && "Expected function or captured|block decl."); - assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && - "Function is registered already."); - assert((!TeamAndReductions.first || TeamAndReductions.first == D) && - "Team is set but not processed."); - const Stmt *Body = nullptr; - bool NeedToDelayGlobalization = false; - if (const auto *FD = dyn_cast(D)) { - Body = FD->getBody(); - } else if (const auto *BD = dyn_cast(D)) { - Body = BD->getBody(); - } else if (const auto *CD = dyn_cast(D)) { - Body = CD->getBody(); - NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP; - if (NeedToDelayGlobalization && - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) - return; - } - if (!Body) - return; - CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); - VarChecker.Visit(Body); - const RecordDecl *GlobalizedVarsRecord = - VarChecker.getGlobalizedRecord(IsInTTDRegion); - TeamAndReductions.first = nullptr; - TeamAndReductions.second.clear(); - ArrayRef EscapedVariableLengthDecls = - VarChecker.getEscapedVariableLengthDecls(); - if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty()) - return; - auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; - I->getSecond().MappedParams = - std::make_unique(); - I->getSecond().GlobalRecord = GlobalizedVarsRecord; - I->getSecond().EscapedParameters.insert( - VarChecker.getEscapedParameters().begin(), - VarChecker.getEscapedParameters().end()); - I->getSecond().EscapedVariableLengthDecls.append( - EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end()); - DeclToAddrMapTy &Data = I->getSecond().LocalVarData; - for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { - assert(VD->isCanonicalDecl() && "Expected canonical declaration"); - const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion))); - } - if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) { - CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None); - VarChecker.Visit(Body); - I->getSecond().SecondaryGlobalRecord = - VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true); - I->getSecond().SecondaryLocalVarData.emplace(); - DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue(); - for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { - assert(VD->isCanonicalDecl() && "Expected canonical declaration"); - const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert( - std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true))); - } - } - if (!NeedToDelayGlobalization) { - emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true); - struct GlobalizationScope final : EHScopeStack::Cleanup { - GlobalizationScope() = default; - - void Emit(CodeGenFunction &CGF, Flags flags) override { - static_cast(CGF.CGM.getOpenMPRuntime()) - .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true); - } - }; - CGF.EHStack.pushCleanup(NormalAndEHCleanup); - } -} - -Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, - const VarDecl *VD) { - if (VD && VD->hasAttr()) { - const auto *A = VD->getAttr(); - switch (A->getAllocatorType()) { - // Use the default allocator here as by default local vars are - // threadlocal. - case OMPAllocateDeclAttr::OMPNullMemAlloc: - case OMPAllocateDeclAttr::OMPDefaultMemAlloc: - case OMPAllocateDeclAttr::OMPThreadMemAlloc: - case OMPAllocateDeclAttr::OMPHighBWMemAlloc: - case OMPAllocateDeclAttr::OMPLowLatMemAlloc: - // Follow the user decision - use default allocation. - return Address::invalid(); - case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: - // TODO: implement aupport for user-defined allocators. - return Address::invalid(); - case OMPAllocateDeclAttr::OMPConstMemAlloc: { - llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), VarTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(VarTy), VD->getName(), - /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant)); - CharUnits Align = CGM.getContext().getDeclAlign(VD); - GV->setAlignment(Align.getAsAlign()); - return Address(GV, Align); - } - case OMPAllocateDeclAttr::OMPPTeamMemAlloc: { - llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), VarTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(VarTy), VD->getName(), - /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - CharUnits Align = CGM.getContext().getDeclAlign(VD); - GV->setAlignment(Align.getAsAlign()); - return Address(GV, Align); - } - case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: - case OMPAllocateDeclAttr::OMPCGroupMemAlloc: { - llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), VarTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(VarTy), VD->getName()); - CharUnits Align = CGM.getContext().getDeclAlign(VD); - GV->setAlignment(Align.getAsAlign()); - return Address(GV, Align); - } - } - } - - if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic) - return Address::invalid(); - - VD = VD->getCanonicalDecl(); - auto I = FunctionGlobalizedDecls.find(CGF.CurFn); - if (I == FunctionGlobalizedDecls.end()) - return Address::invalid(); - auto VDI = I->getSecond().LocalVarData.find(VD); - if (VDI != I->getSecond().LocalVarData.end()) - return VDI->second.PrivateAddr; - if (VD->hasAttrs()) { - for (specific_attr_iterator IT(VD->attr_begin()), - E(VD->attr_end()); - IT != E; ++IT) { - auto VDI = I->getSecond().LocalVarData.find( - cast(cast(IT->getRef())->getDecl()) - ->getCanonicalDecl()); - if (VDI != I->getSecond().LocalVarData.end()) - return VDI->second.PrivateAddr; - } - } - - return Address::invalid(); -} - -void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) { - FunctionGlobalizedDecls.erase(CGF.CurFn); - CGOpenMPRuntime::functionFinished(CGF); -} - -void CGOpenMPRuntimeNVPTX::getDefaultDistScheduleAndChunk( - CodeGenFunction &CGF, const OMPLoopDirective &S, - OpenMPDistScheduleClauseKind &ScheduleKind, - llvm::Value *&Chunk) const { - if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) { - ScheduleKind = OMPC_DIST_SCHEDULE_static; - Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF), - CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), - S.getIterationVariable()->getType(), S.getBeginLoc()); - return; - } - CGOpenMPRuntime::getDefaultDistScheduleAndChunk( - CGF, S, ScheduleKind, Chunk); -} - -void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk( - CodeGenFunction &CGF, const OMPLoopDirective &S, - OpenMPScheduleClauseKind &ScheduleKind, - const Expr *&ChunkExpr) const { - ScheduleKind = OMPC_SCHEDULE_static; - // Chunk size is 1 in this case. - llvm::APInt ChunkSize(32, 1); - ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize, - CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), - SourceLocation()); -} - -void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas( - CodeGenFunction &CGF, const OMPExecutableDirective &D) const { - assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && - " Expected target-based directive."); - const CapturedStmt *CS = D.getCapturedStmt(OMPD_target); - for (const CapturedStmt::Capture &C : CS->captures()) { - // Capture variables captured by reference in lambdas for target-based - // directives. - if (!C.capturesVariable()) - continue; - const VarDecl *VD = C.getCapturedVar(); - const auto *RD = VD->getType() - .getCanonicalType() - .getNonReferenceType() - ->getAsCXXRecordDecl(); - if (!RD || !RD->isLambda()) - continue; - Address VDAddr = CGF.GetAddrOfLocalVar(VD); - LValue VDLVal; - if (VD->getType().getCanonicalType()->isReferenceType()) - VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType()); - else - VDLVal = CGF.MakeAddrLValue( - VDAddr, VD->getType().getCanonicalType().getNonReferenceType()); - llvm::DenseMap Captures; - FieldDecl *ThisCapture = nullptr; - RD->getCaptureFields(Captures, ThisCapture); - if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) { - LValue ThisLVal = - CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture); - llvm::Value *CXXThis = CGF.LoadCXXThis(); - CGF.EmitStoreOfScalar(CXXThis, ThisLVal); - } - for (const LambdaCapture &LC : RD->captures()) { - if (LC.getCaptureKind() != LCK_ByRef) - continue; - const VarDecl *VD = LC.getCapturedVar(); - if (!CS->capturesVariable(VD)) - continue; - auto It = Captures.find(VD); - assert(It != Captures.end() && "Found lambda capture without field."); - LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second); - Address VDAddr = CGF.GetAddrOfLocalVar(VD); - if (VD->getType().getCanonicalType()->isReferenceType()) - VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr, - VD->getType().getCanonicalType()) - .getAddress(CGF); - CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal); - } - } -} - -unsigned CGOpenMPRuntimeNVPTX::getDefaultFirstprivateAddressSpace() const { - return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant); -} - -bool CGOpenMPRuntimeNVPTX::hasAllocateAttributeForGlobalVar(const VarDecl *VD, - LangAS &AS) { - if (!VD || !VD->hasAttr()) - return false; - const auto *A = VD->getAttr(); - switch(A->getAllocatorType()) { - case OMPAllocateDeclAttr::OMPNullMemAlloc: - case OMPAllocateDeclAttr::OMPDefaultMemAlloc: - // Not supported, fallback to the default mem space. - case OMPAllocateDeclAttr::OMPThreadMemAlloc: - case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: - case OMPAllocateDeclAttr::OMPCGroupMemAlloc: - case OMPAllocateDeclAttr::OMPHighBWMemAlloc: - case OMPAllocateDeclAttr::OMPLowLatMemAlloc: - AS = LangAS::Default; - return true; - case OMPAllocateDeclAttr::OMPConstMemAlloc: - AS = LangAS::cuda_constant; - return true; - case OMPAllocateDeclAttr::OMPPTeamMemAlloc: - AS = LangAS::cuda_shared; - return true; - case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: - llvm_unreachable("Expected predefined allocator for the variables with the " - "static storage."); - } - return false; -} - -// Get current CudaArch and ignore any unknown values -static CudaArch getCudaArch(CodeGenModule &CGM) { - if (!CGM.getTarget().hasFeature("ptx")) - return CudaArch::UNKNOWN; - llvm::StringMap Features; - CGM.getTarget().initFeatureMap(Features, CGM.getDiags(), - CGM.getTarget().getTargetOpts().CPU, - CGM.getTarget().getTargetOpts().Features); - for (const auto &Feature : Features) { - if (Feature.getValue()) { - CudaArch Arch = StringToCudaArch(Feature.getKey()); - if (Arch != CudaArch::UNKNOWN) - return Arch; - } - } - return CudaArch::UNKNOWN; -} - -/// Check to see if target architecture supports unified addressing which is -/// a restriction for OpenMP requires clause "unified_shared_memory". -void CGOpenMPRuntimeNVPTX::processRequiresDirective( - const OMPRequiresDecl *D) { - for (const OMPClause *Clause : D->clauselists()) { - if (Clause->getClauseKind() == OMPC_unified_shared_memory) { - CudaArch Arch = getCudaArch(CGM); - switch (Arch) { - case CudaArch::SM_20: - case CudaArch::SM_21: - case CudaArch::SM_30: - case CudaArch::SM_32: - case CudaArch::SM_35: - case CudaArch::SM_37: - case CudaArch::SM_50: - case CudaArch::SM_52: - case CudaArch::SM_53: - case CudaArch::SM_60: - case CudaArch::SM_61: - case CudaArch::SM_62: { - SmallString<256> Buffer; - llvm::raw_svector_ostream Out(Buffer); - Out << "Target architecture " << CudaArchToString(Arch) - << " does not support unified addressing"; - CGM.Error(Clause->getBeginLoc(), Out.str()); - return; - } - case CudaArch::SM_70: - case CudaArch::SM_72: - case CudaArch::SM_75: - case CudaArch::SM_80: - case CudaArch::GFX600: - case CudaArch::GFX601: - case CudaArch::GFX700: - case CudaArch::GFX701: - case CudaArch::GFX702: - case CudaArch::GFX703: - case CudaArch::GFX704: - case CudaArch::GFX801: - case CudaArch::GFX802: - case CudaArch::GFX803: - case CudaArch::GFX810: - case CudaArch::GFX900: - case CudaArch::GFX902: - case CudaArch::GFX904: - case CudaArch::GFX906: - case CudaArch::GFX908: - case CudaArch::GFX909: - case CudaArch::GFX1010: - case CudaArch::GFX1011: - case CudaArch::GFX1012: - case CudaArch::GFX1030: - case CudaArch::UNKNOWN: - break; - case CudaArch::LAST: - llvm_unreachable("Unexpected Cuda arch."); - } - } - } - CGOpenMPRuntime::processRequiresDirective(D); -} - -/// Get number of SMs and number of blocks per SM. -static std::pair getSMsBlocksPerSM(CodeGenModule &CGM) { - std::pair Data; - if (CGM.getLangOpts().OpenMPCUDANumSMs) - Data.first = CGM.getLangOpts().OpenMPCUDANumSMs; - if (CGM.getLangOpts().OpenMPCUDABlocksPerSM) - Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM; - if (Data.first && Data.second) - return Data; - switch (getCudaArch(CGM)) { - case CudaArch::SM_20: - case CudaArch::SM_21: - case CudaArch::SM_30: - case CudaArch::SM_32: - case CudaArch::SM_35: - case CudaArch::SM_37: - case CudaArch::SM_50: - case CudaArch::SM_52: - case CudaArch::SM_53: - return {16, 16}; - case CudaArch::SM_60: - case CudaArch::SM_61: - case CudaArch::SM_62: - return {56, 32}; - case CudaArch::SM_70: - case CudaArch::SM_72: - case CudaArch::SM_75: - case CudaArch::SM_80: - return {84, 32}; - case CudaArch::GFX600: - case CudaArch::GFX601: - case CudaArch::GFX700: - case CudaArch::GFX701: - case CudaArch::GFX702: - case CudaArch::GFX703: - case CudaArch::GFX704: - case CudaArch::GFX801: - case CudaArch::GFX802: - case CudaArch::GFX803: - case CudaArch::GFX810: - case CudaArch::GFX900: - case CudaArch::GFX902: - case CudaArch::GFX904: - case CudaArch::GFX906: - case CudaArch::GFX908: - case CudaArch::GFX909: - case CudaArch::GFX1010: - case CudaArch::GFX1011: - case CudaArch::GFX1012: - case CudaArch::GFX1030: - case CudaArch::UNKNOWN: - break; - case CudaArch::LAST: - llvm_unreachable("Unexpected Cuda arch."); - } - llvm_unreachable("Unexpected NVPTX target without ptx feature."); -} - -void CGOpenMPRuntimeNVPTX::clear() { - if (!GlobalizedRecords.empty() && - !CGM.getLangOpts().OpenMPCUDATargetParallel) { - ASTContext &C = CGM.getContext(); - llvm::SmallVector GlobalRecs; - llvm::SmallVector SharedRecs; - RecordDecl *StaticRD = C.buildImplicitRecord( - "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); - StaticRD->startDefinition(); - RecordDecl *SharedStaticRD = C.buildImplicitRecord( - "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); - SharedStaticRD->startDefinition(); - for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) { - if (Records.Records.empty()) - continue; - unsigned Size = 0; - unsigned RecAlignment = 0; - for (const RecordDecl *RD : Records.Records) { - QualType RDTy = C.getRecordType(RD); - unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity(); - RecAlignment = std::max(RecAlignment, Alignment); - unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity(); - Size = - llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment); - } - Size = llvm::alignTo(Size, RecAlignment); - llvm::APInt ArySize(/*numBits=*/64, Size); - QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); - const bool UseSharedMemory = Size <= SharedMemorySize; - auto *Field = - FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD, - SourceLocation(), SourceLocation(), nullptr, SubTy, - C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - if (UseSharedMemory) { - SharedStaticRD->addDecl(Field); - SharedRecs.push_back(&Records); - } else { - StaticRD->addDecl(Field); - GlobalRecs.push_back(&Records); - } - Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size)); - Records.UseSharedMemory->setInitializer( - llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0)); - } - // Allocate SharedMemorySize buffer for the shared memory. - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore this code as sson as nvlink is fixed. - if (!SharedStaticRD->field_empty()) { - llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize); - QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); - auto *Field = FieldDecl::Create( - C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy, - C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - SharedStaticRD->addDecl(Field); - } - SharedStaticRD->completeDefinition(); - if (!SharedStaticRD->field_empty()) { - QualType StaticTy = C.getRecordType(SharedStaticRD); - llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMStaticTy, - /*isConstant=*/false, llvm::GlobalValue::CommonLinkage, - llvm::Constant::getNullValue(LLVMStaticTy), - "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - C.getTargetAddressSpace(LangAS::cuda_shared)); - auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - GV, CGM.VoidPtrTy); - for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) { - Rec->Buffer->replaceAllUsesWith(Replacement); - Rec->Buffer->eraseFromParent(); - } - } - StaticRD->completeDefinition(); - if (!StaticRD->field_empty()) { - QualType StaticTy = C.getRecordType(StaticRD); - std::pair SMsBlockPerSM = getSMsBlocksPerSM(CGM); - llvm::APInt Size1(32, SMsBlockPerSM.second); - QualType Arr1Ty = - C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::APInt Size2(32, SMsBlockPerSM.first); - QualType Arr2Ty = - C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore CommonLinkage as soon as nvlink is fixed. - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMArr2Ty, - /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(LLVMArr2Ty), - "_openmp_static_glob_rd_$_"); - auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - GV, CGM.VoidPtrTy); - for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) { - Rec->Buffer->replaceAllUsesWith(Replacement); - Rec->Buffer->eraseFromParent(); - } - } - } - if (!TeamsReductions.empty()) { - ASTContext &C = CGM.getContext(); - RecordDecl *StaticRD = C.buildImplicitRecord( - "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union); - StaticRD->startDefinition(); - for (const RecordDecl *TeamReductionRec : TeamsReductions) { - QualType RecTy = C.getRecordType(TeamReductionRec); - auto *Field = FieldDecl::Create( - C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, - C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - StaticRD->addDecl(Field); - } - StaticRD->completeDefinition(); - QualType StaticTy = C.getRecordType(StaticRD); - llvm::Type *LLVMReductionsBufferTy = - CGM.getTypes().ConvertTypeForMem(StaticTy); - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore CommonLinkage as soon as nvlink is fixed. - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMReductionsBufferTy, - /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(LLVMReductionsBufferTy), - "_openmp_teams_reductions_buffer_$_"); - KernelTeamsReductionPtr->setInitializer( - llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, - CGM.VoidPtrTy)); - } - CGOpenMPRuntime::clear(); -} diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt --- a/clang/lib/CodeGen/CMakeLists.txt +++ b/clang/lib/CodeGen/CMakeLists.txt @@ -71,6 +71,8 @@ CGObjCRuntime.cpp CGOpenCLRuntime.cpp CGOpenMPRuntime.cpp + CGOpenMPRuntimeAMDGCN.cpp + CGOpenMPRuntimeGPU.cpp CGOpenMPRuntimeNVPTX.cpp CGRecordLayoutBuilder.cpp CGStmt.cpp