diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp --- a/clang/lib/Basic/OpenMPKinds.cpp +++ b/clang/lib/Basic/OpenMPKinds.cpp @@ -618,9 +618,13 @@ CaptureRegions.push_back(OMPD_teams); break; case OMPD_target: + CaptureRegions.push_back(OMPD_task); + CaptureRegions.push_back(OMPD_target); + break; case OMPD_target_simd: CaptureRegions.push_back(OMPD_task); CaptureRegions.push_back(OMPD_target); + CaptureRegions.push_back(OMPD_simd); break; case OMPD_teams_distribute_parallel_for: case OMPD_teams_distribute_parallel_for_simd: diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -967,6 +967,18 @@ const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen); + /// Emits outlined function for the specified OpenMP simd directive + /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID, + /// kmp_int32 BoundID, struct context_vars*). + /// \param D OpenMP directive. + /// \param ThreadIDVar Variable for thread id in the current OpenMP region. + /// \param InnermostKind Kind of innermost directive (for simple directives it + /// is a directive itself, for combined - its innermost directive). + /// \param CodeGen Code generation sequence for the \a D directive. + virtual llvm::Function *emitSIMDOutlinedFunction( + const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen); + /// Emits outlined function for the specified OpenMP teams directive /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID, /// kmp_int32 BoundID, struct context_vars*). @@ -1019,6 +1031,19 @@ ArrayRef CapturedVars, const Expr *IfCond); + /// Emits code for simd call of the \p Outlined Fn with variables captured in + /// a record whose address is stored in \p CapturedVars. + /// \param OutlinedFn Outlined function to be run in parallel threads. Type of + /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). + /// \param CapturedVars A pointer to the record with the references to + /// variables used in \a OutlinedFn function. + /// \param IfCond Condition in the associated 'if' clause, if it was + /// specified, nullptr otherwise. + virtual void emitSIMDCall(CodeGenFunction &CGF, SourceLocation Loc, + llvm::Function *OutlinedFn, + ArrayRef CapturedVars, + const Expr *IfCond); + /// Emits a critical region. /// \param CriticalName Name of the critical region. /// \param CriticalOpGen Generator for the statement associated with the given diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1295,6 +1295,14 @@ CGM, D, CS, ThreadIDVar, InnermostKind, getOutlinedHelperName(), CodeGen); } +llvm::Function *CGOpenMPRuntime::emitSIMDOutlinedFunction( + const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { + const CapturedStmt *CS = D.getCapturedStmt(OMPD_simd); + return emitParallelOrTeamsOutlinedFunction( + CGM, D, CS, ThreadIDVar, InnermostKind, getOutlinedHelperName(), CodeGen); +} + llvm::Function *CGOpenMPRuntime::emitTeamsOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { @@ -2143,6 +2151,11 @@ } } +void CGOpenMPRuntime::emitSIMDCall(CodeGenFunction &CGF, SourceLocation Loc, + llvm::Function *OutlinedFn, + ArrayRef CapturedVars, + const Expr *IfCond) {} + // If we're inside an (outlined) parallel region, use the region info's // thread-ID variable (it is passed in a first argument of the outlined function // as "kmp_int32 *gtid"). Otherwise, if we're not inside parallel region, but in diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -262,6 +262,19 @@ ArrayRef CapturedVars, const Expr *IfCond) override; + /// Emits code for simd call of the \p Outlined Fn with variables captured in + /// a record whose address is stored in \p CapturedVars. + /// \param OutlinedFn Outlined function to be run in parallel threads. Type of + /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). + /// \param CapturedVars A pointer to the record with the references to + /// variables used in \a OutlinedFn function. + /// \param IfCond Condition in the associated 'if' clause, if it was + /// specified, nullptr otherwise. + void emitSIMDCall(CodeGenFunction &CGF, SourceLocation Loc, + llvm::Function *OutlinedFn, + ArrayRef CapturedVars, + const Expr *IfCond) override; + /// Emit an implicit/explicit barrier for OpenMP threads. /// \param Kind Directive for which this implicit barrier call must be /// generated. Must be OMPD_barrier for explicit barrier generation. diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1587,6 +1587,64 @@ RCG(CGF); } +void CGOpenMPRuntimeGPU::emitSIMDCall(CodeGenFunction &CGF, SourceLocation Loc, + llvm::Function *OutlinedFn, + ArrayRef CapturedVars, + const Expr *IfCond) { + auto &&SIMDGen = [this, Loc, OutlinedFn, CapturedVars, + IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) { + CGBuilderTy &Bld = CGF.Builder; + + Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca( + llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()), + "captured_vars_addrs"); + // There's something to share. + if (!CapturedVars.empty()) { + // Prepare for parallel region. Indicate the outlined function. + ASTContext &Ctx = CGF.getContext(); + unsigned Idx = 0; + for (llvm::Value *V : CapturedVars) { + Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx); + llvm::Value *PtrV; + if (V->getType()->isIntegerTy()) + PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); + else + PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); + CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, + Ctx.getPointerType(Ctx.VoidPtrTy)); + ++Idx; + } + } + + llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); + llvm::Value *IfCondVal = nullptr; + if (IfCond) + IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty, + /* isSigned */ false); + else + IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1); + + llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy); + + llvm::Value *Args[] = { + RTLoc, + getThreadID(CGF, Loc), + IfCondVal, + llvm::ConstantInt::get(CGF.Int32Ty, -1), + llvm::ConstantInt::get(CGF.Int32Ty, -1), + FnPtr, + llvm::ConstantPointerNull::get(CGM.Int8PtrTy), + Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(), + CGF.VoidPtrPtrTy), + llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_simd_51), + Args); + }; + RegionCodeGenTy RCG(SIMDGen); + RCG(CGF); +} + void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) { // Always emit simple barriers! if (!CGF.HaveInsertPoint()) diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -2487,6 +2487,48 @@ } } +static void emitOMPTargetSimdRegion(CodeGenFunction &CGF, + const OMPLoopDirective &S, + PrePostActionTy &Action) { + assert(isOpenMPSimdDirective(S.getDirectiveKind()) && + "Expected simd directive"); + + Action.Enter(CGF); + + OMPLoopScope PreInitScope(CGF, S); + if (isOpenMPDistributeDirective(S.getDirectiveKind()) || + isOpenMPWorksharingDirective(S.getDirectiveKind()) || + isOpenMPTaskLoopDirective(S.getDirectiveKind())) { + (void)EmitOMPHelperVar(CGF, cast(S.getLowerBoundVariable())); + (void)EmitOMPHelperVar(CGF, cast(S.getUpperBoundVariable())); + } + + bool CondConstant; + if (CGF.ConstantFoldsToSimpleInteger(S.getPreCond(), CondConstant)) { + // If the condition is evaluated to false, we don't have to move on. + if (!CondConstant) + return; + } + + const CapturedStmt *CS = S.getCapturedStmt(OMPD_simd); + + auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + // CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(), emitForLoopBounds, + // emitDispatchForLoopBounds); + }; + + llvm::Function *OutlinedFn = + CGF.CGM.getOpenMPRuntime().emitSIMDOutlinedFunction( + S, *CS->getCapturedDecl()->param_begin(), OMPD_simd, CodeGen); + + llvm::SmallVector CapturedVars; + CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars); + + CGF.CGM.getOpenMPRuntime().emitSIMDCall(CGF, S.getBeginLoc(), OutlinedFn, + CapturedVars, S.getPreCond()); +} + static void emitOMPSimdRegion(CodeGenFunction &CGF, const OMPLoopDirective &S, PrePostActionTy &Action) { Action.Enter(CGF); @@ -2585,17 +2627,32 @@ void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) { ParentLoopDirectiveForScanRegion ScanRegion(*this, S); OMPFirstScanLoop = true; - auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) { - emitOMPSimdRegion(CGF, S, Action); - }; - { - auto LPCRegion = - CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S); - OMPLexicalScope Scope(*this, S, OMPD_unknown); - CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen); + if (CGM.getLangOpts().OpenMPIsDevice) { + auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) { + emitOMPTargetSimdRegion(CGF, S, Action); + }; + { + auto LPCRegion = + CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S); + OMPLexicalScope Scope(*this, S, OMPD_unknown); + // FIXME: For device code gen, we have to outline the function. + CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen); + } + // Check for outer lastprivate conditional update. + checkForLastprivateConditionalUpdate(*this, S); + } else { + auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) { + emitOMPSimdRegion(CGF, S, Action); + }; + { + auto LPCRegion = + CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S); + OMPLexicalScope Scope(*this, S, OMPD_unknown); + CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen); + } + // Check for outer lastprivate conditional update. + checkForLastprivateConditionalUpdate(*this, S); } - // Check for outer lastprivate conditional update. - checkForLastprivateConditionalUpdate(*this, S); } void CodeGenFunction::EmitOMPTileDirective(const OMPTileDirective &S) { @@ -3094,7 +3151,7 @@ CodeGenModule &CGM, StringRef ParentName, const OMPTargetSimdDirective &S) { // Emit SPMD target parallel for region as a standalone region. auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) { - emitOMPSimdRegion(CGF, S, Action); + emitOMPTargetSimdRegion(CGF, S, Action); }; llvm::Function *Fn; llvm::Constant *Addr; diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -3977,7 +3977,38 @@ ParamsTeamsOrParallel, /*OpenMPCaptureLevel=*/2); break; } - case OMPD_target: + case OMPD_target: { + QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst(); + QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict(); + QualType KmpInt32PtrTy = + Context.getPointerType(KmpInt32Ty).withConst().withRestrict(); + QualType Args[] = {VoidPtrTy}; + FunctionProtoType::ExtProtoInfo EPI; + EPI.Variadic = true; + QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI); + Sema::CapturedParamNameType Params[] = { + std::make_pair(".global_tid.", KmpInt32Ty), + std::make_pair(".part_id.", KmpInt32PtrTy), + std::make_pair(".privates.", VoidPtrTy), + std::make_pair( + ".copy_fn.", + Context.getPointerType(CopyFnType).withConst().withRestrict()), + std::make_pair(".task_t.", Context.VoidPtrTy.withConst()), + std::make_pair(StringRef(), QualType()) // __context with shared vars + }; + ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, + Params, /*OpenMPCaptureLevel=*/0); + // Mark this captured region as inlined, because we don't use outlined + // function directly. + getCurCapturedRegion()->TheCapturedDecl->addAttr( + AlwaysInlineAttr::CreateImplicit( + Context, {}, AttributeCommonInfo::AS_Keyword, + AlwaysInlineAttr::Keyword_forceinline)); + ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, + std::make_pair(StringRef(), QualType()), + /*OpenMPCaptureLevel=*/1); + break; + } case OMPD_target_simd: { QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst(); QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict(); @@ -4008,6 +4039,15 @@ ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, std::make_pair(StringRef(), QualType()), /*OpenMPCaptureLevel=*/1); + Sema::CapturedParamNameType ParamsSIMD[] = { + std::make_pair(".global_tid.", KmpInt32PtrTy), + std::make_pair(".bound_tid.", KmpInt32PtrTy), + std::make_pair(StringRef(), QualType()) // __context with shared vars + }; + // Start a captured region for 'teams' or 'parallel'. Both regions have + // the same implicit parameters. + ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, + ParamsSIMD, /*OpenMPCaptureLevel=*/2); break; } case OMPD_atomic: diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -423,6 +423,11 @@ __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, ) __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32) +/// OpenMP target SIMD functions +__OMP_RTL(__kmpc_simd_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32, + VoidPtr, VoidPtr, VoidPtrPtr, SizeTy) +__OMP_RTL(__kmpc_kernel_simd, false, Int1, VoidPtrPtr) +__OMP_RTL(__kmpc_kernel_end_simd, false, Void, ) __OMP_RTL(__kmpc_shuffle_int32, false, Int32, Int32, Int16, Int16) __OMP_RTL(__kmpc_nvptx_parallel_reduce_nowait_v2, false, Int32, IdentPtr, Int32, Int32, SizeTy, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr) diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -281,6 +281,12 @@ /// TODO void __kmpc_end_serialized_parallel(IdentTy *Loc, uint32_t); +/// TODO +bool __kmpc_kernel_simd(SIMDRegionFnTy *WorkFn); + +/// TODO +void __kmpc_kernel_end_simd(); + /// TODO void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind); diff --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h --- a/openmp/libomptarget/DeviceRTL/include/Mapping.h +++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h @@ -25,7 +25,7 @@ #pragma omp end declare target /// Initialize the mapping machinery. -void init(bool IsSPMD); +void init(int8_t Mode); /// Return true if the kernel is executed in SPMD mode. bool isSPMDMode(); @@ -33,6 +33,9 @@ /// Return true if the kernel is executed in generic mode. bool isGenericMode(); +/// Return true if the kernel is executed in SIMD mode. +bool isSIMDMode(); + /// Return true if the executing thread is the main thread in generic mode. bool isMainThreadInGenericMode(); @@ -55,6 +58,12 @@ /// Return the thread Id in the block, in [0, getBlockSize()). uint32_t getThreadIdInBlock(); +/// Return the logic thread Id, which depends on how we map an OpenMP thread to +/// the target device. In non-SIMD mode, we map an OpenMP thread to a device +/// thread. In SIMD mode, we map an OpenMP thread to a warp, and each thread in +/// the warp is a SIMD lane. +uint32_t getLogicThreadId(); + /// Return the warp id in the block. uint32_t getWarpId(); @@ -79,6 +88,19 @@ /// Return the number of processing elements on the device. uint32_t getNumberOfProcessorElements(); +namespace utils { +/// Return true if \p Mode indicates SPMD mode. +inline bool isSPMDMode(int8_t Mode) { return Mode & OMP_TGT_EXEC_MODE_SPMD; } + +/// Return true if \p Mode indicates generic mode. +inline bool isGenericMode(int8_t Mode) { + return Mode & OMP_TGT_EXEC_MODE_GENERIC; +} + +/// Return true if \p Mode indicates SIMD mode. +inline bool isSIMDMode(int8_t Mode) { return Mode & OMP_TGT_EXEC_MODE_SIMD; } +} // namespace utils + } // namespace mapping } // namespace _OMP diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h --- a/openmp/libomptarget/DeviceRTL/include/State.h +++ b/openmp/libomptarget/DeviceRTL/include/State.h @@ -24,7 +24,7 @@ inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE; /// Initialize the state machinery. Must be called by all threads. -void init(bool IsSPMD); +void init(int8_t Mode); /// TODO enum ValueKind { @@ -37,6 +37,10 @@ VK_RunSchedChunk, VK_ParallelRegionFn, VK_ParallelTeamSize, + // SIMD + VK_SIMDLevel, + VK_SIMDRegionFn, + VK_SIMDLaneWidth, }; /// TODO @@ -145,10 +149,20 @@ inline state::PtrValue ParallelRegionFn; +/// TODO +inline state::Value SIMDLaneWidth; + +/// TODO +inline state::PtrValue SIMDRegionFn; + void runAndCheckState(void(Func(void))); void assumeInitialState(bool IsSPMD); +/// Propagate the thread state from the leader in the warp to the rest of SIMD +/// workers. This function should only be called in SIMD mode. +void propagateThreadState(unsigned SIMDLen); + } // namespace state namespace icv { @@ -171,6 +185,9 @@ /// TODO inline state::Value RunSched; +/// TODO +inline state::Value SIMDLevel; + } // namespace icv namespace memory { diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -19,7 +19,7 @@ namespace synchronize { /// Initialize the synchronization machinery. Must be called by all threads. -void init(bool IsSPMD); +void init(int8_t Mode); /// Synchronize all threads in a warp identified by \p Mask. void warp(LaneMaskTy Mask); diff --git a/openmp/libomptarget/DeviceRTL/include/Types.h b/openmp/libomptarget/DeviceRTL/include/Types.h --- a/openmp/libomptarget/DeviceRTL/include/Types.h +++ b/openmp/libomptarget/DeviceRTL/include/Types.h @@ -150,6 +150,8 @@ using ParallelRegionFnTy = void *; +using SIMDRegionFnTy = void *; + using CriticalNameTy = int32_t[8]; struct omp_lock_t { @@ -181,6 +183,7 @@ enum OMPTgtExecModeFlags : int8_t { OMP_TGT_EXEC_MODE_GENERIC = 1 << 0, OMP_TGT_EXEC_MODE_SPMD = 1 << 1, + OMP_TGT_EXEC_MODE_SIMD = 1 << 2, }; #define __PRAGMA(STR) _Pragma(#STR) diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp --- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -21,17 +21,17 @@ #pragma omp declare target -static void inititializeRuntime(bool IsSPMD) { +static void inititializeRuntime(int8_t Mode) { // Order is important here. - synchronize::init(IsSPMD); - mapping::init(IsSPMD); - state::init(IsSPMD); + synchronize::init(Mode); + mapping::init(Mode); + state::init(Mode); } /// Simple generic state machine for worker threads. static void genericStateMachine(IdentTy *Ident) { - uint32_t TId = mapping::getThreadIdInBlock(); + uint32_t TId = mapping::getLogicThreadId(); do { ParallelRegionFnTy WorkFn = 0; @@ -58,6 +58,31 @@ } while (true); } +namespace { +void runSIMDStateMachine(IdentTy *Ident) { + uint32_t LaneId = mapping::getThreadIdInWarp(); + do { + SIMDRegionFnTy WorkFn = nullptr; + + // Wait for the signal that we have a new work function. + synchronize::warp(mapping::activemask()); + + // Retrieve the work function from the runtime. + bool IsActive = __kmpc_kernel_simd(&WorkFn); + + if (!WorkFn) + return; + + if (IsActive) { + ((void (*)(uint32_t, uint32_t))WorkFn)(0, LaneId); + __kmpc_kernel_end_simd(); + } + + synchronize::warp(mapping::activemask()); + } while (true); +} +} // namespace + extern "C" { /// Initialization @@ -66,17 +91,20 @@ /// int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode, bool UseGenericStateMachine, bool) { - const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD; - if (IsSPMD) { - inititializeRuntime(/* IsSPMD */ true); - synchronize::threads(); - } else { - inititializeRuntime(/* IsSPMD */ false); - // No need to wait since only the main threads will execute user - // code and workers will run into a barrier right away. + inititializeRuntime(Mode); + + // For all SIMD workers, start the simd state machine. + if (mapping::utils::isSIMDMode(Mode)) { + uint32_t LaneId = mapping::getThreadIdInWarp(); + if (LaneId) { + runSIMDStateMachine(Ident); + return LaneId; + } } + const bool IsSPMD = mapping::utils::isSPMDMode(Mode); if (IsSPMD) { + synchronize::threads(); state::assumeInitialState(IsSPMD); return -1; } @@ -98,7 +126,8 @@ /// \param Ident Source location identification, can be NULL. /// void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode, bool) { - const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD; + const bool IsSPMD = mapping::utils::isSPMDMode(Mode); + state::assumeInitialState(IsSPMD); if (IsSPMD) return; diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -178,7 +178,7 @@ bool mapping::isLeaderInWarp() { __kmpc_impl_lanemask_t Active = mapping::activemask(); __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT(); - return utils::popc(Active & LaneMaskLT) == 0; + return ::_OMP::utils::popc(Active & LaneMaskLT) == 0; } LaneMaskTy mapping::activemask() { return impl::activemask(); } @@ -191,6 +191,13 @@ uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); } +uint32_t mapping::getLogicThreadId() { + if (mapping::isSIMDMode()) + return mapping::getWarpId(); + + return mapping::getThreadIdInBlock(); +} + uint32_t mapping::getBlockSize() { return impl::getBlockSize(); } uint32_t mapping::getKernelSize() { return impl::getKernelSize(); } @@ -214,16 +221,20 @@ /// Execution mode /// ///{ -static int SHARED(IsSPMDMode); +static int8_t SHARED(ExecutionMode); -void mapping::init(bool IsSPMD) { +void mapping::init(int8_t Mode) { if (!mapping::getThreadIdInBlock()) - IsSPMDMode = IsSPMD; + ExecutionMode = Mode; } -bool mapping::isSPMDMode() { return IsSPMDMode; } +bool mapping::isSPMDMode() { return mapping::utils::isSPMDMode(ExecutionMode); } -bool mapping::isGenericMode() { return !isSPMDMode(); } +bool mapping::isGenericMode() { + return mapping::utils::isGenericMode(ExecutionMode); +} + +bool mapping::isSIMDMode() { return mapping::utils::isSIMDMode(ExecutionMode); } ///} extern "C" { diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp --- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp @@ -49,20 +49,43 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { uint32_t NThreadsICV = NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; - uint32_t NumThreads = mapping::getBlockSize(); + + const bool IsSIMDMode = mapping::isSIMDMode(); + + uint32_t NumThreads = + IsSIMDMode ? mapping::getNumberOfWarpsInBlock() : mapping::getBlockSize(); if (NThreadsICV != 0 && NThreadsICV < NumThreads) NumThreads = NThreadsICV; // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP. - if (NumThreads < mapping::getWarpSize()) - NumThreads = 1; - else - NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); + // We don't need this for SIMD mode because an OpenMP thread is mapped to a + // warp on the device and it can be any number. + if (!IsSIMDMode) { + if (NumThreads < mapping::getWarpSize()) + NumThreads = 1; + else + NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); + } return NumThreads; } +uint32_t determineSIMDLen(int32_t SIMDLen, int32_t SafeLen) { + ASSERT(mapping::isSIMDMode()); + + // TODO: This is probably not right if the schedule is different. + if (SafeLen < SIMDLen) + SIMDLen = SafeLen; + + // We currently maps an OpenMP thread to a warp in SIMD mode. If the simdlen + // is larger than the warp size, we have to ceil it. + if (SIMDLen > mapping::getWarpSize()) + SIMDLen = mapping::getWarpSize(); + + return SIMDLen; +} + // Invoke an outlined parallel function unwrapping arguments (up to 32). void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn, void **args, int64_t nargs) { @@ -78,11 +101,57 @@ extern "C" { +void __kmpc_simd_51(IdentTy *ident, int32_t, int32_t if_expr, int32_t safelen, + int32_t simdlen, int32_t order, void *fn, void *wrapper_fn, + void **args, int64_t nargs) { + // Handle non-SIMD case first, which can be: + // - if clause is evaluted to false + // - simdlen is set to 1 + // - it is already in simd region + const uint32_t LogicThreadId = mapping::getLogicThreadId(); + if (OMP_UNLIKELY(!if_expr || simdlen == 1 || safelen == 1 || + icv::SIMDLevel)) { + invokeMicrotask(LogicThreadId, 0, fn, args, nargs); + return; + } + + // Only the leader of each warp can execute the following code. + ASSERT(mapping::isLeaderInWarp()); + + const uint32_t SIMDLen = determineSIMDLen(simdlen, safelen); + + if (LogicThreadId == 0) + state::SIMDLaneWidth = SIMDLen; + + // Propagates the thread state to all SIMD workers from the leader. + state::propagateThreadState(SIMDLen); + + // Synchronize all threads (leaders). + synchronize::threads(); + + { + state::ValueRAII SIMDRegionFnRAII(state::SIMDRegionFn, wrapper_fn, + (void *)nullptr, true); + state::ValueRAII SIMDLevelRAII(icv::SIMDLevel, 1u, 0u, true); + + // Signal SIMD workers + synchronize::warp(mapping::activemask()); + + // TODO: Leader in warp also has to execute the SIMD region. + // What we need: + // - A work-sharing function that can take both thread id and lane id into + // consideration. + + // Synchronize after execution of the SIMD region. + synchronize::warp(mapping::activemask()); + } +} + void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, int32_t num_threads, int proc_bind, void *fn, void *wrapper_fn, void **args, int64_t nargs) { - uint32_t TId = mapping::getThreadIdInBlock(); + uint32_t TId = mapping::getLogicThreadId(); // Handle the serialized case first, same for SPMD/non-SPMD. if (OMP_UNLIKELY(!if_expr || icv::Level)) { __kmpc_serialized_parallel(ident, TId); @@ -157,7 +226,7 @@ return false; // Set to true for workers participating in the parallel region. - uint32_t TId = mapping::getThreadIdInBlock(); + uint32_t TId = mapping::getLogicThreadId(); bool ThreadIsActive = TId < state::ParallelTeamSize; return ThreadIsActive; } @@ -171,6 +240,24 @@ ASSERT(!mapping::isSPMDMode()); } +__attribute__((noinline)) bool __kmpc_kernel_simd(SIMDRegionFnTy *WorkFn) { + // Work function and arguments for L1 SIMD region. + *WorkFn = state::SIMDRegionFn; + + // If this is the termination signal from the master, quit early. + if (!*WorkFn) + return false; + + // Set to true for workers participating in the parallel region. + uint32_t LaneId = mapping::getThreadIdInWarp(); + bool LaneActive = LaneId < state::SIMDLaneWidth; + return LaneActive; +} + +__attribute__((noinline)) void __kmpc_kernel_end_simd() { + // TODO: Some clean-up of SIMD execution +} + void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) { state::enterDataEnvironment(); ++icv::Level; diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -203,7 +203,7 @@ struct TeamStateTy { /// TODO: provide a proper init function. - void init(bool IsSPMD); + void init(int Mode); bool operator==(const TeamStateTy &) const; @@ -224,8 +224,13 @@ TeamStateTy SHARED(TeamState); -void TeamStateTy::init(bool IsSPMD) { - ICVState.NThreadsVar = mapping::getBlockSize(); +void TeamStateTy::init(int Mode) { + // In SIMD mode, we map an OpenMP thread to a warp. + if (mapping::utils::isSIMDMode(Mode)) + ICVState.NThreadsVar = mapping::getNumberOfWarpsInBlock(); + else + ICVState.NThreadsVar = mapping::getBlockSize(); + ICVState.LevelVar = 0; ICVState.ActiveLevelVar = 0; ICVState.MaxActiveLevelsVar = 1; @@ -357,7 +362,8 @@ __builtin_unreachable(); } -void state::init(bool IsSPMD) { +void state::init(int8_t Mode) { + const bool IsSPMD = mapping::utils::isSPMDMode(Mode); SharedMemorySmartStack.init(IsSPMD); if (!mapping::getThreadIdInBlock()) TeamState.init(IsSPMD); @@ -404,6 +410,15 @@ ASSERT(mapping::isSPMDMode() == IsSPMD); } +void state::propagateThreadState(unsigned SIMDLen) { + ASSERT(mapping::isSIMDMode()); + ASSERT(mapping::isLeaderInWarp()); + + const uint32_t TId = mapping::getThreadIdInBlock(); + for (int I = 1; I < SIMDLen; ++I) + ThreadStates[I + TId] = ThreadStates[TId]; +} + extern "C" { void omp_set_dynamic(int V) {} @@ -434,7 +449,7 @@ } int omp_get_ancestor_thread_num(int Level) { - return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); + return returnValIfLevelIsActive(Level, mapping::getLogicThreadId(), 0); } int omp_get_thread_num(void) { diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -214,8 +214,8 @@ } // namespace impl -void synchronize::init(bool IsSPMD) { - if (!IsSPMD) +void synchronize::init(int8_t Mode) { + if (!mapping::utils::isSPMDMode(Mode) || mapping::utils::isSIMDMode(Mode)) impl::namedBarrierInit(); } diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp --- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp @@ -210,7 +210,7 @@ static void dispatch_init(IdentTy *loc, int32_t threadId, kmp_sched_t schedule, T lb, T ub, ST st, ST chunk, DynamicScheduleTracker *DST) { - int tid = mapping::getThreadIdInBlock(); + int tid = mapping::getLogicThreadId(); T tnum = omp_get_num_threads(); T tripCount = ub - lb + 1; // +1 because ub is inclusive ASSERT0(LT_FUSSY, threadId < tnum, diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -1084,20 +1084,31 @@ KernelTy *KernelInfo = reinterpret_cast(TgtEntryPtr); const bool IsSPMDGenericMode = - KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; + KernelInfo->ExecutionMode & llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; const bool IsSPMDMode = - KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; + !IsSPMDGenericMode && + (KernelInfo->ExecutionMode & llvm::omp::OMP_TGT_EXEC_MODE_SPMD); const bool IsGenericMode = - KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; + !IsSPMDGenericMode && + (KernelInfo->ExecutionMode & llvm::omp::OMP_TGT_EXEC_MODE_GENERIC); + const bool IsSIMDMode = + KernelInfo->ExecutionMode & llvm::omp::OMP_TGT_EXEC_MODE_SIMD; int CudaThreadsPerBlock; if (ThreadLimit > 0) { - DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); - CudaThreadsPerBlock = ThreadLimit; - // Add master warp if necessary - if (IsGenericMode) { - DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); - CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; + if (IsSIMDMode) { + DP("Setting CUDA threads per block to requested %d\n", + ThreadLimit * DeviceData[DeviceId].WarpSize); + CudaThreadsPerBlock = ThreadLimit * DeviceData[DeviceId].WarpSize; + } else { + DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); + CudaThreadsPerBlock = ThreadLimit; + // Add master warp if necessary + if (IsGenericMode) { + DP("Adding master warp: +%d threads\n", + DeviceData[DeviceId].WarpSize); + CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; + } } } else { DP("Setting CUDA threads per block to default %d\n",