diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h --- a/openmp/libomptarget/DeviceRTL/include/State.h +++ b/openmp/libomptarget/DeviceRTL/include/State.h @@ -13,16 +13,104 @@ #define OMPTARGET_STATE_H #include "Debug.h" +#include "Mapping.h" #include "Types.h" +#include "Utils.h" #pragma omp begin declare target device_type(nohost) namespace _OMP { +namespace memory { + +/// Alloca \p Size bytes in shared memory, if possible, for \p Reason. +/// +/// Note: See the restrictions on __kmpc_alloc_shared for proper usage. +void *allocShared(uint64_t Size, const char *Reason); + +/// Free \p Ptr, alloated via allocShared, for \p Reason. +/// +/// Note: See the restrictions on __kmpc_free_shared for proper usage. +void freeShared(void *Ptr, uint64_t Bytes, const char *Reason); + +/// Alloca \p Size bytes in global memory, if possible, for \p Reason. +void *allocGlobal(uint64_t Size, const char *Reason); + +/// Return a pointer to the dynamic shared memory buffer. +void *getDynamicBuffer(); + +/// Free \p Ptr, alloated via allocGlobal, for \p Reason. +void freeGlobal(void *Ptr, const char *Reason); + +} // namespace memory + namespace state { inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE; +struct ICVStateTy { + uint32_t NThreadsVar; + uint32_t LevelVar; + uint32_t ActiveLevelVar; + uint32_t MaxActiveLevelsVar; + uint32_t RunSchedVar; + uint32_t RunSchedChunkVar; + + bool operator==(const ICVStateTy &Other) const; + + void assertEqual(const ICVStateTy &Other) const; +}; + +struct TeamStateTy { + void init(bool IsSPMD); + + bool operator==(const TeamStateTy &) const; + + void assertEqual(TeamStateTy &Other) const; + + /// ICVs + /// + /// Preallocated storage for ICV values that are used if the threads have not + /// set a custom default. The latter is supported but unlikely and slow(er). + /// + ///{ + ICVStateTy ICVState; + ///} + + uint32_t ParallelTeamSize; + ParallelRegionFnTy ParallelRegionFnVar; +}; + +extern TeamStateTy TeamState; +#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc) + +struct ThreadStateTy { + + /// ICVs have preallocated storage in the TeamStateTy which is used if a + /// thread has not set a custom value. The latter is supported but unlikely. + /// When it happens we will allocate dynamic memory to hold the values of all + /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an + /// ICV struct to hold them all. This is slower than alternatives but allows + /// users to pay only for what they use. + /// + state::ICVStateTy ICVState; + + ThreadStateTy *PreviousThreadState; + + void init() { + ICVState = TeamState.ICVState; + PreviousThreadState = nullptr; + } + + void init(ThreadStateTy *PreviousTS) { + ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState; + PreviousThreadState = PreviousTS; + } +}; + +extern ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; +#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) + /// Initialize the state machinery. Must be called by all threads. void init(bool IsSPMD); @@ -54,8 +142,73 @@ /// TODO void resetStateForThread(uint32_t TId); -uint32_t &lookup32(ValueKind VK, bool IsReadonly, IdentTy *Ident); -void *&lookupPtr(ValueKind VK, bool IsReadonly); +inline uint32_t &lookupForModify32Impl(uint32_t state::ICVStateTy::*Var, + IdentTy *Ident) { + if (OMP_LIKELY(!config::mayUseThreadStates() || + TeamState.ICVState.LevelVar == 0)) + return TeamState.ICVState.*Var; + uint32_t TId = mapping::getThreadIdInBlock(); + if (OMP_UNLIKELY(!ThreadStates[TId])) { + ThreadStates[TId] = reinterpret_cast(memory::allocGlobal( + sizeof(ThreadStateTy), "ICV modification outside data environment")); + ASSERT(ThreadStates[TId] != nullptr && "Nullptr returned by malloc!"); + ThreadStates[TId]->init(); + } + return ThreadStates[TId]->ICVState.*Var; +} + +inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var) { + auto TId = mapping::getThreadIdInBlock(); + if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId])) + return ThreadStates[TId]->ICVState.*Var; + return TeamState.ICVState.*Var; +} + +__attribute__((always_inline, flatten)) inline uint32_t & +lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident) { + switch (Kind) { + case state::VK_NThreads: + if (IsReadonly) + return lookupImpl(&ICVStateTy::NThreadsVar); + return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident); + case state::VK_Level: + if (IsReadonly) + return lookupImpl(&ICVStateTy::LevelVar); + return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident); + case state::VK_ActiveLevel: + if (IsReadonly) + return lookupImpl(&ICVStateTy::ActiveLevelVar); + return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident); + case state::VK_MaxActiveLevels: + if (IsReadonly) + return lookupImpl(&ICVStateTy::MaxActiveLevelsVar); + return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident); + case state::VK_RunSched: + if (IsReadonly) + return lookupImpl(&ICVStateTy::RunSchedVar); + return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident); + case state::VK_RunSchedChunk: + if (IsReadonly) + return lookupImpl(&ICVStateTy::RunSchedChunkVar); + return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident); + case state::VK_ParallelTeamSize: + return TeamState.ParallelTeamSize; + default: + break; + } + __builtin_unreachable(); +} + +__attribute__((always_inline, flatten)) inline void *& +lookupPtr(ValueKind Kind, bool IsReadonly) { + switch (Kind) { + case state::VK_ParallelRegionFn: + return TeamState.ParallelRegionFnVar; + default: + break; + } + __builtin_unreachable(); +} /// A class without actual state used to provide a nice interface to lookup and /// update ICV values we can declare in global scope. @@ -181,29 +334,6 @@ } // namespace icv -namespace memory { - -/// Alloca \p Size bytes in shared memory, if possible, for \p Reason. -/// -/// Note: See the restrictions on __kmpc_alloc_shared for proper usage. -void *allocShared(uint64_t Size, const char *Reason); - -/// Free \p Ptr, alloated via allocShared, for \p Reason. -/// -/// Note: See the restrictions on __kmpc_free_shared for proper usage. -void freeShared(void *Ptr, uint64_t Bytes, const char *Reason); - -/// Alloca \p Size bytes in global memory, if possible, for \p Reason. -void *allocGlobal(uint64_t Size, const char *Reason); - -/// Return a pointer to the dynamic shared memory buffer. -void *getDynamicBuffer(); - -/// Free \p Ptr, alloated via allocGlobal, for \p Reason. -void freeGlobal(void *Ptr, const char *Reason); - -} // namespace memory - } // namespace _OMP #pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -12,10 +12,8 @@ #include "Configuration.h" #include "Debug.h" #include "Interface.h" -#include "Mapping.h" #include "Synchronization.h" #include "Types.h" -#include "Utils.h" using namespace _OMP; @@ -180,22 +178,7 @@ ///} -namespace { - -struct ICVStateTy { - uint32_t NThreadsVar; - uint32_t LevelVar; - uint32_t ActiveLevelVar; - uint32_t MaxActiveLevelsVar; - uint32_t RunSchedVar; - uint32_t RunSchedChunkVar; - - bool operator==(const ICVStateTy &Other) const; - - void assertEqual(const ICVStateTy &Other) const; -}; - -bool ICVStateTy::operator==(const ICVStateTy &Other) const { +bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & (ActiveLevelVar == Other.ActiveLevelVar) & (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & @@ -203,7 +186,7 @@ (RunSchedChunkVar == Other.RunSchedChunkVar); } -void ICVStateTy::assertEqual(const ICVStateTy &Other) const { +void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { ASSERT(NThreadsVar == Other.NThreadsVar); ASSERT(LevelVar == Other.LevelVar); ASSERT(ActiveLevelVar == Other.ActiveLevelVar); @@ -212,30 +195,7 @@ ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar); } -struct TeamStateTy { - /// TODO: provide a proper init function. - void init(bool IsSPMD); - - bool operator==(const TeamStateTy &) const; - - void assertEqual(TeamStateTy &Other) const; - - /// ICVs - /// - /// Preallocated storage for ICV values that are used if the threads have not - /// set a custom default. The latter is supported but unlikely and slow(er). - /// - ///{ - ICVStateTy ICVState; - ///} - - uint32_t ParallelTeamSize; - ParallelRegionFnTy ParallelRegionFnVar; -}; - -TeamStateTy SHARED(TeamState); - -void TeamStateTy::init(bool IsSPMD) { +void state::TeamStateTy::init(bool IsSPMD) { ICVState.NThreadsVar = mapping::getBlockSize(IsSPMD); ICVState.LevelVar = 0; ICVState.ActiveLevelVar = 0; @@ -246,65 +206,24 @@ ParallelRegionFnVar = nullptr; } -bool TeamStateTy::operator==(const TeamStateTy &Other) const { +bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { return (ICVState == Other.ICVState) & (ParallelTeamSize == Other.ParallelTeamSize); } -void TeamStateTy::assertEqual(TeamStateTy &Other) const { +void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { ICVState.assertEqual(Other.ICVState); ASSERT(ParallelTeamSize == Other.ParallelTeamSize); } -struct ThreadStateTy { - - /// ICVs have preallocated storage in the TeamStateTy which is used if a - /// thread has not set a custom value. The latter is supported but unlikely. - /// When it happens we will allocate dynamic memory to hold the values of all - /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an - /// ICV struct to hold them all. This is slower than alternatives but allows - /// users to pay only for what they use. - /// - ICVStateTy ICVState; - - ThreadStateTy *PreviousThreadState; - - void init() { - ICVState = TeamState.ICVState; - PreviousThreadState = nullptr; - } +namespace { - void init(ThreadStateTy *PreviousTS) { - ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState; - PreviousThreadState = PreviousTS; - } -}; +state::TeamStateTy SHARED(TeamState); __attribute__((loader_uninitialized)) -ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; +state::ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) -uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) { - if (OMP_LIKELY(!config::mayUseThreadStates() || - TeamState.ICVState.LevelVar == 0)) - return TeamState.ICVState.*Var; - uint32_t TId = mapping::getThreadIdInBlock(); - if (OMP_UNLIKELY(!ThreadStates[TId])) { - ThreadStates[TId] = reinterpret_cast(memory::allocGlobal( - sizeof(ThreadStateTy), "ICV modification outside data environment")); - ASSERT(ThreadStates[TId] != nullptr && "Nullptr returned by malloc!"); - ThreadStates[TId]->init(); - } - return ThreadStates[TId]->ICVState.*Var; -} - -template IntTy &lookupImpl(IntTy ICVStateTy::*Var) { - IntTy TId = mapping::getThreadIdInBlock(); - if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId])) - return ThreadStates[TId]->ICVState.*Var; - return TeamState.ICVState.*Var; -} - int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, int OutOfBoundsVal = -1) { if (Level == 0) @@ -320,50 +239,6 @@ } // namespace -uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident) { - switch (Kind) { - case state::VK_NThreads: - if (IsReadonly) - return lookupImpl(&ICVStateTy::NThreadsVar); - return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident); - case state::VK_Level: - if (IsReadonly) - return lookupImpl(&ICVStateTy::LevelVar); - return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident); - case state::VK_ActiveLevel: - if (IsReadonly) - return lookupImpl(&ICVStateTy::ActiveLevelVar); - return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident); - case state::VK_MaxActiveLevels: - if (IsReadonly) - return lookupImpl(&ICVStateTy::MaxActiveLevelsVar); - return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident); - case state::VK_RunSched: - if (IsReadonly) - return lookupImpl(&ICVStateTy::RunSchedVar); - return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident); - case state::VK_RunSchedChunk: - if (IsReadonly) - return lookupImpl(&ICVStateTy::RunSchedChunkVar); - return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident); - case state::VK_ParallelTeamSize: - return TeamState.ParallelTeamSize; - default: - break; - } - __builtin_unreachable(); -} - -void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) { - switch (Kind) { - case state::VK_ParallelRegionFn: - return TeamState.ParallelRegionFnVar; - default: - break; - } - __builtin_unreachable(); -} - void state::init(bool IsSPMD) { SharedMemorySmartStack.init(IsSPMD); if (mapping::isInitialThreadInLevel0(IsSPMD)) {