diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h --- a/openmp/libomptarget/DeviceRTL/include/State.h +++ b/openmp/libomptarget/DeviceRTL/include/State.h @@ -109,7 +109,7 @@ } }; -extern ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; +extern ThreadStateTy **ThreadStates; #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) /// Initialize the state machinery. Must be called by all threads. diff --git a/openmp/libomptarget/DeviceRTL/include/Types.h b/openmp/libomptarget/DeviceRTL/include/Types.h --- a/openmp/libomptarget/DeviceRTL/include/Types.h +++ b/openmp/libomptarget/DeviceRTL/include/Types.h @@ -33,6 +33,9 @@ using int64_t = long; using uint64_t = unsigned long; using size_t = decltype(sizeof(char)); +// TODO: Properly implement this +using intptr_t = int64_t; +using uintptr_t = uint64_t; static_assert(sizeof(int8_t) == 1, "type size mismatch"); static_assert(sizeof(uint8_t) == 1, "type size mismatch"); diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -12,6 +12,7 @@ #include "Configuration.h" #include "Debug.h" #include "Interface.h" +#include "Mapping.h" #include "Synchronization.h" #include "Types.h" #include "Utils.h" @@ -221,10 +222,7 @@ } state::TeamStateTy SHARED(_OMP::state::TeamState); - -__attribute__((loader_uninitialized)) -state::ThreadStateTy *_OMP::state::ThreadStates[mapping::MaxThreadsPerTeam]; -#pragma omp allocate(_OMP::state::ThreadStates) allocator(omp_pteam_mem_alloc) +state::ThreadStateTy **SHARED(_OMP::state::ThreadStates); namespace { @@ -248,18 +246,32 @@ if (mapping::isInitialThreadInLevel0(IsSPMD)) { TeamState.init(IsSPMD); DebugEntryRAII::init(); + ThreadStates = nullptr; } - - ThreadStates[mapping::getThreadIdInBlock()] = nullptr; } void state::enterDataEnvironment(IdentTy *Ident) { ASSERT(config::mayUseThreadStates() && "Thread state modified while explicitly disabled!"); + if (!config::mayUseThreadStates()) + return; unsigned TId = mapping::getThreadIdInBlock(); ThreadStateTy *NewThreadState = static_cast(__kmpc_alloc_shared(sizeof(ThreadStateTy))); + uintptr_t *ThreadStatesBitsPtr = reinterpret_cast(&ThreadStates); + if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) { + uint32_t Bytes = sizeof(ThreadStates[0]) * mapping::getBlockSize(); + void *ThreadStatesPtr = + memory::allocShared(Bytes, "Thread state array allocation"); + if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0), + reinterpret_cast(ThreadStatesPtr), + atomic::seq_cst, atomic::seq_cst)) + memory::freeShared(ThreadStatesPtr, Bytes, + "Thread state array allocated multiple times"); + ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst) && + "Expected valid thread states bit!"); + } NewThreadState->init(ThreadStates[TId]); TeamState.HasThreadState = true; ThreadStates[TId] = NewThreadState; @@ -274,6 +286,8 @@ } void state::resetStateForThread(uint32_t TId) { + if (!config::mayUseThreadStates()) + return; if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId])) return; @@ -295,7 +309,6 @@ TeamStateTy InitialTeamState; InitialTeamState.init(IsSPMD); InitialTeamState.assertEqual(TeamState); - ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]); ASSERT(mapping::isSPMDMode() == IsSPMD); }