Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -370,11 +370,7 @@ } INLINE void* data_sharing_push_stack_common(size_t PushSize) { - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, isSPMDMode(), - "Expected SPMD mode with uninitialized runtime."); - return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize); - } + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); // Only warp active master threads manage the stack. bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0; @@ -480,11 +476,7 @@ // reclaim all outstanding global memory slots since it is // likely we have reached the end of the kernel. EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, isSPMDMode(), - "Expected SPMD mode with uninitialized runtime."); - return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart); - } + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); __threadfence_block(); @@ -544,3 +536,44 @@ EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) { *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); } + +// This function is used to init static memory manager. This manager is used to +// manage statically allocated global memory. This memory is allocated by the +// compiler and used to correctly implement globalization of the variables in +// target, teams and distribute regions. +EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size, + int16_t is_shared, + const void **frame) { + if (is_shared) { + *frame = buf; + return; + } + if (isSPMDMode()) { + if (GetThreadIdInBlock() == 0) { + *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); + } + __syncthreads(); + return; + } + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(), + "Must be called only in the target master thread."); + *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); + __threadfence(); +} + +EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared) { + if (is_shared) + return; + if (isSPMDMode()) { + __syncthreads(); + if (GetThreadIdInBlock() == 0) { + omptarget_nvptx_simpleMemoryManager.Release(); + } + return; + } + __threadfence(); + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(), + "Must be called only in the target master thread."); + omptarget_nvptx_simpleMemoryManager.Release(); +} + Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h @@ -514,4 +514,10 @@ // SPMD execution mode interrogation function. EXTERN int8_t __kmpc_is_spmd_exec_mode(); + +EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size, + int16_t is_shared, const void **res); + +EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared); + #endif Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -31,6 +31,11 @@ OMP_STATE_COUNT> omptarget_nvptx_device_simpleState[MAX_SM]; +__device__ omptarget_nvptx_SimpleMemoryManager + omptarget_nvptx_simpleMemoryManager; +__device__ __shared__ uint32_t usedMemIdx; +__device__ __shared__ uint32_t usedSlotIdx; + // Pointer to this team's OpenMP state object __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; @@ -38,8 +43,6 @@ __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext *omptarget_nvptx_simpleThreadPrivateContext; -__device__ __shared__ void *omptarget_nvptx_simpleGlobalData; - //////////////////////////////////////////////////////////////////////////////// // The team master sets the outlined parallel function in this variable to // communicate with the workers. Since it is in shared memory, there is one Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -344,8 +344,6 @@ INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } INLINE void InitThreadPrivateContext(int tid); - INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; } - INLINE uint64_t GetSourceQueue() { return SourceQueue; } private: // team context for this team @@ -368,8 +366,6 @@ // state for dispatch with dyn/guided OR static (never use both at a time) int64_t nextLowerBound[MAX_THREADS_PER_TEAM]; int64_t stride[MAX_THREADS_PER_TEAM]; - // Queue to which this object must be returned. - uint64_t SourceQueue; }; /// Device envrionment data @@ -377,6 +373,22 @@ int32_t debug_level; }; +/// Memory manager for statically allocated memory. +class omptarget_nvptx_SimpleMemoryManager { +private: + __align__(128) struct MemDataTy { + volatile unsigned keys[OMP_STATE_COUNT]; + } MemData[MAX_SM]; + + INLINE uint32_t hash(unsigned key) const { + return key & (OMP_STATE_COUNT - 1); + } + +public: + INLINE void Release(); + INLINE const void *Acquire(const void *buf, size_t size); +}; + class omptarget_nvptx_SimpleThreadPrivateContext { uint16_t par_level[MAX_THREADS_PER_TEAM]; @@ -386,8 +398,6 @@ "Expected SPMD + uninitialized runtime modes."); par_level[GetThreadIdInBlock()] = 0; } - static INLINE void *Allocate(size_t DataSize); - static INLINE void Deallocate(void *Ptr); INLINE void IncParLevel() { ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(), "Expected SPMD + uninitialized runtime modes."); @@ -424,6 +434,10 @@ // global data tables //////////////////////////////////////////////////////////////////////////////// +extern __device__ omptarget_nvptx_SimpleMemoryManager + omptarget_nvptx_simpleMemoryManager; +extern __device__ __shared__ uint32_t usedMemIdx; +extern __device__ __shared__ uint32_t usedSlotIdx; extern __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -25,18 +25,10 @@ omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT> omptarget_nvptx_device_simpleState[MAX_SM]; -extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData; - //////////////////////////////////////////////////////////////////////////////// // init entry points //////////////////////////////////////////////////////////////////////////////// -INLINE unsigned nsmid() { - unsigned n; - asm("mov.u32 %0, %%nsmid;" : "=r"(n)); - return n; -} - INLINE unsigned smid() { unsigned id; asm("mov.u32 %0, %%smid;" : "=r"(id)); @@ -64,11 +56,9 @@ // Get a state object from the queue. int slot = smid() % MAX_SM; + usedSlotIdx = slot; omptarget_nvptx_threadPrivateContext = omptarget_nvptx_device_State[slot].Dequeue(); -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - omptarget_nvptx_threadPrivateContext->SetSourceQueue(slot); -#endif // init thread private int threadId = GetLogicalThreadIdInBlock(); @@ -94,11 +84,7 @@ ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Generic always requires initialized runtime."); // Enqueue omp state object for use by another team. -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue(); -#else - int slot = smid() % MAX_SM; -#endif + int slot = usedSlotIdx; omptarget_nvptx_device_State[slot].Enqueue( omptarget_nvptx_threadPrivateContext); // Done with work. Kill the workers. @@ -114,12 +100,9 @@ setExecutionParameters(Spmd, RuntimeUninitialized); if (GetThreadIdInBlock() == 0) { int slot = smid() % MAX_SM; + usedSlotIdx = slot; omptarget_nvptx_simpleThreadPrivateContext = omptarget_nvptx_device_simpleState[slot].Dequeue(); - // Reuse the memory allocated for the full runtime as the preallocated - // global memory buffer for the lightweight runtime. - omptarget_nvptx_simpleGlobalData = - omptarget_nvptx_device_State[slot].Dequeue(); } __syncthreads(); omptarget_nvptx_simpleThreadPrivateContext->Init(); @@ -136,6 +119,7 @@ if (threadId == 0) { // Get a state object from the queue. int slot = smid() % MAX_SM; + usedSlotIdx = slot; omptarget_nvptx_threadPrivateContext = omptarget_nvptx_device_State[slot].Dequeue(); @@ -186,19 +170,15 @@ if (isRuntimeUninitialized()) { if (threadId == 0) { // Enqueue omp state object for use by another team. - int slot = smid() % MAX_SM; + int slot = usedSlotIdx; omptarget_nvptx_device_simpleState[slot].Enqueue( omptarget_nvptx_simpleThreadPrivateContext); - // Enqueue global memory back. - omptarget_nvptx_device_State[slot].Enqueue( - reinterpret_cast( - omptarget_nvptx_simpleGlobalData)); } return; } if (threadId == 0) { // Enqueue omp state object for use by another team. - int slot = smid() % MAX_SM; + int slot = usedSlotIdx; omptarget_nvptx_device_State[slot].Enqueue( omptarget_nvptx_threadPrivateContext); } Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -204,34 +204,29 @@ } //////////////////////////////////////////////////////////////////////////////// -// Lightweight runtime functions. +// Memory management runtime functions. //////////////////////////////////////////////////////////////////////////////// -// Shared memory buffer for globalization support. -static __align__(16) __device__ __shared__ char - omptarget_static_buffer[DS_Shared_Memory_Size]; -static __device__ __shared__ void *omptarget_spmd_allocated; - -extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData; - -INLINE void * -omptarget_nvptx_SimpleThreadPrivateContext::Allocate(size_t DataSize) { - if (DataSize <= DS_Shared_Memory_Size) - return ::omptarget_static_buffer; - if (DataSize <= sizeof(omptarget_nvptx_ThreadPrivateContext)) - return ::omptarget_nvptx_simpleGlobalData; - if (threadIdx.x == 0) - omptarget_spmd_allocated = SafeMalloc(DataSize, "SPMD teams alloc"); - __syncthreads(); - return omptarget_spmd_allocated; -} - -INLINE void -omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(void *Ptr) { - if (Ptr != ::omptarget_static_buffer && - Ptr != ::omptarget_nvptx_simpleGlobalData) { - __syncthreads(); - if (threadIdx.x == 0) - SafeFree(Ptr, "SPMD teams dealloc"); +INLINE void omptarget_nvptx_SimpleMemoryManager::Release() { + ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM, + "SlotIdx is too big or uninitialized."); + ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT, + "MemIdx is too big or uninitialized."); + MemDataTy &MD = MemData[usedSlotIdx]; + atomicExch((unsigned *)&MD.keys[usedMemIdx], 0); +} + +INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf, + size_t size) { + ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM, + "SlotIdx is too big or uninitialized."); + const unsigned sm = usedSlotIdx; + MemDataTy &MD = MemData[sm]; + unsigned i = hash(GetBlockIdInKernel()); + while (atomicCAS((unsigned *)&MD.keys[i], 0, 1) != 0) { + i = hash(i + 1); } + usedSlotIdx = sm; + usedMemIdx = i; + return static_cast(buf) + (sm * OMP_STATE_COUNT + i) * size; }