Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -378,6 +378,12 @@ // as long as the size requested fits the pre-allocated size. EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, int16_t UseSharedMemory) { + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode with uninitialized runtime."); + return omptarget_nvptx_simpleThreadPrivateContext->Allocate(DataSize); + } + // Frame pointer must be visible to all workers in the same warp. unsigned WID = getWarpId(); void *&FrameP = DataSharingState.FramePtr[WID]; @@ -456,6 +462,12 @@ // reclaim all outstanding global memory slots since it is // likely we have reached the end of the kernel. EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode with uninitialized runtime."); + return omptarget_nvptx_simpleThreadPrivateContext->Deallocate(FrameStart); + } + if (IsWarpMasterActiveThread()) { unsigned WID = getWarpId(); Index: libomptarget/deviceRTLs/nvptx/src/omp_data.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -38,6 +38,8 @@ __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext *omptarget_nvptx_simpleThreadPrivateContext; +__device__ __shared__ void *omptarget_nvptx_simpleGlobalData; + //////////////////////////////////////////////////////////////////////////////// // The team master sets the outlined parallel function in this variable to // communicate with the workers. Since it is in shared memory, there is one Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -113,6 +113,8 @@ DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, // The maximum number of warps in use DS_Max_Warp_Number = 32, + // The size of the preallocated shared memory buffer per team + DS_Shared_Memory_Size = 128, }; // Data structure to keep in shared memory that traces the current slot, stack, @@ -386,12 +388,15 @@ class omptarget_nvptx_SimpleThreadPrivateContext { uint16_t par_level[MAX_THREADS_PER_TEAM]; + public: INLINE void Init() { ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(), "Expected SPMD + uninitialized runtime modes."); par_level[GetThreadIdInBlock()] = 0; } + INLINE void *Allocate(size_t DataSize); + INLINE void Deallocate(void *Ptr); INLINE void IncParLevel() { ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(), "Expected SPMD + uninitialized runtime modes."); Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -25,13 +25,23 @@ omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT> omptarget_nvptx_device_simpleState[MAX_SM]; +extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData; + //////////////////////////////////////////////////////////////////////////////// // init entry points //////////////////////////////////////////////////////////////////////////////// +INLINE unsigned nsmid() { + unsigned n; + asm("mov.u32 %0, %%nsmid;" : "=r"(n)); + return n; +} + INLINE unsigned smid() { unsigned id; asm("mov.u32 %0, %%smid;" : "=r"(id)); + ASSERT0(LT_FUSSY, nsmid() <= MAX_SM, + "Expected number of SMs is less than reported."); return id; } @@ -108,6 +118,10 @@ int slot = smid() % MAX_SM; omptarget_nvptx_simpleThreadPrivateContext = omptarget_nvptx_device_simpleState[slot].Dequeue(); + // Reuse the memory allocated for the full runtime as the preallocated + // global memory buffer for the lightweight runtime. + omptarget_nvptx_simpleGlobalData = + omptarget_nvptx_device_State[slot].Dequeue(); } __syncthreads(); omptarget_nvptx_simpleThreadPrivateContext->Init(); @@ -177,6 +191,10 @@ int slot = smid() % MAX_SM; omptarget_nvptx_device_simpleState[slot].Enqueue( omptarget_nvptx_simpleThreadPrivateContext); + // Enqueue global memory back. + omptarget_nvptx_device_State[slot].Enqueue( + reinterpret_cast( + omptarget_nvptx_simpleGlobalData)); } return; } Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -202,3 +202,35 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() { return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock()); } + +//////////////////////////////////////////////////////////////////////////////// +// Lightweight runtime functions. +//////////////////////////////////////////////////////////////////////////////// + +// Shared memory buffer for globalization support. +static __align__(16) __device__ __shared__ char + omptarget_static_buffer[DS_Shared_Memory_Size]; +static __device__ __shared__ void *omptarget_spmd_allocated; + +extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData; + +INLINE void * +omptarget_nvptx_SimpleThreadPrivateContext::Allocate(size_t DataSize) { + if (DataSize <= DS_Shared_Memory_Size) + return ::omptarget_static_buffer; + if (DataSize <= sizeof(omptarget_nvptx_ThreadPrivateContext)) + return ::omptarget_nvptx_simpleGlobalData; + if (threadIdx.x == 0) + omptarget_spmd_allocated = SafeMalloc(DataSize, "SPMD teams alloc"); + __syncthreads(); + return omptarget_spmd_allocated; +} + +INLINE void omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(void *Ptr) { + if (Ptr != ::omptarget_static_buffer && + Ptr != ::omptarget_nvptx_simpleGlobalData) { + __syncthreads(); + if (threadIdx.x == 0) + SafeFree(Ptr, "SPMD teams dealloc"); + } +} Index: libomptarget/deviceRTLs/nvptx/src/option.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/option.h +++ libomptarget/deviceRTLs/nvptx/src/option.h @@ -34,7 +34,10 @@ // Maximum number of omp state objects per SM allocated statically in global // memory. -#if __CUDA_ARCH__ >= 600 +#if __CUDA_ARCH__ >= 700 +#define OMP_STATE_COUNT 32 +#define MAX_SM 84 +#elif __CUDA_ARCH__ >= 600 #define OMP_STATE_COUNT 32 #define MAX_SM 56 #else