Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -378,6 +378,12 @@ // as long as the size requested fits the pre-allocated size. EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, int16_t UseSharedMemory) { + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode with uninitialized runtime."); + return omptarget_nvptx_simpleThreadPrivateContext->Allocate(DataSize); + } + // Frame pointer must be visible to all workers in the same warp. unsigned WID = getWarpId(); void *&FrameP = DataSharingState.FramePtr[WID]; @@ -456,6 +462,12 @@ // reclaim all outstanding global memory slots since it is // likely we have reached the end of the kernel. EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode with uninitialized runtime."); + return omptarget_nvptx_simpleThreadPrivateContext->Deallocate(FrameStart); + } + if (IsWarpMasterActiveThread()) { unsigned WID = getWarpId(); Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -113,6 +113,10 @@ DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, // The maximum number of warps in use DS_Max_Warp_Number = 32, + // The size of the preallocated shared memory buffer per team + DS_Shared_Memory_Size = 128, + // The size of the preallocated global memory buffer per team + DS_Global_Memory_Size = 4096, }; // Data structure to keep in shared memory that traces the current slot, stack, @@ -386,12 +390,15 @@ class omptarget_nvptx_SimpleThreadPrivateContext { uint16_t par_level[MAX_THREADS_PER_TEAM]; + char global_buffer[DS_Global_Memory_Size]; public: INLINE void Init() { ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(), "Expected SPMD + uninitialized runtime modes."); par_level[GetThreadIdInBlock()] = 0; } + INLINE void *Allocate(size_t DataSize); + INLINE void Deallocate(void *Ptr); INLINE void IncParLevel() { ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(), "Expected SPMD + uninitialized runtime modes."); Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -29,9 +29,17 @@ // init entry points //////////////////////////////////////////////////////////////////////////////// +INLINE unsigned nsmid() { + unsigned n; + asm("mov.u32 %0, %%nsmid;" : "=r"(n)); + return n; +} + INLINE unsigned smid() { unsigned id; asm("mov.u32 %0, %%smid;" : "=r"(id)); + ASSERT0(LT_FUSSY, nsmid() <= MAX_SM, + "Expectednumber of SMs is less than reported."); return id; } Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -202,3 +202,31 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() { return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock()); } + +//////////////////////////////////////////////////////////////////////////////// +// Lightweight runtime functions. +//////////////////////////////////////////////////////////////////////////////// + +// Shared memory buffer for globalization support. +static __align__(16) __device__ __shared__ char + omptarget_static_buffer[DS_Shared_Memory_Size]; +static __device__ __shared__ void *omptarget_spmd_allocated; +INLINE void * +omptarget_nvptx_SimpleThreadPrivateContext::Allocate(size_t DataSize) { + if (DataSize <= DS_Shared_Memory_Size) + return ::omptarget_static_buffer; + if (DataSize <= DS_Global_Memory_Size) + return global_buffer; + if (threadIdx.x == 0) + omptarget_spmd_allocated = SafeMalloc(DataSize, "SPMD teams alloc"); + __syncthreads(); + return omptarget_spmd_allocated; +} + +INLINE void omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(void *Ptr) { + if (Ptr != ::omptarget_static_buffer && Ptr != global_buffer) { + __syncthreads(); + if (threadIdx.x == 0) + SafeFree(Ptr, "SPMD teams dealloc"); + } +} Index: libomptarget/deviceRTLs/nvptx/src/option.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/option.h +++ libomptarget/deviceRTLs/nvptx/src/option.h @@ -34,7 +34,10 @@ // Maximum number of omp state objects per SM allocated statically in global // memory. -#if __CUDA_ARCH__ >= 600 +#if __CUDA_ARCH__ >= 700 +#define OMP_STATE_COUNT 32 +#define MAX_SM 84 +#elif __CUDA_ARCH__ >= 600 #define OMP_STATE_COUNT 32 #define MAX_SM 56 #else