Skip to content

Commit 022bf16

Browse files
committedSep 21, 2018
[OPENMP][NVPTX] Add support for lastprivates/reductions handling in SPMD constructs with lightweight runtime.
Summary: We need the support for per-team shared variables to support codegen for lastprivates/reductions. Patch adds this support by using shared memory if the total size of the reductions/lastprivates is <= 128 bytes, then pre-allocated buffer in global memory if size is <= 4K bytes,or uses malloc/free, otherwise. Reviewers: gtbercea, kkwli0, grokos Subscribers: guansong, openmp-commits Differential Revision: https://reviews.llvm.org/D51875 llvm-svn: 342737
1 parent 8171bd8 commit 022bf16

File tree

6 files changed

+74
-1
lines changed

6 files changed

+74
-1
lines changed
 

Diff for: ‎openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu

+12
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,12 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
378378
// as long as the size requested fits the pre-allocated size.
379379
EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
380380
int16_t UseSharedMemory) {
381+
if (isRuntimeUninitialized()) {
382+
ASSERT0(LT_FUSSY, isSPMDMode(),
383+
"Expected SPMD mode with uninitialized runtime.");
384+
return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
385+
}
386+
381387
// Frame pointer must be visible to all workers in the same warp.
382388
unsigned WID = getWarpId();
383389
void *&FrameP = DataSharingState.FramePtr[WID];
@@ -456,6 +462,12 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
456462
// reclaim all outstanding global memory slots since it is
457463
// likely we have reached the end of the kernel.
458464
EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
465+
if (isRuntimeUninitialized()) {
466+
ASSERT0(LT_FUSSY, isSPMDMode(),
467+
"Expected SPMD mode with uninitialized runtime.");
468+
return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
469+
}
470+
459471
if (IsWarpMasterActiveThread()) {
460472
unsigned WID = getWarpId();
461473

Diff for: ‎openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu

+2
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ __device__ __shared__
3838
__device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
3939
*omptarget_nvptx_simpleThreadPrivateContext;
4040

41+
__device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
42+
4143
////////////////////////////////////////////////////////////////////////////////
4244
// The team master sets the outlined parallel function in this variable to
4345
// communicate with the workers. Since it is in shared memory, there is one

Diff for: ‎openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu

+18
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,23 @@ extern __device__ omptarget_nvptx_Queue<
2525
omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
2626
omptarget_nvptx_device_simpleState[MAX_SM];
2727

28+
extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
29+
2830
////////////////////////////////////////////////////////////////////////////////
2931
// init entry points
3032
////////////////////////////////////////////////////////////////////////////////
3133

34+
INLINE unsigned nsmid() {
35+
unsigned n;
36+
asm("mov.u32 %0, %%nsmid;" : "=r"(n));
37+
return n;
38+
}
39+
3240
INLINE unsigned smid() {
3341
unsigned id;
3442
asm("mov.u32 %0, %%smid;" : "=r"(id));
43+
ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
44+
"Expected number of SMs is less than reported.");
3545
return id;
3646
}
3747

@@ -108,6 +118,10 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
108118
int slot = smid() % MAX_SM;
109119
omptarget_nvptx_simpleThreadPrivateContext =
110120
omptarget_nvptx_device_simpleState[slot].Dequeue();
121+
// Reuse the memory allocated for the full runtime as the preallocated
122+
// global memory buffer for the lightweight runtime.
123+
omptarget_nvptx_simpleGlobalData =
124+
omptarget_nvptx_device_State[slot].Dequeue();
111125
}
112126
__syncthreads();
113127
omptarget_nvptx_simpleThreadPrivateContext->Init();
@@ -177,6 +191,10 @@ EXTERN void __kmpc_spmd_kernel_deinit() {
177191
int slot = smid() % MAX_SM;
178192
omptarget_nvptx_device_simpleState[slot].Enqueue(
179193
omptarget_nvptx_simpleThreadPrivateContext);
194+
// Enqueue global memory back.
195+
omptarget_nvptx_device_State[slot].Enqueue(
196+
reinterpret_cast<omptarget_nvptx_ThreadPrivateContext *>(
197+
omptarget_nvptx_simpleGlobalData));
180198
}
181199
return;
182200
}

Diff for: ‎openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

+5
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ enum DATA_SHARING_SIZES {
113113
DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
114114
// The maximum number of warps in use
115115
DS_Max_Warp_Number = 32,
116+
// The size of the preallocated shared memory buffer per team
117+
DS_Shared_Memory_Size = 128,
116118
};
117119

118120
// Data structure to keep in shared memory that traces the current slot, stack,
@@ -386,12 +388,15 @@ struct omptarget_device_environmentTy {
386388

387389
class omptarget_nvptx_SimpleThreadPrivateContext {
388390
uint16_t par_level[MAX_THREADS_PER_TEAM];
391+
389392
public:
390393
INLINE void Init() {
391394
ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
392395
"Expected SPMD + uninitialized runtime modes.");
393396
par_level[GetThreadIdInBlock()] = 0;
394397
}
398+
static INLINE void *Allocate(size_t DataSize);
399+
static INLINE void Deallocate(void *Ptr);
395400
INLINE void IncParLevel() {
396401
ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
397402
"Expected SPMD + uninitialized runtime modes.");

Diff for: ‎openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h

+33
Original file line numberDiff line numberDiff line change
@@ -202,3 +202,36 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
202202
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
203203
return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
204204
}
205+
206+
////////////////////////////////////////////////////////////////////////////////
207+
// Lightweight runtime functions.
208+
////////////////////////////////////////////////////////////////////////////////
209+
210+
// Shared memory buffer for globalization support.
211+
static __align__(16) __device__ __shared__ char
212+
omptarget_static_buffer[DS_Shared_Memory_Size];
213+
static __device__ __shared__ void *omptarget_spmd_allocated;
214+
215+
extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
216+
217+
INLINE void *
218+
omptarget_nvptx_SimpleThreadPrivateContext::Allocate(size_t DataSize) {
219+
if (DataSize <= DS_Shared_Memory_Size)
220+
return ::omptarget_static_buffer;
221+
if (DataSize <= sizeof(omptarget_nvptx_ThreadPrivateContext))
222+
return ::omptarget_nvptx_simpleGlobalData;
223+
if (threadIdx.x == 0)
224+
omptarget_spmd_allocated = SafeMalloc(DataSize, "SPMD teams alloc");
225+
__syncthreads();
226+
return omptarget_spmd_allocated;
227+
}
228+
229+
INLINE void
230+
omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(void *Ptr) {
231+
if (Ptr != ::omptarget_static_buffer &&
232+
Ptr != ::omptarget_nvptx_simpleGlobalData) {
233+
__syncthreads();
234+
if (threadIdx.x == 0)
235+
SafeFree(Ptr, "SPMD teams dealloc");
236+
}
237+
}

Diff for: ‎openmp/libomptarget/deviceRTLs/nvptx/src/option.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,10 @@
3434

3535
// Maximum number of omp state objects per SM allocated statically in global
3636
// memory.
37-
#if __CUDA_ARCH__ >= 600
37+
#if __CUDA_ARCH__ >= 700
38+
#define OMP_STATE_COUNT 32
39+
#define MAX_SM 84
40+
#elif __CUDA_ARCH__ >= 600
3841
#define OMP_STATE_COUNT 32
3942
#define MAX_SM 56
4043
#else

0 commit comments

Comments
 (0)
Please sign in to comment.