Skip to content

Commit 463e9f3

Browse files
committedNov 2, 2018
[OPENMP][NVPTX]Fixed/improved support for globalization in team contexts.
Summary: Current globalization scheme works correctly only for SPMD+lightweight runtime mode and does not work for full runtime. Patch improves support for the globalization scheme + reduces global memory consumption in lightweight runtime mode. Patch adds runtime functions to work with the statically allocated global memory. It allows to improve performance and memory consumption. This global memory must be allocated by the compiler. Reviewers: grokos, kkwli0, gtbercea, caomhin Subscribers: guansong, jfb, openmp-commits Differential Revision: https://reviews.llvm.org/D53943 llvm-svn: 345976
1 parent f070544 commit 463e9f3

File tree

6 files changed

+102
-71
lines changed

6 files changed

+102
-71
lines changed
 

‎openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu

+43-10
Original file line numberDiff line numberDiff line change
@@ -370,11 +370,7 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
370370
}
371371

372372
INLINE void* data_sharing_push_stack_common(size_t PushSize) {
373-
if (isRuntimeUninitialized()) {
374-
ASSERT0(LT_FUSSY, isSPMDMode(),
375-
"Expected SPMD mode with uninitialized runtime.");
376-
return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
377-
}
373+
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
378374

379375
// Only warp active master threads manage the stack.
380376
bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
@@ -480,11 +476,7 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
480476
// reclaim all outstanding global memory slots since it is
481477
// likely we have reached the end of the kernel.
482478
EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
483-
if (isRuntimeUninitialized()) {
484-
ASSERT0(LT_FUSSY, isSPMDMode(),
485-
"Expected SPMD mode with uninitialized runtime.");
486-
return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
487-
}
479+
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
488480

489481
__threadfence_block();
490482

@@ -544,3 +536,44 @@ EXTERN void __kmpc_end_sharing_variables() {
544536
EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
545537
*GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
546538
}
539+
540+
// This function is used to init static memory manager. This manager is used to
541+
// manage statically allocated global memory. This memory is allocated by the
542+
// compiler and used to correctly implement globalization of the variables in
543+
// target, teams and distribute regions.
544+
EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size,
545+
int16_t is_shared,
546+
const void **frame) {
547+
if (is_shared) {
548+
*frame = buf;
549+
return;
550+
}
551+
if (isSPMDMode()) {
552+
if (GetThreadIdInBlock() == 0) {
553+
*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
554+
}
555+
__syncthreads();
556+
return;
557+
}
558+
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(),
559+
"Must be called only in the target master thread.");
560+
*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
561+
__threadfence();
562+
}
563+
564+
EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared) {
565+
if (is_shared)
566+
return;
567+
if (isSPMDMode()) {
568+
__syncthreads();
569+
if (GetThreadIdInBlock() == 0) {
570+
omptarget_nvptx_simpleMemoryManager.Release();
571+
}
572+
return;
573+
}
574+
__threadfence();
575+
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(),
576+
"Must be called only in the target master thread.");
577+
omptarget_nvptx_simpleMemoryManager.Release();
578+
}
579+

‎openmp/libomptarget/deviceRTLs/nvptx/src/interface.h

+6
Original file line numberDiff line numberDiff line change
@@ -514,4 +514,10 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
514514

515515
// SPMD execution mode interrogation function.
516516
EXTERN int8_t __kmpc_is_spmd_exec_mode();
517+
518+
EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size,
519+
int16_t is_shared, const void **res);
520+
521+
EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared);
522+
517523
#endif

‎openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu

+5-2
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,18 @@ __device__ omptarget_nvptx_Queue<omptarget_nvptx_SimpleThreadPrivateContext,
3131
OMP_STATE_COUNT>
3232
omptarget_nvptx_device_simpleState[MAX_SM];
3333

34+
__device__ omptarget_nvptx_SimpleMemoryManager
35+
omptarget_nvptx_simpleMemoryManager;
36+
__device__ __shared__ uint32_t usedMemIdx;
37+
__device__ __shared__ uint32_t usedSlotIdx;
38+
3439
// Pointer to this team's OpenMP state object
3540
__device__ __shared__
3641
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
3742

3843
__device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
3944
*omptarget_nvptx_simpleThreadPrivateContext;
4045

41-
__device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
42-
4346
////////////////////////////////////////////////////////////////////////////////
4447
// The team master sets the outlined parallel function in this variable to
4548
// communicate with the workers. Since it is in shared memory, there is one

‎openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu

+6-26
Original file line numberDiff line numberDiff line change
@@ -25,18 +25,10 @@ extern __device__ omptarget_nvptx_Queue<
2525
omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
2626
omptarget_nvptx_device_simpleState[MAX_SM];
2727

28-
extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
29-
3028
////////////////////////////////////////////////////////////////////////////////
3129
// init entry points
3230
////////////////////////////////////////////////////////////////////////////////
3331

34-
INLINE unsigned nsmid() {
35-
unsigned n;
36-
asm("mov.u32 %0, %%nsmid;" : "=r"(n));
37-
return n;
38-
}
39-
4032
INLINE unsigned smid() {
4133
unsigned id;
4234
asm("mov.u32 %0, %%smid;" : "=r"(id));
@@ -64,11 +56,9 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
6456

6557
// Get a state object from the queue.
6658
int slot = smid() % MAX_SM;
59+
usedSlotIdx = slot;
6760
omptarget_nvptx_threadPrivateContext =
6861
omptarget_nvptx_device_State[slot].Dequeue();
69-
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
70-
omptarget_nvptx_threadPrivateContext->SetSourceQueue(slot);
71-
#endif
7262

7363
// init thread private
7464
int threadId = GetLogicalThreadIdInBlock();
@@ -94,11 +84,7 @@ EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
9484
ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized,
9585
"Generic always requires initialized runtime.");
9686
// Enqueue omp state object for use by another team.
97-
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
98-
int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue();
99-
#else
100-
int slot = smid() % MAX_SM;
101-
#endif
87+
int slot = usedSlotIdx;
10288
omptarget_nvptx_device_State[slot].Enqueue(
10389
omptarget_nvptx_threadPrivateContext);
10490
// Done with work. Kill the workers.
@@ -114,12 +100,9 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
114100
setExecutionParameters(Spmd, RuntimeUninitialized);
115101
if (GetThreadIdInBlock() == 0) {
116102
int slot = smid() % MAX_SM;
103+
usedSlotIdx = slot;
117104
omptarget_nvptx_simpleThreadPrivateContext =
118105
omptarget_nvptx_device_simpleState[slot].Dequeue();
119-
// Reuse the memory allocated for the full runtime as the preallocated
120-
// global memory buffer for the lightweight runtime.
121-
omptarget_nvptx_simpleGlobalData =
122-
omptarget_nvptx_device_State[slot].Dequeue();
123106
}
124107
__syncthreads();
125108
omptarget_nvptx_simpleThreadPrivateContext->Init();
@@ -136,6 +119,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
136119
if (threadId == 0) {
137120
// Get a state object from the queue.
138121
int slot = smid() % MAX_SM;
122+
usedSlotIdx = slot;
139123
omptarget_nvptx_threadPrivateContext =
140124
omptarget_nvptx_device_State[slot].Dequeue();
141125

@@ -186,19 +170,15 @@ EXTERN void __kmpc_spmd_kernel_deinit() {
186170
if (isRuntimeUninitialized()) {
187171
if (threadId == 0) {
188172
// Enqueue omp state object for use by another team.
189-
int slot = smid() % MAX_SM;
173+
int slot = usedSlotIdx;
190174
omptarget_nvptx_device_simpleState[slot].Enqueue(
191175
omptarget_nvptx_simpleThreadPrivateContext);
192-
// Enqueue global memory back.
193-
omptarget_nvptx_device_State[slot].Enqueue(
194-
reinterpret_cast<omptarget_nvptx_ThreadPrivateContext *>(
195-
omptarget_nvptx_simpleGlobalData));
196176
}
197177
return;
198178
}
199179
if (threadId == 0) {
200180
// Enqueue omp state object for use by another team.
201-
int slot = smid() % MAX_SM;
181+
int slot = usedSlotIdx;
202182
omptarget_nvptx_device_State[slot].Enqueue(
203183
omptarget_nvptx_threadPrivateContext);
204184
}

‎openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

+20-6
Original file line numberDiff line numberDiff line change
@@ -344,8 +344,6 @@ class omptarget_nvptx_ThreadPrivateContext {
344344
INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
345345

346346
INLINE void InitThreadPrivateContext(int tid);
347-
INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; }
348-
INLINE uint64_t GetSourceQueue() { return SourceQueue; }
349347

350348
private:
351349
// team context for this team
@@ -368,15 +366,29 @@ class omptarget_nvptx_ThreadPrivateContext {
368366
// state for dispatch with dyn/guided OR static (never use both at a time)
369367
int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
370368
int64_t stride[MAX_THREADS_PER_TEAM];
371-
// Queue to which this object must be returned.
372-
uint64_t SourceQueue;
373369
};
374370

375371
/// Device envrionment data
376372
struct omptarget_device_environmentTy {
377373
int32_t debug_level;
378374
};
379375

376+
/// Memory manager for statically allocated memory.
377+
class omptarget_nvptx_SimpleMemoryManager {
378+
private:
379+
__align__(128) struct MemDataTy {
380+
volatile unsigned keys[OMP_STATE_COUNT];
381+
} MemData[MAX_SM];
382+
383+
INLINE uint32_t hash(unsigned key) const {
384+
return key & (OMP_STATE_COUNT - 1);
385+
}
386+
387+
public:
388+
INLINE void Release();
389+
INLINE const void *Acquire(const void *buf, size_t size);
390+
};
391+
380392
class omptarget_nvptx_SimpleThreadPrivateContext {
381393
uint16_t par_level[MAX_THREADS_PER_TEAM];
382394

@@ -386,8 +398,6 @@ class omptarget_nvptx_SimpleThreadPrivateContext {
386398
"Expected SPMD + uninitialized runtime modes.");
387399
par_level[GetThreadIdInBlock()] = 0;
388400
}
389-
static INLINE void *Allocate(size_t DataSize);
390-
static INLINE void Deallocate(void *Ptr);
391401
INLINE void IncParLevel() {
392402
ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
393403
"Expected SPMD + uninitialized runtime modes.");
@@ -424,6 +434,10 @@ extern __device__ omptarget_device_environmentTy omptarget_device_environment;
424434
// global data tables
425435
////////////////////////////////////////////////////////////////////////////////
426436

437+
extern __device__ omptarget_nvptx_SimpleMemoryManager
438+
omptarget_nvptx_simpleMemoryManager;
439+
extern __device__ __shared__ uint32_t usedMemIdx;
440+
extern __device__ __shared__ uint32_t usedSlotIdx;
427441
extern __device__ __shared__
428442
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
429443
extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext

‎openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h

+22-27
Original file line numberDiff line numberDiff line change
@@ -204,34 +204,29 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
204204
}
205205

206206
////////////////////////////////////////////////////////////////////////////////
207-
// Lightweight runtime functions.
207+
// Memory management runtime functions.
208208
////////////////////////////////////////////////////////////////////////////////
209209

210-
// Shared memory buffer for globalization support.
211-
static __align__(16) __device__ __shared__ char
212-
omptarget_static_buffer[DS_Shared_Memory_Size];
213-
static __device__ __shared__ void *omptarget_spmd_allocated;
214-
215-
extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
216-
217-
INLINE void *
218-
omptarget_nvptx_SimpleThreadPrivateContext::Allocate(size_t DataSize) {
219-
if (DataSize <= DS_Shared_Memory_Size)
220-
return ::omptarget_static_buffer;
221-
if (DataSize <= sizeof(omptarget_nvptx_ThreadPrivateContext))
222-
return ::omptarget_nvptx_simpleGlobalData;
223-
if (threadIdx.x == 0)
224-
omptarget_spmd_allocated = SafeMalloc(DataSize, "SPMD teams alloc");
225-
__syncthreads();
226-
return omptarget_spmd_allocated;
227-
}
228-
229-
INLINE void
230-
omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(void *Ptr) {
231-
if (Ptr != ::omptarget_static_buffer &&
232-
Ptr != ::omptarget_nvptx_simpleGlobalData) {
233-
__syncthreads();
234-
if (threadIdx.x == 0)
235-
SafeFree(Ptr, "SPMD teams dealloc");
210+
INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
211+
ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
212+
"SlotIdx is too big or uninitialized.");
213+
ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
214+
"MemIdx is too big or uninitialized.");
215+
MemDataTy &MD = MemData[usedSlotIdx];
216+
atomicExch((unsigned *)&MD.keys[usedMemIdx], 0);
217+
}
218+
219+
INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
220+
size_t size) {
221+
ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
222+
"SlotIdx is too big or uninitialized.");
223+
const unsigned sm = usedSlotIdx;
224+
MemDataTy &MD = MemData[sm];
225+
unsigned i = hash(GetBlockIdInKernel());
226+
while (atomicCAS((unsigned *)&MD.keys[i], 0, 1) != 0) {
227+
i = hash(i + 1);
236228
}
229+
usedSlotIdx = sm;
230+
usedMemIdx = i;
231+
return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
237232
}

0 commit comments

Comments
 (0)
Please sign in to comment.