Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -129,7 +129,7 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *&FrameP = DataSharingState.FramePtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; int32_t &ActiveT = DataSharingState.ActiveThreads[WID]; DSPRINT0(DSFLAG, "Save current slot/stack values.\n"); @@ -283,7 +283,7 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *&FrameP = DataSharingState.FramePtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; SlotP = *SavedSharedSlot; StackP = *SavedSharedStack; @@ -321,7 +321,7 @@ DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID); - void *P = DataSharingState.FramePtr[SourceWID]; + void * volatile P = DataSharingState.FramePtr[SourceWID]; DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n"); return P; } @@ -369,47 +369,31 @@ __threadfence_block(); } -// Called at the time of the kernel initialization. This is used to initilize -// the list of references to shared variables and to pre-allocate global storage -// for holding the globalized variables. -// -// By default the globalized variables are stored in global memory. If the -// UseSharedMemory is set to true, the runtime will attempt to use shared memory -// as long as the size requested fits the pre-allocated size. -EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, - int16_t UseSharedMemory) { +INLINE void* data_sharing_push_stack_common(size_t PushSize) { if (isRuntimeUninitialized()) { ASSERT0(LT_FUSSY, isSPMDMode(), "Expected SPMD mode with uninitialized runtime."); - return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize); + return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize); } + // Only warp active master threads manage the stack. + bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0; + // Add worst-case padding to DataSize so that future stack allocations are // correctly aligned. const size_t Alignment = 8; - if (DataSize % Alignment != 0) { - DataSize += (Alignment - DataSize % Alignment); - } + PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; // Frame pointer must be visible to all workers in the same warp. unsigned WID = getWarpId(); - void *&FrameP = DataSharingState.FramePtr[WID]; + void *volatile &FrameP = DataSharingState.FramePtr[WID]; - // Only warp active master threads manage the stack. - if (getThreadId() % WARPSIZE == 0) { + if (IsWarpMaster) { // SlotP will point to either the shared memory slot or an existing // global memory slot. __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - // Compute the total memory footprint of the requested data. - // The master thread requires a stack only for itself. A worker - // thread (which at this point is a warp master) will require - // space for the variables of each thread in the warp, - // i.e. one DataSize chunk per warp lane. - // TODO: change WARPSIZE to the number of active threads in the warp. - size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize; - // Check if we have room for the data in the current slot. const uintptr_t StartAddress = (uintptr_t)StackP; const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; @@ -453,12 +437,39 @@ // Reset stack pointer to the requested address. StackP = (void *)RequestedEndAddress; } + } else { + while (!FrameP); } - __threadfence_block(); + return FrameP; +} + +EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize, + int16_t UseSharedMemory) { + return data_sharing_push_stack_common(DataSize); +} + +// Called at the time of the kernel initialization. This is used to initilize +// the list of references to shared variables and to pre-allocate global storage +// for holding the globalized variables. +// +// By default the globalized variables are stored in global memory. If the +// UseSharedMemory is set to true, the runtime will attempt to use shared memory +// as long as the size requested fits the pre-allocated size. +EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, + int16_t UseSharedMemory) { + // Compute the total memory footprint of the requested data. + // The master thread requires a stack only for itself. A worker + // thread (which at this point is a warp master) will require + // space for the variables of each thread in the warp, + // i.e. one DataSize chunk per warp lane. + // TODO: change WARPSIZE to the number of active threads in the warp. + size_t PushSize = (isRuntimeUninitialized() || IsMasterThread()) ? + DataSize : WARPSIZE * DataSize; // Compute the start address of the frame of each thread in the warp. - uintptr_t FrameStartAddress = (uintptr_t)FrameP; + uintptr_t FrameStartAddress = + (uintptr_t) data_sharing_push_stack_common(PushSize); FrameStartAddress += (uintptr_t) (getLaneId() * DataSize); return (void *)FrameStartAddress; } @@ -475,6 +486,8 @@ return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart); } + __threadfence_block(); + if (getThreadId() % WARPSIZE == 0) { unsigned WID = getWarpId(); @@ -501,8 +514,6 @@ SlotP->Next = 0; } } - - __threadfence_block(); } // Begin a data sharing context. Maintain a list of references to shared Index: libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/interface.h +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -478,6 +478,8 @@ EXTERN void __kmpc_data_sharing_init_stack(); EXTERN void __kmpc_data_sharing_init_stack_spmd(); +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, + int16_t UseSharedMemory); EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); EXTERN void __kmpc_data_sharing_pop_stack(void *a); EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -123,7 +123,7 @@ struct DataSharingStateTy { __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; void *StackPtr[DS_Max_Warp_Number]; - void *FramePtr[DS_Max_Warp_Number]; + void * volatile FramePtr[DS_Max_Warp_Number]; int32_t ActiveThreads[DS_Max_Warp_Number]; }; // Additional worker slot type which is initialized with the default worker slot Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -40,8 +40,6 @@ INLINE unsigned smid() { unsigned id; asm("mov.u32 %0, %%smid;" : "=r"(id)); - ASSERT0(LT_FUSSY, nsmid() <= MAX_SM, - "Expected number of SMs is less than reported."); return id; } @@ -156,7 +154,6 @@ // omptarget_nvptx_TaskDescr *newTaskDescr = omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); - ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); newTaskDescr->InitLevelOneTaskDescr(ThreadLimit, currTeamDescr.LevelZeroTaskDescr()); newTaskDescr->ThreadLimit() = ThreadLimit; Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -188,7 +188,6 @@ { void *ptr = malloc(size); PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr)); - ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg); return ptr; }