Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -369,15 +369,8 @@ __threadfence_block(); } -// Called at the time of the kernel initialization. This is used to initilize -// the list of references to shared variables and to pre-allocate global storage -// for holding the globalized variables. -// -// By default the globalized variables are stored in global memory. If the -// UseSharedMemory is set to true, the runtime will attempt to use shared memory -// as long as the size requested fits the pre-allocated size. -EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, - int16_t UseSharedMemory) { +INLINE void* data_sharing_push_stack_common(size_t DataSize, + size_t PushSize) { if (isRuntimeUninitialized()) { ASSERT0(LT_FUSSY, isSPMDMode(), "Expected SPMD mode with uninitialized runtime."); @@ -387,9 +380,7 @@ // Add worst-case padding to DataSize so that future stack allocations are // correctly aligned. const size_t Alignment = 8; - if (DataSize % Alignment != 0) { - DataSize += (Alignment - DataSize % Alignment); - } + PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; // Frame pointer must be visible to all workers in the same warp. unsigned WID = getWarpId(); @@ -402,14 +393,6 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - // Compute the total memory footprint of the requested data. - // The master thread requires a stack only for itself. A worker - // thread (which at this point is a warp master) will require - // space for the variables of each thread in the warp, - // i.e. one DataSize chunk per warp lane. - // TODO: change WARPSIZE to the number of active threads in the warp. - size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize; - // Check if we have room for the data in the current slot. const uintptr_t StartAddress = (uintptr_t)StackP; const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; @@ -457,8 +440,35 @@ __threadfence_block(); + return FrameP; +} + +EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize, + int16_t UseSharedMemory) { + size_t PushSize = DataSize; + return data_sharing_push_stack_common(DataSize, PushSize); +} + +// Called at the time of the kernel initialization. This is used to initilize +// the list of references to shared variables and to pre-allocate global storage +// for holding the globalized variables. +// +// By default the globalized variables are stored in global memory. If the +// UseSharedMemory is set to true, the runtime will attempt to use shared memory +// as long as the size requested fits the pre-allocated size. +EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, + int16_t UseSharedMemory) { + // Compute the total memory footprint of the requested data. + // The master thread requires a stack only for itself. A worker + // thread (which at this point is a warp master) will require + // space for the variables of each thread in the warp, + // i.e. one DataSize chunk per warp lane. + // TODO: change WARPSIZE to the number of active threads in the warp. + size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize; + // Compute the start address of the frame of each thread in the warp. - uintptr_t FrameStartAddress = (uintptr_t)FrameP; + uintptr_t FrameStartAddress = + (uintptr_t) data_sharing_push_stack_common(DataSize, PushSize); FrameStartAddress += (uintptr_t) (getLaneId() * DataSize); return (void *)FrameStartAddress; } @@ -475,6 +485,8 @@ return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart); } + __threadfence_block(); + if (getThreadId() % WARPSIZE == 0) { unsigned WID = getWarpId(); @@ -501,8 +513,6 @@ SlotP->Next = 0; } } - - __threadfence_block(); } // Begin a data sharing context. Maintain a list of references to shared Index: libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/interface.h +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -478,6 +478,8 @@ EXTERN void __kmpc_data_sharing_init_stack(); EXTERN void __kmpc_data_sharing_init_stack_spmd(); +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, + int16_t UseSharedMemory); EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); EXTERN void __kmpc_data_sharing_pop_stack(void *a); EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);