diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -207,7 +207,8 @@ int8_t __kmpc_is_spmd_exec_mode(); int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD, - bool UseGenericStateMachine, bool); + bool UseGenericStateMachine, + bool RequiresFullRuntime, uint32_t StackSize); void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool); diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h --- a/openmp/libomptarget/DeviceRTL/include/State.h +++ b/openmp/libomptarget/DeviceRTL/include/State.h @@ -21,10 +21,8 @@ namespace state { -inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE; - /// Initialize the state machinery. Must be called by all threads. -void init(bool IsSPMD); +void init(bool IsSPMD, uint32_t StackSize); /// TODO enum ValueKind { diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp --- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -21,11 +21,11 @@ #pragma omp declare target -static void inititializeRuntime(bool IsSPMD) { +static void inititializeRuntime(bool IsSPMD, uint32_t StackSize) { // Order is important here. synchronize::init(IsSPMD); mapping::init(IsSPMD); - state::init(IsSPMD); + state::init(IsSPMD, StackSize); } /// Simple generic state machine for worker threads. @@ -65,12 +65,13 @@ /// \param Ident Source location identification, can be NULL. /// int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD, - bool UseGenericStateMachine, bool) { + bool UseGenericStateMachine, + bool RequiresFullRuntime, uint32_t StackSize) { if (IsSPMD) { - inititializeRuntime(/* IsSPMD */ true); + inititializeRuntime(/* IsSPMD */ true, StackSize); synchronize::threads(); } else { - inititializeRuntime(/* IsSPMD */ false); + inititializeRuntime(/* IsSPMD */ false, StackSize); // No need to wait since only the main threads will execute user // code and workers will run into a barrier right away. } diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -75,7 +75,7 @@ /// struct SharedMemorySmartStackTy { /// Initialize the stack. Must be called by all threads. - void init(bool IsSPMD); + void init(bool IsSPMD, uint32_t StackSize); /// Allocate \p Bytes on the stack for the encountering thread. Each thread /// can call this function. @@ -89,7 +89,7 @@ /// Compute the size of the storage space reserved for a thread. uint32_t computeThreadStorageTotal() { uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements(); - return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock), + return utils::align_down((SharedScratchpadSize / NumLanesInBlock), Alignment); } @@ -99,20 +99,25 @@ return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; } + /// The size of the shared scratchpad, initialized by a single thread. + uint32_t SharedScratchpadSize; + /// The actual storage, shared among all warps. - unsigned char Data[state::SharedScratchpadSize] - __attribute__((aligned(Alignment))); - unsigned char Usage[mapping::MaxThreadsPerTeam] - __attribute__((aligned(Alignment))); + unsigned char *Data __attribute__((aligned(Alignment))); + unsigned char *Usage __attribute__((aligned(Alignment))); }; -static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, - "Shared scratchpad of this size not supported yet."); - /// The allocation of a single shared memory scratchpad. static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack); -void SharedMemorySmartStackTy::init(bool IsSPMD) { +void SharedMemorySmartStackTy::init(bool IsSPMD, uint32_t StackSize) { + if (mapping::getThreadIdInBlock() == 0) { + SharedScratchpadSize = StackSize; + Data = static_cast(memory::getDynamicBuffer()) + + utils::align_up(config::getDynamicMemorySize(), Alignment); + Usage = Data + utils::align_up(StackSize, Alignment); + } + synchronize::threads(); Usage[mapping::getThreadIdInBlock()] = 0; } @@ -141,7 +146,7 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) { uint64_t AlignedBytes = utils::align_up(Bytes, Alignment); - if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) { + if (Ptr >= &Data[0] && Ptr < &Data[SharedScratchpadSize]) { int TId = mapping::getThreadIdInBlock(); Usage[TId] -= AlignedBytes; return; @@ -357,8 +362,9 @@ __builtin_unreachable(); } -void state::init(bool IsSPMD) { - SharedMemorySmartStack.init(IsSPMD); +void state::init(bool IsSPMD, uint32_t StackSize) { + if (StackSize > 0) + SharedMemorySmartStack.init(IsSPMD, StackSize); if (!mapping::getThreadIdInBlock()) TeamState.init(IsSPMD); diff --git a/openmp/libomptarget/deviceRTLs/common/include/target.h b/openmp/libomptarget/deviceRTLs/common/include/target.h --- a/openmp/libomptarget/deviceRTLs/common/include/target.h +++ b/openmp/libomptarget/deviceRTLs/common/include/target.h @@ -74,7 +74,7 @@ /// int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD, bool UseGenericStateMachine, - bool RequiresFullRuntime); + bool RequiresFullRuntime, uint32_t); /// De-Initialization /// diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -204,7 +204,7 @@ EXTERN int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD, bool UseGenericStateMachine, - bool RequiresFullRuntime) { + bool RequiresFullRuntime, uint32_t) { int TId = __kmpc_get_hardware_thread_id_in_block(); if (IsSPMD) __kmpc_spmd_kernel_init(RequiresFullRuntime); diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -418,7 +418,7 @@ // non standard EXTERN int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD, bool UseGenericStateMachine, - bool RequiresFullRuntime); + bool RequiresFullRuntime, uint32_t); EXTERN void __kmpc_target_deinit(ident_t *Ident, bool IsSPMD, bool RequiresFullRuntime); EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn); diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -92,8 +92,11 @@ /// Maximal number of threads per block for this kernel. int MaxThreadsPerBlock = 0; - KernelTy(CUfunction _Func, int8_t _ExecutionMode) - : Func(_Func), ExecutionMode(_ExecutionMode) {} + /// Amount of dynamic shared memory to allocate for the data sharing stack. + uint32_t StackSize = 0; + + KernelTy(CUfunction _Func, int8_t _ExecutionMode, uint32_t _StackSize) + : Func(_Func), ExecutionMode(_ExecutionMode), StackSize(_StackSize) {} }; /// Device environment data @@ -902,7 +905,33 @@ ExecModeName); } - KernelsList.emplace_back(Func, ExecModeVal); + // Find the amount of dynamic shared memory to allocate for the data + // sharing stack in the new device RTL. + uint32_t StackSizeVal = 0; + std::string StackSizeNameStr(E->name); + StackSizeNameStr += "_stack_size"; + const char *StackSizeName = StackSizeNameStr.c_str(); + + CUdeviceptr StackSizePtr; + Err = cuModuleGetGlobal(&StackSizePtr, &CUSize, Module, StackSizeName); + if (Err == CUDA_SUCCESS) { + if (CUSize != sizeof(uint32_t)) { + DP("Loading global stack_size '%s' - size mismatch (%zd != %zd)\n", + StackSizeName, CUSize, sizeof(uint32_t)); + return nullptr; + } + + Err = cuMemcpyDtoH(&StackSizeVal, StackSizePtr, CUSize); + if (Err != CUDA_SUCCESS) { + REPORT("Error when copying data from device to host. Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", + DPxPTR(&StackSizeVal), DPxPTR(StackSizePtr), CUSize); + CUDA_ERR_STRING(Err); + return nullptr; + } + } + + KernelsList.emplace_back(Func, ExecModeVal, StackSizeVal); __tgt_offload_entry Entry = *E; Entry.addr = &KernelsList.back(); @@ -1197,10 +1226,24 @@ : "SPMD")); CUstream Stream = getStream(DeviceId, AsyncInfo); + + // Round up memory sizes to allow for proper pointer alignment. + auto RoundUp = [](uint32_t V, uint32_t Align) { + return ((V + uint32_t(Align) - 1) / uint32_t(Align)) * uint32_t(Align); + }; + + // Amount of shared memory to allocate, must contain enough for the data + // sharing stack. + uint32_t DynamicSharedMemory = RoundUp(DynamicMemorySize, 8); + if (KernelInfo->StackSize > 0) + DynamicSharedMemory += + RoundUp(KernelInfo->StackSize, 8) + + RoundUp(CudaThreadsPerBlock, DeviceData[DeviceId].WarpSize); + Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, /* gridDimZ */ 1, CudaThreadsPerBlock, /* blockDimY */ 1, /* blockDimZ */ 1, - DynamicMemorySize, Stream, &Args[0], nullptr); + DynamicSharedMemory, Stream, &Args[0], nullptr); if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) return OFFLOAD_FAIL;