diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h --- a/openmp/libomptarget/DeviceRTL/include/Configuration.h +++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h @@ -31,6 +31,9 @@ /// Return the user choosen debug level. uint32_t getDebugLevel(); +/// Return the amount of dynamic shared memory that was allocated at launch. +uint64_t getDynamicMemSize(); + bool isDebugMode(DebugLevel Level); } // namespace config diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -174,6 +174,10 @@ /// allocated by __kmpc_alloc_shared by the same thread. void __kmpc_free_shared(void *Ptr, uint64_t Bytes); +/// Get a pointer to the memory buffer containing dynamically allocated shared +/// memory configured at launch. +void *__kmpc_get_dynamic_shared(); + /// Allocate sufficient space for \p NumArgs sequential `void*` and store the /// allocation address in \p GlobalArgs. /// diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h --- a/openmp/libomptarget/DeviceRTL/include/State.h +++ b/openmp/libomptarget/DeviceRTL/include/State.h @@ -188,6 +188,9 @@ /// Alloca \p Size bytes in global memory, if possible, for \p Reason. void *allocGlobal(uint64_t Size, const char *Reason); +/// Return a pointer to the dynamic shared memory buffer. +void *getDynamicBuffer(); + /// Free \p Ptr, alloated via allocGlobal, for \p Reason. void freeGlobal(void *Ptr, const char *Reason); diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp --- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -21,6 +21,7 @@ uint32_t DebugLevel; uint32_t NumDevices; uint32_t DeviceNum; + uint64_t DynamicMemSize; }; #pragma omp declare target @@ -43,6 +44,10 @@ return omptarget_device_environment.DeviceNum; } +uint64_t config::getDynamicMemSize() { + return omptarget_device_environment.DynamicMemSize; +} + bool config::isDebugMode(config::DebugLevel Level) { return config::getDebugLevel() > Level; } diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -25,6 +25,13 @@ /// ///{ +/// Add worst-case padding so that future allocations are properly aligned. +constexpr const uint32_t Alignment = 8; + +/// External symbol to access dynamic shared memory. +extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment))); +#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc) + namespace { /// Fallback implementations are missing to trigger a link time error. @@ -57,9 +64,6 @@ #pragma omp end declare variant ///} -/// Add worst-case padding so that future allocations are properly aligned. -constexpr const uint32_t Alignment = 8; - /// A "smart" stack in shared memory. /// /// The stack exposes a malloc/free interface but works like a stack internally. @@ -147,6 +151,8 @@ } // namespace +void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } + void *memory::allocShared(uint64_t Bytes, const char *Reason) { return SharedMemorySmartStack.push(Bytes); } @@ -497,6 +503,10 @@ memory::freeShared(Ptr, Bytes, "Frontend free shared"); } +__attribute__((noinline)) void *__kmpc_get_dynamic_shared() { + return memory::getDynamicBuffer(); +} + /// Allocate storage in shared memory to communicate arguments from the main /// thread to the workers in generic mode. If we exceed /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -103,6 +103,7 @@ int32_t debug_level; uint32_t num_devices; uint32_t device_num; + uint64_t dynamic_shared_size; }; namespace { @@ -346,6 +347,8 @@ int EnvTeamThreadLimit; // OpenMP requires flags int64_t RequiresFlags; + // Amount of dynamic shared memory to use at launch. + uint64_t DynamicMemorySize; static constexpr const int HardTeamLimit = 1U << 16U; // 64k static constexpr const int HardThreadLimit = 1024; @@ -499,7 +502,8 @@ DeviceRTLTy() : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), - EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) { + EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), + DynamicMemorySize(0) { DP("Start initializing CUDA\n"); @@ -540,6 +544,11 @@ EnvNumTeams = std::stoi(EnvStr); DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); } + if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY")) { + // LIBOMPTARGET_SHARED_MEMORY has been set + DynamicMemorySize = std::stoi(EnvStr); + DP("Parsed LIBOMPTARGET_SHARED_MEMORY", DynamicMemorySize); + } StreamManager = std::make_unique(NumberOfDevices, DeviceData); @@ -903,7 +912,7 @@ { omptarget_device_environmentTy DeviceEnv{ 0, static_cast(NumberOfDevices), - static_cast(DeviceId)}; + static_cast(DeviceId), DynamicMemorySize}; #ifdef OMPTARGET_DEBUG if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) @@ -1189,7 +1198,7 @@ Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, /* gridDimZ */ 1, CudaThreadsPerBlock, /* blockDimY */ 1, /* blockDimZ */ 1, - /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); + DynamicMemorySize, Stream, &Args[0], nullptr); if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) return OFFLOAD_FAIL; diff --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c @@ -0,0 +1,31 @@ +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -fopenmp-target-new-runtime +// RUN: env LIBOMPTARGET_SHARED_MEMORY=4 \ +// RUN: %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda +// REQUIRES: nvptx64-nvidia-cuda + +#include +#include + +void *get_dynamic_shared() { return NULL; } +#pragma omp begin declare variant match(device = {arch(nvptx64)}) +extern void *__kmpc_get_dynamic_shared(); +void *get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } +#pragma omp end declare variant + +int main() { + int x; +#pragma omp target parallel map(from : x) + { + int *buf = get_dynamic_shared(); +#pragma omp barrier + if (omp_get_thread_num() == 0) + *buf = 1; +#pragma omp barrier + if (omp_get_thread_num() == 1) + x = *buf; + } + + // CHECK: PASS + if (x == 1) + printf("PASS\n"); +}