diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -30,6 +30,8 @@ * ``LIBOMPTARGET_PROFILE=`` * ``LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=`` * ``LIBOMPTARGET_INFO=`` + * ``LIBOMPTARGET_HEAP_SIZE=`` + * ``LIBOMPTARGET_STACK_SIZE=`` LIBOMPTARGET_DEBUG """""""""""""""""" @@ -321,6 +323,21 @@ return sum; } +LIBOMPTARGET_STACK_SIZE +""""""""""""""""""""""" + +This environment variable sets the stack size in bytes for the CUDA plugin. This +can be used to increase or decrease the standard amount of memory reserved for +each thread's stack. + +LIBOMPTARGET_HEAP_SIZE +""""""""""""""""""""""" + +This environment variable sets the amount of memory in bytes that can be +allocated using ``malloc`` and ``free`` for the CUDA plugin. This is necessary +for some applications that allocate too much memory either through the user or +globalization. + .. toctree:: :hidden: :maxdepth: 1 diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -642,11 +642,34 @@ DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; } + size_t StackLimit; + size_t HeapLimit; + if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { + StackLimit = std::stol(EnvStr); + if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) + return OFFLOAD_FAIL; + } else { + if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) + return OFFLOAD_FAIL; + } + if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { + HeapLimit = std::stol(EnvStr); + if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) + return OFFLOAD_FAIL; + } else { + if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) + return OFFLOAD_FAIL; + } + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Device supports up to %d CUDA blocks and %d threads with a " "warp size of %d\n", DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Device heap size is %d Bytes, device stack size is %d Bytes per " + "thread\n", + (int)HeapLimit, (int)StackLimit); // Set default number of teams if (EnvNumTeams > 0) {