diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -648,7 +648,7 @@ EventPool[DeviceId] = std::make_unique( EventAllocatorTy(DeviceData[DeviceId].Context), NumInitialEvents); - // Query attributes to determine number of threads/block and blocks/grid. + // Set the limit for the number of teams. int MaxGridDimX; Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device); @@ -660,8 +660,14 @@ DP("Using %d CUDA blocks per grid\n", MaxGridDimX); DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; } + // Adjust the number to the env OMP_TEAM_LIMIT. + if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { + DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", + EnvTeamLimit); + DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; + } - // We are only exploiting threads along the x axis. + // Set the limit for the number of threads. int MaxBlockDimX; Err = cuDeviceGetAttribute(&MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); @@ -672,20 +678,41 @@ } else { DP("Using %d CUDA threads per block\n", MaxBlockDimX); DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; + } + // Adjust the number to the env OMP_TEAMS_THREAD_LIMIT. + if (EnvTeamThreadLimit > 0 && + DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { + DP("Max CUDA threads per block %d exceeds the thread limit %d set by " + "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", + DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); + DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; + } - if (EnvTeamThreadLimit > 0 && - DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { - DP("Max CUDA threads per block %d exceeds the thread limit %d set by " - "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", - DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); - DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; - } - if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { - DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " - "capping at the hard limit\n", - DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); - DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; - } + // Set default number of teams used when user doesn't set in construct. + if (EnvNumTeams > 0) { + DP("Default number of teams set according to environment %d\n", + EnvNumTeams); + DeviceData[DeviceId].NumTeams = EnvNumTeams; + } else { + DP("Default number of teams set according to library's default %d\n", + DeviceRTLTy::DefaultNumTeams); + DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; + } + if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { + DP("Default number of teams exceeds device limit, capping at %d\n", + DeviceData[DeviceId].BlocksPerGrid); + DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; + } + + // Set default number of threads used when user doesn't set in construct. + DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; + DP("Default number of threads set according to library's default %d\n", + DeviceRTLTy::DefaultNumThreads); + if (DeviceData[DeviceId].NumThreads > + DeviceData[DeviceId].ThreadsPerBlock) { + DP("Default number of threads exceeds device limit, capping at %d\n", + DeviceData[DeviceId].ThreadsPerBlock); + DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock; } // Get and set warp size @@ -700,13 +727,6 @@ DeviceData[DeviceId].WarpSize = WarpSize; } - // Adjust teams to the env variables - if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { - DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", - EnvTeamLimit); - DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; - } - size_t StackLimit; size_t HeapLimit; if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { @@ -736,34 +756,6 @@ "thread\n", (int)HeapLimit, (int)StackLimit); - // Set default number of teams - if (EnvNumTeams > 0) { - DP("Default number of teams set according to environment %d\n", - EnvNumTeams); - DeviceData[DeviceId].NumTeams = EnvNumTeams; - } else { - DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; - DP("Default number of teams set according to library's default %d\n", - DeviceRTLTy::DefaultNumTeams); - } - - if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { - DP("Default number of teams exceeds device limit, capping at %d\n", - DeviceData[DeviceId].BlocksPerGrid); - DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; - } - - // Set default number of threads - DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; - DP("Default number of threads set according to library's default %d\n", - DeviceRTLTy::DefaultNumThreads); - if (DeviceData[DeviceId].NumThreads > - DeviceData[DeviceId].ThreadsPerBlock) { - DP("Default number of threads exceeds device limit, capping at %d\n", - DeviceData[DeviceId].ThreadsPerBlock); - DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock; - } - return OFFLOAD_SUCCESS; } @@ -1137,13 +1129,11 @@ DeviceData[DeviceId].NumThreads); CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; } - if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { DP("Threads per block capped at device limit %d\n", DeviceData[DeviceId].ThreadsPerBlock); CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; } - if (!KernelInfo->MaxThreadsPerBlock) { Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, @@ -1151,7 +1141,6 @@ if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) return OFFLOAD_FAIL; } - if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { DP("Threads per block capped at kernel limit %d\n", KernelInfo->MaxThreadsPerBlock); @@ -1205,7 +1194,6 @@ DP("Using requested number of teams %d\n", TeamNum); CudaBlocksPerGrid = TeamNum; } - if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) { DP("Capping number of teams to team limit %d\n", DeviceData[DeviceId].BlocksPerGrid);