diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -399,6 +399,7 @@ // OpenMP Environment properties int EnvNumTeams; int EnvTeamLimit; + int EnvTeamThreadLimit; int EnvMaxTeamsDefault; // OpenMP Requires Flags @@ -631,6 +632,13 @@ } else { EnvMaxTeamsDefault = -1; } + envStr = getenv("OMP_TEAMS_THREAD_LIMIT"); + if (envStr) { + EnvTeamThreadLimit = std::stoi(envStr); + DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); + } else { + EnvTeamThreadLimit = -1; + } // Default state. RequiresFlags = OMP_REQ_UNDEFINED; @@ -936,6 +944,14 @@ DeviceInfo.GroupsPerDevice[device_id]); } + // Adjust threads to the env variables + if (DeviceInfo.EnvTeamThreadLimit > 0 && + (enforce_upper_bound(&DeviceInfo.NumThreads[device_id], + DeviceInfo.EnvTeamThreadLimit))) { + DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n", + DeviceInfo.EnvTeamThreadLimit); + } + // Set default number of threads DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size; DP("Default number of threads set according to library's default %d\n", diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -281,6 +281,7 @@ // OpenMP environment properties int EnvNumTeams; int EnvTeamLimit; + int EnvTeamThreadLimit; // OpenMP requires flags int64_t RequiresFlags; @@ -436,7 +437,7 @@ DeviceRTLTy() : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), - RequiresFlags(OMP_REQ_UNDEFINED) { + EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) { DP("Start initializing CUDA\n"); @@ -467,6 +468,11 @@ EnvTeamLimit = std::stoi(EnvStr); DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); } + if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { + // OMP_TEAMS_THREAD_LIMIT has been set + EnvTeamThreadLimit = std::stoi(EnvStr); + DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); + } if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { // OMP_NUM_TEAMS has been set EnvNumTeams = std::stoi(EnvStr); @@ -596,14 +602,35 @@ DP("Error getting max block dimension, use default value %d\n", DeviceRTLTy::DefaultNumThreads); DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; - } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) { - DP("Using %d CUDA threads per block\n", MaxBlockDimX); - DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; } else { - DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " - "capping at the hard limit\n", - MaxBlockDimX, DeviceRTLTy::HardThreadLimit); - DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; + if (EnvTeamThreadLimit < 0) { + if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) { + DP("Using %d CUDA threads per block\n", MaxBlockDimX); + DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; + } else { + DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " + "capping at the hard limit\n", + MaxBlockDimX, DeviceRTLTy::HardThreadLimit); + DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; + } + } else { + if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit && + MaxBlockDimX <= EnvTeamThreadLimit) { + DP("Using %d CUDA threads per block\n", MaxBlockDimX); + DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; + } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit && + MaxBlockDimX > EnvTeamThreadLimit) { + DP("Max CUDA threads per block %d exceeds the hard thread limit %d " + "set by OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", + MaxBlockDimX, EnvTeamThreadLimit); + DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; + } else { + DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " + "capping at the hard limit\n", + MaxBlockDimX, DeviceRTLTy::HardThreadLimit); + DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; + } + } } // Get and set warp size