diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -86,7 +86,13 @@ GV_Max_Warp_Number, /// The slot size that should be reserved for a working warp. /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2)) - GV_Warp_Size_Log2_MaskL + GV_Warp_Size_Log2_MaskL, + /// Total number of vector registers per CU or SM + GV_Vector_Register_Count, + /// Total number of scalar registers per CU or SM + GV_Scalar_Register_Count, + /// Total shared memory size per CU or SM in bytes + GV_Shared_Memory_Size }; /// For AMDGPU GPUs @@ -104,7 +110,10 @@ 1024, // GV_Max_WG_Size, 256, // GV_Defaut_WG_Size 1024 / 64, // GV_Max_WG_Size / GV_WarpSize - 63 // GV_Warp_Size_Log2_MaskL + 63, // GV_Warp_Size_Log2_MaskL + 64 * 1024, // GV_Vector_Register_Count + 4 * 800, // GV_Scalar_Register_Count + 64 * 1024 // GV_Shared_Memory_Size }; /// For Nvidia GPUs @@ -122,7 +131,10 @@ 1024, // GV_Max_WG_Size 128, // GV_Defaut_WG_Size 1024 / 32, // GV_Max_WG_Size / GV_WarpSize - 31 // GV_Warp_Size_Log2_MaskL + 31, // GV_Warp_Size_Log2_MaskL + 64 * 1024, // GV_Vector_Register_Count + 0, // GV_Scalar_Register_Count (not applicable) + 32 * 1024 // GV_Shared_Memory_Size (configurable) }; } // namespace omp diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -71,6 +71,15 @@ } } +// Number of SIMDs in a CU +static const uint32_t NumSIMDsPerCU = 4; +// Heuristic parameters used for kernel launch parameters +// Default number of waves per team is chosen equal to the number of SIMDs +static const uint32_t DefaultNumWavesPerTeam = NumSIMDsPerCU; +// Default number of teams per CU is chosen for scheduling flexibility within a +// SIMD +static const uint32_t DefaultNumTeamsPerCU = 4; + int print_kernel_trace; #ifdef OMPTARGET_DEBUG @@ -435,14 +444,22 @@ static const unsigned HardTeamLimit = (1 << 16) - 1; // 64K needed to fit in uint16 static const int DefaultNumTeams = 128; - static const int Max_Teams = - llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams]; static const int Warp_Size = llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]; + static const int Max_Teams = + llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams]; static const int Max_WG_Size = llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_WG_Size]; static const int Default_WG_Size = llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size]; + static const int Max_Warp_Number = + llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Warp_Number]; + static const int Vector_Register_Count = llvm::omp::AMDGPUGpuGridValues + [llvm::omp::GVIDX::GV_Vector_Register_Count]; + static const int Scalar_Register_Count = llvm::omp::AMDGPUGpuGridValues + [llvm::omp::GVIDX::GV_Scalar_Register_Count]; + static const int LDS_Size = + llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Shared_Memory_Size]; using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *, size_t size, hsa_agent_t); @@ -1718,6 +1735,83 @@ return OFFLOAD_SUCCESS; } +/* + Determine total number of teams in a kernel and the number of + threads in each team in order to maximize occupancy. + + Here is the overall idea: We aim to schedule a certain number of + wavefronts, expressed as the product of the number of teams and the + number of wavefronts per team. Each CU has 4 SIMDs. To account for + each SIMD, we start with at least 4 wavefronts in a team. If the + total number of teams per CU is constrained by LDS usage, we try to + increase the number of wavefronts per team. If the total number of + wavefronts per SIMD is constrained by VGPR or SGPR usage, we reduce + the number of teams while keeping the number of wavefronts unchanged. +*/ +void adjustTeamsAndThreadsBasedOnResources(int *NumTeams, int *NumThreads, + uint32_t lds_usage, + uint32_t sgpr_count, + uint32_t vgpr_count, + int32_t device_id) { + // Initialize the number of waves per team to the default + uint32_t num_waves_per_team = DefaultNumWavesPerTeam; + + // lds_usage is reported per workgroup (i.e. team). So initialize + // the number of teams per CU based on lds_usage + uint32_t num_teams_per_cu = + std::min(RTLDeviceInfoTy::LDS_Size / (lds_usage ? lds_usage : 1), + DefaultNumTeamsPerCU); + + // Compute the maximum number of waves per SIMD based on VGPR and SGPR usage + uint32_t vgprs_avail_per_simd = + RTLDeviceInfoTy::Vector_Register_Count / NumSIMDsPerCU; + // vgpr_count is per workitem (i.e. thread) + uint32_t vgpr_usage_per_wave = vgpr_count * RTLDeviceInfoTy::Warp_Size; + uint32_t vgpr_constrained_max_waves_per_simd = + vgprs_avail_per_simd / (vgpr_usage_per_wave ? vgpr_usage_per_wave : 1); + + uint32_t sgprs_avail_per_simd = + RTLDeviceInfoTy::Scalar_Register_Count / NumSIMDsPerCU; + // sgpr_count is per wavefront + uint32_t sgpr_constrained_max_waves_per_simd = + sgprs_avail_per_simd / (sgpr_count ? sgpr_count : 1); + + uint32_t max_waves_per_simd = + std::min(std::min(vgpr_constrained_max_waves_per_simd, + sgpr_constrained_max_waves_per_simd), + DefaultNumTeamsPerCU); + + uint32_t default_occupancy_factor = + DefaultNumTeamsPerCU * DefaultNumWavesPerTeam; + // Compute occupancy factor based on constraints from LDS usage + uint32_t lds_constrained_occupancy_factor = + num_teams_per_cu * DefaultNumWavesPerTeam; + // Compute occupancy factor based on constraints from VGPR and SGPR usage + uint32_t gpr_constrained_occupancy_factor = + max_waves_per_simd * DefaultNumWavesPerTeam; + + // First, we examine whether LDS is limiting the number of teams, + // regardless of any limits imposed by GPR usage + if (lds_constrained_occupancy_factor < default_occupancy_factor) { + // No benefit in increasing num_teams_per_cu + // But try to increase the number of waves per team subject to + // constraints imposed by GPR usage + num_waves_per_team = std::min(gpr_constrained_occupancy_factor / + (num_teams_per_cu ? num_teams_per_cu : 1), + (uint32_t)RTLDeviceInfoTy::Max_Warp_Number); + } else if (gpr_constrained_occupancy_factor < default_occupancy_factor) { + // Not much can be done, this kernel will run with reduced + // occupancy. Lower the number of teams to reflect that aspect. + num_teams_per_cu = std::min(num_teams_per_cu, max_waves_per_simd); + } + + // Total number of teams in the kernel + *NumTeams = num_teams_per_cu * DeviceInfo.ComputeUnits[device_id]; + + // Total number of threads in each team + *NumThreads = num_waves_per_team * RTLDeviceInfoTy::Warp_Size; +} + // Determine launch values for threadsPerGroup and num_groups. // Outputs: treadsPerGroup, num_groups // Inputs: Max_Teams, Max_WG_Size, Warp_Size, ExecutionMode, @@ -1726,8 +1820,21 @@ void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize, int ExecutionMode, int EnvTeamLimit, int EnvNumTeams, int num_teams, int thread_limit, uint64_t loop_tripcount, + uint32_t lds_usage, uint32_t sgpr_count, uint32_t vgpr_count, int32_t device_id) { + // If the user did not specify number of teams or threads, adjust + // them based on resources + int NumTeamsBasedOnResources = 0; + int NumThreadsBasedOnResources = 0; + if (EnvTeamLimit <= 0 && EnvNumTeams <= 0 && + DeviceInfo.EnvMaxTeamsDefault <= 0 && num_teams <= 0 && + thread_limit <= 0) { + adjustTeamsAndThreadsBasedOnResources( + &NumTeamsBasedOnResources, &NumThreadsBasedOnResources, lds_usage, + sgpr_count, vgpr_count, device_id); + } + int Max_Teams = DeviceInfo.EnvMaxTeamsDefault > 0 ? DeviceInfo.EnvMaxTeamsDefault : DeviceInfo.NumTeams[device_id]; @@ -1738,6 +1845,7 @@ fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams); fprintf(stderr, "Max_Teams: %d\n", Max_Teams); + fprintf(stderr, "NumTeamsBasedOnResources: %d\n", NumTeamsBasedOnResources); fprintf(stderr, "RTLDeviceInfoTy::Warp_Size: %d\n", RTLDeviceInfoTy::Warp_Size); fprintf(stderr, "RTLDeviceInfoTy::Max_WG_Size: %d\n", @@ -1746,8 +1854,20 @@ RTLDeviceInfoTy::Default_WG_Size); fprintf(stderr, "thread_limit: %d\n", thread_limit); fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); + fprintf(stderr, "NumThreadsBasedOnResources: %d\n", + NumThreadsBasedOnResources); fprintf(stderr, "ConstWGSize: %d\n", ConstWGSize); } + + if (NumTeamsBasedOnResources > 0 && + NumTeamsBasedOnResources <= DeviceInfo.HardTeamLimit && + NumThreadsBasedOnResources > 0) { + Max_Teams = NumTeamsBasedOnResources; + threadsPerGroup = NumThreadsBasedOnResources; + DP("Modifying Max_Teams based on resources: %d\n", Max_Teams); + DP("Modifying threadsPerGroup based on resources: %d\n", threadsPerGroup); + } + // check for thread_limit() clause if (thread_limit > 0) { threadsPerGroup = thread_limit; @@ -1947,6 +2067,7 @@ num_teams, // From run_region arg thread_limit, // From run_region arg loop_tripcount, // From run_region arg + group_segment_size, sgpr_count, vgpr_count, KernelInfo->device_id); if (print_kernel_trace >= LAUNCH) {