Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -884,7 +884,7 @@ /// \returns Maximum number of waves per execution unit supported by the /// subtarget without any kind of limitation. unsigned getMaxWavesPerEU() const { - return AMDGPU::IsaInfo::getMaxWavesPerEU(); + return AMDGPU::IsaInfo::getMaxWavesPerEU(this); } /// \returns Number of waves per work group supported by the subtarget and Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -591,25 +591,12 @@ } unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { - if (VGPRs <= 24) - return 10; - if (VGPRs <= 28) - return 9; - if (VGPRs <= 32) - return 8; - if (VGPRs <= 36) - return 7; - if (VGPRs <= 40) - return 6; - if (VGPRs <= 48) - return 5; - if (VGPRs <= 64) - return 4; - if (VGPRs <= 84) - return 3; - if (VGPRs <= 128) - return 2; - return 1; + unsigned MaxWaves = getMaxWavesPerEU(); + unsigned Granule = getVGPRAllocGranule(); + if (VGPRs < Granule) + return MaxWaves; + unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; + return std::min(getTotalNumVGPRs() / RoundedRegs, MaxWaves); } unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -94,7 +94,7 @@ /// \returns Maximum number of waves per execution unit for given subtarget \p /// STI without any kind of limitation. -unsigned getMaxWavesPerEU(); +unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI); /// \returns Maximum number of waves per execution unit for given subtarget \p /// STI and limited by given \p FlatWorkGroupSize. Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -241,7 +241,7 @@ } unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) { - return getMaxWavesPerEU() * getEUsPerCU(STI); + return getMaxWavesPerEU(STI) * getEUsPerCU(STI); } unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI, @@ -253,9 +253,11 @@ return 1; } -unsigned getMaxWavesPerEU() { +unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) { // FIXME: Need to take scratch memory into account. - return 10; + if (!isGFX10(*STI)) + return 10; + return 20; } unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI, @@ -317,7 +319,7 @@ if (Version.Major >= 10) return 0; - if (WavesPerEU >= getMaxWavesPerEU()) + if (WavesPerEU >= getMaxWavesPerEU(STI)) return 0; unsigned MinNumSGPRs = getTotalNumSGPRs(STI) / (WavesPerEU + 1); @@ -394,17 +396,19 @@ } unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) { - return 256; + if (!isGFX10(*STI)) + return 256; + return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512; } unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { - return getTotalNumVGPRs(STI); + return 256; } unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { assert(WavesPerEU != 0); - if (WavesPerEU >= getMaxWavesPerEU()) + if (WavesPerEU >= getMaxWavesPerEU(STI)) return 0; unsigned MinNumVGPRs = alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1), Index: test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -74,7 +74,10 @@ ; CHECK: .name: num_spilled_vgprs ; CHECK: .symbol: num_spilled_vgprs.kd -; CHECK: .vgpr_spill_count: 14 +; GFX700: .vgpr_spill_count: 14 +; GFX803: .vgpr_spill_count: 14 +; GFX900: .vgpr_spill_count: 14 +; GFX1010: .vgpr_spill_count: 0 define amdgpu_kernel void @num_spilled_vgprs() #1 { %val0 = load volatile float, float addrspace(1)* @var %val1 = load volatile float, float addrspace(1)* @var Index: test/CodeGen/AMDGPU/idot8s.ll =================================================================== --- test/CodeGen/AMDGPU/idot8s.ll +++ test/CodeGen/AMDGPU/idot8s.ll @@ -2356,9 +2356,9 @@ ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s6 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v22, v11 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v20, 12, s8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v31, 12, s8 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v23, v23, v10 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v21, 12, s1 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v27, 12, s1 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s5 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s9 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v19, 12, s7 @@ -2368,8 +2368,8 @@ ; GFX10-DL-NEXT: v_and_b32_e32 v12, v16, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v13, v17, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v15, v19, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v10, v21, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v14, v20, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v10, v27, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v14, v31, v2 ; GFX10-DL-NEXT: v_and_b32_sdwa v6, v23, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v7, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD Index: test/CodeGen/AMDGPU/nsa-reassign.ll =================================================================== --- test/CodeGen/AMDGPU/nsa-reassign.ll +++ test/CodeGen/AMDGPU/nsa-reassign.ll @@ -21,8 +21,8 @@ } ; GCN-LABEL: {{^}}sample_contig_nsa_10vgprs: -; GCN-DAG: image_sample_c_l v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}], -; GCN-DAG: image_sample v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+}}], +; GCN-DAG: image_sample_c_l v{{[0-9]+}}, v[{{[0-9:]+}}], +; GCN-DAG: image_sample v{{[0-9]+}}, v[{{[0-9:]+}}], define amdgpu_ps <2 x float> @sample_contig_nsa_10vgprs(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) #0 { main_body: %zcompare.1 = fadd float %zcompare, 1.0 Index: test/CodeGen/AMDGPU/regbank-reassign.mir =================================================================== --- test/CodeGen/AMDGPU/regbank-reassign.mir +++ test/CodeGen/AMDGPU/regbank-reassign.mir @@ -217,6 +217,12 @@ - { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' } - { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' } - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } + - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } + - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } + - { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' } + - { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' } + - { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' } + - { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' } body: | bb.0: %0 = IMPLICIT_DEF @@ -228,6 +234,12 @@ %7 = IMPLICIT_DEF %8 = IMPLICIT_DEF %9 = IMPLICIT_DEF + %10 = IMPLICIT_DEF + %11 = IMPLICIT_DEF + %12 = IMPLICIT_DEF + %13 = IMPLICIT_DEF + %14 = IMPLICIT_DEF + %15 = IMPLICIT_DEF %2 = V_AND_B32_e32 %1, %0, implicit $exec GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, implicit $exec GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, implicit $exec Index: test/CodeGen/AMDGPU/wave32.ll =================================================================== --- test/CodeGen/AMDGPU/wave32.ll +++ test/CodeGen/AMDGPU/wave32.ll @@ -920,7 +920,7 @@ ; GCN-LABEL: {{^}}test_vgprblocks_w64_attr: ; Test that the wave size can be overridden in function attributes and that the block size is correct as a result -; GFX10DEFWAVE: ; VGPRBlocks: 2 +; GFX10DEFWAVE: ; VGPRBlocks: 11 define amdgpu_gs float @test_vgprblocks_w64_attr(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j, float %k, float %l) #4 { main_body: