diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -695,6 +695,12 @@ "Does not need SW waitstates" >; +def FeatureGFX11FullVGPRs : SubtargetFeature<"gfx11-full-vgprs", + "HasGFX11FullVGPRs", + "true", + "GFX11 with 50% more physical VGPRs and 50% larger allocation granule than GFX10" +>; + class SubtargetFeatureNSAMaxSize : SubtargetFeature < "nsa-max-size-"#Value, "NSAMaxSize", @@ -1297,11 +1303,12 @@ def FeatureISAVersion11_0_0 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, - [FeatureUserSGPRInit16Bug])>; + [FeatureGFX11FullVGPRs, + FeatureUserSGPRInit16Bug])>; def FeatureISAVersion11_0_1 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, - [])>; + [FeatureGFX11FullVGPRs])>; def FeatureISAVersion11_0_2 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -192,6 +192,7 @@ bool HasFlatSegmentOffsetBug = false; bool HasImageStoreD16Bug = false; bool HasImageGather4D16Bug = false; + bool HasGFX11FullVGPRs = false; bool HasVOPDInsts = false; // Dummy feature to use for assembler in tablegen. @@ -1071,6 +1072,8 @@ /// target. bool hasNullExportTarget() const { return !GFX11Insts; } + bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; } + bool hasVOPDInsts() const { return HasVOPDInsts; } bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -844,6 +844,9 @@ *EnableWavefrontSize32 : STI->getFeatureBits().test(FeatureWavefrontSize32); + if (STI->getFeatureBits().test(FeatureGFX11FullVGPRs)) + return IsWave32 ? 24 : 12; + if (hasGFX10_3Insts(*STI)) return IsWave32 ? 16 : 8; @@ -867,7 +870,10 @@ return 512; if (!isGFX10Plus(*STI)) return 256; - return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512; + bool IsWave32 = STI->getFeatureBits().test(FeatureWavefrontSize32); + if (STI->getFeatureBits().test(FeatureGFX11FullVGPRs)) + return IsWave32 ? 1536 : 768; + return IsWave32 ? 1024 : 512; } unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -397,54 +397,57 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_load_b128 v[16:19], v[0:1], off ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:32 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_dual_cndmask_b32 v17, v13, v15 :: v_dual_cndmask_b32 v16, v12, v14 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v12, v14, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v19, v13, v15, s0 ; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:48 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v19, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v18 :: v_dual_add_nc_u32 v1, 1, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v17, v5 :: v_dual_cndmask_b32 v0, v16, v4 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v4, v18, v4, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v19, v5, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v3 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v7, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v18, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v19, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_cndmask_b32 v3, v3, v7 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v1 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v8 :: v_dual_cndmask_b32 v3, v3, v9 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v7, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v3 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v12 :: v_dual_cndmask_b32 v3, v3, v13 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v3 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v14, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v3 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v15, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v14, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v15, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -104,7 +104,10 @@ ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: s_clause 0xa +; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:128 +; GFX11-NEXT: global_load_b128 v[4:7], v64, s[0:1] offset:144 +; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:160 ; GFX11-NEXT: global_load_b128 v[32:35], v64, s[0:1] ; GFX11-NEXT: global_load_b128 v[36:39], v64, s[0:1] offset:16 ; GFX11-NEXT: global_load_b128 v[40:43], v64, s[0:1] offset:32 @@ -113,42 +116,45 @@ ; GFX11-NEXT: global_load_b128 v[52:55], v64, s[0:1] offset:80 ; GFX11-NEXT: global_load_b128 v[56:59], v64, s[0:1] offset:96 ; GFX11-NEXT: global_load_b128 v[60:63], v64, s[0:1] offset:112 -; GFX11-NEXT: global_load_b128 v[4:7], v64, s[0:1] offset:144 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(9) ; GFX11-NEXT: v_mov_b32_e32 v5, 0x3e7 -; GFX11-NEXT: s_clause 0x6 -; GFX11-NEXT: global_load_b128 v[28:31], v64, s[0:1] offset:128 -; GFX11-NEXT: global_load_b128 v[24:27], v64, s[0:1] offset:160 -; GFX11-NEXT: global_load_b128 v[20:23], v64, s[0:1] offset:176 -; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:192 -; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:208 -; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:224 -; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:240 -; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:176 +; GFX11-NEXT: global_load_b128 v[28:31], v64, s[0:1] offset:192 +; GFX11-NEXT: global_load_b128 v[24:27], v64, s[0:1] offset:208 +; GFX11-NEXT: global_load_b128 v[20:23], v64, s[0:1] offset:224 +; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:240 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v64, v[28:31], s[2:3] offset:128 +; GFX11-NEXT: global_store_b128 v64, v[0:3], s[2:3] offset:128 ; GFX11-NEXT: global_store_b128 v64, v[4:7], s[2:3] offset:144 -; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: global_store_b128 v64, v[24:27], s[2:3] offset:160 -; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:176 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: global_store_b128 v64, v[0:3], s[2:3] offset:192 +; GFX11-NEXT: s_waitcnt vmcnt(13) +; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:160 +; GFX11-NEXT: s_waitcnt vmcnt(12) ; GFX11-NEXT: global_store_b128 v64, v[32:35], s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(11) ; GFX11-NEXT: global_store_b128 v64, v[36:39], s[2:3] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(10) ; GFX11-NEXT: global_store_b128 v64, v[40:43], s[2:3] offset:32 +; GFX11-NEXT: s_waitcnt vmcnt(9) ; GFX11-NEXT: global_store_b128 v64, v[44:47], s[2:3] offset:48 +; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: global_store_b128 v64, v[48:51], s[2:3] offset:64 +; GFX11-NEXT: s_waitcnt vmcnt(7) ; GFX11-NEXT: global_store_b128 v64, v[52:55], s[2:3] offset:80 +; GFX11-NEXT: s_waitcnt vmcnt(6) ; GFX11-NEXT: global_store_b128 v64, v[56:59], s[2:3] offset:96 +; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: global_store_b128 v64, v[60:63], s[2:3] offset:112 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:176 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: global_store_b128 v64, v[28:31], s[2:3] offset:192 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:208 +; GFX11-NEXT: global_store_b128 v64, v[24:27], s[2:3] offset:208 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:224 +; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:224 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:240 +; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:240 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -942,6 +942,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_mov_b32 s14, 0 ; GFX11-NEXT: s_mov_b32 s15, 0x40200000 +; GFX11-NEXT: s_mov_b64 s[0:1], 1.0 ; GFX11-NEXT: s_mov_b32 s13, 0x401c0000 ; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s11, 0x40180000 @@ -952,38 +953,34 @@ ; GFX11-NEXT: s_mov_b32 s5, 0x40080000 ; GFX11-NEXT: s_mov_b32 s4, s14 ; GFX11-NEXT: s_mov_b64 s[2:3], 2.0 -; GFX11-NEXT: s_mov_b64 s[0:1], 1.0 ; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 ; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 ; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 ; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 ; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 ; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 2, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 3, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 4, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v2 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v1, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v0, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v1, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, v0, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v1, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v12, v12, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0 +; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v0 :: v_dual_cndmask_b32 v16, v16, v1 ; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[7:10], off dlc @@ -1172,29 +1169,29 @@ ; GFX11-NEXT: v_dual_mov_b32 v8, s7 :: v_dual_mov_b32 v7, s6 ; GFX11-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v5, s4 ; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 7, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s18, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s19, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, s18, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s18, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s19, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s18, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s19, s1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, s18, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s19, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, s19, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s18, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, s19, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, s18, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, s19, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, s18, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, s19, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, s18, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, s19, vcc_lo ; GFX11-NEXT: global_store_b128 v[0:1], v[1:4], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[5:8], off dlc @@ -1584,36 +1581,33 @@ ; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s14, s16 ; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 ; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 ; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 ; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 ; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 ; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 2, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 3, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 4, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v2 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v1, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v0, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v1, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, v0, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v1, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v12, v12, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0 +; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v0 :: v_dual_cndmask_b32 v16, v16, v1 ; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[7:10], off dlc @@ -5181,33 +5175,35 @@ ; GFX11-LABEL: dyn_insertelement_v7f64_v_v_v: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v16 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v16 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v14, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v15, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v14, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v15, s1 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v14 :: v_dual_cndmask_b32 v3, v3, v15 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 ; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v14 :: v_dual_cndmask_b32 v5, v5, v15 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 +; GFX11-NEXT: v_readfirstlane_b32 s10, v10 +; GFX11-NEXT: v_readfirstlane_b32 s11, v11 ; GFX11-NEXT: v_readfirstlane_b32 s4, v4 ; GFX11-NEXT: v_readfirstlane_b32 s5, v5 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v14 :: v_dual_cndmask_b32 v7, v7, v15 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 +; GFX11-NEXT: v_readfirstlane_b32 s12, v12 +; GFX11-NEXT: v_readfirstlane_b32 s13, v13 ; GFX11-NEXT: v_readfirstlane_b32 s6, v6 ; GFX11-NEXT: v_readfirstlane_b32 s7, v7 ; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v14 :: v_dual_cndmask_b32 v9, v9, v15 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 ; GFX11-NEXT: v_readfirstlane_b32 s8, v8 ; GFX11-NEXT: v_readfirstlane_b32 s9, v9 -; GFX11-NEXT: v_dual_cndmask_b32 v10, v10, v14 :: v_dual_cndmask_b32 v11, v11, v15 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 -; GFX11-NEXT: v_readfirstlane_b32 s10, v10 -; GFX11-NEXT: v_readfirstlane_b32 s11, v11 -; GFX11-NEXT: v_dual_cndmask_b32 v12, v12, v14 :: v_dual_cndmask_b32 v13, v13, v15 -; GFX11-NEXT: v_readfirstlane_b32 s12, v12 -; GFX11-NEXT: v_readfirstlane_b32 s13, v13 ; GFX11-NEXT: ; return to shader part epilog entry: %insert = insertelement <7 x double> %vec, double %val, i32 %idx @@ -5629,26 +5625,27 @@ ; GFX11-LABEL: dyn_insertelement_v5f64_v_v_s: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s2, 4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s2, 3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s2, 4 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v10, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s1 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 -; GFX11-NEXT: v_readfirstlane_b32 s8, v8 -; GFX11-NEXT: v_readfirstlane_b32 s9, v9 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 ; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11 -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 -; GFX11-NEXT: v_readfirstlane_b32 s2, v2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v4 -; GFX11-NEXT: v_readfirstlane_b32 s5, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11 ; GFX11-NEXT: v_readfirstlane_b32 s6, v6 ; GFX11-NEXT: v_readfirstlane_b32 s7, v7 +; GFX11-NEXT: v_readfirstlane_b32 s8, v8 +; GFX11-NEXT: v_readfirstlane_b32 s4, v4 +; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s9, v9 ; GFX11-NEXT: ; return to shader part epilog entry: %insert = insertelement <5 x double> %vec, double %val, i32 %idx @@ -5717,26 +5714,27 @@ ; GFX11-LABEL: dyn_insertelement_v5f64_v_v_v: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 4, v12 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v10, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s1 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 -; GFX11-NEXT: v_readfirstlane_b32 s8, v8 -; GFX11-NEXT: v_readfirstlane_b32 s9, v9 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 ; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 -; GFX11-NEXT: v_readfirstlane_b32 s4, v4 -; GFX11-NEXT: v_readfirstlane_b32 s5, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11 ; GFX11-NEXT: v_readfirstlane_b32 s6, v6 ; GFX11-NEXT: v_readfirstlane_b32 s7, v7 +; GFX11-NEXT: v_readfirstlane_b32 s8, v8 +; GFX11-NEXT: v_readfirstlane_b32 s4, v4 +; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s9, v9 ; GFX11-NEXT: ; return to shader part epilog entry: %insert = insertelement <5 x double> %vec, double %val, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll @@ -557,8 +557,8 @@ ; GFX10CU-WAVE64: NumVgprs: 128 ; GFX11WGP-WAVE32: NumVgprs: 256 ; GFX11WGP-WAVE64: NumVgprs: 256 -; GFX11CU-WAVE32: NumVgprs: 128 -; GFX11CU-WAVE64: NumVgprs: 128 +; GFX11CU-WAVE32: NumVgprs: 192 +; GFX11CU-WAVE64: NumVgprs: 192 define amdgpu_kernel void @f512() #512 { call void @foo() call void @use256vgprs() @@ -574,10 +574,10 @@ ; GFX10WGP-WAVE64: NumVgprs: 128 ; GFX10CU-WAVE32: NumVgprs: 64 ; GFX10CU-WAVE64: NumVgprs: 64 -; GFX11WGP-WAVE32: NumVgprs: 128 -; GFX11WGP-WAVE64: NumVgprs: 128 -; GFX11CU-WAVE32: NumVgprs: 64 -; GFX11CU-WAVE64: NumVgprs: 64 +; GFX11WGP-WAVE32: NumVgprs: 192 +; GFX11WGP-WAVE64: NumVgprs: 192 +; GFX11CU-WAVE32: NumVgprs: 96 +; GFX11CU-WAVE64: NumVgprs: 96 define amdgpu_kernel void @f1024() #1024 { call void @foo() call void @use256vgprs() diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll --- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll +++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll @@ -3,11 +3,18 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1010,GFX1010W64 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1030,GFX1030W32 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1030,GFX1030W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1101 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1101 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1102 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1030,GFX1030W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1030,GFX1030W64 %s ; GCN-LABEL: {{^}}max_occupancy: ; GFX9: ; Occupancy: 10 ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @max_occupancy() { ret void } @@ -16,6 +23,8 @@ ; GFX9: ; Occupancy: 3 ; GFX10W64: ; Occupancy: 3 ; GFX10W32: ; Occupancy: 4 +; GFX1100W64: ; Occupancy: 3 +; GFX1100W32: ; Occupancy: 5 define amdgpu_kernel void @limited_occupancy_3() #0 { ret void } @@ -24,6 +33,7 @@ ; GFX9: ; Occupancy: 10 ; GFX1010: ; Occupancy: 18 ; GFX1030: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @limited_occupancy_18() #1 { ret void } @@ -32,6 +42,7 @@ ; GFX9: ; Occupancy: 10 ; GFX1010: ; Occupancy: 18 ; GFX1030: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @limited_occupancy_19() #2 { ret void } @@ -40,6 +51,7 @@ ; GFX9: ; Occupancy: 10 ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @used_24_vgprs() { call void asm sideeffect "", "~{v23}" () ret void @@ -50,6 +62,7 @@ ; GFX1010W64: ; Occupancy: 18 ; GFX1010W32: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @used_28_vgprs() { call void asm sideeffect "", "~{v27}" () ret void @@ -60,6 +73,7 @@ ; GFX10W64: ; Occupancy: 16 ; GFX1010W32: ; Occupancy: 20 ; GFX1030W32: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @used_32_vgprs() { call void asm sideeffect "", "~{v31}" () ret void @@ -71,6 +85,7 @@ ; GFX1010W32: ; Occupancy: 20 ; GFX1030W64: ; Occupancy: 12 ; GFX1030W32: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @used_36_vgprs() { call void asm sideeffect "", "~{v35}" () ret void @@ -81,6 +96,7 @@ ; GFX10W64: ; Occupancy: 12 ; GFX1010W32: ; Occupancy: 20 ; GFX1030W32: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @used_40_vgprs() { call void asm sideeffect "", "~{v39}" () ret void @@ -92,6 +108,7 @@ ; GFX1010W32: ; Occupancy: 20 ; GFX1030W64: ; Occupancy: 10 ; GFX1030W32: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @used_44_vgprs() { call void asm sideeffect "", "~{v43}" () ret void @@ -102,6 +119,7 @@ ; GFX10W64: ; Occupancy: 10 ; GFX1010W32: ; Occupancy: 20 ; GFX1030W32: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @used_48_vgprs() { call void asm sideeffect "", "~{v47}" () ret void @@ -112,6 +130,8 @@ ; GFX10W64: ; Occupancy: 9 ; GFX1010W32: ; Occupancy: 18 ; GFX1030W32: ; Occupancy: 16 +; GFX1100W64: ; Occupancy: 12 +; GFX1100W32: ; Occupancy: 16 define amdgpu_kernel void @used_56_vgprs() { call void asm sideeffect "", "~{v55}" () ret void @@ -121,6 +141,8 @@ ; GFX9: ; Occupancy: 4 ; GFX10W64: ; Occupancy: 8 ; GFX10W32: ; Occupancy: 16 +; GFX1100W64: ; Occupancy: 10 +; GFX1100W32: ; Occupancy: 16 define amdgpu_kernel void @used_64_vgprs() { call void asm sideeffect "", "~{v63}" () ret void @@ -131,6 +153,8 @@ ; GFX10W64: ; Occupancy: 7 ; GFX1010W32: ; Occupancy: 14 ; GFX1030W32: ; Occupancy: 12 +; GFX1100W64: ; Occupancy: 10 +; GFX1100W32: ; Occupancy: 16 define amdgpu_kernel void @used_72_vgprs() { call void asm sideeffect "", "~{v71}" () ret void @@ -140,6 +164,8 @@ ; GFX9: ; Occupancy: 3 ; GFX10W64: ; Occupancy: 6 ; GFX10W32: ; Occupancy: 12 +; GFX1100W64: ; Occupancy: 9 +; GFX1100W32: ; Occupancy: 16 define amdgpu_kernel void @used_80_vgprs() { call void asm sideeffect "", "~{v79}" () ret void @@ -151,6 +177,8 @@ ; GFX1010W32: ; Occupancy: 11 ; GFX1030W64: ; Occupancy: 5 ; GFX1030W32: ; Occupancy: 10 +; GFX1100W64: ; Occupancy: 9 +; GFX1100W32: ; Occupancy: 16 define amdgpu_kernel void @used_84_vgprs() { call void asm sideeffect "", "~{v83}" () ret void @@ -161,6 +189,8 @@ ; GFX10W64: ; Occupancy: 5 ; GFX1010W32: ; Occupancy: 11 ; GFX1030W32: ; Occupancy: 10 +; GFX1100W64: ; Occupancy: 8 +; GFX1100W32: ; Occupancy: 16 define amdgpu_kernel void @used_88_vgprs() { call void asm sideeffect "", "~{v87}" () ret void @@ -170,6 +200,8 @@ ; GFX9: ; Occupancy: 2 ; GFX10W64: ; Occupancy: 5 ; GFX10W32: ; Occupancy: 10 +; GFX1100W64: ; Occupancy: 8 +; GFX1100W32: ; Occupancy: 16 define amdgpu_kernel void @used_96_vgprs() { call void asm sideeffect "", "~{v95}" () ret void @@ -180,6 +212,8 @@ ; GFX1010W64: ; Occupancy: 5 ; GFX1030W64: ; Occupancy: 4 ; GFX10W32: ; Occupancy: 9 +; GFX1100W64: ; Occupancy: 7 +; GFX1100W32: ; Occupancy: 12 define amdgpu_kernel void @used_100_vgprs() { call void asm sideeffect "", "~{v99}" () ret void @@ -189,6 +223,8 @@ ; GFX9: ; Occupancy: 2 ; GFX10W64: ; Occupancy: 4 ; GFX10W32: ; Occupancy: 9 +; GFX1100W64: ; Occupancy: 6 +; GFX1100W32: ; Occupancy: 12 define amdgpu_kernel void @used_112_vgprs() { call void asm sideeffect "", "~{v111}" () ret void @@ -198,6 +234,8 @@ ; GFX9: ; Occupancy: 2 ; GFX10W64: ; Occupancy: 4 ; GFX10W32: ; Occupancy: 8 +; GFX1100W64: ; Occupancy: 5 +; GFX1100W32: ; Occupancy: 10 define amdgpu_kernel void @used_128_vgprs() { call void asm sideeffect "", "~{v127}" () ret void @@ -207,6 +245,8 @@ ; GFX9: ; Occupancy: 1 ; GFX10W64: ; Occupancy: 3 ; GFX10W32: ; Occupancy: 7 +; GFX1100W64: ; Occupancy: 5 +; GFX1100W32: ; Occupancy: 10 define amdgpu_kernel void @used_144_vgprs() { call void asm sideeffect "", "~{v143}" () ret void @@ -217,6 +257,8 @@ ; GFX10W64: ; Occupancy: 3 ; GFX1010W32: ; Occupancy: 6 ; GFX1030W32: ; Occupancy: 5 +; GFX1100W64: ; Occupancy: 4 +; GFX1100W32: ; Occupancy: 9 define amdgpu_kernel void @used_168_vgprs() { call void asm sideeffect "", "~{v167}" () ret void @@ -227,6 +269,8 @@ ; GFX10W64: ; Occupancy: 2 ; GFX1010W32: ; Occupancy: 5 ; GFX1030W32: ; Occupancy: 4 +; GFX1100W64: ; Occupancy: 3 +; GFX1100W32: ; Occupancy: 7 define amdgpu_kernel void @used_200_vgprs() { call void asm sideeffect "", "~{v199}" () ret void @@ -236,6 +280,8 @@ ; GFX9: ; Occupancy: 1 ; GFX10W64: ; Occupancy: 2 ; GFX10W32: ; Occupancy: 4 +; GFX1100W64: ; Occupancy: 2 +; GFX1100W32: ; Occupancy: 5 define amdgpu_kernel void @used_256_vgprs() { call void asm sideeffect "", "~{v255}" () ret void @@ -245,6 +291,7 @@ ; GFX9: ; Occupancy: 10 ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @used_80_sgprs() { call void asm sideeffect "", "~{s79}" () ret void @@ -254,6 +301,7 @@ ; GFX9: ; Occupancy: 9 ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @used_88_sgprs() { call void asm sideeffect "", "~{s87}" () ret void @@ -263,6 +311,7 @@ ; GFX9: ; Occupancy: 8 ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @used_100_sgprs() { call void asm sideeffect "", "~{s99}" () ret void @@ -272,6 +321,7 @@ ; GFX9: ; Occupancy: 7 ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @used_101_sgprs() { call void asm sideeffect "", "~{s100}" () ret void @@ -281,6 +331,7 @@ ; GFX9: ; Occupancy: 10 ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 @lds6552 = internal addrspace(3) global [6552 x i8] undef, align 4 define amdgpu_kernel void @used_lds_6552() { %p = bitcast [6552 x i8] addrspace(3)* @lds6552 to i8 addrspace(3)* @@ -292,6 +343,7 @@ ; GFX9: ; Occupancy: 10 ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 @lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4 define amdgpu_kernel void @used_lds_6556() { %p = bitcast [6556 x i8] addrspace(3)* @lds6556 to i8 addrspace(3)* @@ -303,6 +355,7 @@ ; GFX9: ; Occupancy: 10 ; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 +; GFX1100: ; Occupancy: 16 @lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4 define amdgpu_kernel void @used_lds_13112() { %p = bitcast [13112 x i8] addrspace(3)* @lds13112 to i8 addrspace(3)* @@ -314,6 +367,8 @@ ; GFX9: ; Occupancy: 7{{$}} ; GFX10W64: ; Occupancy: 7{{$}} ; GFX10W32: ; Occupancy: 14{{$}} +; GFX1100W64: ; Occupancy: 7{{$}} +; GFX1100W32: ; Occupancy: 14{{$}} @lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4 define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 { %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* @@ -326,6 +381,8 @@ ; GFX10W64: ; Occupancy: 14{{$}} ; GFX1010W32: ; Occupancy: 20{{$}} ; GFX1030W32: ; Occupancy: 16{{$}} +; GFX1100W64: ; Occupancy: 14{{$}} +; GFX1100W32: ; Occupancy: 16{{$}} define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 { %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* store volatile i8 1, i8 addrspace(3)* %p @@ -337,6 +394,8 @@ ; GFX10W64: ; Occupancy: 14{{$}} ; GFX1010W32: ; Occupancy: 20{{$}} ; GFX1030W32: ; Occupancy: 16{{$}} +; GFX1100W64: ; Occupancy: 14{{$}} +; GFX1100W32: ; Occupancy: 16{{$}} define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 { %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* store volatile i8 1, i8 addrspace(3)* %p @@ -347,6 +406,7 @@ ; GFX9: ; Occupancy: 10{{$}} ; GFX1010: ; Occupancy: 20{{$}} ; GFX1030: ; Occupancy: 16{{$}} +; GFX1100: ; Occupancy: 16{{$}} define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 { %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* store volatile i8 1, i8 addrspace(3)* %p @@ -357,6 +417,7 @@ ; GFX9: ; Occupancy: 10{{$}} ; GFX1010: ; Occupancy: 20{{$}} ; GFX1030: ; Occupancy: 16{{$}} +; GFX1100: ; Occupancy: 16{{$}} define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 { %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* store volatile i8 1, i8 addrspace(3)* %p @@ -367,6 +428,7 @@ ; GFX9: ; Occupancy: 10{{$}} ; GFX1010: ; Occupancy: 20{{$}} ; GFX1030: ; Occupancy: 16{{$}} +; GFX1100: ; Occupancy: 16{{$}} define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 { %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* store volatile i8 1, i8 addrspace(3)* %p @@ -377,6 +439,7 @@ ; GFX9: ; Occupancy: 10{{$}} ; GFX1010: ; Occupancy: 20{{$}} ; GFX1030: ; Occupancy: 16{{$}} +; GFX1100: ; Occupancy: 16{{$}} define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 { %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* store volatile i8 1, i8 addrspace(3)* %p @@ -386,6 +449,7 @@ ; GCN-LABEL: {{^}}used_lds_8252_max_group_size_32: ; GFX9: ; Occupancy: 7{{$}} ; GFX10: ; Occupancy: 7{{$}} +; GFX1100: ; Occupancy: 7{{$}} define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 { %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)* store volatile i8 1, i8 addrspace(3)* %p