diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -542,6 +542,14 @@ GlobalFlags |= StateStrictWQM; LowerToMovInstrs.push_back(&MI); continue; + } else if (Opcode == AMDGPU::LDS_PARAM_LOAD || + Opcode == AMDGPU::LDS_DIRECT_LOAD) { + // Mark these STRICTWQM, but only for the instruction, not its operands. + // This avoid unnecessarily marking M0 as requiring WQM. + InstrInfo &II = Instructions[&MI]; + II.Needs |= StateStrictWQM; + GlobalFlags |= StateStrictWQM; + continue; } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || Opcode == AMDGPU::V_SET_INACTIVE_B64) { III.Disabled = StateStrict; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll @@ -4,16 +4,19 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 { ; GCN-LABEL: v_interp_f32: ; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo ; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: lds_param_load v0, attr0.y wait_vdst:15 +; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15 +; GCN-NEXT: s_mov_b32 exec_lo, s3 ; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: lds_param_load v0, attr0.y -; GCN-NEXT: lds_param_load v1, attr1.x ; GCN-NEXT: v_mov_b32_e32 v4, s1 ; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 -; GCN-NEXT: v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7 -; GCN-NEXT: exp mrt0 v3, v2, v0, v1 done +; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 +; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done ; GCN-NEXT: s_endpgm main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0) @@ -29,22 +32,25 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 { ; GCN-LABEL: v_interp_f32_many: ; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo ; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: lds_param_load v0, attr0.x wait_vdst:15 +; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15 +; GCN-NEXT: lds_param_load v2, attr2.x wait_vdst:15 +; GCN-NEXT: lds_param_load v3, attr3.x wait_vdst:15 +; GCN-NEXT: s_mov_b32 exec_lo, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: lds_param_load v0, attr0.x -; GCN-NEXT: lds_param_load v1, attr1.x -; GCN-NEXT: lds_param_load v2, attr2.x -; GCN-NEXT: lds_param_load v3, attr3.x ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 ; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 -; GCN-NEXT: v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v3, v3, v5, v4 wait_exp:7 -; GCN-NEXT: exp mrt0 v0, v1, v2, v3 done +; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 +; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done ; GCN-NEXT: s_endpgm main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) @@ -68,20 +74,23 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4 ; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: lds_param_load v2, attr0.x -; GCN-NEXT: lds_param_load v3, attr1.x -; GCN-NEXT: lds_param_load v4, attr2.x -; GCN-NEXT: lds_param_load v5, attr3.x +; GCN-NEXT: s_mov_b32 s0, exec_lo +; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo +; GCN-NEXT: lds_param_load v2, attr0.x wait_vdst:15 +; GCN-NEXT: lds_param_load v3, attr1.x wait_vdst:15 +; GCN-NEXT: lds_param_load v4, attr2.x wait_vdst:15 +; GCN-NEXT: lds_param_load v5, attr3.x wait_vdst:15 +; GCN-NEXT: s_mov_b32 exec_lo, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3 ; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 -; GCN-NEXT: v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 -; GCN-NEXT: exp mrt0 v2, v3, v4, v0 done +; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done ; GCN-NEXT: s_endpgm main_body: %i.ptr = getelementptr float, float addrspace(1)* %ptr, i32 1 @@ -107,14 +116,17 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { ; GCN-LABEL: v_interp_f16: ; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo ; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: lds_param_load v0, attr0.x +; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GCN-NEXT: s_mov_b32 exec_lo, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 -; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7 -; GCN-NEXT: v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7 -; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7 +; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 +; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 ; GCN-NEXT: v_add_f16_e32 v0, v3, v0 ; GCN-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll @@ -4,16 +4,19 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 { ; GCN-LABEL: v_interp_f32: ; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo ; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: lds_param_load v0, attr0.y wait_vdst:15 +; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15 +; GCN-NEXT: s_mov_b32 exec_lo, s3 ; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: lds_param_load v0, attr0.y -; GCN-NEXT: lds_param_load v1, attr1.x ; GCN-NEXT: v_mov_b32_e32 v4, s1 ; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 -; GCN-NEXT: v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7 -; GCN-NEXT: exp mrt0 v3, v2, v0, v1 done +; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 +; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done ; GCN-NEXT: s_endpgm main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0) @@ -29,22 +32,25 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 { ; GCN-LABEL: v_interp_f32_many: ; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo ; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: lds_param_load v0, attr0.x wait_vdst:15 +; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15 +; GCN-NEXT: lds_param_load v2, attr2.x wait_vdst:15 +; GCN-NEXT: lds_param_load v3, attr3.x wait_vdst:15 +; GCN-NEXT: s_mov_b32 exec_lo, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: lds_param_load v0, attr0.x -; GCN-NEXT: lds_param_load v1, attr1.x -; GCN-NEXT: lds_param_load v2, attr2.x -; GCN-NEXT: lds_param_load v3, attr3.x ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 ; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 -; GCN-NEXT: v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v3, v3, v5, v4 wait_exp:7 -; GCN-NEXT: exp mrt0 v0, v1, v2, v3 done +; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 +; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done ; GCN-NEXT: s_endpgm main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) @@ -68,20 +74,23 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4 ; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: lds_param_load v2, attr0.x -; GCN-NEXT: lds_param_load v3, attr1.x -; GCN-NEXT: lds_param_load v4, attr2.x -; GCN-NEXT: lds_param_load v5, attr3.x +; GCN-NEXT: s_mov_b32 s0, exec_lo +; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo +; GCN-NEXT: lds_param_load v2, attr0.x wait_vdst:15 +; GCN-NEXT: lds_param_load v3, attr1.x wait_vdst:15 +; GCN-NEXT: lds_param_load v4, attr2.x wait_vdst:15 +; GCN-NEXT: lds_param_load v5, attr3.x wait_vdst:15 +; GCN-NEXT: s_mov_b32 exec_lo, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3 ; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 -; GCN-NEXT: v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7 +; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 -; GCN-NEXT: exp mrt0 v2, v3, v4, v0 done +; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done ; GCN-NEXT: s_endpgm main_body: %i.ptr = getelementptr float, float addrspace(1)* %ptr, i32 1 @@ -107,14 +116,17 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { ; GCN-LABEL: v_interp_f16: ; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo ; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: lds_param_load v0, attr0.x +; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GCN-NEXT: s_mov_b32 exec_lo, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 -; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7 -; GCN-NEXT: v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7 -; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7 +; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 +; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 ; GCN-NEXT: v_add_f16_e32 v0, v3, v0 ; GCN-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll b/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK %s + +; Test that s_wqm is executed before lds.param.load. +define amdgpu_ps <3 x float> @test_param_load(i32 inreg %attr, <3 x float> %to_add) { +; CHECK-LABEL: test_param_load: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: s_mov_b32 m0, s0 +; CHECK-NEXT: s_mov_b32 s0, exec_lo +; CHECK-NEXT: s_wqm_b32 exec_lo, exec_lo +; CHECK-NEXT: lds_param_load v3, attr0.x wait_vdst:15 +; CHECK-NEXT: lds_param_load v4, attr0.y wait_vdst:15 +; CHECK-NEXT: lds_param_load v5, attr0.z wait_vdst:15 +; CHECK-NEXT: s_mov_b32 exec_lo, s0 +; CHECK-NEXT: s_waitcnt expcnt(2) +; CHECK-NEXT: v_add_f32_e32 v0, v3, v0 +; CHECK-NEXT: s_waitcnt expcnt(1) +; CHECK-NEXT: v_add_f32_e32 v1, v4, v1 +; CHECK-NEXT: s_waitcnt expcnt(0) +; CHECK-NEXT: v_add_f32_e32 v2, v5, v2 +; CHECK-NEXT: ; return to shader part epilog +main_body: + %a = call float @llvm.amdgcn.lds.param.load(i32 immarg 0, i32 immarg 0, i32 %attr) #1 + %b = call float @llvm.amdgcn.lds.param.load(i32 immarg 1, i32 immarg 0, i32 %attr) #1 + %c = call float @llvm.amdgcn.lds.param.load(i32 immarg 2, i32 immarg 0, i32 %attr) #1 + %tmp_0 = insertelement <3 x float> undef, float %a, i32 0 + %tmp_1 = insertelement <3 x float> %tmp_0, float %b, i32 1 + %tmp_2 = insertelement <3 x float> %tmp_1, float %c, i32 2 + %res = fadd <3 x float> %tmp_2, %to_add + ret <3 x float> %res +} + +; Test that s_wqm is executed before lds.direct.load. +define amdgpu_ps <3 x float> @test_direct_load(i32 inreg %arg_0, i32 inreg %arg_1, i32 inreg %arg_2, <3 x float> %to_add) { +; CHECK-LABEL: test_direct_load: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: s_mov_b32 m0, s0 +; CHECK-NEXT: s_mov_b32 s0, exec_lo +; CHECK-NEXT: s_wqm_b32 exec_lo, exec_lo +; CHECK-NEXT: lds_direct_load v3 wait_vdst:15 +; CHECK-NEXT: s_mov_b32 m0, s1 +; CHECK-NEXT: lds_direct_load v4 wait_vdst:15 +; CHECK-NEXT: s_mov_b32 m0, s2 +; CHECK-NEXT: lds_direct_load v5 wait_vdst:15 +; CHECK-NEXT: s_mov_b32 exec_lo, s0 +; CHECK-NEXT: s_waitcnt expcnt(2) +; CHECK-NEXT: v_add_f32_e32 v0, v3, v0 +; CHECK-NEXT: s_waitcnt expcnt(1) +; CHECK-NEXT: v_add_f32_e32 v1, v4, v1 +; CHECK-NEXT: s_waitcnt expcnt(0) +; CHECK-NEXT: v_add_f32_e32 v2, v5, v2 +; CHECK-NEXT: ; return to shader part epilog +main_body: + %a = call float @llvm.amdgcn.lds.direct.load(i32 %arg_0) #1 + %b = call float @llvm.amdgcn.lds.direct.load(i32 %arg_1) #1 + %c = call float @llvm.amdgcn.lds.direct.load(i32 %arg_2) #1 + %tmp_0 = insertelement <3 x float> undef, float %a, i32 0 + %tmp_1 = insertelement <3 x float> %tmp_0, float %b, i32 1 + %tmp_2 = insertelement <3 x float> %tmp_1, float %c, i32 2 + %res = fadd <3 x float> %tmp_2, %to_add + ret <3 x float> %res +} + +attributes #1 = { nounwind readnone speculatable willreturn } +declare float @llvm.amdgcn.lds.param.load(i32 immarg, i32 immarg, i32) #1 +declare float @llvm.amdgcn.lds.direct.load(i32) #1