diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -487,6 +487,8 @@ bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs"); SmallVector SetInactiveInstrs; SmallVector SoftWQMInstrs; + bool HasImplicitDerivatives = + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; // We need to visit the basic blocks in reverse post-order so that we visit // defs before uses, in particular so that we don't accidentally mark an @@ -506,6 +508,11 @@ // If LOD is not supported WQM is not needed. if (!ST->hasExtendedImageInsts()) continue; + // Only generate implicit WQM if implicit derivatives are required. + // This avoids inserting unintended WQM if a shader type without + // implicit derivatives uses an image sampling instruction. + if (!HasImplicitDerivatives) + continue; // Sampling instructions don't need to produce results for all pixels // in a quad, they just require all inputs of a quad to have been // computed for derivatives. diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -391,27 +391,25 @@ ; GCN-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s18, -1 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 ; GCN-NEXT: s_mov_b32 s19, 0xe00000 ; GCN-NEXT: s_add_u32 s16, s16, s3 ; GCN-NEXT: s_addc_u32 s17, s17, 0 -; GCN-NEXT: s_mov_b64 s[12:13], exec -; GCN-NEXT: s_wqm_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 -; GCN-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 ; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4 ; GCN-NEXT: s_brev_b32 s0, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_mov_b32 s1, s0 ; GCN-NEXT: s_mov_b32 s2, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NEXT: s_and_b64 exec, exec, s[12:13] +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f32_e32 v0, v2, v0 @@ -424,25 +422,22 @@ ; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 ; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GCN-SCRATCH-NEXT: s_mov_b32 s9, exec_lo -; GCN-SCRATCH-NEXT: s_wqm_b32 exec_lo, exec_lo ; GCN-SCRATCH-NEXT: s_clause 0x1 ; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1 +; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8 ; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off offset:4 ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-SCRATCH-NEXT: ;;#ASMSTART ; GCN-SCRATCH-NEXT: ;;#ASMEND +; GCN-SCRATCH-NEXT: scratch_load_dword v2, off, off offset:4 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s10 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s11 -; GCN-SCRATCH-NEXT: s_and_b32 exec_lo, exec_lo, s9 ; GCN-SCRATCH-NEXT: s_mov_b32 s11, 0 -; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8 ; GCN-SCRATCH-NEXT: s_mov_b32 s10, s8 -; GCN-SCRATCH-NEXT: scratch_load_dword v2, off, off offset:4 ; GCN-SCRATCH-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GCN-SCRATCH-NEXT: v_add_f32_e32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -9,19 +9,22 @@ ; GFX9-LABEL: non_preserved_vgpr_tuple8: ; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9: v_mov_b32_e32 v36, v16 ; GFX9-NEXT: v_mov_b32_e32 v35, v15 ; GFX9-NEXT: v_mov_b32_e32 v34, v14 ; GFX9-NEXT: v_mov_b32_e32 v33, v13 ; GFX9-NEXT: v_mov_b32_e32 v32, v12 + +; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill + ; GFX9: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND ; GFX9: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 @@ -39,10 +42,6 @@ ; ; GFX10-LABEL: non_preserved_vgpr_tuple8: ; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10: v_mov_b32_e32 v36, v16 ; GFX10-NEXT: v_mov_b32_e32 v35, v15 @@ -50,10 +49,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v33, v13 ; GFX10-NEXT: v_mov_b32_e32 v32, v12 +; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill + ; GFX10: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 @@ -100,6 +105,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v41, v12 ; GFX9: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 @@ -133,12 +139,9 @@ ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v41, v16 +; GFX10-NEXT: v_writelane_b32 v40, s30, 8 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v42, v15 -; GFX10-NEXT: v_mov_b32_e32 v43, v14 -; GFX10-NEXT: v_mov_b32_e32 v44, v13 -; GFX10-NEXT: v_mov_b32_e32 v45, v12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 9 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir --- a/llvm/test/CodeGen/AMDGPU/wqm.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm.mir @@ -1,5 +1,48 @@ # RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-wqm -o - %s | FileCheck %s +--- | + define amdgpu_ps void @test_strict_wwm_scc() { + ret void + } + define amdgpu_ps void @test_strict_wwm_scc2() { + ret void + } + define amdgpu_ps void @no_cfg() { + ret void + } + define amdgpu_ps void @copy_exec() { + ret void + } + define amdgpu_ps void @scc_always_live() { + ret void + } + define amdgpu_ps void @test_wwm_set_inactive_propagation() { + ret void + } + define amdgpu_ps void @test_wqm_lr_phi() { + ret void + } + define amdgpu_cs void @no_wqm_in_cs() { + ret void + } + define amdgpu_es void @no_wqm_in_es() { + ret void + } + define amdgpu_gs void @no_wqm_in_gs() { + ret void + } + define amdgpu_hs void @no_wqm_in_hs() { + ret void + } + define amdgpu_ls void @no_wqm_in_ls() { + ret void + } + define amdgpu_vs void @no_wqm_in_vs() { + ret void + } +... +--- + --- # Check for awareness that s_or_saveexec_b64 clobbers SCC # @@ -298,3 +341,105 @@ $vgpr1 = COPY %4.sub1:vreg_128 SI_RETURN_TO_EPILOG $vgpr0, $vgpr1 ... + +--- +#CHECK-LABEL: name: no_wqm_in_cs +#CHECK-NOT: S_WQM +name: no_wqm_in_cs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1, $vgpr2 + + undef %0.sub0:vreg_64 = COPY $vgpr1 + %0.sub1:vreg_64 = COPY $vgpr2 + %100:sgpr_256 = IMPLICIT_DEF + %101:sgpr_128 = IMPLICIT_DEF + + %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +... + +--- +#CHECK-LABEL: name: no_wqm_in_es +#CHECK-NOT: S_WQM +name: no_wqm_in_es +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1, $vgpr2 + + undef %0.sub0:vreg_64 = COPY $vgpr1 + %0.sub1:vreg_64 = COPY $vgpr2 + %100:sgpr_256 = IMPLICIT_DEF + %101:sgpr_128 = IMPLICIT_DEF + + %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +... + +--- +#CHECK-LABEL: name: no_wqm_in_gs +#CHECK-NOT: S_WQM +name: no_wqm_in_gs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1, $vgpr2 + + undef %0.sub0:vreg_64 = COPY $vgpr1 + %0.sub1:vreg_64 = COPY $vgpr2 + %100:sgpr_256 = IMPLICIT_DEF + %101:sgpr_128 = IMPLICIT_DEF + + %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +... + +--- +#CHECK-LABEL: name: no_wqm_in_hs +#CHECK-NOT: S_WQM +name: no_wqm_in_hs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1, $vgpr2 + + undef %0.sub0:vreg_64 = COPY $vgpr1 + %0.sub1:vreg_64 = COPY $vgpr2 + %100:sgpr_256 = IMPLICIT_DEF + %101:sgpr_128 = IMPLICIT_DEF + + %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +... + +--- +#CHECK-LABEL: name: no_wqm_in_ls +#CHECK-NOT: S_WQM +name: no_wqm_in_ls +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1, $vgpr2 + + undef %0.sub0:vreg_64 = COPY $vgpr1 + %0.sub1:vreg_64 = COPY $vgpr2 + %100:sgpr_256 = IMPLICIT_DEF + %101:sgpr_128 = IMPLICIT_DEF + + %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +... + +--- +#CHECK-LABEL: name: no_wqm_in_vs +#CHECK-NOT: S_WQM +name: no_wqm_in_vs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1, $vgpr2 + + undef %0.sub0:vreg_64 = COPY $vgpr1 + %0.sub1:vreg_64 = COPY $vgpr2 + %100:sgpr_256 = IMPLICIT_DEF + %101:sgpr_128 = IMPLICIT_DEF + + %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +...