diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -489,6 +489,7 @@ SmallVector SoftWQMInstrs; bool HasImplicitDerivatives = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; + bool HasDemotes = false; // We need to visit the basic blocks in reverse post-order so that we visit // defs before uses, in particular so that we don't accidentally mark an @@ -573,6 +574,8 @@ Opcode == AMDGPU::SI_DEMOTE_I1) { KillInstrs.push_back(&MI); BBI.NeedsLowering = true; + if (Opcode == AMDGPU::SI_DEMOTE_I1) + HasDemotes = true; } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical // VGPRs correspond to shader inputs and outputs. Inputs are @@ -601,6 +604,12 @@ } } + // Demotes may be used to intentionally introduce new helper lanes. + // Enable WQM to facilitate this effect if there are operations which + // would change behaviour when run in WQM, i.e. SOFT_WQM instructions. + if (HasDemotes && !SoftWQMInstrs.empty()) + GlobalFlags |= StateWQM; + // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is // ever used anywhere in the function. This implements the corresponding // semantics of @llvm.amdgcn.set.inactive. diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -287,6 +287,7 @@ ; CHECK-LABEL: test_demote_1: ; CHECK: ; %bb.0: ; %main_body ; CHECK-NEXT: s_mov_b64 s[2:3], exec +; CHECK-NEXT: s_wqm_b64 exec, exec ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen ; CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -297,10 +298,12 @@ ; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] ; CHECK-NEXT: s_cbranch_scc0 .LBB8_2 ; CHECK-NEXT: ; %bb.1: ; %main_body -; CHECK-NEXT: s_and_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_wqm_b64 s[0:1], s[2:3] +; CHECK-NEXT: s_and_b64 exec, exec, s[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_add_f32_e32 v0, v0, v1 ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; CHECK-NEXT: s_and_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_branch .LBB8_3 ; CHECK-NEXT: .LBB8_2: ; CHECK-NEXT: s_mov_b64 exec, 0