diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -489,6 +489,7 @@ SmallVector SoftWQMInstrs; bool HasImplicitDerivatives = MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; + bool HasDemotes = false; // We need to visit the basic blocks in reverse post-order so that we visit // defs before uses, in particular so that we don't accidentally mark an @@ -573,6 +574,8 @@ Opcode == AMDGPU::SI_DEMOTE_I1) { KillInstrs.push_back(&MI); BBI.NeedsLowering = true; + if (Opcode == AMDGPU::SI_DEMOTE_I1) + HasDemotes = true; } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical // VGPRs correspond to shader inputs and outputs. Inputs are @@ -601,6 +604,12 @@ } } + // Demotes may be used to intentionally introduce new helper lanes. + // Enable WQM to facilitate this effect if there are operations which + // would change behaviour when run in WQM, i.e. SOFT_WQM instructions. + if (HasDemotes && !SoftWQMInstrs.empty()) + GlobalFlags |= StateWQM; + // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is // ever used anywhere in the function. This implements the corresponding // semantics of @llvm.amdgcn.set.inactive. diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -200,12 +200,30 @@ ret float %r } +; Check that WQM is triggered for softwqm with demote. +; +;CHECK-LABEL: {{^}}test_demote_1: +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test_demote_1(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %c1 = fcmp oge float %src0, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %c1) + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) + ret float %out.0 +} + declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2 declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #3 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 -declare void @llvm.amdgcn.kill(i1) #1 +declare void @llvm.amdgcn.wqm.demote(i1) #1 declare float @llvm.amdgcn.wqm.f32(float) #3 declare float @llvm.amdgcn.softwqm.f32(float) #3 declare i32 @llvm.amdgcn.softwqm.i32(i32) #3