diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -489,6 +489,7 @@
   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
   bool HasImplicitDerivatives =
       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
+  bool HasDemotes = false;
 
   // We need to visit the basic blocks in reverse post-order so that we visit
   // defs before uses, in particular so that we don't accidentally mark an
@@ -573,6 +574,8 @@
                    Opcode == AMDGPU::SI_DEMOTE_I1) {
           KillInstrs.push_back(&MI);
           BBI.NeedsLowering = true;
+          if (Opcode == AMDGPU::SI_DEMOTE_I1)
+            HasDemotes = true;
         } else if (WQMOutputs) {
           // The function is in machine SSA form, which means that physical
           // VGPRs correspond to shader inputs and outputs. Inputs are
@@ -601,6 +604,12 @@
     }
   }
 
+  // Demotes may be used to intentionally introduce new helper lanes.
+  // Enable WQM to facilitate this effect if there are operations which
+  // would change behaviour when run in WQM, i.e. SOFT_WQM instructions.
+  if (HasDemotes && !SoftWQMInstrs.empty())
+    GlobalFlags |= StateWQM;
+
   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
   // ever used anywhere in the function. This implements the corresponding
   // semantics of @llvm.amdgcn.set.inactive.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
@@ -200,12 +200,30 @@
   ret float %r
 }
 
+; Check that WQM is triggered for softwqm with demote.
+;
+;CHECK-LABEL: {{^}}test_demote_1:
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test_demote_1(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
+  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
+  %c1 = fcmp oge float %src0, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %c1)
+  %out = fadd float %src0, %src1
+  %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
+  ret float %out.0
+}
+
 declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
 declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #3
 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
-declare void @llvm.amdgcn.kill(i1) #1
+declare void @llvm.amdgcn.wqm.demote(i1) #1
 declare float @llvm.amdgcn.wqm.f32(float) #3
 declare float @llvm.amdgcn.softwqm.f32(float) #3
 declare i32 @llvm.amdgcn.softwqm.i32(i32) #3