Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1431,6 +1431,13 @@
   [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
 >;
 
+// Copies the source value to the destination value, such that the source
+// is computed as if the entire program were executed in WQM if any other
+// program code executes in WQM.
+def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty],
+  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+>;
+
 // Return true if at least one thread within the pixel quad passes true into
 // the function.
 def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp
===================================================================
--- lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -617,6 +617,7 @@
         continue;
       case AMDGPU::COPY:
       case AMDGPU::WQM:
+      case AMDGPU::SOFT_WQM:
       case AMDGPU::WWM: {
         // If the destination register is a physical register there isn't really
         // much we can do to fix this.
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5952,6 +5952,11 @@
     return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
                    0);
   }
+  case Intrinsic::amdgcn_softwqm: {
+    SDValue Src = Op.getOperand(1);
+    return SDValue(DAG.getMachineNode(AMDGPU::SOFT_WQM, DL, Src.getValueType(), Src),
+                   0);
+  }
   case Intrinsic::amdgcn_wwm: {
     SDValue Src = Op.getOperand(1);
     return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3631,6 +3631,7 @@
   case AMDGPU::PHI: return AMDGPU::PHI;
   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
   case AMDGPU::WQM: return AMDGPU::WQM;
+  case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
   case AMDGPU::WWM: return AMDGPU::WWM;
   case AMDGPU::S_MOV_B32: {
     const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
@@ -5506,6 +5507,7 @@
     switch (UseMI.getOpcode()) {
     case AMDGPU::COPY:
     case AMDGPU::WQM:
+    case AMDGPU::SOFT_WQM:
     case AMDGPU::WWM:
     case AMDGPU::REG_SEQUENCE:
     case AMDGPU::PHI:
@@ -5623,6 +5625,7 @@
   case AMDGPU::REG_SEQUENCE:
   case AMDGPU::INSERT_SUBREG:
   case AMDGPU::WQM:
+  case AMDGPU::SOFT_WQM:
   case AMDGPU::WWM: {
     const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
     if (RI.hasAGPRs(SrcRC)) {
Index: lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- lib/Target/AMDGPU/SIInstructions.td
+++ lib/Target/AMDGPU/SIInstructions.td
@@ -111,6 +111,10 @@
 // WQM pass processes it.
 def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
 
+// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wcm it is
+// turned into a copy by WQM pass, but does not seed WQM requirements.
+def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+
 // Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
 // that the @earlyclobber is respected. The @earlyclobber is to make sure that
 // the instruction that defines $src0 (which is run in WWM) doesn't
Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp
===================================================================
--- lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -312,6 +312,7 @@
   char GlobalFlags = 0;
   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
+  SmallVector<MachineInstr *, 4> SoftWQMInstrs;
 
   // We need to visit the basic blocks in reverse post-order so that we visit
   // defs before uses, in particular so that we don't accidentally mark an
@@ -340,6 +341,10 @@
         // correct, so we need it to be in WQM.
         Flags = StateWQM;
         LowerToCopyInstrs.push_back(&MI);
+      } else if (Opcode == AMDGPU::SOFT_WQM) {
+        LowerToCopyInstrs.push_back(&MI);
+        SoftWQMInstrs.push_back(&MI);
+        continue;
       } else if (Opcode == AMDGPU::WWM) {
         // The WWM intrinsic doesn't make the same guarantee, and plus it needs
         // to be executed in WQM or Exact so that its copy doesn't clobber
@@ -407,9 +412,12 @@
   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
   // ever used anywhere in the function. This implements the corresponding
   // semantics of @llvm.amdgcn.set.inactive.
+  // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
   if (GlobalFlags & StateWQM) {
     for (MachineInstr *MI : SetInactiveInstrs)
       markInstruction(*MI, StateWQM, Worklist);
+    for (MachineInstr *MI : SoftWQMInstrs)
+      markInstruction(*MI, StateWQM, Worklist);
   }
 
   return GlobalFlags;
@@ -885,7 +893,7 @@
   unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   if (!(GlobalFlags & StateWQM)) {
     lowerLiveMaskQueries(Exec);
-    if (!(GlobalFlags & StateWWM))
+    if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty())
       return !LiveMaskQueries.empty();
   } else {
     // Store a copy of the original live mask when required
Index: test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
@@ -0,0 +1,188 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s
+
+; Check that WQM is not triggered by the softwqm intrinsic alone.
+;
+;CHECK-LABEL: {{^}}test1:
+;CHECK-NOT: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %out = fadd float %src0, %src1
+  %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
+  ret float %out.0
+}
+
+; Check that the softwqm intrinsic works correctly for integers.
+;
+;CHECK-LABEL: {{^}}test2:
+;CHECK-NOT: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %out = fadd float %src0, %src1
+  %out.0 = bitcast float %out to i32
+  %out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0)
+  %out.2 = bitcast i32 %out.1 to float
+  ret float %out.2
+}
+
+; Make sure the transition from WQM to Exact to softwqm does not trigger WQM.
+;
+;CHECK-LABEL: {{^}}test_softwqm1:
+;CHECK-NOT: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: buffer_store_dword
+;CHECK-NOT; s_wqm_b64 exec, exec
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %temp = fadd float %src0, %src1
+  call void @llvm.amdgcn.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %out = fadd float %temp, %temp
+  %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
+  ret float %out.0
+}
+
+; Make sure the transition from WQM to Exact to softwqm does trigger WQM.
+;
+;CHECK-LABEL: {{^}}test_softwqm2:
+;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: buffer_store_dword
+;CHECK; s_wqm_b64 exec, exec
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %temp = fadd float %src0, %src1
+  %temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp)
+  call void @llvm.amdgcn.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %out = fadd float %temp, %temp
+  %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
+  ret float %out.0
+}
+
+; Make sure the transition from Exact to WWM then softwqm does not trigger WQM.
+;
+;CHECK-LABEL: {{^}}test_wwm1:
+;CHECK: buffer_load_dword
+;CHECK: buffer_store_dword
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK-NOT: s_wqm_b64
+define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %temp = fadd float %src0, %src1
+  %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
+  %out = fadd float %temp.0, %temp.0
+  %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
+  ret float %out.0
+}
+
+; Check that softwqm on one case of branch does not trigger WQM for shader.
+;
+;CHECK-LABEL: {{^}}test_control_flow_0:
+;CHECK-NEXT: ; %main_body
+;CHECK-NOT: s_wqm_b64 exec, exec
+;CHECK: %ELSE
+;CHECK: store
+;CHECK: %IF
+;CHECK: buffer_load
+;CHECK: buffer_load
+define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
+main_body:
+  %cmp = icmp eq i32 %z, 0
+  br i1 %cmp, label %IF, label %ELSE
+
+IF:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %out = fadd float %src0, %src1
+  %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
+  br label %END
+
+ELSE:
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
+  br label %END
+
+END:
+  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
+  ret float %r
+}
+
+; Check that softwqm on one case of branch is treated as WQM in WQM shader.
+;
+;CHECK-LABEL: {{^}}test_control_flow_1:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %ELSE
+;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
+;CHECK: store
+;CHECK: s_mov_b64 exec, [[SAVED]]
+;CHECK: %IF
+;CHECK-NOT: s_and_saveexec_b64
+;CHECK-NOT: s_and_b64 exec
+;CHECK: buffer_load
+;CHECK: buffer_load
+define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
+main_body:
+  %c.bc = bitcast i32 %c to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %data.sample = extractelement <4 x float> %dtex, i32 0
+
+  %cmp = icmp eq i32 %z, 0
+  br i1 %cmp, label %IF, label %ELSE
+
+IF:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %out = fadd float %src0, %src1
+  %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
+  br label %END
+
+ELSE:
+  call void @llvm.amdgcn.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
+  br label %END
+
+END:
+  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
+  ret float %r
+}
+
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2
+declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
+declare void @llvm.amdgcn.kill(i1) #1
+declare float @llvm.amdgcn.wqm.f32(float) #3
+declare float @llvm.amdgcn.softwqm.f32(float) #3
+declare i32 @llvm.amdgcn.softwqm.i32(i32) #3
+declare float @llvm.amdgcn.wwm.f32(float) #3
+
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readonly }
+attributes #3 = { nounwind readnone }