Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3717,15 +3717,6 @@
       Inst.eraseFromParent();
       continue;
 
-    case AMDGPU::S_CBRANCH_SCC0:
-    case AMDGPU::S_CBRANCH_SCC1:
-      // Clear unused bits of vcc
-      BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
-              AMDGPU::VCC)
-          .addReg(AMDGPU::EXEC)
-          .addReg(AMDGPU::VCC);
-      break;
-
     case AMDGPU::S_BFE_U64:
     case AMDGPU::S_BFM_B64:
       llvm_unreachable("Moving this op to VALU not implemented");
Index: lib/Target/AMDGPU/SILowerControlFlow.cpp
===================================================================
--- lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -46,6 +46,9 @@
 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr   // Do the THEN block
 /// label1:
 /// %exec = S_OR_B64 %exec, %sgpr0     // Re-enable saved exec mask bits
+///
+/// This pass also spots an S_CBRANCH_VCCZ/NZ from scalar control and ensures
+/// that the vcc it uses does not have bits set in inactive lanes.
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
@@ -95,6 +98,8 @@
 
   void combineMasks(MachineInstr &MI);
 
+  void clearVccInactiveLanes(MachineInstr &MI);
+
 public:
   static char ID;
 
@@ -446,6 +451,106 @@
     MRI->getUniqueVRegDef(Reg)->eraseFromParent();
 }
 
+// MI is an S_CBRANCH_VCCZ/NZ from scalar control. Ensure that the vcc it uses
+// does not have bits set in inactive lanes.
+void SILowerControlFlow::clearVccInactiveLanes(MachineInstr &MI) {
+  // Use a worklist to scan a tree of v_cmp, s_and, s_or and copy. Anything
+  // else we find means that we cannot guarantee that vcc has all bits clear
+  // for inactive lanes, so we need to insert an S_AND_B64 to ensure that.
+  MachineBasicBlock &MBB = *MI.getParent();
+  std::vector<MachineOperand *> WorkList;
+  WorkList.push_back(&MI.getOperand(1));
+  assert(WorkList.back()->isReg()
+      && WorkList.back()->getReg() == AMDGPU::VCC);
+  bool InactiveBitsClear = true;
+  while (!WorkList.empty() && InactiveBitsClear) {
+    MachineOperand &WorkOpnd = *WorkList.back();
+    WorkList.pop_back();
+    if (!WorkOpnd.isReg()) {
+      InactiveBitsClear = false;
+      break;
+    }
+    MachineInstr *DefMI = nullptr;
+    MachineOperand *DefOpnd = nullptr;
+    if (WorkOpnd.getReg() == AMDGPU::VCC) {
+      // The operand is vcc. Scan back a little way to find the
+      // instruction that sets it.
+      MachineInstr *ScanMI = WorkOpnd.getParent();
+      for (unsigned ScanLimit = 3;
+          !DefOpnd && ScanMI != &MBB.front() && ScanLimit; --ScanLimit) {
+        ScanMI = ScanMI->getPrevNode();
+        for (unsigned OpndIdx = 0, NumOpnds = ScanMI->getNumOperands();
+            OpndIdx != NumOpnds; ++OpndIdx) {
+          auto &Opnd = ScanMI->getOperand(OpndIdx);
+          if (Opnd.isReg()
+              && Opnd.isDef() && Opnd.getReg() == AMDGPU::VCC) {
+            // We have found the def of vcc.
+            DefMI = ScanMI;
+            DefOpnd = &Opnd;
+            break;
+          }
+        }
+      }
+    } else {
+      // Otherwise, find the single def of this operand.
+      auto WorkReg = WorkOpnd.getReg();
+      if (MRI->hasOneDef(WorkReg)) {
+        DefMI = &*MRI->def_instr_begin(WorkReg);
+        for (unsigned OpndIdx = 0; ; ++OpndIdx) {
+          assert(OpndIdx < DefMI->getNumOperands());
+          auto &Opnd = DefMI->getOperand(OpndIdx);
+          if (Opnd.isReg() && Opnd.isDef() && Opnd.getReg() == WorkReg) {
+            DefOpnd = &Opnd;
+            break;
+          }
+        }
+      }
+    }
+    if (!DefOpnd) {
+      // We did not find the def.
+      InactiveBitsClear = false;
+      break;
+    }
+    // Inspect the def.
+    if (DefMI->isCompare()) {
+      // A compare instruction is known to leave bits for inactive lanes
+      // clear.
+      continue;
+    }
+    if (DefMI->isCopy()) {
+      // Copy instruction: add its input to the worklist.
+      WorkList.push_back(&DefMI->getOperand(1));
+      continue;
+    }
+    switch (DefMI->getOpcode()) {
+      case AMDGPU::V_CNDMASK_B32_e32:
+      case AMDGPU::V_CNDMASK_B32_e64:
+        // A later pass inserts a v_cmp after this, so we can treat it as
+        // a v_cmp.
+        continue;
+      case AMDGPU::S_AND_B32:
+      case AMDGPU::S_OR_B32:
+      case AMDGPU::S_AND_B64:
+      case AMDGPU::S_OR_B64:
+        // And/or: add its two inputs to the worklist.
+        WorkList.push_back(&DefMI->getOperand(1));
+        WorkList.push_back(&DefMI->getOperand(2));
+        break;
+      default:
+        // Any other op means that bits for inactive lanes might be set.
+        InactiveBitsClear = false;
+        break;
+    }
+  }
+  if (!InactiveBitsClear) {
+    // Add an S_AND_B64 to clear bits for inactive lanes.
+    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::S_AND_B64),
+            AMDGPU::VCC)
+        .addReg(AMDGPU::EXEC)
+        .addReg(AMDGPU::VCC);
+  }
+}
+
 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   TII = ST.getInstrInfo();
@@ -503,6 +608,15 @@
         Last = I;
         continue;
 
+      case AMDGPU::S_CBRANCH_VCCNZ:
+      case AMDGPU::S_CBRANCH_VCCZ:
+        // This is an S_CBRANCH_VCCZ/NZ from uniform control. Ensure that the
+        // vcc it uses does not have bits set in inactive lanes.
+        clearVccInactiveLanes(MI);
+        // Do not replay newly inserted code.
+        Last = I;
+        continue;
+
       default:
         Last = I;
         continue;
Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll
===================================================================
--- test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -662,14 +662,14 @@
 ; GCN-LABEL: {{^}}broken_phi_bb:
 ; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8
 
-; GCN: s_branch [[BB2:BB[0-9]+_[0-9]+]]
+; GCN: s_cbranch_vccz [[BB2:BB[0-9]+_[0-9]+]]
 
 ; GCN: {{^BB[0-9]+_[0-9]+}}:
 ; GCN: s_mov_b64 exec,
 ; IDXMODE: s_set_gpr_idx_off
 
-; GCN: [[BB2]]:
 ; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
+; GCN: [[BB2]]:
 ; GCN: buffer_load_dword
 
 ; GCN: [[REGLOOP:BB[0-9]+_[0-9]+]]:
Index: test/CodeGen/AMDGPU/loop_break.ll
===================================================================
--- test/CodeGen/AMDGPU/loop_break.ll
+++ test/CodeGen/AMDGPU/loop_break.ll
@@ -26,10 +26,9 @@
 ; GCN: s_mov_b64 [[INITMASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
-; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]]
 ; GCN: v_cmp_lt_i32_e32 vcc, -1
-; GCN: s_and_b64 vcc, exec, vcc
-; GCN-NEXT: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
+; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]]
+; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
 
 ; GCN: ; %bb.2: ; %bb4
 ; GCN: buffer_load_dword
Index: test/CodeGen/AMDGPU/nested-loop-conditions.ll
===================================================================
--- test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -64,6 +64,7 @@
 
 ; FIXME: Should fold to unconditional branch?
 ; GCN: s_mov_b64 vcc, -1
+; GCN-NEXT: s_and_b64 vcc, exec, vcc
 ; GCN-NEXT: ; implicit-def
 ; GCN: s_cbranch_vccz
 
@@ -178,24 +179,20 @@
 ; GCN-LABEL: {{^}}nested_loop_conditions:
 
 ; GCN: v_cmp_lt_i32_e32 vcc, 8, v
-; GCN: s_and_b64 vcc, exec, vcc
 ; GCN: s_cbranch_vccnz [[BB31:BB[0-9]+_[0-9]+]]
 
 ; GCN: [[BB14:BB[0-9]+_[0-9]+]]: ; %bb14
 ; GCN: v_cmp_ne_u32_e32 vcc, 1, v
-; GCN-NEXT: s_and_b64 vcc, exec, vcc
 ; GCN-NEXT: s_cbranch_vccnz [[BB31]]
 
 ; GCN: [[BB18:BB[0-9]+_[0-9]+]]: ; %bb18
 ; GCN: buffer_load_dword
 ; GCN: v_cmp_lt_i32_e32 vcc, 8, v
-; GCN-NEXT: s_and_b64 vcc, exec, vcc
 ; GCN-NEXT: s_cbranch_vccnz [[BB18]]
 
 ; GCN: buffer_load_dword
 ; GCN: buffer_load_dword
 ; GCN: v_cmp_gt_i32_e32 vcc, 9
-; GCN-NEXT: s_and_b64 vcc, exec, vcc
 ; GCN-NEXT: s_cbranch_vccnz [[BB14]]
 
 ; GCN: [[BB31]]:
Index: test/CodeGen/AMDGPU/salu-to-valu.ll
===================================================================
--- test/CodeGen/AMDGPU/salu-to-valu.ll
+++ test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -434,7 +434,6 @@
 ; {{^}}sopc_vopc_legalize_bug:
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}
-; GCN: s_and_b64 vcc, exec, vcc
 ; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]]
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN-NOHSA: buffer_store_dword [[ONE]]
Index: test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll
@@ -0,0 +1,54 @@
+; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=gfx800 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+
+; This checks for a bug where uniform control flow can result in multiple
+; v_cmp results being combined together with s_and_b64, s_or_b64 and s_xor_b64,
+; using the resulting mask in s_cbranch_vccnz
+; without ensuring that the resulting mask has bits clear for inactive lanes.
+; The problematic case is s_xor_b64, as, unlike the other ops, it can actually
+; set bits for inactive lanes.
+;
+; The check for an s_xor_b64 is just to check that this test tests what it is
+; supposed to test. If the s_xor_b64 disappears due to some other case, it does
+; not necessarily mean that the bug has reappeared.
+;
+; The check for s_and_b64 vcc, vcc, exec checks that the bug is fixed.
+
+; CHECK: {{^}}main:
+; CHECK: s_xor_b64
+; CHECK: s_and_b64 vcc, exec, vcc
+
+define amdgpu_cs void @main(i32 inreg %arg) {
+.entry:
+  %tmp44 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+  %tmp16 = load volatile float, float addrspace(1)* undef
+  %tmp22 = load volatile float, float addrspace(1)* undef
+  %tmp25 = load volatile float, float addrspace(1)* undef
+  %tmp31 = fcmp olt float %tmp16, 0x3FA99999A0000000
+  br i1 %tmp31, label %bb, label %.exit.thread
+
+bb:                                               ; preds = %.entry
+  %tmp42 = fcmp olt float %tmp25, 0x3FA99999A0000000
+  br i1 %tmp42, label %bb43, label %.exit.thread
+
+bb43:
+  %tmp46 = fcmp olt <2 x float> %tmp44, <float 0x3FA99999A0000000, float 0x3FA99999A0000000>
+  %tmp47 = extractelement <2 x i1> %tmp46, i32 0
+  %tmp48 = extractelement <2 x i1> %tmp46, i32 1
+  %tmp49 = and i1 %tmp47, %tmp48
+  br i1 %tmp49, label %bb50, label %.exit.thread
+
+bb50:
+  %tmp53 = fcmp olt float %tmp22, 0x3FA99999A0000000
+  br i1 %tmp53, label %.exit3.i, label %.exit.thread
+
+.exit3.i:
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %.exit.thread
+
+.exit.thread:
+  ret void
+}
+
Index: test/CodeGen/AMDGPU/skip-if-dead.ll
===================================================================
--- test/CodeGen/AMDGPU/skip-if-dead.ll
+++ test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -218,7 +218,6 @@
 ; CHECK-NEXT: ; %bb.3:
 ; CHECK: buffer_load_dword [[LOAD:v[0-9]+]]
 ; CHECK: v_cmp_eq_u32_e32 vcc, 0, [[LOAD]]
-; CHECK-NEXT: s_and_b64 vcc, exec, vcc
 ; CHECK-NEXT: s_cbranch_vccnz [[LOOP_BB]]
 
 ; CHECK-NEXT: {{^}}[[EXIT]]:
Index: test/CodeGen/AMDGPU/uniform-cfg.ll
===================================================================
--- test/CodeGen/AMDGPU/uniform-cfg.ll
+++ test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -118,7 +118,6 @@
 ; Using a floating-point value in an integer compare will cause the compare to
 ; be selected for the SALU and then later moved to the VALU.
 ; GCN: v_cmp_ne_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 5, [[CMP]]
-; GCN: s_and_b64 vcc, exec, [[COND]]
 ; GCN: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]:
@@ -143,7 +142,6 @@
 ; Using a floating-point value in an integer compare will cause the compare to
 ; be selected for the SALU and then later moved to the VALU.
 ; GCN: v_cmp_gt_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 6, [[CMP]]
-; GCN: s_and_b64 vcc, exec, [[COND]]
 ; GCN: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]: