Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -166,6 +166,7 @@
   SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performBrcondCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
   bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -680,6 +680,7 @@
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::BRCOND);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -8634,6 +8635,37 @@
   return SDValue(CSrc, 0);
 }
 
+SDValue SITargetLowering::performBrcondCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  // Fold brcond (setcc zext(i1 x), 1, ne), label -> brcond x, label
+  SDValue CC = N->getOperand(1);
+  if (CC.getOpcode() != ISD::SETCC || !isa<ConstantSDNode>(CC.getOperand(1)))
+    return SDValue();
+
+  uint64_t CVal = CC.getConstantOperandVal(1);
+  ISD::CondCode Code = cast<CondCodeSDNode>(CC.getOperand(2))->get();
+  switch (CC.getOperand(0).getOpcode()) {
+  case ISD::ZERO_EXTEND:
+    if (!(CVal == 1 && Code == ISD::SETNE) &&
+        !(CVal == 0 && Code == ISD::SETEQ))
+      return SDValue();
+    break;
+  case ISD::SIGN_EXTEND:
+    if (!(CVal == (uint64_t)-1 && Code == ISD::SETNE) &&
+        !(CVal == 0 && Code == ISD::SETEQ))
+      return SDValue();
+    break;
+  default:
+    return SDValue();
+  }
+
+  SDValue OrigCC = CC.getOperand(0).getOperand(0);
+  if (OrigCC.getValueType() != MVT::i1)
+    return SDValue();
+
+  return DCI.DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, N->getOperand(0),
+                         OrigCC, N->getOperand(2));
+}
 
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
@@ -8760,6 +8792,8 @@
     return performExtractVectorEltCombine(N, DCI);
   case ISD::INSERT_VECTOR_ELT:
     return performInsertVectorEltCombine(N, DCI);
+  case ISD::BRCOND:
+    return performBrcondCombine(N, DCI);
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
Index: test/CodeGen/AMDGPU/dag-combine-brcond.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/dag-combine-brcond.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN: BB0_1:
+; GCN:   v_cmp_eq_u32_e64 [[CC:[^,]+]], s{{[0-9]+}}, 0
+; GCN: BB0_2:
+; GCN-NOT: v_cndmask_b32
+; GCN-NOT: v_cmp
+; GCN:   s_and_b64 vcc, exec, [[CC]]
+; GCN:   s_cbranch_vccz BB0_4
+define amdgpu_kernel void @test(float addrspace(1)* %arg1) {
+bb:
+  %tmp36 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 undef
+  %tmp37 = getelementptr inbounds float, float addrspace(1)* %tmp36, i64 undef
+  br label %bb41
+
+bb40:                                             ; preds = %bb111
+  ret void
+
+bb41:                                             ; preds = %bb111, %bb
+  %tmp42 = phi i32 [ 0, %bb ], [ %tmp112, %bb111 ]
+  %tmp106 = icmp eq i32 %tmp42, 0
+  br label %bb114
+
+bb111:                                            ; preds = %bb150
+  %tmp112 = add nuw nsw i32 %tmp42, 1
+  %tmp113 = icmp eq i32 %tmp112, 32
+  br i1 %tmp113, label %bb40, label %bb41
+
+bb114:                                            ; preds = %bb150, %bb41
+  %tmp115 = phi i32 [ 0, %bb41 ], [ %tmp326, %bb150 ]
+  %tmp116 = shl i32 %tmp115, 5
+  %tmp122 = zext i32 %tmp116 to i64
+  %tmp123 = getelementptr inbounds float, float addrspace(1)* %tmp37, i64 %tmp122
+  br i1 %tmp106, label %bb125, label %bb132
+
+bb125:                                            ; preds = %bb114
+  %tmp128 = getelementptr inbounds float, float addrspace(1)* %tmp123, i64 16
+  %tmp129 = bitcast float addrspace(1)* %tmp128 to <4 x float> addrspace(1)*
+  br label %bb150
+
+bb132:                                            ; preds = %bb114
+  %tmp140 = getelementptr inbounds float, float addrspace(1)* %tmp123, i64 16
+  %tmp141 = bitcast float addrspace(1)* %tmp140 to <4 x float> addrspace(1)*
+  br label %bb150
+
+bb150:                                            ; preds = %bb132, %bb125
+  %tmp152 = phi <4 x float> addrspace(1)* [ %tmp129, %bb125 ], [ %tmp141, %bb132 ]
+  store <4 x float> zeroinitializer, <4 x float> addrspace(1)* %tmp152, align 16
+  %tmp326 = add nuw nsw i32 %tmp115, 2
+  %tmp327 = icmp eq i32 %tmp326, 32
+  br i1 %tmp327, label %bb111, label %bb114
+}
Index: test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
===================================================================
--- test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -367,7 +367,7 @@
 ; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
 
 ; GCN: {{^}}[[FLOW]]:
-; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
+; GCN: s_cbranch_execnz [[FLOW1:BB[0-9]+]]
 
 ; GCN: s_or_b64 exec, exec
 ; GCN: v_mov_b32_e32 v0, 2.0