Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -440,6 +440,7 @@
 
   setSchedulingPreference(Sched::RegPressure);
   setJumpIsExpensive(true);
+  setHasMultipleConditionRegisters(true);
 
   // SI at least has hardware support for floating point exceptions, but no way
   // of using or handling them is implemented. They are also optional in OpenCL
Index: llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -80,6 +80,11 @@
   void emitLoop(MachineInstr &MI);
   void emitEndCf(MachineInstr &MI);
 
+  void findMaskOperands(MachineInstr &MI, unsigned OpNo,
+                        SmallVectorImpl<MachineOperand> &Src) const;
+
+  void combineMasks(MachineInstr &MI);
+
 public:
   static char ID;
 
@@ -336,6 +341,62 @@
     LIS->handleMove(*NewMI);
 }
 
+// Returns replace operands for a logical operation, either single result
+// for exec or two operands if source was another equivalent operation.
+void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
+       SmallVectorImpl<MachineOperand> &Src) const {
+  MachineOperand &Op = MI.getOperand(OpNo);
+  if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+    Src.push_back(Op);
+    return;
+  }
+
+  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
+  if (!Def || Def->getParent() != MI.getParent() ||
+      !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
+    return;
+
+  // Make sure we do not modify exec between def and use.
+  // A copy with implcitly defined exec inserted earlier is an exclusion, it
+  // does not really modify exec.
+  for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
+    if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
+        !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC))
+      return;
+
+  for (const auto &SrcOp : Def->explicit_operands())
+    if (SrcOp.isUse() && (!SrcOp.isReg() ||
+        TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
+        SrcOp.getReg() == AMDGPU::EXEC))
+      Src.push_back(SrcOp);
+}
+
+// Search and combine pairs of equivalent instructions, like
+// S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
+// S_OR_B64  x, (S_OR_B64  x, y) => S_OR_B64  x, y
+// One of the operands is exec mask.
+void SILowerControlFlow::combineMasks(MachineInstr &MI) {
+  assert(MI.getNumExplicitOperands() == 3);
+  SmallVector<MachineOperand, 4> Ops;
+  unsigned OpToReplace = 1;
+  findMaskOperands(MI, 1, Ops);
+  if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
+  findMaskOperands(MI, 2, Ops);
+  if (Ops.size() != 3) return;
+
+  unsigned UniqueOpndIdx;
+  if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
+  else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+  else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+  else return;
+
+  unsigned Reg = MI.getOperand(OpToReplace).getReg();
+  MI.RemoveOperand(OpToReplace);
+  MI.addOperand(Ops[UniqueOpndIdx]);
+  if (MRI->use_empty(Reg))
+    MRI->getUniqueVRegDef(Reg)->eraseFromParent();
+}
+
 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   TII = ST.getInstrInfo();
@@ -351,9 +412,9 @@
     NextBB = std::next(BI);
     MachineBasicBlock &MBB = *BI;
 
-    MachineBasicBlock::iterator I, Next;
+    MachineBasicBlock::iterator I, Next, Last;
 
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+    for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
@@ -386,9 +447,20 @@
         emitEndCf(MI);
         break;
 
+      case AMDGPU::S_AND_B64:
+      case AMDGPU::S_OR_B64:
+        // Cleanup bit manipulations on exec mask
+        combineMasks(MI);
+        Last = I;
+        continue;
+
       default:
-        break;
+        Last = I;
+        continue;
       }
+
+      // Replay newly inserted code to combine masks
+      Next = (Last == MBB.end()) ? MBB.begin() : Last;
     }
   }
 
Index: llvm/trunk/lib/Target/AMDGPU/SILowerI1Copies.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -100,12 +100,12 @@
       const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
       const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
 
+      DebugLoc DL = MI.getDebugLoc();
+      MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
       if (DstRC == &AMDGPU::VReg_1RegClass &&
           TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
         I1Defs.push_back(Dst.getReg());
-        DebugLoc DL = MI.getDebugLoc();
 
-        MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
         if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
           if (DefInst->getOperand(1).isImm()) {
             I1Defs.push_back(Dst.getReg());
@@ -129,10 +129,26 @@
         MI.eraseFromParent();
       } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
                  SrcRC == &AMDGPU::VReg_1RegClass) {
-        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_U32_e64))
-          .addOperand(Dst)
-          .addOperand(Src)
-          .addImm(0);
+        if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
+            DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
+            DefInst->getOperand(1).getImm() == 0 &&
+            DefInst->getOperand(2).getImm() != 0 &&
+            DefInst->getOperand(3).isReg() &&
+            TargetRegisterInfo::isVirtualRegister(
+              DefInst->getOperand(3).getReg()) &&
+            TRI->getCommonSubClass(
+              MRI.getRegClass(DefInst->getOperand(3).getReg()),
+              &AMDGPU::SGPR_64RegClass)) {
+          BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
+            .addOperand(Dst)
+            .addReg(AMDGPU::EXEC)
+            .addOperand(DefInst->getOperand(3));
+        } else {
+          BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
+            .addOperand(Dst)
+            .addOperand(Src)
+            .addImm(0);
+        }
         MI.eraseFromParent();
       }
     }
Index: llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -493,9 +493,9 @@
 ; GCN: s_setpc_b64
 
 ; GCN: [[LONG_BR_DEST0]]
-; GCN: s_cmp_eq_u32
+; GCN: v_cmp_ne_u32_e32
 ; GCN-NEXT: ; implicit-def
-; GCN-NEXT: s_cbranch_scc0
+; GCN-NEXT: s_cbranch_vccz
 ; GCN: s_setpc_b64
 
 ; GCN: s_endpgm
Index: llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; Check that invariant compare is hoisted out of the loop.
+; At the same time condition shall not be serialized into a VGPR and deserialized later
+; using another v_cmp + v_cndmask, but used directly in s_and_saveexec_b64.
+
+; CHECK: v_cmp_{{..}}_u32_e64 [[COND:s\[[0-9]+:[0-9]+\]]]
+; CHECK: BB0_1:
+; CHECK-NOT: v_cmp
+; CHECK_NOT: v_cndmask
+; CHECK: s_and_saveexec_b64 s[{{[[0-9]+:[0-9]+}}], [[COND]]
+; CHECK: BB0_2:
+
+define amdgpu_kernel void @hoist_cond(float addrspace(1)* nocapture %arg, float addrspace(1)* noalias nocapture readonly %arg1, i32 %arg3, i32 %arg4) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %tmp5 = icmp ult i32 %tmp, %arg3
+  br label %bb1
+
+bb1:                                              ; preds = %bb3, %bb
+  %tmp7 = phi i32 [ %arg4, %bb ], [ %tmp16, %bb3 ]
+  %tmp8 = phi float [ 0.000000e+00, %bb ], [ %tmp15, %bb3 ]
+  br i1 %tmp5, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb1
+  %tmp10 = zext i32 %tmp7 to i64
+  %tmp11 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %tmp10
+  %tmp12 = load float, float addrspace(1)* %tmp11, align 4
+  br label %bb3
+
+bb3:                                             ; preds = %bb2, %bb1
+  %tmp14 = phi float [ %tmp12, %bb2 ], [ 0.000000e+00, %bb1 ]
+  %tmp15 = fadd float %tmp8, %tmp14
+  %tmp16 = add i32 %tmp7, -1
+  %tmp17 = icmp eq i32 %tmp16, 0
+  br i1 %tmp17, label %bb4, label %bb1
+
+bb4:                                             ; preds = %bb3
+  store float %tmp15, float addrspace(1)* %arg, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }