diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -52,6 +53,7 @@
           "Number of select groups not converted due to unpredictability");
 STATISTIC(NumSelectConvertedLoop,
           "Number of select groups converted due to loop-level analysis");
+STATISTIC(NumSelectsConverted, "Number of selects converted");
 
 static cl::opt<unsigned> ColdOperandThreshold(
     "cold-operand-threshold",
@@ -132,7 +134,8 @@
                                         SelectGroups &ProfSIGroups);
   bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI);
   bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI);
-  void getOneUseSlice(Instruction *I, SmallVector<Instruction *, 2> &Slice);
+  void getOneUseSlice(Instruction *I, std::stack<Instruction *> &Slice,
+                      bool ForSinking = false);
   bool isSelectHighlyPredictable(const SelectInst *SI);
   bool checkLoopHeuristics(CostInfo LoopDepth[2]);
   bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
@@ -248,8 +251,195 @@
   }
 }
 
+/// If \p isTrue is true, return the true value of \p SI, otherwise return
+/// false value of \p SI. If the true/false value of \p SI is defined by any
+/// select instructions in \p Selects, look through the defining select
+/// instruction until the true/false value is not defined in \p Selects.
+static Value *
+getTrueOrFalseValue(SelectInst *SI, bool isTrue,
+                    const SmallPtrSet<const Instruction *, 2> &Selects) {
+  Value *V = nullptr;
+  for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
+       DefSI = dyn_cast<SelectInst>(V)) {
+    assert(DefSI->getCondition() == SI->getCondition() &&
+           "The condition of DefSI does not match with SI");
+    V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
+  }
+  assert(V && "Failed to get select true/false value");
+  return V;
+}
+
 void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
-  llvm_unreachable("Unimplemented");
+  for (SelectGroup &ASI : ProfSIGroups) {
+    // The code transformation here is a modified version of the sinking
+    // transformation in CodeGenPrepare::optimizeSelectInst with a more
+    // aggressive strategy of which instructions to sink.
+    //
+    // TODO: eliminate the redundancy by removing
+    // CodeGenPrepare::optimizeSelectInst and optimizing here selects for all
+    // cases (with and without profile information).
+
+    // Transform a sequence like this:
+    //    start:
+    //       %cmp = cmp uge i32 %a, %b
+    //       %sel = select i1 %cmp, i32 %c, i32 %d
+    //
+    // Into:
+    //    start:
+    //       %cmp = cmp uge i32 %a, %b
+    //       %cmp.frozen = freeze %cmp
+    //       br i1 %cmp.frozen, label %select.true, label %select.false
+    //    select.true:
+    //       br label %select.end
+    //    select.false:
+    //       br label %select.end
+    //    select.end:
+    //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
+    //
+    // %cmp should be frozen, otherwise it may introduce undefined behavior.
+    // In addition, we may sink instructions that produce %c or %d into the
+    // destination(s) of the new branch.
+    // If the true or false blocks do not contain a sunken instruction, that
+    // block and its branch may be optimized away. In that case, one side of the
+    // first branch will point directly to select.end, and the corresponding PHI
+    // predecessor block will be the start block.
+
+    // Find all the instructions that can be soundly sunk to the true/false
+    // blocks. These are instructions that are computed solely for producing the
+    // operands of the select instructions in the group and can be sunk without
+    // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
+    // with side effects).
+    SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
+    unsigned long maxTrueSliceLen = 0, maxFalseSliceLen = 0;
+    for (SelectInst *SI : ASI) {
+      // For each select, compute the sinkable dependence chains of the true and
+      // false operands.
+      if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) {
+        std::stack<Instruction *> TrueSlice;
+        getOneUseSlice(TI, TrueSlice, true);
+        maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
+        TrueSlices.push_back(TrueSlice);
+      }
+      if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) {
+        std::stack<Instruction *> FalseSlice;
+        getOneUseSlice(FI, FalseSlice, true);
+        maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
+        FalseSlices.push_back(FalseSlice);
+      }
+    }
+    // In the case of multiple select instructions in the same group, the order
+    // of non-dependent instructions (instructions of different dependence
+    // slices) in the true/false blocks appears to affect performance.
+    // Interleaving the slices seems to experimentally be the optimal approach.
+    // This interleaving scheduling allows for more ILP (with a natural downside
+    // of increasing a bit register pressure) compared to a simple ordering of
+    // one whole chain after another. One would expect that this ordering would
+    // not matter since the scheduling in the backend of the compiler  would
+    // take care of it, but apparently the scheduler fails to deliver optimal
+    // ILP with a naive ordering here.
+    SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
+    for (unsigned long IS = 0; IS < maxTrueSliceLen; ++IS) {
+      for (auto &S : TrueSlices) {
+        if (!S.empty()) {
+          TrueSlicesInterleaved.push_back(S.top());
+          S.pop();
+        }
+      }
+    }
+    for (unsigned long IS = 0; IS < maxFalseSliceLen; ++IS) {
+      for (auto &S : FalseSlices) {
+        if (!S.empty()) {
+          FalseSlicesInterleaved.push_back(S.top());
+          S.pop();
+        }
+      }
+    }
+
+    // We split the block containing the select(s) into two blocks.
+    SelectInst *SI = ASI.front();
+    SelectInst *LastSI = ASI.back();
+    BasicBlock *StartBlock = SI->getParent();
+    BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
+    BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
+    BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
+    // Delete the unconditional branch that was just created by the split.
+    StartBlock->getTerminator()->eraseFromParent();
+
+    // These are the new basic blocks for the conditional branch.
+    // At least one will become an actual new basic block.
+    BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
+    BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
+    if (!TrueSlicesInterleaved.empty()) {
+      TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink",
+                                     EndBlock->getParent(), EndBlock);
+      TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
+      TrueBranch->setDebugLoc(LastSI->getDebugLoc());
+      for (Instruction *TrueInst : TrueSlicesInterleaved)
+        TrueInst->moveBefore(TrueBranch);
+    }
+    if (!FalseSlicesInterleaved.empty()) {
+      FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink",
+                                      EndBlock->getParent(), EndBlock);
+      FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
+      FalseBranch->setDebugLoc(LastSI->getDebugLoc());
+      for (Instruction *FalseInst : FalseSlicesInterleaved)
+        FalseInst->moveBefore(FalseBranch);
+    }
+    // If there was nothing to sink, then arbitrarily choose the 'false' side
+    // for a new input value to the PHI.
+    if (TrueBlock == FalseBlock) {
+      assert(TrueBlock == nullptr &&
+             "Unexpected basic block transform while optimizing select");
+
+      FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
+                                      EndBlock->getParent(), EndBlock);
+      auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
+      FalseBranch->setDebugLoc(SI->getDebugLoc());
+    }
+
+    // Insert the real conditional branch based on the original condition.
+    // If we did not create a new block for one of the 'true' or 'false' paths
+    // of the condition, it means that side of the branch goes to the end block
+    // directly and the path originates from the start block from the point of
+    // view of the new PHI.
+    BasicBlock *TT, *FT;
+    if (TrueBlock == nullptr) {
+      TT = EndBlock;
+      FT = FalseBlock;
+      TrueBlock = StartBlock;
+    } else if (FalseBlock == nullptr) {
+      TT = TrueBlock;
+      FT = EndBlock;
+      FalseBlock = StartBlock;
+    } else {
+      TT = TrueBlock;
+      FT = FalseBlock;
+    }
+    IRBuilder<> IB(SI);
+    auto *CondFr =
+        IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
+    IB.CreateCondBr(CondFr, TT, FT, SI);
+
+    SmallPtrSet<const Instruction *, 2> INS;
+    INS.insert(ASI.begin(), ASI.end());
+    // Use reverse iterator because later select may use the value of the
+    // earlier select, and we need to propagate value through earlier select
+    // to get the PHI operand.
+    for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
+      SelectInst *SI = *It;
+      // The select itself is replaced with a PHI Node.
+      PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
+      PN->takeName(SI);
+      PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
+      PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
+      PN->setDebugLoc(SI->getDebugLoc());
+
+      SI->replaceAllUsesWith(PN);
+      SI->eraseFromParent();
+      INS.erase(SI);
+      ++NumSelectsConverted;
+    }
+  }
 }
 
 void SelectOptimize::collectSelectGroups(BasicBlock &BB,
@@ -394,18 +584,21 @@
     else
       ColdI = dyn_cast<Instruction>(SI->getFalseValue());
     if (ColdI) {
-      SmallVector<Instruction *, 2> ColdSlice;
+      std::stack<Instruction *> ColdSlice;
       getOneUseSlice(ColdI, ColdSlice);
-      for (auto *ColdII : ColdSlice)
-        if (isExpensiveInst(ColdII))
+      while (!ColdSlice.empty()) {
+        if (isExpensiveInst(ColdSlice.top()))
           return true;
+        ColdSlice.pop();
+      }
     }
   }
   return false;
 }
 
 void SelectOptimize::getOneUseSlice(Instruction *I,
-                                    SmallVector<Instruction *, 2> &Slice) {
+                                    std::stack<Instruction *> &Slice,
+                                    bool ForSinking) {
   std::queue<Instruction *> Worklist;
   Worklist.push(I);
   SmallPtrSet<Instruction *, 2> Visited;
@@ -421,9 +614,16 @@
     if (!II->hasOneUse())
       continue;
 
-    // Avoid considering instructions in outer or independent loops, or non-loop
-    // instructions (instructions with potentially much less frequency than the
-    // operand of the select).
+    // Cannot soundly sink instructions with side-effects.
+    // Terminator instructions cannot be sunk.
+    // Avoid sinking other select instructions (should be handled separetely).
+    if (ForSinking &&
+        (II->isTerminator() || II->mayHaveSideEffects() || isa<SelectInst>(II)))
+      continue;
+
+    // Avoid considering/sinking instructions from outer or independent loops,
+    // or non-loop instructions (instructions with potentially much less
+    // frequency than the operand of the select).
     const Loop *LII = LI->getLoopFor(II->getParent());
     if (L && L != LII && (!LII || !L->contains(LII)))
       continue;
@@ -431,13 +631,15 @@
     // Avoid phi nodes that are not part of a loop-carried dependence.
     // Could source from a cold basic block.
     if (isa<PHINode>(II)) {
+      if (ForSinking)
+        continue;
       if (!LII)
         continue;
       if (LII->getHeader() != II->getParent())
         continue;
     }
 
-    Slice.push_back(II);
+    Slice.push(II);
     for (unsigned k = 0; k < II->getNumOperands(); ++k)
       if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
         Worklist.push(OpI);
diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/select-optimize.ll
@@ -0,0 +1,292 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -disable-select-optimize=false -x86-cmov-converter=false -disable-cgp-select2branch=true < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Test base heuristic 1:
+;; highly-biased selects assumed to be highly predictable, converted to branches
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; If a select is obviously predictable, turn it into a branch.
+define i32 @weighted_select1(i32 %a, i32 %b, i1 %cmp) {
+; CHECK-LABEL: weighted_select1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %select.false
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:  .LBB0_2: # %select.end
+; CHECK-NEXT:    retq
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
+  ret i32 %sel
+}
+
+; If a select is obviously predictable (reversed profile weights),
+; turn it into a branch.
+define i32 @weighted_select2(i32 %a, i32 %b, i1 %cmp) {
+; CHECK-LABEL: weighted_select2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    je .LBB1_1
+; CHECK-NEXT:  # %bb.2: # %select.end
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB1_1: # %select.false
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16
+  ret i32 %sel
+}
+
+; Not obvioulsy predictable select.
+define i32 @weighted_select3(i32 %a, i32 %b, i1 %cmp) {
+; CHECK-LABEL: weighted_select3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %esi, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !17
+  ret i32 %sel
+}
+
+; Unpredictable select should not form a branch.
+define i32 @unpred_select(i32 %a, i32 %b, i1 %cmp) {
+; CHECK-LABEL: unpred_select:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %esi, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cmp, i32 %a, i32 %b, !unpredictable !20
+  ret i32 %sel
+}
+
+; Predictable select in function with optsize attribute should not form branch.
+define i32 @weighted_select_optsize(i32 %a, i32 %b, i1 %cmp) optsize {
+; CHECK-LABEL: weighted_select_optsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %esi, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
+  ret i32 %sel
+}
+
+define i32 @weighted_select_pgso(i32 %a, i32 %b, i1 %cmp) !prof !14 {
+; CHECK-LABEL: weighted_select_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %esi, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
+  ret i32 %sel
+}
+
+; If two selects in a row are predictable, turn them into branches.
+define i32 @weighted_selects(i32 %a, i32 %b) !prof !19 {
+; CHECK-LABEL: weighted_selects:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    jne .LBB6_2
+; CHECK-NEXT:  # %bb.1: # %select.false
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:  .LBB6_2: # %select.end
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    jne .LBB6_4
+; CHECK-NEXT:  # %bb.3: # %select.false2
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:  .LBB6_4: # %select.end1
+; CHECK-NEXT:    retq
+  %cmp = icmp ne i32 %a, 0
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
+  %cmp1 = icmp ne i32 %sel, 0
+  %sel1 = select i1 %cmp1, i32 %b, i32 %a, !prof !15
+  ret i32 %sel1
+}
+
+; If select group predictable, turn it into a branch.
+define i32 @weighted_select_group(i32 %a, i32 %b, i32 %c, i1 %cmp) !prof !19 {
+; CHECK-LABEL: weighted_select_group:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    jne .LBB7_1
+; CHECK-NEXT:  # %bb.2: # %select.false
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    addl %edi, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB7_1:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    retq
+  %sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !15
+  %sel2 = select i1 %cmp, i32 %c, i32 %a, !prof !15
+  %add = add i32 %sel1, %sel2
+  ret i32 %add
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Test base heuristic 2:
+;; look for expensive instructions in the one-use slice of the cold path
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; select with cold one-use load value operand should form branch and
+; sink load
+define i32 @expensive_val_operand1(i32* nocapture %a, i32 %y, i1 %cmp) {
+; CHECK-LABEL: expensive_val_operand1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    jne .LBB8_1
+; CHECK-NEXT:  # %bb.2: # %select.end
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB8_1: # %select.true.sink
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    retq
+  %load = load i32, i32* %a, align 8
+  %sel = select i1 %cmp, i32 %load, i32 %y, !prof !17
+  ret i32 %sel
+}
+
+; expensive hot value operand and cheap cold value operand.
+define i32 @expensive_val_operand2(i32* nocapture %a, i32 %x, i1 %cmp) {
+; CHECK-LABEL: expensive_val_operand2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel (%rdi), %eax
+; CHECK-NEXT:    retq
+  %load = load i32, i32* %a, align 8
+  %sel = select i1 %cmp, i32 %x, i32 %load, !prof !17
+  ret i32 %sel
+}
+
+; cold value operand with load in its one-use dependence slice shoud result
+; into a branch with sinked dependence slice.
+define i32 @expensive_val_operand3(i32* nocapture %a, i32 %b, i32 %y, i1 %cmp) {
+; CHECK-LABEL: expensive_val_operand3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    jne .LBB10_1
+; CHECK-NEXT:  # %bb.2: # %select.end
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB10_1: # %select.true.sink
+; CHECK-NEXT:    addl (%rdi), %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
+  %load = load i32, i32* %a, align 8
+	%x = add i32 %load, %b
+  %sel = select i1 %cmp, i32 %x, i32 %y, !prof !17
+  ret i32 %sel
+}
+
+; Multiple uses of the load value operand.
+define i32 @expensive_val_operand4(i32 %a, i32* nocapture %b, i32 %x, i1 %cmp) {
+; CHECK-LABEL: expensive_val_operand4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl (%rsi), %eax
+; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    cmovel %eax, %edx
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    retq
+  %load = load i32, i32* %b, align 4
+  %sel = select i1 %cmp, i32 %x, i32 %load
+  %add = add i32 %sel, %load
+  ret i32 %add
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Test loop heuristic: loop-level critical-path analysis
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Use of cmov in this test would put a load and a fsub on the critical path.
+;; Loop-level analysis should decide to form a branch.
+;;
+;;double cmov_on_critical_path(int n, double x, double *a) {
+;;  for (int i = 0; i < n; i++) {
+;;    double r = a[i];
+;;    if (x > r)
+;; 			// 50% of iterations
+;;   		x -= r;
+;;  }
+;;  return x;
+;;}
+define double @cmov_on_critical_path(i32 %n, double %x, double* nocapture %a) {
+; CHECK-LABEL: cmov_on_critical_path:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    jle .LBB12_5
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    jmp .LBB12_2
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB12_4: # %select.end
+; CHECK-NEXT:    # in Loop: Header=BB12_2 Depth=1
+; CHECK-NEXT:    addq $8, %rsi
+; CHECK-NEXT:    decq %rax
+; CHECK-NEXT:    je .LBB12_5
+; CHECK-NEXT:  .LBB12_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    ucomisd %xmm1, %xmm0
+; CHECK-NEXT:    jbe .LBB12_4
+; CHECK-NEXT:  # %bb.3: # %select.true.sink
+; CHECK-NEXT:    # in Loop: Header=BB12_2 Depth=1
+; CHECK-NEXT:    subsd %xmm1, %xmm0
+; CHECK-NEXT:    jmp .LBB12_4
+; CHECK-NEXT:  .LBB12_5: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %entry
+  ret double %x
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x1 = phi double [ %x2, %for.body ], [ %x, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  %r = load double, double* %arrayidx, align 8
+	%sub = fsub double %x1, %r
+  %cmp2 = fcmp ogt double %x1, %r
+  %x2 = select i1 %cmp2, double %sub, double %x1, !prof !18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:                                         ; preds = %for.body
+  ret double %x2
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
+!15 = !{!"branch_weights", i32 1, i32 100}
+!16 = !{!"branch_weights", i32 100, i32 1}
+!17 = !{!"branch_weights", i32 1, i32 99}
+!18 = !{!"branch_weights", i32 50, i32 50}
+!19 = !{!"function_entry_count", i64 100}
+!20 = !{}