diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
@@ -49,12 +50,28 @@
           "Number of select groups converted due to high-predictability");
 STATISTIC(NumSelectUnPred,
           "Number of select groups not converted due to unpredictability");
+STATISTIC(NumSelectConvertedLoop,
+          "Number of select groups converted due to loop-level analysis");
 
 static cl::opt<unsigned> ColdOperandThreshold(
     "cold-operand-threshold",
     cl::desc("Maximum frequency of path for an operand to be considered cold."),
     cl::init(20), cl::Hidden);
 
+static cl::opt<unsigned>
+    LoopGradientGain("loop-gradient-gain",
+                     cl::desc("Gradient gain threshold (%)."), cl::init(25),
+                     cl::Hidden);
+
+static cl::opt<unsigned>
+    GainCycleThreshold("select-converter-loop-threshold",
+                       cl::desc("Minimum gain per loop (in cycles) threshold."),
+                       cl::init(4), cl::Hidden);
+
+static cl::opt<unsigned> MispredictDefaultRate(
+    "mispredict-default-rate", cl::Hidden, cl::init(25),
+    cl::desc("Default mispredict rate (initialized to 25%)."));
+
 namespace {
 
 class SelectOptimize : public FunctionPass {
@@ -68,6 +85,10 @@
   std::unique_ptr<BranchProbabilityInfo> BPI;
   ProfileSummaryInfo *PSI;
   OptimizationRemarkEmitter *ORE;
+  TargetSchedModel TSchedModel;
+
+  // Scaling up factor to avoid precision issues with integer division.
+  const unsigned ScalingUpFactor = 1000;
 
 public:
   static char ID;
@@ -93,6 +114,13 @@
   using SelectGroup = SmallVector<SelectInst *, 2>;
   using SelectGroups = SmallVector<SelectGroup, 2>;
 
+  struct CostInfo {
+    /// Predicated cost (with selects as conditional moves).
+    uint64_t PredCost;
+    /// Non-predicated cost (with selects converted to branches).
+    uint64_t NonPredCost;
+  };
+
   bool optimizeSelects(Function &F);
   void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
   void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
@@ -100,10 +128,21 @@
   void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
   void findProfitableSIGroupsBase(SelectGroups &SIGroups,
                                   SelectGroups &ProfSIGroups);
+  void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
+                                        SelectGroups &ProfSIGroups);
   bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI);
   bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI);
-  void getOneUseChain(Instruction *I, SmallVector<Instruction *, 2> &Chain);
-  bool isSelectHighlyPredictable(SelectInst *SI);
+  void getOneUseSlice(Instruction *I, SmallVector<Instruction *, 2> &Slice);
+  bool isSelectHighlyPredictable(const SelectInst *SI);
+  bool checkLoopHeuristics(CostInfo LoopDepth[2]);
+  bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
+                        DenseMap<const Instruction *, CostInfo> &InstCostMap,
+                        CostInfo LoopCost[2]);
+  SmallPtrSet<const Instruction *, 2> getSIset(const SelectGroups &SIGroups);
+  Optional<uint64_t> computeInstCost(const Instruction *I);
+  uint64_t getMispredictionCost(const SelectInst *SI, const uint64_t CondCost);
+  uint64_t predictedPathCost(uint64_t TrueCost, uint64_t FalseCost,
+                             const SelectInst *SI);
   bool isExpensiveInst(Instruction *I);
   bool isSelectKindSupported(SelectInst *SI);
 };
@@ -148,6 +187,7 @@
   BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+  TSchedModel.init(TSI);
 
   // When optimizing for size, selects are preferable over branches.
   if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get()))
@@ -190,7 +230,22 @@
 
 void SelectOptimize::optimizeSelectsInnerLoops(Function &F,
                                                SelectGroups &ProfSIGroups) {
-  llvm_unreachable("Unimplemented");
+  SmallVector<Loop *, 4> Loops(LI->begin(), LI->end());
+  // Need to check size on each iteration as we accumulate child loops.
+  for (unsigned long i = 0; i < Loops.size(); ++i)
+    for (Loop *ChildL : Loops[i]->getSubLoops())
+      Loops.push_back(ChildL);
+
+  for (Loop *L : Loops) {
+    if (!L->isInnermost())
+      continue;
+
+    SelectGroups SIGroups;
+    for (BasicBlock *BB : L->getBlocks())
+      collectSelectGroups(*BB, SIGroups);
+
+    findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
+  }
 }
 
 void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
@@ -234,6 +289,54 @@
   }
 }
 
+void SelectOptimize::findProfitableSIGroupsInnerLoops(
+    const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
+  NumSelectOptAnalyzed += SIGroups.size();
+  // For each select group in an inner-most loop,
+  // a branch is more preferable than a select/conditional-move if:
+  // i) conversion to branches satisfies loop-level heuristics including
+  //    reducing the loop's critical path by some threshold; and
+  // ii) the cost of the select group is cheaper (in terms of latency) compared
+  //     to the select version.
+
+  DenseMap<const Instruction *, CostInfo> InstCostMap;
+  CostInfo LoopCost[2] = {{0, 0}, {0, 0}};
+  if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
+      !checkLoopHeuristics(LoopCost)) {
+    OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
+                                     L->getHeader()->getFirstNonPHI());
+    ORmissL << "Did not convert any select in the loop to branch due to "
+               "loop-level heuristics.";
+    ORE->emit(ORmissL);
+    return;
+  }
+
+  for (SelectGroup &ASI : SIGroups) {
+    // Assuming infinite resources, the cost of a group of instructions is the
+    // cost of the most expensive instruction of the group.
+    uint64_t SelectCost = 0, BranchCost = 0;
+    for (SelectInst *SI : ASI) {
+      SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost);
+      BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost);
+    }
+    if (BranchCost < SelectCost) {
+      OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front());
+      OR << "Profitable to convert to branch (loop analysis). BranchCost="
+         << std::to_string(BranchCost)
+         << ", SelectCost=" << std::to_string(SelectCost) << ". ";
+      ORE->emit(OR);
+      ++NumSelectConvertedLoop;
+      ProfSIGroups.push_back(ASI);
+    } else {
+      OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
+      ORmiss << "Select is more profitable (loop analysis). BranchCost="
+             << std::to_string(BranchCost)
+             << ", SelectCost=" << std::to_string(SelectCost) << ". ";
+      ORE->emit(ORmiss);
+    }
+  }
+}
+
 bool SelectOptimize::isConvertToBranchProfitableBase(
     const SmallVector<SelectInst *, 2> &ASI) {
   SelectInst *SI = ASI.front();
@@ -257,7 +360,7 @@
   }
 
   // Look for expensive operands in the cold operand's (if any) dependence
-  // chain of any of the selects in the group.
+  // slice of any of the selects in the group.
   if (hasExpensiveColdOperand(ASI)) {
     ++NumSelectConvertedExpColdOperand;
     OR << "Converted to branch because of expensive cold operand.";
@@ -282,7 +385,7 @@
   }
   if (!ColdOperand)
     return false;
-  // Check if the cold path's dependence chain has an expensive operation for
+  // Check if the cold path's dependence slice has an expensive operation for
   // any of the selects of the group.
   for (SelectInst *SI : ASI) {
     Instruction *ColdI = nullptr;
@@ -290,10 +393,10 @@
       ColdI = dyn_cast<Instruction>(SI->getTrueValue());
     else
       ColdI = dyn_cast<Instruction>(SI->getFalseValue());
-    if (ColdI && ColdI->hasOneUse()) {
-      SmallVector<Instruction *, 2> ColdChain;
-      getOneUseChain(ColdI, ColdChain);
-      for (auto *ColdII : ColdChain)
+    if (ColdI) {
+      SmallVector<Instruction *, 2> ColdSlice;
+      getOneUseSlice(ColdI, ColdSlice);
+      for (auto *ColdII : ColdSlice)
         if (isExpensiveInst(ColdII))
           return true;
     }
@@ -301,8 +404,8 @@
   return false;
 }
 
-void SelectOptimize::getOneUseChain(Instruction *I,
-                                    SmallVector<Instruction *, 2> &Chain) {
+void SelectOptimize::getOneUseSlice(Instruction *I,
+                                    SmallVector<Instruction *, 2> &Slice) {
   std::queue<Instruction *> Worklist;
   Worklist.push(I);
   SmallPtrSet<Instruction *, 2> Visited;
@@ -315,7 +418,7 @@
       continue;
     Visited.insert(II);
 
-    if (I != II && !II->hasOneUse())
+    if (!II->hasOneUse())
       continue;
 
     // Avoid considering instructions in outer or independent loops, or non-loop
@@ -334,14 +437,14 @@
         continue;
     }
 
-    Chain.push_back(II);
+    Slice.push_back(II);
     for (unsigned k = 0; k < II->getNumOperands(); ++k)
       if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
         Worklist.push(OpI);
   }
 }
 
-bool SelectOptimize::isSelectHighlyPredictable(SelectInst *SI) {
+bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) {
   uint64_t TrueWeight, FalseWeight;
   if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
     uint64_t Max = std::max(TrueWeight, FalseWeight);
@@ -355,6 +458,160 @@
   return false;
 }
 
+bool SelectOptimize::checkLoopHeuristics(CostInfo LoopCost[2]) {
+  // Loop-level checks to determine if a non-predicated version (with branches)
+  // of the loop is more profitable than its predicated version.
+  uint64_t Diff[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
+                      LoopCost[1].PredCost - LoopCost[1].NonPredCost};
+
+  // Profitably converting to branches need to reduce the loop's critical path
+  // by at least some threshold (absolute gain of GainCycleThreshold cycles and
+  // relative gain of 12.5%).
+  if (Diff[1] < GainCycleThreshold * ScalingUpFactor &&
+      Diff[1] * 8 < LoopCost[1].PredCost)
+    return false;
+
+  // If the loop's critical path involves loop-carried dependences, the gradient
+  // of the gain needs to be at least LoopGradientGain% (defaults to 25%).
+  if (Diff[1] > Diff[0] &&
+      (Diff[1] - Diff[0]) * 100 <
+          (LoopCost[1].PredCost - LoopCost[0].PredCost) * LoopGradientGain)
+    return false;
+
+  // Non-predicated version of the loop is more profitable than its
+  // predicated version.
+  return true;
+}
+
+bool SelectOptimize::computeLoopCosts(
+    const Loop *L, const SelectGroups &SIGroups,
+    DenseMap<const Instruction *, CostInfo> &InstCostMap,
+    CostInfo LoopCost[2]) {
+  const auto &SIset = getSIset(SIGroups);
+  // Compute instruction and loop-critical-path costs across two iterations for
+  // both predicated and non-predicated version.
+  const unsigned Iterations = 2;
+  for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
+    // Cost of the loop's critical path.
+    CostInfo &MaxCost = LoopCost[Iter];
+    for (BasicBlock *BB : L->getBlocks()) {
+      for (const Instruction &I : *BB) {
+        if (I.isDebugOrPseudoInst())
+          continue;
+        // Compute the predicated and non-predicated cost of the instruction.
+        uint64_t IPredCost = 0, INonPredCost = 0;
+
+        // Assume infinite resources that allow to fully exploit the available
+        // instruction-level parallelism.
+        // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
+        for (const Use &U : I.operands()) {
+          auto UI = dyn_cast<Instruction>(U.get());
+          if (!UI)
+            continue;
+          if (InstCostMap.count(UI)) {
+            IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
+            INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
+          }
+        }
+        auto ILatency = computeInstCost(&I);
+        if (!ILatency.hasValue())
+          return false;
+        IPredCost += ScalingUpFactor * ILatency.getValue();
+        INonPredCost += ScalingUpFactor * ILatency.getValue();
+
+        // For a select that can be converted to branch,
+        // compute its cost as a branch (non-predicated cost).
+        //
+        // BranchCost = PredictedPathCost + MispredictCost
+        // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
+        // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
+        if (SIset.contains(&I)) {
+          auto SI = dyn_cast<SelectInst>(&I);
+
+          uint64_t TrueOpCost = 0, FalseOpCost = 0;
+          if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue()))
+            if (InstCostMap.count(TI))
+              TrueOpCost = InstCostMap[TI].NonPredCost;
+          if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue()))
+            if (InstCostMap.count(FI))
+              FalseOpCost = InstCostMap[FI].NonPredCost;
+          uint64_t PredictedPathCost =
+              predictedPathCost(TrueOpCost, FalseOpCost, SI);
+
+          uint64_t CondCost = 0;
+          if (auto *CI = dyn_cast<Instruction>(SI->getCondition()))
+            if (InstCostMap.count(CI))
+              CondCost = InstCostMap[CI].NonPredCost;
+          uint64_t MispredictCost = getMispredictionCost(SI, CondCost);
+
+          INonPredCost = PredictedPathCost + MispredictCost;
+        }
+
+        InstCostMap[&I] = {IPredCost, INonPredCost};
+        MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
+        MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
+      }
+    }
+  }
+  return true;
+}
+
+SmallPtrSet<const Instruction *, 2>
+SelectOptimize::getSIset(const SelectGroups &SIGroups) {
+  SmallPtrSet<const Instruction *, 2> SIset;
+  for (const SelectGroup &ASI : SIGroups)
+    for (const SelectInst *SI : ASI)
+      SIset.insert(SI);
+  return SIset;
+}
+
+Optional<uint64_t> SelectOptimize::computeInstCost(const Instruction *I) {
+  InstructionCost ICost =
+      TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+  if (auto OC = ICost.getValue())
+    return Optional<uint64_t>(OC.getValue());
+  return Optional<uint64_t>(None);
+}
+
+uint64_t SelectOptimize::getMispredictionCost(const SelectInst *SI,
+                                              const uint64_t CondCost) {
+  uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
+
+  // Account for the default misprediction rate when using a branch
+  // (conservatively set to 25% by default).
+  uint64_t MispredictRate = MispredictDefaultRate;
+  // If the select condition is obviously predictable, then the misprediction
+  // rate is zero.
+  if (isSelectHighlyPredictable(SI))
+    MispredictRate = 0;
+
+  // CondCost is included to account for cases where the computation of the
+  // condition is part of a long dependence chain (potentially loop-carried)
+  // that would delay detection of a misprediction and increase its cost.
+  uint64_t MispredictCost = divideCeil(
+      std::max(ScalingUpFactor * MispredictPenalty, CondCost) * MispredictRate,
+      100);
+  return MispredictCost;
+}
+
+// Returns the cost of a branch when the prediction is correct.
+// TrueCost * TrueProbability + FalseCost * FalseProbability.
+uint64_t SelectOptimize::predictedPathCost(uint64_t TrueCost,
+                                           uint64_t FalseCost,
+                                           const SelectInst *SI) {
+  uint64_t TrueWeight, FalseWeight;
+  if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
+    uint64_t SumWeight = TrueWeight + FalseWeight;
+    if (SumWeight != 0)
+      return divideCeil(TrueCost * TrueWeight + FalseCost * FalseWeight,
+                        SumWeight);
+  }
+  // Without branch weight metadata, we assume 75% for the one path and 25% for
+  // the other, and pick the result with the biggest cost.
+  return std::max(divideCeil(TrueCost * 3 + FalseCost, 4),
+                  divideCeil(FalseCost * 3 + TrueCost, 4));
+}
+
 bool SelectOptimize::isExpensiveInst(Instruction *I) {
   return TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency) >=
          TargetTransformInfo::TCC_Expensive;