diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionExpander.h b/llvm/include/llvm/Analysis/ScalarEvolutionExpander.h
--- a/llvm/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionExpander.h
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
 #include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/ValueHandle.h"
 
@@ -171,16 +172,21 @@
       ChainedPhis.clear();
     }
 
-    /// Return true for expressions that may incur non-trivial cost to evaluate
-    /// at runtime.
+    /// Return true for expressions that can't be evaluate at runtime
+    /// within given \b Budged.
     ///
     /// At is an optional parameter which specifies point in code where user is
     /// going to expand this expression. Sometimes this knowledge can lead to a
     /// more accurate cost estimation.
-    bool isHighCostExpansion(const SCEV *Expr, Loop *L,
+    bool isHighCostExpansion(const SCEV *Expr, Loop *L, unsigned Threshold,
+                             const TargetTransformInfo *TTI,
                              const Instruction *At = nullptr) {
       SmallPtrSet<const SCEV *, 8> Processed;
-      return isHighCostExpansionHelper(Expr, L, At, Processed);
+      int BudgetRemaining = Threshold * TargetTransformInfo::TCC_Basic;
+      if (isHighCostExpansionHelper(Expr, L, At, BudgetRemaining, TTI,
+                                    Processed))
+        return true;
+      return BudgetRemaining < 0;
     }
 
     /// This method returns the canonical induction variable of the specified
@@ -322,8 +328,11 @@
     LLVMContext &getContext() const { return SE.getContext(); }
 
     /// Recursive helper function for isHighCostExpansion.
+    /// Returns true to indicate that cost computation failed, and the cost is
+    /// likely high.
     bool isHighCostExpansionHelper(const SCEV *S, Loop *L,
-                                   const Instruction *At,
+                                   const Instruction *At, int &BudgetRemaining,
+                                   const TargetTransformInfo *TTI,
                                    SmallPtrSetImpl<const SCEV *> &Processed);
 
     /// Insert the specified binary operator, doing a small amount of work to
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -376,11 +376,15 @@
 /// If the final value of any expressions that are recurrent in the loop can
 /// be computed, substitute the exit values from the loop into any instructions
 /// outside of the loop that use the final values of the current expressions.
+/// If ReplaceExitValue is OnlyCheapRepl, the replacement is only done if the
+/// cost of replacement is within the budget ReplaceExitValueBudget.
 /// Return the number of loop exit values that have been replaced, and the
 /// corresponding phi node will be added to DeadInsts.
 int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
-                          ScalarEvolution *SE, SCEVExpander &Rewriter,
-                          DominatorTree *DT, ReplaceExitVal ReplaceExitValue,
+                          ScalarEvolution *SE, const TargetTransformInfo *TTI,
+                          SCEVExpander &Rewriter, DominatorTree *DT,
+                          ReplaceExitVal ReplaceExitValue,
+                          unsigned ReplaceExitValueBudget,
                           SmallVector<WeakTrackingVH, 16> &DeadInsts);
 
 /// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyIndVar.h b/llvm/include/llvm/Transforms/Utils/SimplifyIndVar.h
--- a/llvm/include/llvm/Transforms/Utils/SimplifyIndVar.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyIndVar.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_TRANSFORMS_UTILS_SIMPLIFYINDVAR_H
 #define LLVM_TRANSFORMS_UTILS_SIMPLIFYINDVAR_H
 
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/ValueHandle.h"
 
 namespace llvm {
@@ -46,13 +47,15 @@
 /// simplifyUsersOfIV - Simplify instructions that use this induction variable
 /// by using ScalarEvolution to analyze the IV's recurrence.
 bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT,
-                       LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead,
+                       LoopInfo *LI, const TargetTransformInfo *TTI,
+                       SmallVectorImpl<WeakTrackingVH> &Dead,
                        SCEVExpander &Rewriter, IVVisitor *V = nullptr);
 
 /// SimplifyLoopIVs - Simplify users of induction variables within this
 /// loop. This does not actually change or add IVs.
 bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
-                     LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead);
+                     LoopInfo *LI, const TargetTransformInfo *TTI,
+                     SmallVectorImpl<WeakTrackingVH> &Dead);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -80,16 +80,17 @@
 
 LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
                             ScalarEvolution *SE, DominatorTree *DT,
-                            AssumptionCache *AC, OptimizationRemarkEmitter *ORE,
-                            bool PreserveLCSSA, Loop **RemainderLoop = nullptr);
+                            AssumptionCache *AC,
+                            const llvm::TargetTransformInfo *TTI,
+                            OptimizationRemarkEmitter *ORE, bool PreserveLCSSA,
+                            Loop **RemainderLoop = nullptr);
 
-bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
-                                bool AllowExpensiveTripCount,
-                                bool UseEpilogRemainder, bool UnrollRemainder,
-                                bool ForgetAllSCEV, LoopInfo *LI,
-                                ScalarEvolution *SE, DominatorTree *DT,
-                                AssumptionCache *AC, bool PreserveLCSSA,
-                                Loop **ResultLoop = nullptr);
+bool UnrollRuntimeLoopRemainder(
+    Loop *L, unsigned Count, bool AllowExpensiveTripCount,
+    bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
+    LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+    const TargetTransformInfo *TTI, bool PreserveLCSSA,
+    Loop **ResultLoop = nullptr);
 
 void computePeelCount(Loop *L, unsigned LoopSize,
                       TargetTransformInfo::UnrollingPreferences &UP,
@@ -104,6 +105,7 @@
                                   unsigned TripMultiple, bool UnrollRemainder,
                                   LoopInfo *LI, ScalarEvolution *SE,
                                   DominatorTree *DT, AssumptionCache *AC,
+                                  const TargetTransformInfo *TTI,
                                   OptimizationRemarkEmitter *ORE,
                                   Loop **EpilogueLoop = nullptr);
 
@@ -121,7 +123,8 @@
 
 void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
                              ScalarEvolution *SE, DominatorTree *DT,
-                             AssumptionCache *AC);
+                             AssumptionCache *AC,
+                             const TargetTransformInfo *TTI);
 
 MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
 
diff --git a/llvm/lib/Analysis/ScalarEvolutionExpander.cpp b/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
--- a/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -2129,84 +2129,142 @@
 }
 
 bool SCEVExpander::isHighCostExpansionHelper(
-    const SCEV *S, Loop *L, const Instruction *At,
-    SmallPtrSetImpl<const SCEV *> &Processed) {
+    const SCEV *S, Loop *L, const Instruction *At, int &BudgetRemaining,
+    const TargetTransformInfo *TTI, SmallPtrSetImpl<const SCEV *> &Processed) {
+  if (BudgetRemaining < 0)
+    return true; // Already run out of budget, give up.
 
-  // If we can find an existing value for this scev available at the point "At"
-  // then consider the expression cheap.
+  // Was the cost of expansion of this expression already accounted for?
+  if (!Processed.insert(S).second)
+    return false; // We have already accounted for this expression.
+
+  // Can we find an existing value for this scev available at the point "At"?
   if (At && getRelatedExistingExpansion(S, At, L))
-    return false;
+    return false; // Consider the expression to be free.
 
-  // Zero/One operand expressions
   switch (S->getSCEVType()) {
   case scUnknown:
   case scConstant:
-    return false;
-  case scTruncate:
-    return isHighCostExpansionHelper(cast<SCEVTruncateExpr>(S)->getOperand(),
-                                     L, At, Processed);
-  case scZeroExtend:
-    return isHighCostExpansionHelper(cast<SCEVZeroExtendExpr>(S)->getOperand(),
-                                     L, At, Processed);
-  case scSignExtend:
-    return isHighCostExpansionHelper(cast<SCEVSignExtendExpr>(S)->getOperand(),
-                                     L, At, Processed);
+    return false; // Assume to be zero-cost.
   }
 
-  if (!Processed.insert(S).second)
-    return false;
+  // The rest of the logic is recursive!
+  if (!TTI)
+    return true; // No cost model - give up.
+
+  if (auto *CastExpr = dyn_cast<SCEVCastExpr>(S)) {
+    unsigned Opcode;
+    switch (S->getSCEVType()) {
+    case scTruncate:
+      Opcode = Instruction::Trunc;
+      break;
+    case scZeroExtend:
+      Opcode = Instruction::ZExt;
+      break;
+    case scSignExtend:
+      Opcode = Instruction::SExt;
+      break;
+    default:
+      llvm_unreachable("There are no other cast types.");
+    }
+    const SCEV *Op = CastExpr->getOperand();
+    BudgetRemaining -=
+        TTI->getOperationCost(Opcode, S->getType(), Op->getType());
+    return isHighCostExpansionHelper(Op, L, At, BudgetRemaining, TTI,
+                                     Processed);
+  }
 
   if (auto *UDivExpr = dyn_cast<SCEVUDivExpr>(S)) {
-    // If the divisor is a power of two and the SCEV type fits in a native
-    // integer (and the LHS not expensive), consider the division cheap
-    // irrespective of whether it occurs in the user code since it can be
-    // lowered into a right shift.
-    if (auto *SC = dyn_cast<SCEVConstant>(UDivExpr->getRHS()))
+    // If the divisor is a power of two and the LHS not expensive,
+    // consider the division as logical right-shift.
+    if (auto *SC = dyn_cast<SCEVConstant>(UDivExpr->getRHS())) {
       if (SC->getAPInt().isPowerOf2()) {
-        if (isHighCostExpansionHelper(UDivExpr->getLHS(), L, At, Processed))
+        if (isHighCostExpansionHelper(UDivExpr->getLHS(), L, At,
+                                      BudgetRemaining, TTI, Processed) ||
+            isHighCostExpansionHelper(UDivExpr->getRHS(), L, At,
+                                      BudgetRemaining, TTI, Processed))
           return true;
-        const DataLayout &DL =
-            L->getHeader()->getParent()->getParent()->getDataLayout();
-        unsigned Width = cast<IntegerType>(UDivExpr->getType())->getBitWidth();
-        return DL.isIllegalInteger(Width);
+        BudgetRemaining -=
+            TTI->getOperationCost(Instruction::LShr, S->getType());
+        return BudgetRemaining < 0;
       }
+    }
 
     // UDivExpr is very likely a UDiv that ScalarEvolution's HowFarToZero or
     // HowManyLessThans produced to compute a precise expression, rather than a
     // UDiv from the user's code. If we can't find a UDiv in the code with some
-    // simple searching, assume the former consider UDivExpr expensive to
-    // compute.
+    // simple searching, we need to account for it's cost.
     BasicBlock *ExitingBB = L->getExitingBlock();
-    if (!ExitingBB)
-      return true;
+    if (At || ExitingBB) {
+      if (!At)
+        At = &ExitingBB->back();
 
-    // At the beginning of this function we already tried to find existing value
-    // for plain 'S'. Now try to lookup 'S + 1' since it is common pattern
-    // involving division. This is just a simple search heuristic.
-    if (!At)
-      At = &ExitingBB->back();
-    if (!getRelatedExistingExpansion(
-            SE.getAddExpr(S, SE.getConstant(S->getType(), 1)), At, L))
+      // At the beginning of this function we already tried to find existing
+      // value for plain 'S'. Now try to lookup 'S + 1' since it is common
+      // pattern involving division. This is just a simple search heuristic.
+      if (getRelatedExistingExpansion(
+              SE.getAddExpr(S, SE.getConstant(S->getType(), 1)), At, L))
+        return false;
+    }
+
+    // Need to count the cost of this UDiv.
+    if (isHighCostExpansionHelper(UDivExpr->getLHS(), L, At, BudgetRemaining,
+                                  TTI, Processed) ||
+        isHighCostExpansionHelper(UDivExpr->getRHS(), L, At, BudgetRemaining,
+                                  TTI, Processed))
       return true;
+    BudgetRemaining -= TTI->getOperationCost(Instruction::UDiv, S->getType());
+    return BudgetRemaining < 0;
   }
 
-  // HowManyLessThans uses a Max expression whenever the loop is not guarded by
-  // the exit condition.
-  if (isa<SCEVMinMaxExpr>(S))
-    return true;
-
   // Recurse past nary expressions, which commonly occur in the
   // BackedgeTakenCount. They may already exist in program code, and if not,
   // they are not too expensive rematerialize.
   if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(S)) {
-    for (auto *Op : NAry->operands())
-      if (isHighCostExpansionHelper(Op, L, At, Processed))
+    Type *OpType = NAry->getType();
+
+    int PairCost;
+    int CostIncreasePerStep = 0;
+    switch (S->getSCEVType()) {
+    case scAddExpr:
+      PairCost = TTI->getOperationCost(Instruction::Add, OpType);
+      break;
+    case scMulExpr:
+      PairCost = TTI->getOperationCost(Instruction::Mul, OpType);
+      break;
+    case scSMaxExpr:
+    case scUMaxExpr:
+    case scSMinExpr:
+    case scUMinExpr:
+      PairCost = TTI->getOperationCost(Instruction::ICmp, OpType) +
+                 TTI->getOperationCost(Instruction::Select, OpType);
+      break;
+    case scAddRecExpr:
+      PairCost = TTI->getOperationCost(Instruction::Mul, OpType) +
+                 TTI->getOperationCost(Instruction::Add, OpType);
+      CostIncreasePerStep = TTI->getOperationCost(Instruction::Mul, OpType);
+      break;
+    default:
+      llvm_unreachable("There are no other Nary expressions.");
+    }
+
+    assert(NAry->getNumOperands() > 1 &&
+           "Nary expr should have more than 1 operand.");
+    for (auto *Op : NAry->operands()) {
+      if (isHighCostExpansionHelper(Op, L, At, BudgetRemaining, TTI, Processed))
         return true;
+      if (Op == *NAry->op_begin())
+        continue;
+      BudgetRemaining -= PairCost;
+      if (BudgetRemaining < 0)
+        return true;
+      PairCost += CostIncreasePerStep;
+    }
+
+    return BudgetRemaining < 0;
   }
 
-  // If we haven't recognized an expensive SCEV pattern, assume it's an
-  // expression produced by program code.
-  return false;
+  llvm_unreachable("No other scev expressions possible.");
 }
 
 Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred,
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -156,10 +156,11 @@
   SmallVector<WeakTrackingVH, 16> DeadInsts;
   SCEVExpander Rewriter(*SE, *DL, "mvetp");
   ReplaceExitVal ReplaceExitValue = AlwaysRepl;
+  unsigned ReplaceExitValueBudget = ~0; // Irrelevant, because AlwaysRepl.
 
   formLCSSARecursively(*L, *DT, LI, SE);
-  rewriteLoopExitValues(L, LI, TLI, SE, Rewriter, DT, ReplaceExitValue,
-                        DeadInsts);
+  rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT, ReplaceExitValue,
+                        ReplaceExitValueBudget, DeadInsts);
 }
 
 bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -116,6 +116,12 @@
                clEnumValN(AlwaysRepl, "always",
                           "always replace exit value whenever possible")));
 
+static cl::opt<unsigned> ReplaceExitValueBudget(
+    "replace-exit-value-budget", cl::Hidden, cl::init(4),
+    cl::desc(
+        "Control the maximal total instruction cost that we are willing to "
+        "emit to replace the exit value in IndVarSimplify (default = 4)"));
+
 static cl::opt<bool> UsePostIncrementRanges(
   "indvars-post-increment-ranges", cl::Hidden,
   cl::desc("Use post increment control-dependent ranges in IndVarSimplify"),
@@ -1657,8 +1663,8 @@
       // Information about sign/zero extensions of CurrIV.
       IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT);
 
-      Changed |=
-          simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, Rewriter, &Visitor);
+      Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, TTI, DeadInsts, Rewriter,
+                                   &Visitor);
 
       if (Visitor.WI.WidestNativeType) {
         WideIVs.push_back(Visitor.WI);
@@ -2691,8 +2697,9 @@
   // loop into any instructions outside of the loop that use the final values
   // of the current expressions.
   if (ReplaceExitValue != NeverRepl) {
-    if (int Rewrites = rewriteLoopExitValues(L, LI, TLI, SE, Rewriter, DT,
-                                             ReplaceExitValue, DeadInsts)) {
+    if (int Rewrites = rewriteLoopExitValues(
+            L, LI, TLI, SE, TTI, Rewriter, DT, ReplaceExitValue,
+            ReplaceExitValueBudget, DeadInsts)) {
       NumReplaced += Rewrites;
       Changed = true;
     }
@@ -2750,9 +2757,9 @@
       if (!IndVar)
         continue;
 
-      // Avoid high cost expansions.  Note: This heuristic is questionable in
-      // that our definition of "high cost" is not exactly principled.
-      if (Rewriter.isHighCostExpansion(ExitCount, L))
+      // Avoid high cost expansions.
+      if (Rewriter.isHighCostExpansion(ExitCount, L, ReplaceExitValueBudget,
+                                       TTI))
         continue;
 
       // Check preconditions for proper SCEVExpander operation. SCEV does not
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -388,7 +388,7 @@
   Loop *EpilogueOuterLoop = nullptr;
   LoopUnrollResult UnrollResult = UnrollAndJamLoop(
       L, UP.Count, OuterTripCount, OuterTripMultiple, UP.UnrollRemainder, LI,
-      &SE, &DT, &AC, &ORE, &EpilogueOuterLoop);
+      &SE, &DT, &AC, &TTI, &ORE, &EpilogueOuterLoop);
 
   // Assign new loop attributes.
   if (EpilogueOuterLoop) {
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1148,7 +1148,7 @@
       {UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
        UseUpperBound, MaxOrZero, TripMultiple, UP.PeelCount, UP.UnrollRemainder,
        ForgetAllSCEV},
-      LI, &SE, &DT, &AC, &ORE, PreserveLCSSA, &RemainderLoop);
+      LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
   if (UnrollResult == LoopUnrollResult::Unmodified)
     return LoopUnrollResult::Unmodified;
 
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -166,11 +166,12 @@
 /// simplify/dce pass of the instructions.
 void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
                                    ScalarEvolution *SE, DominatorTree *DT,
-                                   AssumptionCache *AC) {
+                                   AssumptionCache *AC,
+                                   const TargetTransformInfo *TTI) {
   // Simplify any new induction variables in the partially unrolled loop.
   if (SE && SimplifyIVs) {
     SmallVector<WeakTrackingVH, 16> DeadInsts;
-    simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
+    simplifyLoopIVs(L, SE, DT, LI, TTI, DeadInsts);
 
     // Aggressively clean up dead instructions that simplifyLoopIVs already
     // identified. Any remaining should be cleaned up below.
@@ -245,6 +246,7 @@
 LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
                                   ScalarEvolution *SE, DominatorTree *DT,
                                   AssumptionCache *AC,
+                                  const TargetTransformInfo *TTI,
                                   OptimizationRemarkEmitter *ORE,
                                   bool PreserveLCSSA, Loop **RemainderLoop) {
 
@@ -403,7 +405,7 @@
   if (RuntimeTripCount && ULO.TripMultiple % ULO.Count != 0 &&
       !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
                                   EpilogProfitability, ULO.UnrollRemainder,
-                                  ULO.ForgetAllSCEV, LI, SE, DT, AC,
+                                  ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
                                   PreserveLCSSA, RemainderLoop)) {
     if (ULO.Force)
       RuntimeTripCount = false;
@@ -865,7 +867,7 @@
   // At this point, the code is well formed.  We now simplify the unrolled loop,
   // doing constant propagation and dead code elimination as we go.
   simplifyLoopAfterUnroll(L, !CompletelyUnroll && (ULO.Count > 1 || Peeled), LI,
-                          SE, DT, AC);
+                          SE, DT, AC, TTI);
 
   NumCompletelyUnrolled += CompletelyUnroll;
   ++NumUnrolled;
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -169,10 +169,12 @@
   If EpilogueLoop is non-null, it receives the epilogue loop (if it was
   necessary to create one and not fully unrolled).
 */
-LoopUnrollResult llvm::UnrollAndJamLoop(
-    Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
-    bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
-    AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
+LoopUnrollResult
+llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
+                       unsigned TripMultiple, bool UnrollRemainder,
+                       LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+                       AssumptionCache *AC, const TargetTransformInfo *TTI,
+                       OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
 
   // When we enter here we should have already checked that it is safe
   BasicBlock *Header = L->getHeader();
@@ -198,7 +200,7 @@
     if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
                                     /*UseEpilogRemainder*/ true,
                                     UnrollRemainder, /*ForgetAllSCEV*/ false,
-                                    LI, SE, DT, AC, true, EpilogueLoop)) {
+                                    LI, SE, DT, AC, TTI, true, EpilogueLoop)) {
       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
                            "generated when assuming runtime trip count\n");
       return LoopUnrollResult::Unmodified;
@@ -562,8 +564,9 @@
   // At this point, the code is well formed.  We now do a quick sweep over the
   // inserted code, doing constant propagation and dead code elimination as we
   // go.
-  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
-  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
+  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI);
+  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC,
+                          TTI);
 
   NumCompletelyUnrolledAndJammed += CompletelyUnroll;
   ++NumUnrolledAndJammed;
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -51,6 +51,12 @@
     cl::desc("Allow runtime unrolling for loops with multiple exits, when "
              "epilog is generated"));
 
+static cl::opt<unsigned> RuntimeUnrollTripCountBudget(
+    "unroll-runtime-exit-value-budget", cl::Hidden, cl::init(4),
+    cl::desc(
+        "Control the maximal total instruction cost that we are willing to "
+        "emit to for trip count value in LoopUnrollRuntime (default = 4)"));
+
 /// Connect the unrolling prolog code to the original loop.
 /// The unrolling prolog code contains code to execute the
 /// 'extra' iterations if the run-time trip count modulo the
@@ -543,13 +549,11 @@
 ///        if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
 /// EpilExit:
 
-bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
-                                      bool AllowExpensiveTripCount,
-                                      bool UseEpilogRemainder,
-                                      bool UnrollRemainder, bool ForgetAllSCEV,
-                                      LoopInfo *LI, ScalarEvolution *SE,
-                                      DominatorTree *DT, AssumptionCache *AC,
-                                      bool PreserveLCSSA, Loop **ResultLoop) {
+bool llvm::UnrollRuntimeLoopRemainder(
+    Loop *L, unsigned Count, bool AllowExpensiveTripCount,
+    bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
+    LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+    const TargetTransformInfo *TTI, bool PreserveLCSSA, Loop **ResultLoop) {
   LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
   LLVM_DEBUG(L->dump());
   LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
@@ -637,7 +641,8 @@
   const DataLayout &DL = Header->getModule()->getDataLayout();
   SCEVExpander Expander(*SE, DL, "loop-unroll");
   if (!AllowExpensiveTripCount &&
-      Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR)) {
+      Expander.isHighCostExpansion(TripCountSC, L, RuntimeUnrollTripCountBudget,
+                                   TTI, PreHeaderBR)) {
     LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
     return false;
   }
@@ -949,7 +954,7 @@
                     /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true,
                     /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1,
                     /*PeelCount*/ 0, /*UnrollRemainder*/ false, ForgetAllSCEV},
-                   LI, SE, DT, AC, /*ORE*/ nullptr, PreserveLCSSA);
+                   LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA);
   }
 
   if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1262,10 +1262,13 @@
   return true;
 }
 
-int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI,
-    TargetLibraryInfo *TLI, ScalarEvolution *SE, SCEVExpander &Rewriter,
-    DominatorTree *DT, ReplaceExitVal ReplaceExitValue,
-    SmallVector<WeakTrackingVH, 16> &DeadInsts) {
+int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
+                                ScalarEvolution *SE,
+                                const TargetTransformInfo *TTI,
+                                SCEVExpander &Rewriter, DominatorTree *DT,
+                                ReplaceExitVal ReplaceExitValue,
+                                unsigned ReplaceExitValueBudget,
+                                SmallVector<WeakTrackingVH, 16> &DeadInsts) {
   // Check a pre-condition.
   assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
          "Indvars did not preserve LCSSA!");
@@ -1347,14 +1350,16 @@
         // Computing the value outside of the loop brings no benefit if it is
         // definitely used inside the loop in a way which can not be optimized
         // away. Avoid doing so unless we know we have a value which computes
-        // the ExitValue already. TODO: This should be merged into SCEV
-        // expander to leverage its knowledge of existing expressions.
-        if (ReplaceExitValue != AlwaysRepl &&
-            !isa<SCEVConstant>(ExitValue) && !isa<SCEVUnknown>(ExitValue) &&
-            hasHardUserWithinLoop(L, Inst))
+        // the ExitValue already, unless the cost of the expansion is deemed
+        // to not be high. TODO: This should be merged into SCEV expander
+        // to leverage its knowledge of existing expressions.
+        bool HighCost = Rewriter.isHighCostExpansion(
+            ExitValue, L, ReplaceExitValueBudget, TTI, Inst);
+        if (ReplaceExitValue != AlwaysRepl && !isa<SCEVConstant>(ExitValue) &&
+            !isa<SCEVUnknown>(ExitValue) && hasHardUserWithinLoop(L, Inst) &&
+            HighCost)
           continue;
 
-        bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst);
         Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst);
 
         LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: AfterLoopVal = "
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -44,6 +45,12 @@
     "Number of IV signed remainder operations converted to unsigned remainder");
 STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated");
 
+static cl::opt<unsigned> ReplaceIWUserWithLoopInvariantBudget(
+    "replace-iv-user-with-loop-invariant-budget", cl::Hidden, cl::init(4),
+    cl::desc("Control the maximal total instruction cost that we are willing "
+             "to emit to replace the IW user in SimplifyIndvar with "
+             "loop-invariant (default = 4)"));
+
 namespace {
   /// This is a utility for simplifying induction variables
   /// based on ScalarEvolution. It is the primary instrument of the
@@ -54,6 +61,7 @@
     LoopInfo         *LI;
     ScalarEvolution  *SE;
     DominatorTree    *DT;
+    const TargetTransformInfo *TTI;
     SCEVExpander     &Rewriter;
     SmallVectorImpl<WeakTrackingVH> &DeadInsts;
 
@@ -61,10 +69,11 @@
 
   public:
     SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT,
-                   LoopInfo *LI, SCEVExpander &Rewriter,
+                   LoopInfo *LI, const TargetTransformInfo *TTI,
+                   SCEVExpander &Rewriter,
                    SmallVectorImpl<WeakTrackingVH> &Dead)
-        : L(Loop), LI(LI), SE(SE), DT(DT), Rewriter(Rewriter), DeadInsts(Dead),
-          Changed(false) {
+        : L(Loop), LI(LI), SE(SE), DT(DT), TTI(TTI), Rewriter(Rewriter),
+          DeadInsts(Dead), Changed(false) {
       assert(LI && "IV simplification requires LoopInfo");
     }
 
@@ -667,7 +676,8 @@
     return false;
 
   // Do not generate something ridiculous even if S is loop invariant.
-  if (Rewriter.isHighCostExpansion(S, L, I))
+  if (Rewriter.isHighCostExpansion(S, L, ReplaceIWUserWithLoopInvariantBudget,
+                                   TTI, I))
     return false;
 
   auto *IP = GetLoopInvariantInsertPosition(L, I);
@@ -931,10 +941,11 @@
 /// Simplify instructions that use this induction variable
 /// by using ScalarEvolution to analyze the IV's recurrence.
 bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT,
-                       LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead,
+                       LoopInfo *LI, const TargetTransformInfo *TTI,
+                       SmallVectorImpl<WeakTrackingVH> &Dead,
                        SCEVExpander &Rewriter, IVVisitor *V) {
-  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, Rewriter,
-                     Dead);
+  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, TTI,
+                     Rewriter, Dead);
   SIV.simplifyUsers(CurrIV, V);
   return SIV.hasChanged();
 }
@@ -942,14 +953,16 @@
 /// Simplify users of induction variables within this
 /// loop. This does not actually change or add IVs.
 bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
-                     LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead) {
+                     LoopInfo *LI, const TargetTransformInfo *TTI,
+                     SmallVectorImpl<WeakTrackingVH> &Dead) {
   SCEVExpander Rewriter(*SE, SE->getDataLayout(), "indvars");
 #ifndef NDEBUG
   Rewriter.setDebugType(DEBUG_TYPE);
 #endif
   bool Changed = false;
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
-    Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, Dead, Rewriter);
+    Changed |=
+        simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, TTI, Dead, Rewriter);
   }
   return Changed;
 }
diff --git a/llvm/test/Transforms/IndVarSimplify/dont-recompute.ll b/llvm/test/Transforms/IndVarSimplify/dont-recompute.ll
--- a/llvm/test/Transforms/IndVarSimplify/dont-recompute.ll
+++ b/llvm/test/Transforms/IndVarSimplify/dont-recompute.ll
@@ -35,8 +35,8 @@
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 186
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    tail call void @func(i32 [[ADD_LCSSA]])
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[M]], 186
+; CHECK-NEXT:    tail call void @func(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -69,8 +69,8 @@
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 186
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[M]], 186
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
   br label %for.body
@@ -101,8 +101,8 @@
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 186
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    tail call void @func(i32 [[ADD_LCSSA]])
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[M]], 186
+; CHECK-NEXT:    tail call void @func(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -141,8 +141,8 @@
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 186
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SOFT_USE:%.*]] = add i32 [[ADD_LCSSA]], 123
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[M]], 186
+; CHECK-NEXT:    [[SOFT_USE:%.*]] = add i32 [[TMP0]], 123
 ; CHECK-NEXT:    tail call void @func(i32 [[SOFT_USE]])
 ; CHECK-NEXT:    ret void
 ;
@@ -178,8 +178,8 @@
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 186
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    tail call void @func(i32 [[ADD_LCSSA]])
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[M]], 186
+; CHECK-NEXT:    tail call void @func(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -215,8 +215,8 @@
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 186
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT:    tail call void @func(i32 [[ADD_LCSSA]])
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[M]], 186
+; CHECK-NEXT:    tail call void @func(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/IndVarSimplify/elim-extend.ll b/llvm/test/Transforms/IndVarSimplify/elim-extend.ll
--- a/llvm/test/Transforms/IndVarSimplify/elim-extend.ll
+++ b/llvm/test/Transforms/IndVarSimplify/elim-extend.ll
@@ -8,7 +8,10 @@
 define void @postincConstIV(i8* %base, i32 %limit) nounwind {
 ; CHECK-LABEL: @postincConstIV(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[LIMIT:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[LIMIT:%.*]], 0
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[LIMIT]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[SMAX]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
@@ -19,8 +22,8 @@
 ; CHECK-NEXT:    store i8 0, i8* [[POSTADR]]
 ; CHECK-NEXT:    [[POSTADRNSW:%.*]] = getelementptr inbounds i8, i8* [[BASE]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    store i8 0, i8* [[POSTADRNSW]]
-; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[TMP0]], [[INDVARS_IV]]
-; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    br label [[RETURN:%.*]]
 ; CHECK:       return:
@@ -113,7 +116,9 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LIMITDEC:%.*]] = add i32 [[LIMIT:%.*]], -1
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[LIMITDEC]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[LIMIT]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[LIMIT]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP1]], i32 [[LIMIT]], i32 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[OUTERLOOP:%.*]]
 ; CHECK:       outerloop:
 ; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], [[OUTERMERGE:%.*]] ], [ 0, [[ENTRY:%.*]] ]
@@ -135,11 +140,10 @@
 ; CHECK-NEXT:    store i8 0, i8* [[ADR2]]
 ; CHECK-NEXT:    [[ADR3:%.*]] = getelementptr i8, i8* [[ADDRESS]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    store i8 0, i8* [[ADR3]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[INNERLOOP]], label [[INNEREXIT:%.*]]
+; CHECK-NEXT:    [[INNERCMP:%.*]] = icmp sgt i64 [[TMP0]], [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    br i1 [[INNERCMP]], label [[INNERLOOP]], label [[INNEREXIT:%.*]]
 ; CHECK:       innerexit:
-; CHECK-NEXT:    [[INNERCOUNT_LCSSA_WIDE:%.*]] = phi i64 [ [[INDVARS_IV_NEXT]], [[INNERLOOP]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INNERCOUNT_LCSSA_WIDE]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP0]] to i32
 ; CHECK-NEXT:    br label [[OUTERMERGE]]
 ; CHECK:       outermerge:
 ; CHECK-NEXT:    [[INNERCOUNT_MERGE]] = phi i32 [ [[TMP4]], [[INNEREXIT]] ], [ [[INNERCOUNT]], [[INNERPREHEADER]] ]
@@ -149,8 +153,8 @@
 ; CHECK-NEXT:    [[ADR5:%.*]] = getelementptr i8, i8* [[ADDRESS]], i64 [[OFS5]]
 ; CHECK-NEXT:    store i8 0, i8* [[ADR5]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], 1
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT2]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP47]], label [[OUTERLOOP]], label [[RETURN:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT2]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[OUTERLOOP]], label [[RETURN:%.*]]
 ; CHECK:       return:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll
--- a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll
+++ b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll
@@ -529,16 +529,19 @@
 ; CHECK-NEXT:    [[ENTRY_COND:%.*]] = and i1 [[ENTRY_COND_0]], [[ENTRY_COND_1]]
 ; CHECK-NEXT:    br i1 [[ENTRY_COND]], label [[LOOP_PREHEADER:%.*]], label [[LEAVE:%.*]]
 ; CHECK:       loop.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[LEN]], 0
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[LEN]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SMAX]], -5
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[IV_2_INC:%.*]], [[BE:%.*]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_INC:%.*]], [[BE:%.*]] ], [ -6, [[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    call void @side_effect()
-; CHECK-NEXT:    [[IV_2_INC]] = add nuw i32 [[IV_2]], 1
+; CHECK-NEXT:    [[IV_INC]] = add nsw i32 [[IV]], 1
 ; CHECK-NEXT:    br i1 true, label [[BE]], label [[LEAVE_LOOPEXIT:%.*]]
 ; CHECK:       be:
 ; CHECK-NEXT:    call void @side_effect()
-; CHECK-NEXT:    [[BE_COND:%.*]] = icmp slt i32 [[IV_2]], [[LEN]]
-; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LEAVE_LOOPEXIT]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[IV_INC]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LEAVE_LOOPEXIT]]
 ; CHECK:       leave.loopexit:
 ; CHECK-NEXT:    br label [[LEAVE]]
 ; CHECK:       leave:
@@ -685,6 +688,8 @@
 ; CHECK-NEXT:    [[LENGTH_IS_NONZERO:%.*]] = icmp ne i32 [[LENGTH]], 0
 ; CHECK-NEXT:    br i1 [[LENGTH_IS_NONZERO]], label [[LOOP_PREHEADER:%.*]], label [[LEAVE:%.*]]
 ; CHECK:       loop.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[LENGTH]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[LENGTH]], i32 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_INC:%.*]], [[BE:%.*]] ], [ 0, [[LOOP_PREHEADER]] ]
@@ -693,8 +698,8 @@
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[BE]], label [[LEAVE_LOOPEXIT:%.*]]
 ; CHECK:       be:
 ; CHECK-NEXT:    call void @side_effect()
-; CHECK-NEXT:    [[BE_COND:%.*]] = icmp slt i32 [[IV_INC]], [[LENGTH]]
-; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LEAVE_LOOPEXIT]]
+; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp ne i32 [[IV_INC]], [[SMAX]]
+; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[LOOP]], label [[LEAVE_LOOPEXIT]]
 ; CHECK:       leave.loopexit:
 ; CHECK-NEXT:    br label [[LEAVE]]
 ; CHECK:       leave:
diff --git a/llvm/test/Transforms/IndVarSimplify/eliminate-trunc.ll b/llvm/test/Transforms/IndVarSimplify/eliminate-trunc.ll
--- a/llvm/test/Transforms/IndVarSimplify/eliminate-trunc.ll
+++ b/llvm/test/Transforms/IndVarSimplify/eliminate-trunc.ll
@@ -36,13 +36,16 @@
 ;
 ; CHECK-LABEL: @test_01(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[SMAX]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i64 [[IV]], [[SEXT]]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -63,13 +66,16 @@
 ;
 ; CHECK-LABEL: @test_02(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[N:%.*]], 2147483646
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 2147483646
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[SMAX]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 2147483646, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i64 [[IV]], [[SEXT]]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -113,13 +119,16 @@
 ;
 ; CHECK-LABEL: @test_04(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[N:%.*]], -2147483647
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 -2147483647
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SMAX]], 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ -2147483647, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i64 [[IV]], [[SEXT]]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -243,13 +252,16 @@
 define void @test_02_unsigned(i32 %n) {
 ; CHECK-LABEL: @test_02_unsigned(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[N:%.*]], -2
+; CHECK-NEXT:    [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 -2
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[UMAX]], 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 4294967294, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[IV]], [[ZEXT]]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -318,13 +330,16 @@
 define void @test_05_unsigned(i32 %n) {
 ; CHECK-LABEL: @test_05_unsigned(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[N:%.*]], 1
+; CHECK-NEXT:    [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[UMAX]], 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[IV]], [[ZEXT]]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -366,14 +381,18 @@
 define void @test_07(i32* %p, i32 %n) {
 ; CHECK-LABEL: @test_07(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[SMAX]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[NARROW_IV:%.*]] = trunc i64 [[IV]] to i32
 ; CHECK-NEXT:    store i32 [[NARROW_IV]], i32* [[P:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[NARROW_IV]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -451,15 +470,17 @@
 define void @test_10(i32 %n) {
 ; CHECK-LABEL: @test_10(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], 100
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 90
+; CHECK-NEXT:    [[UMIN:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 90
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[UMIN]], -99
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ -100, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i64 [[IV]], [[SEXT]]
-; CHECK-NEXT:    [[NEGCMP:%.*]] = icmp slt i64 [[IV]], -10
-; CHECK-NEXT:    [[CMP:%.*]] = and i1 [[TMP0]], [[NEGCMP]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -530,13 +551,15 @@
 ; CHECK-LABEL: @test_12(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[N:%.*]] = load i32, i32* [[P:%.*]], !range !0
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[IV_NEXT]], [[ZEXT]]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/IndVarSimplify/full_widening.ll b/llvm/test/Transforms/IndVarSimplify/full_widening.ll
--- a/llvm/test/Transforms/IndVarSimplify/full_widening.ll
+++ b/llvm/test/Transforms/IndVarSimplify/full_widening.ll
@@ -7,7 +7,9 @@
 define i32 @test_01(double* %p, double %x, i32* %np, i32* %mp, i32 %k) {
 ; CHECK-LABEL: @test_01(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[K:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[K:%.*]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[K]], i32 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV_WIDE:%.*]] = phi i64 [ [[CANONICAL_IV_NEXT_I:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
@@ -17,8 +19,8 @@
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[X:%.*]], [[LOAD]]
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds double, double* [[P]], i64 [[IV_WIDE]]
 ; CHECK-NEXT:    store atomic double [[MUL]], double* [[GEP2]] unordered, align 8
-; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i64 [[CANONICAL_IV_NEXT_I]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[CANONICAL_IV_NEXT_I]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/IndVarSimplify/iv-widen.ll b/llvm/test/Transforms/IndVarSimplify/iv-widen.ll
--- a/llvm/test/Transforms/IndVarSimplify/iv-widen.ll
+++ b/llvm/test/Transforms/IndVarSimplify/iv-widen.ll
@@ -122,15 +122,17 @@
 ; CHECK-NEXT:    [[ENTRY_COND:%.*]] = icmp ne i32 [[LIM:%.*]], 0
 ; CHECK-NEXT:    br i1 [[ENTRY_COND]], label [[LOOP_PREHEADER:%.*]], label [[LEAVE:%.*]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LIM]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[LIM]], 2
+; CHECK-NEXT:    [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[LIM]], i32 2
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[UMAX]] to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 1, [[LOOP_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], -1
 ; CHECK-NEXT:    call void @dummy.i64(i64 [[TMP1]])
-; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LEAVE_LOOPEXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LEAVE_LOOPEXIT:%.*]]
 ; CHECK:       leave.loopexit:
 ; CHECK-NEXT:    br label [[LEAVE]]
 ; CHECK:       leave:
@@ -165,7 +167,9 @@
 ; CHECK-NEXT:    [[BC0:%.*]] = bitcast i32* [[LINED:%.*]] to i8*
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[SIZE]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[HSIZE:%.*]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[NSTEPS:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[NSTEPS:%.*]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP2]], i32 [[NSTEPS]], i32 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT11:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
@@ -200,8 +204,8 @@
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT8]] = add nuw nsw i64 [[INDVARS_IV7]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT8]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND12:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT8]], [[WIDE_TRIP_COUNT11]]
+; CHECK-NEXT:    br i1 [[EXITCOND12]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/IndVarSimplify/lftr-multi-exit.ll b/llvm/test/Transforms/IndVarSimplify/lftr-multi-exit.ll
--- a/llvm/test/Transforms/IndVarSimplify/lftr-multi-exit.ll
+++ b/llvm/test/Transforms/IndVarSimplify/lftr-multi-exit.ll
@@ -128,18 +128,18 @@
 define void @compound_early_exit(i32 %n, i32 %m) {
 ; CHECK-LABEL: @compound_early_exit(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[M:%.*]], [[N:%.*]]
+; CHECK-NEXT:    [[UMIN:%.*]] = select i1 [[TMP0]], i32 [[M]], i32 [[N]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = icmp ult i32 [[IV]], [[N:%.*]]
-; CHECK-NEXT:    [[EARLYCND2:%.*]] = icmp ult i32 [[IV]], [[M:%.*]]
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[EARLYCND]], [[EARLYCND2]]
-; CHECK-NEXT:    br i1 [[AND]], label [[LATCH]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[IV]], [[UMIN]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LATCH]], label [[EXIT:%.*]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    store volatile i32 [[IV]], i32* @A
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT]]
+; CHECK-NEXT:    [[EXITCOND1:%.*]] = icmp ne i32 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND1]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll b/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll
--- a/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll
+++ b/llvm/test/Transforms/IndVarSimplify/lftr-reuse.ll
@@ -187,13 +187,15 @@
 ;
 ; CHECK-LABEL: @unguardedloop(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[IROW:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[IROW:%.*]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[IROW]], i32 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1
-; CHECK-NEXT:    [[CMP196:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT3]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[CMP196]], label [[LOOP]], label [[RETURN:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[RETURN:%.*]]
 ; CHECK:       return:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/IndVarSimplify/loop-invariant-conditions.ll b/llvm/test/Transforms/IndVarSimplify/loop-invariant-conditions.ll
--- a/llvm/test/Transforms/IndVarSimplify/loop-invariant-conditions.ll
+++ b/llvm/test/Transforms/IndVarSimplify/loop-invariant-conditions.ll
@@ -311,12 +311,15 @@
 define void @test3_neg(i64 %start) {
 ; CHECK-LABEL: @test3_neg(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[START:%.*]], -1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[START]], i64 -1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[SMAX]], 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    br i1 [[CMP1]], label [[LOOP]], label [[FOR_END:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[START]], [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -336,16 +339,19 @@
 define void @test4_neg(i64 %start) {
 ; CHECK-LABEL: @test4_neg(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[START:%.*]], 0
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[START]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i64 [[SMAX]], 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[START]], [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 25
 ; CHECK-NEXT:    br i1 [[CMP]], label [[BACKEDGE]], label [[FOR_END:%.*]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    call void @foo()
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_END]], label [[LOOP]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[LOOP]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/IndVarSimplify/lrev-existing-umin.ll b/llvm/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
--- a/llvm/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
+++ b/llvm/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
@@ -26,8 +26,7 @@
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp slt i32 [[TMP22]], [[TMP14]]
 ; CHECK-NEXT:    br i1 [[TMP23]], label [[NOT_ZERO11]], label [[MAIN_EXIT_SELECTOR:%.*]]
 ; CHECK:       main.exit.selector:
-; CHECK-NEXT:    [[TMP22_LCSSA:%.*]] = phi i32 [ [[TMP22]], [[NOT_ZERO11]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp slt i32 [[TMP22_LCSSA]], [[LENGTH_I]]
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp slt i32 [[TMP14]], [[LENGTH_I]]
 ; CHECK-NEXT:    br i1 [[TMP24]], label [[NOT_ZERO11_POSTLOOP]], label [[LEAVE:%.*]]
 ; CHECK:       leave:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/IndVarSimplify/pr28705.ll b/llvm/test/Transforms/IndVarSimplify/pr28705.ll
--- a/llvm/test/Transforms/IndVarSimplify/pr28705.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pr28705.ll
@@ -16,14 +16,14 @@
 ; CHECK:       for.body650.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY650:%.*]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[INC_I_I_LCSSA:%.*]] = phi i32 [ [[INC_I_I:%.*]], [[FOR_BODY650]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[DOTSROA_SPECULATED]], 1
 ; CHECK-NEXT:    br label [[XZ_EXIT]]
 ; CHECK:       XZ.exit:
-; CHECK-NEXT:    [[DB_SROA_9_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC_I_I_LCSSA]], [[LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[DB_SROA_9_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP0]], [[LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    br label [[END:%.*]]
 ; CHECK:       for.body650:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[FOR_BODY650_LR_PH]] ], [ [[INC655:%.*]], [[FOR_BODY650]] ]
-; CHECK-NEXT:    [[IV2:%.*]] = phi i32 [ 1, [[FOR_BODY650_LR_PH]] ], [ [[INC_I_I]], [[FOR_BODY650]] ]
+; CHECK-NEXT:    [[IV2:%.*]] = phi i32 [ 1, [[FOR_BODY650_LR_PH]] ], [ [[INC_I_I:%.*]], [[FOR_BODY650]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_I_I1105:%.*]] = getelementptr inbounds i8, i8* [[REF_I1174:%.*]], i32 [[IV2]]
 ; CHECK-NEXT:    store i8 7, i8* [[ARRAYIDX_I_I1105]], align 1
 ; CHECK-NEXT:    [[INC_I_I]] = add nuw nsw i32 [[IV2]], 1
diff --git a/llvm/test/Transforms/IndVarSimplify/pr39673.ll b/llvm/test/Transforms/IndVarSimplify/pr39673.ll
--- a/llvm/test/Transforms/IndVarSimplify/pr39673.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pr39673.ll
@@ -72,8 +72,8 @@
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i16 [[L2_ADD]], 2
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOP2]], label [[LOOP2_END:%.*]]
 ; CHECK:       loop2.end:
-; CHECK-NEXT:    [[K2_ADD_LCSSA:%.*]] = phi i16 [ [[K2_ADD]], [[LOOP2]] ]
-; CHECK-NEXT:    ret i16 [[K2_ADD_LCSSA]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[ARG2]], 2
+; CHECK-NEXT:    ret i16 [[TMP0]]
 ;
 entry:
   br label %loop1
@@ -121,8 +121,8 @@
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i16 [[L2_ADD]], 2
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOP2]], label [[LOOP2_END:%.*]]
 ; CHECK:       loop2.end:
-; CHECK-NEXT:    [[K2_ADD_LCSSA:%.*]] = phi i16 [ [[K2_ADD]], [[LOOP2]] ]
-; CHECK-NEXT:    ret i16 [[K2_ADD_LCSSA]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[DUMMY]], 2
+; CHECK-NEXT:    ret i16 [[TMP0]]
 ;
 entry:
   br label %loop2.preheader
@@ -166,8 +166,8 @@
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i16 [[L2_ADD]], 2
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOP2]], label [[LOOP2_END:%.*]]
 ; CHECK:       loop2.end:
-; CHECK-NEXT:    [[K2_ADD_LCSSA:%.*]] = phi i16 [ [[K2_ADD]], [[LOOP2]] ]
-; CHECK-NEXT:    ret i16 [[K2_ADD_LCSSA]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 2
+; CHECK-NEXT:    ret i16 [[TMP1]]
 ;
 entry:
   br label %loop1
diff --git a/llvm/test/Transforms/IndVarSimplify/widen-loop-comp.ll b/llvm/test/Transforms/IndVarSimplify/widen-loop-comp.ll
--- a/llvm/test/Transforms/IndVarSimplify/widen-loop-comp.ll
+++ b/llvm/test/Transforms/IndVarSimplify/widen-loop-comp.ll
@@ -24,30 +24,33 @@
 ; CHECK:       for.body.lr.ph:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32*, i32** @ptr, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* @e, align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], 0
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw i32 [[SMAX]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV:%.*]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_LOOPEXIT_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_LOOPEXIT_CRIT_EDGE:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV]] = phi i64 [ [[INDVARS_IV_NEXT]], [[FOR_COND:%.*]] ], [ 0, [[FOR_BODY_LR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP5]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_COND]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[I_05_LCSSA_WIDE:%.*]] = phi i64 [ [[INDVARS_IV]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[I_05_LCSSA_WIDE]] to i32
-; CHECK-NEXT:    store i32 [[TMP5]], i32* @idx, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[I_05_LCSSA_WIDE]] to i32
+; CHECK-NEXT:    store i32 [[TMP6]], i32* @idx, align 4
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       for.cond.for.end.loopexit_crit_edge:
 ; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* @idx, align 4
-; CHECK-NEXT:    ret i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* @idx, align 4
+; CHECK-NEXT:    ret i32 [[TMP7]]
 ;
 entry:
   store i32 -1, i32* @idx, align 4
@@ -96,7 +99,8 @@
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[LIMIT:%.*]] to i32
 ; CHECK-NEXT:    br i1 undef, label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]]
 ; CHECK:       for.cond1.preheader.us.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[CONV]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[CONV]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[CONV]], i32 1
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.preheader:
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
@@ -107,8 +111,8 @@
 ; CHECK-NEXT:    br label [[FOR_INC13_US]]
 ; CHECK:       for.inc13.us:
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 4
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_END_LOOPEXIT1:%.*]]
+; CHECK-NEXT:    [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 4
+; CHECK-NEXT:    br i1 [[EXITCOND4]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_END_LOOPEXIT1:%.*]]
 ; CHECK:       for.body4.us:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY4_LR_PH_US]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US:%.*]] ]
 ; CHECK-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds [8 x i8], [8 x i8]* [[A:%.*]], i64 [[INDVARS_IV2]], i64 [[INDVARS_IV]]
@@ -118,9 +122,10 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX8_US]], align 1
 ; CHECK-NEXT:    store i8 [[TMP2]], i8* [[ARRAYIDX6_US]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_INC13_US_LOOPEXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY4_US]], label [[FOR_INC13_US_LOOPEXIT:%.*]]
 ; CHECK:       for.body4.lr.ph.us:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY4_US]]
 ; CHECK:       for.cond1.preheader:
 ; CHECK-NEXT:    br i1 false, label [[FOR_INC13:%.*]], label [[FOR_INC13]]
@@ -180,13 +185,15 @@
 define i32 @test3(i32* %a, i32 %b) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[B]], i32 0
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SUM_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
@@ -300,17 +307,20 @@
 define i32 @test6(i32* %a, i32 %b) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[B:%.*]], -1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[B]], i32 -1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SMAX]], 1
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SUM_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP2]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ; CHECK:       for.end:
@@ -342,7 +352,10 @@
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[B:%.*]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[B]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[B]], -1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP1]], i32 [[B]], i32 -1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SMAX]], 2
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
@@ -351,11 +364,11 @@
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i64 [[INDVARS_IV]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[FOR_END]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND]], label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[SUM_0]], [[FOR_BODY]] ], [ [[SUM_0]], [[FOR_COND]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
@@ -444,13 +457,15 @@
 ; CHECK-NEXT:    br i1 [[E]], label [[FOR_COND_PREHEADER:%.*]], label [[LEAVE:%.*]]
 ; CHECK:       for.cond.preheader:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[INIT]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[INIT]], [[B:%.*]]
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP1]], i32 [[INIT]], i32 [[B]]
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_COND_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY:%.*]] ]
 ; CHECK-NEXT:    [[SUM_0:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_COND_PREHEADER]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -26,29 +26,119 @@
 ; AUTO_VEC-NEXT:    [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float
 ; AUTO_VEC-NEXT:    [[TMP0:%.*]] = fmul fast float [[CAST_CRD]], 5.000000e-01
 ; AUTO_VEC-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP0]], 1.000000e+00
+; AUTO_VEC-NEXT:    [[TMP1:%.*]] = add nsw i64 [[N_VEC]], -32
+; AUTO_VEC-NEXT:    [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 5
+; AUTO_VEC-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; AUTO_VEC-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP3]], 3
+; AUTO_VEC-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP1]], 96
+; AUTO_VEC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
+; AUTO_VEC:       vector.ph.new:
+; AUTO_VEC-NEXT:    [[UNROLL_ITER:%.*]] = sub nsw i64 [[TMP3]], [[XTRAITER]]
 ; AUTO_VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AUTO_VEC:       vector.body:
-; AUTO_VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AUTO_VEC-NEXT:    [[VEC_IND:%.*]] = phi <8 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00, float 3.000000e+00, float 3.500000e+00, float 4.000000e+00, float 4.500000e+00>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[VEC_IND:%.*]] = phi <8 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00, float 3.000000e+00, float 3.500000e+00, float 4.000000e+00, float 4.500000e+00>, [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[VECTOR_BODY]] ]
 ; AUTO_VEC-NEXT:    [[STEP_ADD:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
 ; AUTO_VEC-NEXT:    [[STEP_ADD5:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
 ; AUTO_VEC-NEXT:    [[STEP_ADD6:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01>
-; AUTO_VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
-; AUTO_VEC-NEXT:    [[TMP2:%.*]] = bitcast float* [[TMP1]] to <8 x float>*
-; AUTO_VEC-NEXT:    store <8 x float> [[VEC_IND]], <8 x float>* [[TMP2]], align 4
-; AUTO_VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AUTO_VEC-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>*
-; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD]], <8 x float>* [[TMP4]], align 4
-; AUTO_VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 16
+; AUTO_VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
 ; AUTO_VEC-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD5]], <8 x float>* [[TMP6]], align 4
-; AUTO_VEC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 24
+; AUTO_VEC-NEXT:    store <8 x float> [[VEC_IND]], <8 x float>* [[TMP6]], align 4
+; AUTO_VEC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 8
 ; AUTO_VEC-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <8 x float>*
-; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD6]], <8 x float>* [[TMP8]], align 4
-; AUTO_VEC-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
-; AUTO_VEC-NEXT:    [[VEC_IND_NEXT]] = fadd fast <8 x float> [[VEC_IND]], <float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01>
-; AUTO_VEC-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AUTO_VEC-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD]], <8 x float>* [[TMP8]], align 4
+; AUTO_VEC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 16
+; AUTO_VEC-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD5]], <8 x float>* [[TMP10]], align 4
+; AUTO_VEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 24
+; AUTO_VEC-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD6]], <8 x float>* [[TMP12]], align 4
+; AUTO_VEC-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32
+; AUTO_VEC-NEXT:    [[VEC_IND_NEXT:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 2.000000e+01, float 2.000000e+01, float 2.000000e+01, float 2.000000e+01, float 2.000000e+01, float 2.000000e+01, float 2.000000e+01, float 2.000000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD5_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 2.400000e+01, float 2.400000e+01, float 2.400000e+01, float 2.400000e+01, float 2.400000e+01, float 2.400000e+01, float 2.400000e+01, float 2.400000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD6_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 2.800000e+01, float 2.800000e+01, float 2.800000e+01, float 2.800000e+01, float 2.800000e+01, float 2.800000e+01, float 2.800000e+01, float 2.800000e+01>
+; AUTO_VEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT]]
+; AUTO_VEC-NEXT:    [[TMP14:%.*]] = bitcast float* [[TMP13]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[VEC_IND_NEXT]], <8 x float>* [[TMP14]], align 4
+; AUTO_VEC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 8
+; AUTO_VEC-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP15]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD_1]], <8 x float>* [[TMP16]], align 4
+; AUTO_VEC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 16
+; AUTO_VEC-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP17]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD5_1]], <8 x float>* [[TMP18]], align 4
+; AUTO_VEC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 24
+; AUTO_VEC-NEXT:    [[TMP20:%.*]] = bitcast float* [[TMP19]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD6_1]], <8 x float>* [[TMP20]], align 4
+; AUTO_VEC-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 64
+; AUTO_VEC-NEXT:    [[VEC_IND_NEXT_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 3.200000e+01, float 3.200000e+01, float 3.200000e+01, float 3.200000e+01, float 3.200000e+01, float 3.200000e+01, float 3.200000e+01, float 3.200000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 3.600000e+01, float 3.600000e+01, float 3.600000e+01, float 3.600000e+01, float 3.600000e+01, float 3.600000e+01, float 3.600000e+01, float 3.600000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD5_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 4.000000e+01, float 4.000000e+01, float 4.000000e+01, float 4.000000e+01, float 4.000000e+01, float 4.000000e+01, float 4.000000e+01, float 4.000000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD6_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 4.400000e+01, float 4.400000e+01, float 4.400000e+01, float 4.400000e+01, float 4.400000e+01, float 4.400000e+01, float 4.400000e+01, float 4.400000e+01>
+; AUTO_VEC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT_1]]
+; AUTO_VEC-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP21]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[VEC_IND_NEXT_1]], <8 x float>* [[TMP22]], align 4
+; AUTO_VEC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP21]], i64 8
+; AUTO_VEC-NEXT:    [[TMP24:%.*]] = bitcast float* [[TMP23]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD_2]], <8 x float>* [[TMP24]], align 4
+; AUTO_VEC-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP21]], i64 16
+; AUTO_VEC-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD5_2]], <8 x float>* [[TMP26]], align 4
+; AUTO_VEC-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP21]], i64 24
+; AUTO_VEC-NEXT:    [[TMP28:%.*]] = bitcast float* [[TMP27]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD6_2]], <8 x float>* [[TMP28]], align 4
+; AUTO_VEC-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 96
+; AUTO_VEC-NEXT:    [[VEC_IND_NEXT_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 4.800000e+01, float 4.800000e+01, float 4.800000e+01, float 4.800000e+01, float 4.800000e+01, float 4.800000e+01, float 4.800000e+01, float 4.800000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD_3:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 5.200000e+01, float 5.200000e+01, float 5.200000e+01, float 5.200000e+01, float 5.200000e+01, float 5.200000e+01, float 5.200000e+01, float 5.200000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD5_3:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 5.600000e+01, float 5.600000e+01, float 5.600000e+01, float 5.600000e+01, float 5.600000e+01, float 5.600000e+01, float 5.600000e+01, float 5.600000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD6_3:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 6.000000e+01, float 6.000000e+01, float 6.000000e+01, float 6.000000e+01, float 6.000000e+01, float 6.000000e+01, float 6.000000e+01, float 6.000000e+01>
+; AUTO_VEC-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT_2]]
+; AUTO_VEC-NEXT:    [[TMP30:%.*]] = bitcast float* [[TMP29]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[VEC_IND_NEXT_2]], <8 x float>* [[TMP30]], align 4
+; AUTO_VEC-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 8
+; AUTO_VEC-NEXT:    [[TMP32:%.*]] = bitcast float* [[TMP31]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD_3]], <8 x float>* [[TMP32]], align 4
+; AUTO_VEC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 16
+; AUTO_VEC-NEXT:    [[TMP34:%.*]] = bitcast float* [[TMP33]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD5_3]], <8 x float>* [[TMP34]], align 4
+; AUTO_VEC-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 24
+; AUTO_VEC-NEXT:    [[TMP36:%.*]] = bitcast float* [[TMP35]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD6_3]], <8 x float>* [[TMP36]], align 4
+; AUTO_VEC-NEXT:    [[INDEX_NEXT_3]] = add i64 [[INDEX]], 128
+; AUTO_VEC-NEXT:    [[VEC_IND_NEXT_3]] = fadd fast <8 x float> [[VEC_IND]], <float 6.400000e+01, float 6.400000e+01, float 6.400000e+01, float 6.400000e+01, float 6.400000e+01, float 6.400000e+01, float 6.400000e+01, float 6.400000e+01>
+; AUTO_VEC-NEXT:    [[NITER_NSUB_3]] = add i64 [[NITER]], -4
+; AUTO_VEC-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NSUB_3]], 0
+; AUTO_VEC-NEXT:    br i1 [[NITER_NCMP_3]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop !0
+; AUTO_VEC:       middle.block.unr-lcssa:
+; AUTO_VEC-NEXT:    [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3]], [[VECTOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[VEC_IND_UNR:%.*]] = phi <8 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00, float 3.000000e+00, float 3.500000e+00, float 4.000000e+00, float 4.500000e+00>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_3]], [[VECTOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[LCMP_MOD:%.*]] = icmp eq i64 [[XTRAITER]], 0
+; AUTO_VEC-NEXT:    br i1 [[LCMP_MOD]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]]
+; AUTO_VEC:       vector.body.epil:
+; AUTO_VEC-NEXT:    [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ]
+; AUTO_VEC-NEXT:    [[VEC_IND_EPIL:%.*]] = phi <8 x float> [ [[VEC_IND_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[VEC_IND_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ]
+; AUTO_VEC-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_SUB:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[XTRAITER]], [[MIDDLE_BLOCK_UNR_LCSSA]] ]
+; AUTO_VEC-NEXT:    [[STEP_ADD_EPIL:%.*]] = fadd fast <8 x float> [[VEC_IND_EPIL]], <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
+; AUTO_VEC-NEXT:    [[STEP_ADD5_EPIL:%.*]] = fadd fast <8 x float> [[VEC_IND_EPIL]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+; AUTO_VEC-NEXT:    [[STEP_ADD6_EPIL:%.*]] = fadd fast <8 x float> [[VEC_IND_EPIL]], <float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01>
+; AUTO_VEC-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_EPIL]]
+; AUTO_VEC-NEXT:    [[TMP38:%.*]] = bitcast float* [[TMP37]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[VEC_IND_EPIL]], <8 x float>* [[TMP38]], align 4
+; AUTO_VEC-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP37]], i64 8
+; AUTO_VEC-NEXT:    [[TMP40:%.*]] = bitcast float* [[TMP39]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD_EPIL]], <8 x float>* [[TMP40]], align 4
+; AUTO_VEC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP37]], i64 16
+; AUTO_VEC-NEXT:    [[TMP42:%.*]] = bitcast float* [[TMP41]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD5_EPIL]], <8 x float>* [[TMP42]], align 4
+; AUTO_VEC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP37]], i64 24
+; AUTO_VEC-NEXT:    [[TMP44:%.*]] = bitcast float* [[TMP43]] to <8 x float>*
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD6_EPIL]], <8 x float>* [[TMP44]], align 4
+; AUTO_VEC-NEXT:    [[INDEX_NEXT_EPIL]] = add i64 [[INDEX_EPIL]], 32
+; AUTO_VEC-NEXT:    [[VEC_IND_NEXT_EPIL]] = fadd fast <8 x float> [[VEC_IND_EPIL]], <float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01>
+; AUTO_VEC-NEXT:    [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1
+; AUTO_VEC-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0
+; AUTO_VEC-NEXT:    br i1 [[EPIL_ITER_CMP]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop !2
 ; AUTO_VEC:       middle.block:
 ; AUTO_VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[ZEXT]]
 ; AUTO_VEC-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY]]
@@ -59,8 +149,8 @@
 ; AUTO_VEC-NEXT:    store float [[X_06]], float* [[ARRAYIDX]], align 4
 ; AUTO_VEC-NEXT:    [[CONV1]] = fadd float [[X_06]], 5.000000e-01
 ; AUTO_VEC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; AUTO_VEC-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
-; AUTO_VEC-NEXT:    br i1 [[TMP10]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
+; AUTO_VEC-NEXT:    [[TMP45:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]]
+; AUTO_VEC-NEXT:    br i1 [[TMP45]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !4
 ; AUTO_VEC:       for.end:
 ; AUTO_VEC-NEXT:    ret void
 ;
@@ -167,7 +257,7 @@
 ; AUTO_VEC-NEXT:    [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
 ; AUTO_VEC-NEXT:    [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1
 ; AUTO_VEC-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0
-; AUTO_VEC-NEXT:    br i1 [[EPIL_ITER_CMP]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop !4
+; AUTO_VEC-NEXT:    br i1 [[EPIL_ITER_CMP]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop !6
 ; AUTO_VEC:       for.end:
 ; AUTO_VEC-NEXT:    ret void
 ;
@@ -207,34 +297,124 @@
 ; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
 ; AUTO_VEC-NEXT:    [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to double
 ; AUTO_VEC-NEXT:    [[TMP1:%.*]] = fmul fast double [[CAST_CRD]], 3.000000e+00
+; AUTO_VEC-NEXT:    [[TMP2:%.*]] = add nsw i64 [[N_VEC]], -16
+; AUTO_VEC-NEXT:    [[TMP3:%.*]] = lshr exact i64 [[TMP2]], 4
+; AUTO_VEC-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; AUTO_VEC-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP4]], 3
+; AUTO_VEC-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP2]], 48
+; AUTO_VEC-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
+; AUTO_VEC:       vector.ph.new:
+; AUTO_VEC-NEXT:    [[UNROLL_ITER:%.*]] = sub nsw i64 [[TMP4]], [[XTRAITER]]
 ; AUTO_VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AUTO_VEC:       vector.body:
-; AUTO_VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AUTO_VEC-NEXT:    [[VEC_IND:%.*]] = phi <4 x double> [ <double 0.000000e+00, double 3.000000e+00, double 6.000000e+00, double 9.000000e+00>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[VEC_IND:%.*]] = phi <4 x double> [ <double 0.000000e+00, double 3.000000e+00, double 6.000000e+00, double 9.000000e+00>, [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[VECTOR_BODY]] ]
 ; AUTO_VEC-NEXT:    [[STEP_ADD:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 1.200000e+01, double 1.200000e+01, double 1.200000e+01, double 1.200000e+01>
 ; AUTO_VEC-NEXT:    [[STEP_ADD5:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 2.400000e+01, double 2.400000e+01, double 2.400000e+01, double 2.400000e+01>
 ; AUTO_VEC-NEXT:    [[STEP_ADD6:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 3.600000e+01, double 3.600000e+01, double 3.600000e+01, double 3.600000e+01>
-; AUTO_VEC-NEXT:    [[TMP2:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[INDEX]]
-; AUTO_VEC-NEXT:    [[TMP3:%.*]] = bitcast double* [[TMP2]] to <4 x double>*
-; AUTO_VEC-NEXT:    store <4 x double> [[VEC_IND]], <4 x double>* [[TMP3]], align 8
-; AUTO_VEC-NEXT:    [[TMP4:%.*]] = getelementptr double, double* [[TMP2]], i64 4
-; AUTO_VEC-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD]], <4 x double>* [[TMP5]], align 8
-; AUTO_VEC-NEXT:    [[TMP6:%.*]] = getelementptr double, double* [[TMP2]], i64 8
+; AUTO_VEC-NEXT:    [[TMP6:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[INDEX]]
 ; AUTO_VEC-NEXT:    [[TMP7:%.*]] = bitcast double* [[TMP6]] to <4 x double>*
-; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD5]], <4 x double>* [[TMP7]], align 8
-; AUTO_VEC-NEXT:    [[TMP8:%.*]] = getelementptr double, double* [[TMP2]], i64 12
+; AUTO_VEC-NEXT:    store <4 x double> [[VEC_IND]], <4 x double>* [[TMP7]], align 8
+; AUTO_VEC-NEXT:    [[TMP8:%.*]] = getelementptr double, double* [[TMP6]], i64 4
 ; AUTO_VEC-NEXT:    [[TMP9:%.*]] = bitcast double* [[TMP8]] to <4 x double>*
-; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD6]], <4 x double>* [[TMP9]], align 8
-; AUTO_VEC-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; AUTO_VEC-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x double> [[VEC_IND]], <double 4.800000e+01, double 4.800000e+01, double 4.800000e+01, double 4.800000e+01>
-; AUTO_VEC-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AUTO_VEC-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD]], <4 x double>* [[TMP9]], align 8
+; AUTO_VEC-NEXT:    [[TMP10:%.*]] = getelementptr double, double* [[TMP6]], i64 8
+; AUTO_VEC-NEXT:    [[TMP11:%.*]] = bitcast double* [[TMP10]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD5]], <4 x double>* [[TMP11]], align 8
+; AUTO_VEC-NEXT:    [[TMP12:%.*]] = getelementptr double, double* [[TMP6]], i64 12
+; AUTO_VEC-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP12]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD6]], <4 x double>* [[TMP13]], align 8
+; AUTO_VEC-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 16
+; AUTO_VEC-NEXT:    [[VEC_IND_NEXT:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 4.800000e+01, double 4.800000e+01, double 4.800000e+01, double 4.800000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 6.000000e+01, double 6.000000e+01, double 6.000000e+01, double 6.000000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD5_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 7.200000e+01, double 7.200000e+01, double 7.200000e+01, double 7.200000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD6_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 8.400000e+01, double 8.400000e+01, double 8.400000e+01, double 8.400000e+01>
+; AUTO_VEC-NEXT:    [[TMP14:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_NEXT]]
+; AUTO_VEC-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[VEC_IND_NEXT]], <4 x double>* [[TMP15]], align 8
+; AUTO_VEC-NEXT:    [[TMP16:%.*]] = getelementptr double, double* [[TMP14]], i64 4
+; AUTO_VEC-NEXT:    [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD_1]], <4 x double>* [[TMP17]], align 8
+; AUTO_VEC-NEXT:    [[TMP18:%.*]] = getelementptr double, double* [[TMP14]], i64 8
+; AUTO_VEC-NEXT:    [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD5_1]], <4 x double>* [[TMP19]], align 8
+; AUTO_VEC-NEXT:    [[TMP20:%.*]] = getelementptr double, double* [[TMP14]], i64 12
+; AUTO_VEC-NEXT:    [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD6_1]], <4 x double>* [[TMP21]], align 8
+; AUTO_VEC-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 32
+; AUTO_VEC-NEXT:    [[VEC_IND_NEXT_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 9.600000e+01, double 9.600000e+01, double 9.600000e+01, double 9.600000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 1.080000e+02, double 1.080000e+02, double 1.080000e+02, double 1.080000e+02>
+; AUTO_VEC-NEXT:    [[STEP_ADD5_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 1.200000e+02, double 1.200000e+02, double 1.200000e+02, double 1.200000e+02>
+; AUTO_VEC-NEXT:    [[STEP_ADD6_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 1.320000e+02, double 1.320000e+02, double 1.320000e+02, double 1.320000e+02>
+; AUTO_VEC-NEXT:    [[TMP22:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_NEXT_1]]
+; AUTO_VEC-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[VEC_IND_NEXT_1]], <4 x double>* [[TMP23]], align 8
+; AUTO_VEC-NEXT:    [[TMP24:%.*]] = getelementptr double, double* [[TMP22]], i64 4
+; AUTO_VEC-NEXT:    [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD_2]], <4 x double>* [[TMP25]], align 8
+; AUTO_VEC-NEXT:    [[TMP26:%.*]] = getelementptr double, double* [[TMP22]], i64 8
+; AUTO_VEC-NEXT:    [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD5_2]], <4 x double>* [[TMP27]], align 8
+; AUTO_VEC-NEXT:    [[TMP28:%.*]] = getelementptr double, double* [[TMP22]], i64 12
+; AUTO_VEC-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD6_2]], <4 x double>* [[TMP29]], align 8
+; AUTO_VEC-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 48
+; AUTO_VEC-NEXT:    [[VEC_IND_NEXT_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 1.440000e+02, double 1.440000e+02, double 1.440000e+02, double 1.440000e+02>
+; AUTO_VEC-NEXT:    [[STEP_ADD_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 1.560000e+02, double 1.560000e+02, double 1.560000e+02, double 1.560000e+02>
+; AUTO_VEC-NEXT:    [[STEP_ADD5_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 1.680000e+02, double 1.680000e+02, double 1.680000e+02, double 1.680000e+02>
+; AUTO_VEC-NEXT:    [[STEP_ADD6_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 1.800000e+02, double 1.800000e+02, double 1.800000e+02, double 1.800000e+02>
+; AUTO_VEC-NEXT:    [[TMP30:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_NEXT_2]]
+; AUTO_VEC-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[VEC_IND_NEXT_2]], <4 x double>* [[TMP31]], align 8
+; AUTO_VEC-NEXT:    [[TMP32:%.*]] = getelementptr double, double* [[TMP30]], i64 4
+; AUTO_VEC-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD_3]], <4 x double>* [[TMP33]], align 8
+; AUTO_VEC-NEXT:    [[TMP34:%.*]] = getelementptr double, double* [[TMP30]], i64 8
+; AUTO_VEC-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD5_3]], <4 x double>* [[TMP35]], align 8
+; AUTO_VEC-NEXT:    [[TMP36:%.*]] = getelementptr double, double* [[TMP30]], i64 12
+; AUTO_VEC-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD6_3]], <4 x double>* [[TMP37]], align 8
+; AUTO_VEC-NEXT:    [[INDEX_NEXT_3]] = add i64 [[INDEX]], 64
+; AUTO_VEC-NEXT:    [[VEC_IND_NEXT_3]] = fadd fast <4 x double> [[VEC_IND]], <double 1.920000e+02, double 1.920000e+02, double 1.920000e+02, double 1.920000e+02>
+; AUTO_VEC-NEXT:    [[NITER_NSUB_3]] = add i64 [[NITER]], -4
+; AUTO_VEC-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NSUB_3]], 0
+; AUTO_VEC-NEXT:    br i1 [[NITER_NCMP_3]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop !7
+; AUTO_VEC:       middle.block.unr-lcssa:
+; AUTO_VEC-NEXT:    [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3]], [[VECTOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[VEC_IND_UNR:%.*]] = phi <4 x double> [ <double 0.000000e+00, double 3.000000e+00, double 6.000000e+00, double 9.000000e+00>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_3]], [[VECTOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[LCMP_MOD:%.*]] = icmp eq i64 [[XTRAITER]], 0
+; AUTO_VEC-NEXT:    br i1 [[LCMP_MOD]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]]
+; AUTO_VEC:       vector.body.epil:
+; AUTO_VEC-NEXT:    [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ]
+; AUTO_VEC-NEXT:    [[VEC_IND_EPIL:%.*]] = phi <4 x double> [ [[VEC_IND_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[VEC_IND_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ]
+; AUTO_VEC-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_SUB:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[XTRAITER]], [[MIDDLE_BLOCK_UNR_LCSSA]] ]
+; AUTO_VEC-NEXT:    [[STEP_ADD_EPIL:%.*]] = fadd fast <4 x double> [[VEC_IND_EPIL]], <double 1.200000e+01, double 1.200000e+01, double 1.200000e+01, double 1.200000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD5_EPIL:%.*]] = fadd fast <4 x double> [[VEC_IND_EPIL]], <double 2.400000e+01, double 2.400000e+01, double 2.400000e+01, double 2.400000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD6_EPIL:%.*]] = fadd fast <4 x double> [[VEC_IND_EPIL]], <double 3.600000e+01, double 3.600000e+01, double 3.600000e+01, double 3.600000e+01>
+; AUTO_VEC-NEXT:    [[TMP38:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_EPIL]]
+; AUTO_VEC-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[VEC_IND_EPIL]], <4 x double>* [[TMP39]], align 8
+; AUTO_VEC-NEXT:    [[TMP40:%.*]] = getelementptr double, double* [[TMP38]], i64 4
+; AUTO_VEC-NEXT:    [[TMP41:%.*]] = bitcast double* [[TMP40]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD_EPIL]], <4 x double>* [[TMP41]], align 8
+; AUTO_VEC-NEXT:    [[TMP42:%.*]] = getelementptr double, double* [[TMP38]], i64 8
+; AUTO_VEC-NEXT:    [[TMP43:%.*]] = bitcast double* [[TMP42]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD5_EPIL]], <4 x double>* [[TMP43]], align 8
+; AUTO_VEC-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[TMP38]], i64 12
+; AUTO_VEC-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>*
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD6_EPIL]], <4 x double>* [[TMP45]], align 8
+; AUTO_VEC-NEXT:    [[INDEX_NEXT_EPIL]] = add i64 [[INDEX_EPIL]], 16
+; AUTO_VEC-NEXT:    [[VEC_IND_NEXT_EPIL]] = fadd fast <4 x double> [[VEC_IND_EPIL]], <double 4.800000e+01, double 4.800000e+01, double 4.800000e+01, double 4.800000e+01>
+; AUTO_VEC-NEXT:    [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1
+; AUTO_VEC-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0
+; AUTO_VEC-NEXT:    br i1 [[EPIL_ITER_CMP]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop !8
 ; AUTO_VEC:       middle.block:
 ; AUTO_VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
-; AUTO_VEC-NEXT:    [[TMP11:%.*]] = add nsw i64 [[N_VEC]], -1
-; AUTO_VEC-NEXT:    [[CAST_CMO:%.*]] = sitofp i64 [[TMP11]] to double
-; AUTO_VEC-NEXT:    [[TMP12:%.*]] = fmul fast double [[CAST_CMO]], 3.000000e+00
+; AUTO_VEC-NEXT:    [[TMP46:%.*]] = add nsw i64 [[N_VEC]], -1
+; AUTO_VEC-NEXT:    [[CAST_CMO:%.*]] = sitofp i64 [[TMP46]] to double
+; AUTO_VEC-NEXT:    [[TMP47:%.*]] = fmul fast double [[CAST_CMO]], 3.000000e+00
 ; AUTO_VEC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; AUTO_VEC:       for.body:
 ; AUTO_VEC-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
@@ -244,9 +424,9 @@
 ; AUTO_VEC-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; AUTO_VEC-NEXT:    [[J_NEXT]] = fadd fast double [[J]], 3.000000e+00
 ; AUTO_VEC-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
-; AUTO_VEC-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !7
+; AUTO_VEC-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !9
 ; AUTO_VEC:       for.end:
-; AUTO_VEC-NEXT:    [[J_LCSSA:%.*]] = phi double [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[J_LCSSA:%.*]] = phi double [ [[TMP47]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ]
 ; AUTO_VEC-NEXT:    ret double [[J_LCSSA]]
 ;
 entry:
@@ -270,18 +450,74 @@
 define double @external_use_without_fast_math(double* %a, i64 %n) {
 ; AUTO_VEC-LABEL: @external_use_without_fast_math(
 ; AUTO_VEC-NEXT:  entry:
+; AUTO_VEC-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1
+; AUTO_VEC-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; AUTO_VEC-NEXT:    [[TMP1:%.*]] = add nsw i64 [[SMAX]], -1
+; AUTO_VEC-NEXT:    [[XTRAITER:%.*]] = and i64 [[SMAX]], 7
+; AUTO_VEC-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 7
+; AUTO_VEC-NEXT:    br i1 [[TMP2]], label [[FOR_END_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
+; AUTO_VEC:       entry.new:
+; AUTO_VEC-NEXT:    [[UNROLL_ITER:%.*]] = sub nsw i64 [[SMAX]], [[XTRAITER]]
 ; AUTO_VEC-NEXT:    br label [[FOR_BODY:%.*]]
 ; AUTO_VEC:       for.body:
-; AUTO_VEC-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ]
-; AUTO_VEC-NEXT:    [[J:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[J_NEXT:%.*]], [[FOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[I_NEXT_7:%.*]], [[FOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[J:%.*]] = phi double [ 0.000000e+00, [[ENTRY_NEW]] ], [ [[J_NEXT_7:%.*]], [[FOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[ENTRY_NEW]] ], [ [[NITER_NSUB_7:%.*]], [[FOR_BODY]] ]
 ; AUTO_VEC-NEXT:    [[TMP0:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[I]]
 ; AUTO_VEC-NEXT:    store double [[J]], double* [[TMP0]], align 8
-; AUTO_VEC-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; AUTO_VEC-NEXT:    [[J_NEXT]] = fadd double [[J]], 3.000000e+00
-; AUTO_VEC-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N:%.*]]
-; AUTO_VEC-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; AUTO_VEC-NEXT:    [[I_NEXT:%.*]] = or i64 [[I]], 1
+; AUTO_VEC-NEXT:    [[J_NEXT:%.*]] = fadd double [[J]], 3.000000e+00
+; AUTO_VEC-NEXT:    [[TMP0_1:%.*]] = getelementptr double, double* [[A]], i64 [[I_NEXT]]
+; AUTO_VEC-NEXT:    store double [[J_NEXT]], double* [[TMP0_1]], align 8
+; AUTO_VEC-NEXT:    [[I_NEXT_1:%.*]] = or i64 [[I]], 2
+; AUTO_VEC-NEXT:    [[J_NEXT_1:%.*]] = fadd double [[J_NEXT]], 3.000000e+00
+; AUTO_VEC-NEXT:    [[TMP0_2:%.*]] = getelementptr double, double* [[A]], i64 [[I_NEXT_1]]
+; AUTO_VEC-NEXT:    store double [[J_NEXT_1]], double* [[TMP0_2]], align 8
+; AUTO_VEC-NEXT:    [[I_NEXT_2:%.*]] = or i64 [[I]], 3
+; AUTO_VEC-NEXT:    [[J_NEXT_2:%.*]] = fadd double [[J_NEXT_1]], 3.000000e+00
+; AUTO_VEC-NEXT:    [[TMP0_3:%.*]] = getelementptr double, double* [[A]], i64 [[I_NEXT_2]]
+; AUTO_VEC-NEXT:    store double [[J_NEXT_2]], double* [[TMP0_3]], align 8
+; AUTO_VEC-NEXT:    [[I_NEXT_3:%.*]] = or i64 [[I]], 4
+; AUTO_VEC-NEXT:    [[J_NEXT_3:%.*]] = fadd double [[J_NEXT_2]], 3.000000e+00
+; AUTO_VEC-NEXT:    [[TMP0_4:%.*]] = getelementptr double, double* [[A]], i64 [[I_NEXT_3]]
+; AUTO_VEC-NEXT:    store double [[J_NEXT_3]], double* [[TMP0_4]], align 8
+; AUTO_VEC-NEXT:    [[I_NEXT_4:%.*]] = or i64 [[I]], 5
+; AUTO_VEC-NEXT:    [[J_NEXT_4:%.*]] = fadd double [[J_NEXT_3]], 3.000000e+00
+; AUTO_VEC-NEXT:    [[TMP0_5:%.*]] = getelementptr double, double* [[A]], i64 [[I_NEXT_4]]
+; AUTO_VEC-NEXT:    store double [[J_NEXT_4]], double* [[TMP0_5]], align 8
+; AUTO_VEC-NEXT:    [[I_NEXT_5:%.*]] = or i64 [[I]], 6
+; AUTO_VEC-NEXT:    [[J_NEXT_5:%.*]] = fadd double [[J_NEXT_4]], 3.000000e+00
+; AUTO_VEC-NEXT:    [[TMP0_6:%.*]] = getelementptr double, double* [[A]], i64 [[I_NEXT_5]]
+; AUTO_VEC-NEXT:    store double [[J_NEXT_5]], double* [[TMP0_6]], align 8
+; AUTO_VEC-NEXT:    [[I_NEXT_6:%.*]] = or i64 [[I]], 7
+; AUTO_VEC-NEXT:    [[J_NEXT_6:%.*]] = fadd double [[J_NEXT_5]], 3.000000e+00
+; AUTO_VEC-NEXT:    [[TMP0_7:%.*]] = getelementptr double, double* [[A]], i64 [[I_NEXT_6]]
+; AUTO_VEC-NEXT:    store double [[J_NEXT_6]], double* [[TMP0_7]], align 8
+; AUTO_VEC-NEXT:    [[I_NEXT_7]] = add nuw nsw i64 [[I]], 8
+; AUTO_VEC-NEXT:    [[J_NEXT_7]] = fadd double [[J_NEXT_6]], 3.000000e+00
+; AUTO_VEC-NEXT:    [[NITER_NSUB_7]] = add i64 [[NITER]], -8
+; AUTO_VEC-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NSUB_7]], 0
+; AUTO_VEC-NEXT:    br i1 [[NITER_NCMP_7]], label [[FOR_END_UNR_LCSSA]], label [[FOR_BODY]]
+; AUTO_VEC:       for.end.unr-lcssa:
+; AUTO_VEC-NEXT:    [[J_LCSSA_PH:%.*]] = phi double [ undef, [[ENTRY:%.*]] ], [ [[J_NEXT_6]], [[FOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[I_UNR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[I_NEXT_7]], [[FOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[J_UNR:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[J_NEXT_7]], [[FOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[LCMP_MOD:%.*]] = icmp eq i64 [[XTRAITER]], 0
+; AUTO_VEC-NEXT:    br i1 [[LCMP_MOD]], label [[FOR_END:%.*]], label [[FOR_BODY_EPIL:%.*]]
+; AUTO_VEC:       for.body.epil:
+; AUTO_VEC-NEXT:    [[I_EPIL:%.*]] = phi i64 [ [[I_NEXT_EPIL:%.*]], [[FOR_BODY_EPIL]] ], [ [[I_UNR]], [[FOR_END_UNR_LCSSA]] ]
+; AUTO_VEC-NEXT:    [[J_EPIL:%.*]] = phi double [ [[J_NEXT_EPIL:%.*]], [[FOR_BODY_EPIL]] ], [ [[J_UNR]], [[FOR_END_UNR_LCSSA]] ]
+; AUTO_VEC-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_SUB:%.*]], [[FOR_BODY_EPIL]] ], [ [[XTRAITER]], [[FOR_END_UNR_LCSSA]] ]
+; AUTO_VEC-NEXT:    [[TMP0_EPIL:%.*]] = getelementptr double, double* [[A]], i64 [[I_EPIL]]
+; AUTO_VEC-NEXT:    store double [[J_EPIL]], double* [[TMP0_EPIL]], align 8
+; AUTO_VEC-NEXT:    [[I_NEXT_EPIL]] = add nuw nsw i64 [[I_EPIL]], 1
+; AUTO_VEC-NEXT:    [[J_NEXT_EPIL]] = fadd double [[J_EPIL]], 3.000000e+00
+; AUTO_VEC-NEXT:    [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1
+; AUTO_VEC-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0
+; AUTO_VEC-NEXT:    br i1 [[EPIL_ITER_CMP]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop !10
 ; AUTO_VEC:       for.end:
-; AUTO_VEC-NEXT:    ret double [[J]]
+; AUTO_VEC-NEXT:    [[J_LCSSA:%.*]] = phi double [ [[J_LCSSA_PH]], [[FOR_END_UNR_LCSSA]] ], [ [[J_EPIL]], [[FOR_BODY_EPIL]] ]
+; AUTO_VEC-NEXT:    ret double [[J_LCSSA]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
--- a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
+++ b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
@@ -70,7 +70,8 @@
 
   bool PreserveLCSSA = L->isRecursivelyLCSSAForm(DT,LI);
 
-  bool ret = UnrollRuntimeLoopRemainder(L, 4, true, false, false, false, &LI,
-                                        &SE, &DT, &AC, PreserveLCSSA);
+  bool ret =
+      UnrollRuntimeLoopRemainder(L, 4, true, false, false, false, &LI, &SE, &DT,
+                                 &AC, /*TTI=*/nullptr, PreserveLCSSA);
   EXPECT_FALSE(ret);
 }