Index: lib/Transforms/Scalar/LoopUnrollPass.cpp
===================================================================
--- lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -185,33 +185,21 @@
     void selectThresholds(const Loop *L, bool HasPragma,
                           const TargetTransformInfo::UnrollingPreferences &UP,
                           unsigned &Threshold, unsigned &PartialThreshold,
-                          unsigned NumberOfOptimizedInstructions) {
+                          unsigned &AbsoluteThreshold,
+                          unsigned &PercentOfOptimizedForCompleteUnroll) {
       // Determine the current unrolling threshold.  While this is
       // normally set from UnrollThreshold, it is overridden to a
       // smaller value if the current function is marked as
       // optimize-for-size, and the unroll threshold was not user
       // specified.
       Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
-
-      // If we are allowed to completely unroll if we can remove M% of
-      // instructions, and we know that with complete unrolling we'll be able
-      // to kill N instructions, then we can afford to completely unroll loops
-      // with unrolled size up to N*100/M.
-      // Adjust the threshold according to that:
-      unsigned PercentOfOptimizedForCompleteUnroll =
-          UserPercentOfOptimized ? CurrentMinPercentOfOptimized
-                                 : UP.MinPercentOfOptimized;
-      unsigned AbsoluteThreshold = UserAbsoluteThreshold
-                                       ? CurrentAbsoluteThreshold
-                                       : UP.AbsoluteThreshold;
-      if (PercentOfOptimizedForCompleteUnroll)
-        Threshold = std::max<unsigned>(Threshold,
-                                       NumberOfOptimizedInstructions * 100 /
-                                           PercentOfOptimizedForCompleteUnroll);
-      // But don't allow unrolling loops bigger than absolute threshold.
-      Threshold = std::min<unsigned>(Threshold, AbsoluteThreshold);
-
       PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold;
+      AbsoluteThreshold = UserAbsoluteThreshold ? CurrentAbsoluteThreshold
+                                                : UP.AbsoluteThreshold;
+      PercentOfOptimizedForCompleteUnroll = UserPercentOfOptimized
+                                                ? CurrentMinPercentOfOptimized
+                                                : UP.MinPercentOfOptimized;
+
       if (!UserThreshold &&
           L->getHeader()->getParent()->hasFnAttribute(
               Attribute::OptimizeForSize)) {
@@ -230,6 +218,10 @@
               std::max<unsigned>(PartialThreshold, PragmaUnrollThreshold);
       }
     }
+    bool canUnrollCompletely(Loop *L, unsigned Threshold,
+                             unsigned AbsoluteThreshold, unsigned UnrolledSize,
+                             unsigned NumberOfOptimizedInstructions,
+                             unsigned PercentOfOptimizedForCompleteUnroll);
   };
 }
 
@@ -252,32 +244,45 @@
   return llvm::createLoopUnrollPass(-1, -1, 0, 0);
 }
 
-static bool isLoadFromConstantInitializer(Value *V) {
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
-    if (GV->isConstant() && GV->hasDefinitiveInitializer())
-      return GV->getInitializer();
-  return false;
-}
-
+/// \brief SCEV expressions visitor used for finding expressions that would
+/// become constants if the loop L is unrolled.
 struct FindConstantPointers {
-  bool LoadCanBeConstantFolded;
+  /// \brief Shows whether the expression is ConstAddress+Constant or not.
   bool IndexIsConstant;
-  APInt Step;
-  APInt StartValue;
+
+  /// \brief Used for filtering out SCEV expressions with two or more AddRec
+  ///subexpressions.
+  ///
+  /// haveSeenAR is used to filter out complicated SCEV expressions, having
+  /// several AddRec sub-expressions. We don't handle them, because unrolling one
+  /// loop wouldn't help to replace only one of these inductions with a constant,
+  /// and consequently, the expression would remain non-constant.
+  bool haveSeenAR;
+
+  /// \brief If the SCEV expression becomes ConstAddress+Constant, this value
+  /// holds ConstAddress. Otherwise, it's nullptr.
   Value *BaseAddress;
+
+  /// \brief The loop, which we try to completely unroll.
   const Loop *L;
+
   ScalarEvolution &SE;
+
   FindConstantPointers(const Loop *loop, ScalarEvolution &SE)
-      : LoadCanBeConstantFolded(true), IndexIsConstant(true), L(loop), SE(SE) {}
+      : IndexIsConstant(true), haveSeenAR(false), BaseAddress(nullptr),
+        L(loop), SE(SE) {}
 
   bool follow(const SCEV *S) {
     if (const SCEVUnknown *SC = dyn_cast<SCEVUnknown>(S)) {
       // We've reached the leaf node of SCEV, it's most probably just a
-      // variable. Now it's time to see if it corresponds to a global constant
-      // global (in which case we can eliminate the load), or not.
+      // variable.
+      // If it's the only one SCEV-subexpression, then it might be a base
+      // address of an index expression.
+      // If we've already recorded base address, then just give up on this SCEV
+      // - it's too complicated.
+      if (BaseAddress)
+        return IndexIsConstant = false;
       BaseAddress = SC->getValue();
-      LoadCanBeConstantFolded =
-          IndexIsConstant && isLoadFromConstantInitializer(BaseAddress);
       return false;
     }
     if (isa<SCEVConstant>(S))
@@ -288,14 +293,13 @@
       // unrolling, and thus, won't be able to eliminate the load.
       if (AR->getLoop() != L)
         return IndexIsConstant = false;
-      // If the step isn't constant, we won't get constant addresses in unrolled
-      // version. Bail out.
-      if (const SCEVConstant *StepSE =
-              dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))
-        Step = StepSE->getValue()->getValue();
-      else
+
+      // We don't handle multiple AddRecs here, so give up in this case.
+      if (haveSeenAR)
         return IndexIsConstant = false;
 
+      haveSeenAR = true;
+
       return IndexIsConstant;
     }
     // If Result is true, continue traversal.
@@ -323,26 +327,61 @@
 // And finally:
 //   v = b[1]
 class UnrollAnalyzer : public InstVisitor<UnrollAnalyzer, bool> {
+  typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
+                    SmallPtrSet<BasicBlock *, 16>> BBSetVector;
+
   typedef InstVisitor<UnrollAnalyzer, bool> Base;
   friend class InstVisitor<UnrollAnalyzer, bool>;
 
+  typedef struct {
+    Value *BaseAddr;
+    APInt Start;
+    APInt Step;
+  } SCEVGEPDescriptor;
+
+  /// \brief The loop we're going to analyze.
   const Loop *L;
+
+  /// \brief TripCount of the given loop.
   unsigned TripCount;
+
   ScalarEvolution &SE;
+
   const TargetTransformInfo &TTI;
 
+  // While we walk the loop instructions, we we build up and maintain a mapping
+  // of simplified values specific to this iteration.  The idea is to propagate
+  // any special information we have about loads that can be replaced with
+  // constants after complete unrolling, and account for likely simplifications
+  // post-unrolling.
   DenseMap<Value *, Constant *> SimplifiedValues;
-  DenseMap<LoadInst *, Value *> LoadBaseAddresses;
-  SmallPtrSet<Instruction *, 32> CountedInstructions;
 
-  /// \brief Count the number of optimized instructions.
-  unsigned NumberOfOptimizedInstructions;
+  // To avoid requesting SCEV info on every iteration, request it once, and
+  // for each value that would become ConstAddress+Constant after loop
+  // unrolling, save the corresponding data.
+  DenseMap<Value *, SCEVGEPDescriptor> SCEVCache;
+
+  /// \brief Number of currently simulated iteration.
+  ///
+  /// If an expression is ConstAddress+Constant, then the Constant is
+  /// Start + Iteration*Step, where Start and Step could be obtained from
+  /// SCEVCache.
+  unsigned Iteration;
+
+  /// \brief Upper threshold for complete unrolling.
+  unsigned MaxUnrolledLoopSize;
+
+  /// \brief Shows that an early exit would definitely be taken.
+  ///
+  /// We can figure that out after substituting loads with corresponding
+  /// constants. In this case, we 1) don't need to continue analyzing further
+  /// iterations, 2) have a better estimate for unrolled loop size than
+  /// TC*LoopSize.
+  bool EarlyExitFound;
 
   // Provide base case for our instruction visit.
   bool visitInstruction(Instruction &I) { return false; };
-  // TODO: We should also visit ICmp, FCmp, GetElementPtr, Trunc, ZExt, SExt,
-  // FPTrunc, FPExt, FPToUI, FPToSI, UIToFP, SIToFP, BitCast, Select,
-  // ExtractElement, InsertElement, ShuffleVector, ExtractValue, InsertValue.
+  // TODO: Add visitors for other instruction types, e.g. ZExt, SExt.
   //
   // Probaly it's worth to hoist the code for estimating the simplifications
   // effects to a separate class, since we have a very similar code in
@@ -362,7 +401,7 @@
     else
       SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS);
 
-    if (SimpleV && CountedInstructions.insert(&I).second)
+    if (SimpleV)
       NumberOfOptimizedInstructions += TTI.getUserCost(&I);
 
     if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
@@ -372,205 +411,243 @@
     return false;
   }
 
-  Constant *computeLoadValue(LoadInst *LI, unsigned Iteration) {
-    if (!LI)
-      return nullptr;
-    Value *BaseAddr = LoadBaseAddresses[LI];
-    if (!BaseAddr)
-      return nullptr;
+  bool visitCmpInst(CmpInst &I) {
+    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+    // First try to handle simplified comparisons.
+    if (!isa<Constant>(LHS))
+      if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+        LHS = SimpleLHS;
+    if (!isa<Constant>(RHS))
+      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+        RHS = SimpleRHS;
+    if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
+      if (Constant *CRHS = dyn_cast<Constant>(RHS))
+        if (Constant *C =
+                ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
+          NumberOfOptimizedInstructions += TTI.getUserCost(&I);
+          SimplifiedValues[&I] = C;
+          return true;
+        }
+    }
+
+    if (I.getOpcode() == Instruction::FCmp)
+      return false;
+
+    return false;
+  }
+
+  bool visitLoad(LoadInst &I) {
+    Value *AddrOp = I.getPointerOperand();
+    if (SimplifiedValues[AddrOp])
+      AddrOp = SimplifiedValues[AddrOp];
+    if (!SCEVCache.count(AddrOp))
+      return false;
+    SCEVGEPDescriptor d = SCEVCache[AddrOp];
 
-    auto GV = dyn_cast<GlobalVariable>(BaseAddr);
+    auto GV = dyn_cast_or_null<GlobalVariable>(d.BaseAddr);
     if (!GV)
-      return nullptr;
+      return false;
 
     ConstantDataSequential *CDS =
         dyn_cast<ConstantDataSequential>(GV->getInitializer());
     if (!CDS)
-      return nullptr;
-
-    const SCEV *BaseAddrSE = SE.getSCEV(BaseAddr);
-    const SCEV *S = SE.getSCEV(LI->getPointerOperand());
-    const SCEV *OffSE = SE.getMinusSCEV(S, BaseAddrSE);
-
-    APInt StepC, StartC;
-    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffSE);
-    if (!AR)
-      return nullptr;
-
-    if (const SCEVConstant *StepSE =
-            dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))
-      StepC = StepSE->getValue()->getValue();
-    else
-      return nullptr;
+      return false;
 
-    if (const SCEVConstant *StartSE = dyn_cast<SCEVConstant>(AR->getStart()))
-      StartC = StartSE->getValue()->getValue();
-    else
-      return nullptr;
+    unsigned Start = d.Start.getLimitedValue();
+    unsigned Step = d.Step.getLimitedValue();
 
     unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
-    unsigned Start = StartC.getLimitedValue();
-    unsigned Step = StepC.getLimitedValue();
-
     unsigned Index = (Start + Step * Iteration) / ElemSize;
     if (Index >= CDS->getNumElements())
-      return nullptr;
+      return false;
 
     Constant *CV = CDS->getElementAsConstant(Index);
+    if (CV)
+      SimplifiedValues[cast<Value>(&I)] = CV;
 
-    return CV;
+    NumberOfOptimizedInstructions += TTI.getUserCost(&I);
+    return true;
   }
 
-public:
-  UnrollAnalyzer(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
-                 const TargetTransformInfo &TTI)
-      : L(L), TripCount(TripCount), SE(SE), TTI(TTI),
-        NumberOfOptimizedInstructions(0) {}
+  // Visit all instructions in the given basic block and try to simplify it.
+  // We don't change the actual IR, just count optimization opportunities.
+  void analyzeBlock(BasicBlock *BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      UnrolledLoopSize += TTI.getUserCost(&*I);
+      Base::visit(I);
+      if (UnrolledLoopSize - NumberOfOptimizedInstructions >
+          MaxUnrolledLoopSize)
+        return;
+    }
+  }
 
-  // Visit all loads the loop L, and for those that, after complete loop
-  // unrolling, would have a constant address and it will point to a known
-  // constant initializer, record its base address for future use.  It is used
-  // when we estimate number of potentially simplified instructions.
-  void findConstFoldableLoads() {
-    for (auto BB : L->getBlocks()) {
-      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-        if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-          if (!LI->isSimple())
-            continue;
-          Value *AddrOp = LI->getPointerOperand();
-          const SCEV *S = SE.getSCEV(AddrOp);
-          FindConstantPointers Visitor(L, SE);
-          SCEVTraversal<FindConstantPointers> T(Visitor);
-          T.visitAll(S);
-          if (Visitor.IndexIsConstant && Visitor.LoadCanBeConstantFolded) {
-            LoadBaseAddresses[LI] = Visitor.BaseAddress;
-          }
+  // Add successors of the given basic block to the worklist.
+  // If it's possible to evaluate the condition, do that, and add only the
+  // corresponding successor, dropping the rest.
+  void addBBSuccessors(BasicBlock *BB, BBSetVector *BBWorklist) {
+    TerminatorInst *TI = BB->getTerminator();
+
+    BasicBlock *CertainSucc = nullptr;
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (BI->isConditional()) {
+        Value *Cond = BI->getCondition();
+        if (ConstantInt *SimpleCond =
+                dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
+          CertainSucc = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
         }
+      } else {
+        CertainSucc = BI->getSuccessor(0);
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      Value *Cond = SI->getCondition();
+      if (ConstantInt *SimpleCond =
+              dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
+        CertainSucc = SI->findCaseValue(SimpleCond).getCaseSuccessor();
       }
     }
+
+    if (CertainSucc) {
+      if (L->contains(CertainSucc))
+        BBWorklist->insert(CertainSucc);
+      else
+        EarlyExitFound = true;
+      return;
+    }
+
+    for (BasicBlock *Succ : successors(BB))
+      if (L->contains(Succ))
+        BBWorklist->insert(Succ);
   }
 
-  // Given a list of loads that could be constant-folded (LoadBaseAddresses),
-  // estimate number of optimized instructions after substituting the concrete
-  // values for the given Iteration. Also track how many instructions become
-  // dead through this process.
-  unsigned estimateNumberOfOptimizedInstructions(unsigned Iteration) {
-    // We keep a set vector for the worklist so that we don't wast space in the
-    // worklist queuing up the same instruction repeatedly. This can happen due
-    // to multiple operands being the same instruction or due to the same
-    // instruction being an operand of lots of things that end up dead or
-    // simplified.
-    SmallSetVector<Instruction *, 8> Worklist;
-
-    // Clear the simplified values and counts for this iteration.
-    SimplifiedValues.clear();
-    CountedInstructions.clear();
+  // Simulate execution of each iteration of the loop counting instructions,
+  // which would be simplified.
+  void simulateLoop() {
+    UnrolledLoopSize = 0;
     NumberOfOptimizedInstructions = 0;
+    EarlyExitFound = false;
+    // Estimate number of instructions, that could be simplified if we replace a
+    // load with the corresponding constant. Since the same load will take
+    // different values on different iterations, we have to go through all
+    // loop's
+    // iterations here.
+    for (Iteration = 0; Iteration < TripCount; ++Iteration) {
+      if (EarlyExitFound)
+        break;
+
+      BBSetVector BBWorklist;
+      BBWorklist.insert(L->getHeader());
+      // Note that we *must not* cache the size, this loop grows the worklist.
+      for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
+        BasicBlock *BB = BBWorklist[Idx];
+        if (BB->empty())
+          continue;
+
+        analyzeBlock(BB);
+
+        // If unrolled body turns out to be too big, bail out.
+        if (UnrolledLoopSize - NumberOfOptimizedInstructions >
+            MaxUnrolledLoopSize)
+          return;
+
+        addBBSuccessors(BB, &BBWorklist);
+      }
 
-    // We start by adding all loads to the worklist.
-    for (auto &LoadDescr : LoadBaseAddresses) {
-      LoadInst *LI = LoadDescr.first;
-      SimplifiedValues[LI] = computeLoadValue(LI, Iteration);
-      if (CountedInstructions.insert(LI).second)
-        NumberOfOptimizedInstructions += TTI.getUserCost(LI);
-
-      for (User *U : LI->users())
-        Worklist.insert(cast<Instruction>(U));
+      // If we found no optimization opportunities on the first iteration, we
+      // won't find them on later ones too.
+      if (!NumberOfOptimizedInstructions) {
+        UnrolledLoopSize = UINT_MAX;
+        break;
+      }
     }
+  }
 
-    // And then we try to simplify every user of every instruction from the
-    // worklist. If we do simplify a user, add it to the worklist to process
-    // its users as well.
-    while (!Worklist.empty()) {
-      Instruction *I = Worklist.pop_back_val();
-      if (!L->contains(I))
-        continue;
-      if (!visit(I))
-        continue;
-      for (User *U : I->users())
-        Worklist.insert(cast<Instruction>(U));
-    }
+  // Visit all GEPs in the loop L, and find those that after complete loop
+  // unrolling would become constants, or BaseAddress+Constant.  For this to
+  // happen, a GEPs SCEV expression should contain at most one AddRec
+  // expression, and the loop corresponding to this expression should be L. The
+  // rest SCEV sub-expressions should be either constants, or ScevUnknown
+  // (which would become the base address).
+  // If the expression contains the base address, then after subtracting it, we
+  // should get AddRec with constant step and start.
+  void findConstantPointers() {
+    for (auto BB : L->getBlocks()) {
+      for (Instruction &I : *BB) {
+        // TODO: Handle other instructions (esp. phis) here as well.
+        if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+          Value *V = cast<Value>(GEP);
+          const SCEV *S = SE.getSCEV(V);
+          FindConstantPointers Visitor(L, SE);
+          SCEVTraversal<FindConstantPointers> T(Visitor);
+          // Try to find (BaseAddress+Step+Offset) tuple.
+          // If succeeded, save it to the cache - it might help in folding
+          // loads.
+          T.visitAll(S);
+          if (!Visitor.IndexIsConstant || !Visitor.BaseAddress)
+            continue;
 
-    // Now that we know the potentially simplifed instructions, estimate number
-    // of instructions that would become dead if we do perform the
-    // simplification.
-
-    // The dead instructions are held in a separate set. This is used to
-    // prevent us from re-examining instructions and make sure we only count
-    // the benifit once. The worklist's internal set handles insertion
-    // deduplication.
-    SmallPtrSet<Instruction *, 16> DeadInstructions;
-
-    // Lambda to enque operands onto the worklist.
-    auto EnqueueOperands = [&](Instruction &I) {
-      for (auto *Op : I.operand_values())
-        if (auto *OpI = dyn_cast<Instruction>(Op))
-          if (!OpI->use_empty())
-            Worklist.insert(OpI);
-    };
-
-    // Start by initializing worklist with simplified instructions.
-    for (auto &FoldedKeyValue : SimplifiedValues)
-      if (auto *FoldedInst = dyn_cast<Instruction>(FoldedKeyValue.first)) {
-        DeadInstructions.insert(FoldedInst);
-
-        // Add each instruction operand of this dead instruction to the
-        // worklist.
-        EnqueueOperands(*FoldedInst);
-      }
+          SCEVGEPDescriptor d;
+          d.BaseAddr = Visitor.BaseAddress;
+          const SCEV *BaseAddrSE = SE.getSCEV(d.BaseAddr);
+          const SCEV *OffSE = SE.getMinusSCEV(S, BaseAddrSE);
+          const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffSE);
 
-    // If a definition of an insn is only used by simplified or dead
-    // instructions, it's also dead. Check defs of all instructions from the
-    // worklist.
-    while (!Worklist.empty()) {
-      Instruction *I = Worklist.pop_back_val();
-      if (!L->contains(I))
-        continue;
-      if (DeadInstructions.count(I))
-        continue;
-
-      if (std::all_of(I->user_begin(), I->user_end(), [&](User *U) {
-            return DeadInstructions.count(cast<Instruction>(U));
-          })) {
-        NumberOfOptimizedInstructions += TTI.getUserCost(I);
-        DeadInstructions.insert(I);
-        EnqueueOperands(*I);
+          if (!AR)
+            continue;
+
+          const SCEVConstant *StepSE =
+              dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE));
+          const SCEVConstant *StartSE = dyn_cast<SCEVConstant>(AR->getStart());
+          if (!StepSE || !StartSE)
+            continue;
+
+          d.Step = StepSE->getValue()->getValue();
+          d.Start = StartSE->getValue()->getValue();
+
+          SCEVCache[V] = d;
+        }
       }
     }
-    return NumberOfOptimizedInstructions;
   }
-};
 
-// Complete loop unrolling can make some loads constant, and we need to know if
-// that would expose any further optimization opportunities.
-// This routine estimates this optimization effect and returns the number of
-// instructions, that potentially might be optimized away.
-static unsigned
-approximateNumberOfOptimizedInstructions(const Loop *L, ScalarEvolution &SE,
-                                         unsigned TripCount,
-                                         const TargetTransformInfo &TTI) {
-  if (!TripCount || !UnrollMaxIterationsCountToAnalyze)
-    return 0;
-
-  UnrollAnalyzer UA(L, TripCount, SE, TTI);
-  UA.findConstFoldableLoads();
-
-  // Estimate number of instructions, that could be simplified if we replace a
-  // load with the corresponding constant. Since the same load will take
-  // different values on different iterations, we have to go through all loop's
-  // iterations here. To limit ourselves here, we check only first N
-  // iterations, and then scale the found number, if necessary.
-  unsigned IterationsNumberForEstimate =
-      std::min<unsigned>(UnrollMaxIterationsCountToAnalyze, TripCount);
-  unsigned NumberOfOptimizedInstructions = 0;
-  for (unsigned i = 0; i < IterationsNumberForEstimate; ++i)
-    NumberOfOptimizedInstructions +=
-        UA.estimateNumberOfOptimizedInstructions(i);
-
-  NumberOfOptimizedInstructions *= TripCount / IterationsNumberForEstimate;
-
-  return NumberOfOptimizedInstructions;
-}
+public:
+  UnrollAnalyzer(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
+                 const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize)
+      : L(L), TripCount(TripCount), SE(SE), TTI(TTI),
+        MaxUnrolledLoopSize(MaxUnrolledLoopSize) {}
+
+  /// \brief Count the number of optimized instructions.
+  unsigned NumberOfOptimizedInstructions;
+
+  /// \brief Count the total number of instructions.
+  unsigned UnrolledLoopSize;
+
+  // Complete loop unrolling can make some loads constant, and we need to know
+  // if that would expose any further optimization opportunities.  This routine
+  // estimates this optimization.  It assigns computed number of instructions,
+  // that potentially might be optimized away, to NumberOfOptimizedInstructions,
+  // and total number of instructions to UnrolledLoopSize (not counting blocks
+  // that won't be reached, if we were able to compute the condition).
+  void analyzeLoop() {
+    UnrolledLoopSize = UINT_MAX;
+    NumberOfOptimizedInstructions = 0;
+
+    // Don't simulate loops with a big or unknown tripcount
+    if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
+        TripCount > UnrollMaxIterationsCountToAnalyze)
+      return;
+
+    findConstantPointers();
+    simulateLoop();
+    assert(NumberOfOptimizedInstructions <= UnrolledLoopSize &&
+           "We can't optimize more than total number of instructions.");
+
+    // If we can overflow computing percentage of optimized instructions, just
+    // give a conservative answer. Anyway, we don't want to deal with such a
+    // big loops.
+    if (NumberOfOptimizedInstructions > UINT_MAX / 100)
+      NumberOfOptimizedInstructions = 0;
+  }
+};
 
 /// ApproximateLoopSize - Approximate the size of the loop.
 static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
@@ -670,6 +747,48 @@
   L->setLoopID(NewLoopID);
 }
 
+bool LoopUnroll::canUnrollCompletely(
+    Loop *L, unsigned Threshold, unsigned AbsoluteThreshold,
+    unsigned UnrolledSize, unsigned NumberOfOptimizedInstructions,
+    unsigned PercentOfOptimizedForCompleteUnroll) {
+
+  if (Threshold == NoThreshold) {
+    DEBUG(dbgs() << "  Can fully unroll, because no threshold is set.\n");
+    return true;
+  }
+
+  if (UnrolledSize <= Threshold) {
+    DEBUG(dbgs() << "  Can fully unroll, because unrolled size: "
+                 << UnrolledSize << "<" << Threshold << "\n");
+    return true;
+  }
+
+  unsigned PercentOfOptimizedInstructions =
+      NumberOfOptimizedInstructions * 100 /
+      UnrolledSize; // The previous check guards us from div by 0
+  if (UnrolledSize <= AbsoluteThreshold &&
+      PercentOfOptimizedInstructions >= PercentOfOptimizedForCompleteUnroll) {
+    DEBUG(dbgs() << "  Can fully unroll, because unrolling will help removing "
+                 << PercentOfOptimizedInstructions
+                 << "% instructions (threshold: "
+                 << PercentOfOptimizedForCompleteUnroll << "%)\n");
+    DEBUG(dbgs() << "  Unrolled size (" << UnrolledSize
+                 << ") is less than the threshold (" << AbsoluteThreshold
+                 << ").\n");
+    return true;
+  }
+
+  DEBUG(dbgs() << "  Too large to fully unroll:\n");
+  DEBUG(dbgs() << "    Unrolled size: " << UnrolledSize << "\n");
+  DEBUG(dbgs() << "    Estimated number of optimized instructions: "
+               << NumberOfOptimizedInstructions << "\n");
+  DEBUG(dbgs() << "    Absolute threshold: " << AbsoluteThreshold << "\n");
+  DEBUG(dbgs() << "    Minimum percent of removed instructions: "
+               << PercentOfOptimizedForCompleteUnroll << "\n");
+  DEBUG(dbgs() << "    Threshold for small loops: " << Threshold << "\n");
+  return false;
+}
+
 unsigned LoopUnroll::selectUnrollCount(
     const Loop *L, unsigned TripCount, bool PragmaFullUnroll,
     unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP,
@@ -776,27 +895,26 @@
     return false;
   }
 
-  unsigned NumberOfOptimizedInstructions =
-      approximateNumberOfOptimizedInstructions(L, *SE, TripCount, TTI);
-  DEBUG(dbgs() << "  Complete unrolling could save: "
-               << NumberOfOptimizedInstructions << "\n");
-
   unsigned Threshold, PartialThreshold;
+  unsigned AbsoluteThreshold, PercentOfOptimizedForCompleteUnroll;
   selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold,
-                   NumberOfOptimizedInstructions);
+                   AbsoluteThreshold, PercentOfOptimizedForCompleteUnroll);
 
   // Given Count, TripCount and thresholds determine the type of
   // unrolling which is to be performed.
   enum { Full = 0, Partial = 1, Runtime = 2 };
   int Unrolling;
   if (TripCount && Count == TripCount) {
-    if (Threshold != NoThreshold && UnrolledSize > Threshold) {
-      DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
-                   << " because size: " << UnrolledSize << ">" << Threshold
-                   << "\n");
-      Unrolling = Partial;
-    } else {
+    UnrollAnalyzer UA(L, TripCount, *SE, TTI, AbsoluteThreshold);
+    UA.analyzeLoop();
+    if (canUnrollCompletely(
+            L, Threshold, AbsoluteThreshold,
+            std::min<unsigned>(UnrolledSize, UA.UnrolledLoopSize),
+            UA.NumberOfOptimizedInstructions,
+            PercentOfOptimizedForCompleteUnroll)) {
       Unrolling = Full;
+    } else {
+      Unrolling = Partial;
     }
   } else if (TripCount && Count < TripCount) {
     Unrolling = Partial;
Index: test/Transforms/LoopUnroll/full-unroll-early-exit.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopUnroll/full-unroll-early-exit.ll
@@ -0,0 +1,35 @@
+; The trip count of this loop is 100, and the size of the loop body is 7.
+; It might seem unprofitable to unroll the loop because the final code size, if
+; computed as TC*Size, is 700.
+; However, if the loop is fully unrolled, then the compiler can figure out,
+; that we'll exit early from the loop on iteration 2. Thus, the final code size
+; would be only TC*2, which is quite affordable.
+
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-absolute-threshold=50 -unroll-threshold=50 -unroll-percent-of-optimized-for-complete-unroll=10 | FileCheck %s
+; CHECK-NOT: icmp
+; CHECK-NOT: getelementptr
+; CHECK-NOT: load
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@a = private unnamed_addr constant [100 x i32] [i32 12, i32 0, i32 32, i32 40, i32 44, i32 65, i32 56, i32 7, i32 8, i32 9, i32 12, i32 0, i32 32, i32 0, i32 44, i32 65, i32 56, i32 7, i32 8, i32 9, i32 12, i32 1, i32 32, i32 43, i32 44, i32 65, i32 56, i32 7, i32 8, i32 9, i32 12, i32 1, i32 32, i32 43, i32 44, i32 65, i32 56, i32 7, i32 8, i32 9, i32 12, i32 1, i32 32, i32 43, i32 44, i32 65, i32 56, i32 0, i32 8, i32 9, i32 12, i32 1, i32 32, i32 43, i32 44, i32 65, i32 56, i32 7, i32 8, i32 9, i32 12, i32 1, i32 32, i32 43, i32 44, i32 65, i32 56, i32 7, i32 8, i32 9, i32 12, i32 1, i32 32, i32 43, i32 44, i32 65, i32 56, i32 7, i32 8, i32 9, i32 12, i32 1, i32 32, i32 43, i32 44, i32 65, i32 56, i32 7, i32 8, i32 9, i32 12, i32 1, i32 32, i32 43, i32 44, i32 65, i32 56, i32 7, i32 8, i32 9], align 16
+
+define i64 @foo() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds [100 x i32]* @a, i64 0, i64 %iv
+  %a_i = load i32* %arrayidx, align 4
+  %cmp_early_exit = icmp eq i32 %a_i, 0
+  br i1 %cmp_early_exit, label %for.end, label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cmp = icmp slt i64 %iv.next, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %for.inc
+  %iv.last = phi i64 [ %iv, %for.body ], [ %iv.next, %for.inc ]
+  ret i64 %iv.last
+}
Index: test/Transforms/LoopUnroll/full-unroll-no-taken-branch.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopUnroll/full-unroll-no-taken-branch.ll
@@ -0,0 +1,41 @@
+; In this test we check that the loop was unrolled.
+; This loop contains a branch that makes loop bigger than we can afford to
+; completely unroll. However, if we do unroll this loop, we can resolve this
+; branch and figure out that in fact we never take it. Thus, we can ignore its
+; cost.
+
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=20 -unroll-absolute-threshold=150 -unroll-threshold=150 -unroll-percent-of-optimized-for-complete-unroll=10 | FileCheck %s
+; CHECK-NOT: icmp
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@a = private unnamed_addr constant [20 x i32] [i32 12, i32 10, i32 32, i32 201, i32 44, i32 65, i32 56, i32 7, i32 8, i32 9, i32 12, i32 1, i32 32, i32 10, i32 44, i32 65, i32 56, i32 7, i32 8, i32 9]
+
+; Function Attrs: nounwind ssp uwtable
+define void @foo(i32* %b, i32* %c) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.continue ]
+  %arrayidx = getelementptr inbounds [20 x i32]* @a, i64 0, i64 %indvars.iv
+  %a_i = load i32* %arrayidx, align 4
+  %cmp_expensive_branch = icmp eq i32 %a_i, 0
+  br i1 %cmp_expensive_branch, label %expensive.branch, label %for.continue
+
+expensive.branch:
+  %arrayidx_b = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %arrayidx_c = getelementptr inbounds i32* %c, i64 %indvars.iv
+  %b_i = load i32* %arrayidx_b, align 4
+  %c_i = load i32* %arrayidx_c, align 4
+  %mul = mul nsw i32 %b_i, %c_i
+  store i32 %mul, i32* %arrayidx_c, align 4
+  br label %for.continue
+
+for.continue:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 20
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}