Index: include/llvm/Analysis/LoopUnrollAnalyzer.h
===================================================================
--- include/llvm/Analysis/LoopUnrollAnalyzer.h
+++ include/llvm/Analysis/LoopUnrollAnalyzer.h
@@ -48,8 +48,9 @@
 public:
   UnrolledInstAnalyzer(unsigned Iteration,
                        DenseMap<Value *, Constant *> &SimplifiedValues,
-                       ScalarEvolution &SE, const Loop *L)
-      : SimplifiedValues(SimplifiedValues), SE(SE), L(L) {
+                       ScalarEvolution &SE, const Loop *L, bool CompleteUnroll)
+      : SimplifiedValues(SimplifiedValues), SE(SE),
+        L(L), CompleteUnroll(CompleteUnroll) {
       IterationNumber = SE.getConstant(APInt(64, Iteration));
   }
 
@@ -81,6 +82,7 @@
 
   ScalarEvolution &SE;
   const Loop *L;
+  bool CompleteUnroll;
 
   bool simplifyInstWithSCEV(Instruction *I);
 
Index: lib/Analysis/LoopUnrollAnalyzer.cpp
===================================================================
--- lib/Analysis/LoopUnrollAnalyzer.cpp
+++ lib/Analysis/LoopUnrollAnalyzer.cpp
@@ -35,6 +35,8 @@
     SimplifiedValues[I] = SC->getValue();
     return true;
   }
+  if (!CompleteUnroll)
+    return false;
 
   auto *AR = dyn_cast<SCEVAddRecExpr>(S);
   if (!AR || AR->getLoop() != L)
@@ -211,6 +213,10 @@
   if (Base::visitPHINode(PN))
     return true;
 
+  // Consider PHI is foldable only after complete unroll.
+  if (!CompleteUnroll)
+    return false;
+
   // The loop induction PHI nodes are definitionally free.
   return PN.getParent() == L->getHeader();
 }
Index: lib/Transforms/Scalar/LoopUnrollPass.cpp
===================================================================
--- lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -89,6 +89,11 @@
     cl::desc("Allow generation of a loop remainder (extra iterations) "
              "when unrolling a loop."));
 
+static cl::opt<bool> UnrollAllowForce(
+    "unroll-allow-force", cl::Hidden, cl::init(false),
+    cl::desc("Allow generation of a loop remainder (extra iterations) "
+             "when unrolling a loop."));
+
 static cl::opt<bool>
     UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
                   cl::desc("Unroll loops with run-time trip counts"));
@@ -229,6 +234,27 @@
 };
 }
 
+/// Some instructions get the same value during loop execution.
+/// The function find a cycle length for such.
+static unsigned getPhiCycleLength (PHINode *PHI, const Loop *L) {
+  Value *V = PHI->getIncomingValueForBlock(L->getLoopLatch());
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return 0;
+  switch (I->getOpcode()) {
+  case Instruction::Sub:
+    if (ConstantInt *CS = dyn_cast<ConstantInt>(I->getOperand(0)))
+      if (CS->isZero() && I->getOperand(1) == PHI)
+        return 2;
+  case Instruction::Xor:
+    if (I->getOperand(0) == PHI && L->isLoopInvariant(I->getOperand(1)))
+      return 2;
+  default:
+    return 0;
+  }
+  return 0;
+}
+
 namespace {
 struct EstimatedUnrollCost {
   /// \brief The estimated cost after unrolling.
@@ -254,9 +280,9 @@
 /// the analysis failed (no benefits expected from the unrolling, or the loop is
 /// too big to analyze), the returned value is None.
 static Optional<EstimatedUnrollCost>
-analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
+analyzeLoopUnrollCost(const Loop *L, unsigned UnrollCount, DominatorTree &DT,
                       ScalarEvolution &SE, const TargetTransformInfo &TTI,
-                      unsigned MaxUnrolledLoopSize) {
+                      unsigned MaxUnrolledLoopSize, bool CompleteUnroll = true) {
   // We want to be able to scale offsets by the trip count and add more offsets
   // to them without checking for overflows, and we already don't want to
   // analyze *massive* trip counts, so we force the max to be reasonably small.
@@ -268,9 +294,9 @@
   if (!L->empty())
     return None;
 
-  // Don't simulate loops with a big or unknown tripcount
-  if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
-      TripCount > UnrollMaxIterationsCountToAnalyze)
+  // Don't simulate loops with a big or unknown UnrollCount
+  if (!UnrollMaxIterationsCountToAnalyze || !UnrollCount ||
+      UnrollCount > UnrollMaxIterationsCountToAnalyze)
     return None;
 
   SmallSetVector<BasicBlock *, 16> BBWorklist;
@@ -331,8 +357,10 @@
         // If this is a PHI node in the loop header, just add it to the PHI set.
         if (auto *PhiI = dyn_cast<PHINode>(I))
           if (PhiI->getParent() == L->getHeader()) {
-            assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
-                                  "inherently simplify during unrolling.");
+            if (CompleteUnroll)
+              assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
+                                    "inherently simplify during complete "
+                                    "unrolling.");
             if (Iteration == 0)
               continue;
 
@@ -363,7 +391,15 @@
           auto *OpI = dyn_cast<Instruction>(Op);
           if (!OpI || !L->contains(OpI))
             continue;
-
+          // For regular unroll instruction that depends on PHI also costs.
+          if (!CompleteUnroll && isa<PHINode>(OpI)) {
+            auto *OpIP = dyn_cast<PHINode>(OpI);
+            if(OpIP->getParent() == L->getHeader())
+              if (auto *OpIPI =
+                      dyn_cast<Instruction>(
+                          OpIP->getIncomingValueForBlock(L->getLoopLatch())))
+            CostWorklist.push_back(OpIPI);
+          }
           // Otherwise accumulate its cost.
           CostWorklist.push_back(OpI);
         }
@@ -392,7 +428,7 @@
   // which would be simplified.
   // Since the same load will take different values on different iterations,
   // we literally have to go through all loop's iterations.
-  for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
+  for (unsigned Iteration = 0; Iteration < UnrollCount; ++Iteration) {
     DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
 
     // Prepare for the iteration by collecting any simplified entry or backedge
@@ -408,10 +444,26 @@
           PHI->getNumIncomingValues() == 2 &&
           "Must have an incoming value only for the preheader and the latch.");
 
+      // If incoming value for PHI is another loop PHI, that means we need to
+      // store previous value (without unroll).
+      if (PHINode *PHIL = dyn_cast<PHINode>(
+          PHI->getIncomingValueForBlock(L->getLoopLatch())))
+        if (Iteration && L->contains(PHIL))
+          RolledDynamicCost++;
+
+      unsigned PhiCycle = getPhiCycleLength (PHI, L);
+      // For complete unroll if we were unable to get PHI cycle length
+      // consider cycle length as full unroll count (TripCount).
+      if (CompleteUnroll && PhiCycle == 0)
+        PhiCycle = UnrollCount;
+      // If PHI has no cycle we are unable to simplify it.
+      if (PhiCycle == 0)
+        continue;
+
       Value *V = PHI->getIncomingValueForBlock(
-          Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
+          Iteration % PhiCycle ? L->getLoopLatch() : L->getLoopPreheader());
       Constant *C = dyn_cast<Constant>(V);
-      if (Iteration != 0 && !C)
+      if ((Iteration % PhiCycle) && !C)
         C = SimplifiedValues.lookup(V);
       if (C)
         SimplifiedInputValues.push_back({PHI, C});
@@ -421,8 +473,9 @@
     SimplifiedValues.clear();
     while (!SimplifiedInputValues.empty())
       SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
+    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues,
+                                  SE, L, CompleteUnroll);
 
-    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
 
     BBWorklist.clear();
     BBWorklist.insert(L->getHeader());
@@ -521,7 +574,7 @@
 
     // If we found no optimization opportunities on the first iteration, we
     // won't find them on later ones too.
-    if (UnrolledCost == RolledDynamicCost) {
+    if (CompleteUnroll && UnrolledCost == RolledDynamicCost) {
       DEBUG(dbgs() << "  No opportunities found.. exiting.\n"
                    << "  UnrolledCost: " << UnrolledCost << "\n");
       return None;
@@ -538,9 +591,12 @@
         break;
 
       Value *Op = PN->getIncomingValueForBlock(ExitingBB);
+      if (auto *OpPN = dyn_cast<PHINode>(Op))
+        if (OpPN->getParent() == L->getHeader())
+          Op = OpPN->getIncomingValueForBlock(L->getLoopLatch());
       if (auto *OpI = dyn_cast<Instruction>(Op))
         if (L->contains(OpI))
-          AddCostRecursively(*OpI, TripCount - 1);
+          AddCostRecursively(*OpI, UnrollCount - 1);
     }
   }
 
@@ -907,6 +963,25 @@
           << NV("UnrollCount", UP.Count) << " time(s).");
   }
 
+  // Estimate if Force unroll could be profitable.
+  // Currently try only 2 times unroll.
+  // Do it only for innermost loops.
+  if (UnrollAllowForce && !ExplicitUnroll &&
+      (UP.Count < 2 ||
+       isa<SCEVCouldNotCompute>(SE->getBackedgeTakenCount(L)))) {
+    Optional<EstimatedUnrollCost> Cost =
+        analyzeLoopUnrollCost(L, 2, DT, *SE, TTI,
+                              UP.PartialThreshold + UP.BEInsns, false);
+    if (Cost && Cost->UnrolledCost < Cost->RolledDynamicCost) {
+      if ((unsigned)Cost->UnrolledCost <= UP.PartialThreshold)
+        UP.Force = true;
+      UP.Count = 2;
+      DEBUG(dbgs() << "  unrolling with count: " << UP.Count << "\n");
+      // Unroll uncountable loops only once.
+      return true;
+    }
+  }
+
   if (UP.Count > UP.MaxCount)
     UP.Count = UP.MaxCount;
   DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count << "\n");
Index: test/Transforms/LoopUnroll/unroll-force.ll
===================================================================
--- test/Transforms/LoopUnroll/unroll-force.ll
+++ test/Transforms/LoopUnroll/unroll-force.ll
@@ -0,0 +1,74 @@
+; RUN: opt < %s -S -unroll-runtime -loop-unroll -unroll-allow-force | FileCheck %s
+; The test checks unroll for uncountable loop, when previous value is stored.
+
+; CHECK: loop.body.if:
+; CHECK: loop.body.if.1:
+
+%struct.list = type { %struct.list*, i32 }
+
+@head = global %struct.list { %struct.list* null, i32 1 }, align 8
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
+entry:
+  %0 = load i32, i32* getelementptr inbounds (%struct.list, %struct.list* @head, i64 0, i32 1), align 8
+  br label %loop.body
+
+loop.body:                                          ; preds = %entry, %loop.body.if
+  %prev = phi %struct.list* [ @head, %entry ], [ %cur, %loop.body.if ]
+  %cur = phi %struct.list* [ @head, %entry ], [ %2, %loop.body.if ]
+  %1 = phi i32 [ %0, %entry ], [ %3, %loop.body.if ]
+  %next.cur = getelementptr inbounds %struct.list, %struct.list* %cur, i64 0, i32 0
+  %2 = load %struct.list*, %struct.list** %next.cur, align 8
+  %tobool = icmp eq %struct.list* %2, null
+  br i1 %tobool, label %if.else, label %loop.body.if
+
+loop.body.if:                       ; preds = %loop.body
+  %val = getelementptr inbounds %struct.list, %struct.list* %cur, i64 0, i32 1
+  %3 = load i32, i32* %val, align 8
+  %cmp = icmp sgt i32 %3, %0
+  br i1 %cmp, label %loopexit, label %loop.body
+
+if.else:                                          ; preds = %if.then
+  %next.prev = getelementptr inbounds %struct.list, %struct.list* %prev, i64 0, i32 0
+  store %struct.list* null, %struct.list** %next.prev, align 8
+  br label %end
+
+loopexit:                                 ; preds = %loop.body.if
+  br label %end
+
+end:                                          ; preds = %loopexit, %if.else
+  %4 = phi i32 [ %1, %if.else ], [ %3, %loopexit ]
+  ret i32 %4
+}
+
+; The test checks unroll for uncountable loop, when XOR is simplified.
+
+; CHECK: while.body:
+; CHECK: while.body.1:
+
+; Function Attrs: norecurse nounwind uwtable
+define void @xor(i32* nocapture %a) local_unnamed_addr #0 {
+entry:
+  %0 = load i32, i32* %a, align 4
+  %entry.cmp = icmp eq i32 %0, -1
+  br i1 %entry.cmp, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %1 = phi i32 [ %2, %while.body ], [ %0, %entry ]
+  %arrayidx12 = phi i32* [ %arrayidx, %while.body ], [ %a, %entry ]
+  %s = phi i32 [ %xor, %while.body ], [ 0, %entry ]
+  %i = phi i32 [ %inc, %while.body ], [ undef, %entry ]
+  %add = add nsw i32 %1, %s
+  store i32 %add, i32* %arrayidx12, align 4
+  %xor = xor i32 %s, 1
+  %inc = add nsw i32 %i, 1
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %inc
+  %2 = load i32, i32* %arrayidx, align 4
+  %cmp = icmp eq i32 %2, -1
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
Index: unittests/Analysis/UnrollAnalyzer.cpp
===================================================================
--- unittests/Analysis/UnrollAnalyzer.cpp
+++ unittests/Analysis/UnrollAnalyzer.cpp
@@ -38,7 +38,7 @@
     TripCount = SE->getSmallConstantTripCount(L, Exiting);
     for (unsigned Iteration = 0; Iteration < TripCount; Iteration++) {
       DenseMap<Value *, Constant *> SimplifiedValues;
-      UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L);
+      UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L, true);
       for (auto *BB : L->getBlocks())
         for (Instruction &I : *BB)
           Analyzer.visit(I);