Index: lib/Transforms/Utils/LoopUnrollRuntime.cpp
===================================================================
--- lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -45,6 +45,10 @@
 
 STATISTIC(NumRuntimeUnrolled,
           "Number of loops unrolled with run-time trip counts");
+static cl::opt<bool> UnrollRuntimeMultiExit(
+    "unroll-runtime-multi-exit", cl::init(false), cl::Hidden,
+    cl::desc("Allow runtime unrolling for loops with multiple exits, when "
+             "epilog is generated"));
 
 /// Connect the unrolling prolog code to the original loop.
 /// The unrolling prolog code contains code to execute the
@@ -465,14 +469,48 @@
                                       LoopInfo *LI, ScalarEvolution *SE,
                                       DominatorTree *DT, bool PreserveLCSSA) {
   // for now, only unroll loops that contain a single exit
-  if (!L->getExitingBlock())
+  if (!UnrollRuntimeMultiExit && !L->getExitingBlock())
     return false;
 
-  // Make sure the loop is in canonical form, and there is a single
-  // exit block only.
+  // Make sure the loop is in canonical form.
   if (!L->isLoopSimplifyForm())
     return false;
+
+  // Since loop is in simplify form, we'll have exactly one latch.
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Header = L->getHeader();
+
   BasicBlock *Exit = L->getUniqueExitBlock(); // successor out of loop
+  // These are exit blocks other than the target of the latch exiting block.
+  SmallVector<BasicBlock *, 4> OtherExits;
+  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+  unsigned int ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
+  // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
+  // targets of the Latch be an exit block out of the loop. This needs
+  // to be guaranteed by the callers of UnrollRuntimeLoopRemainder.
+  assert(!L->contains(LatchBR->getSuccessor(ExitIndex)) &&
+         "one of the loop latch successors should be the exit block!");
+  // Support runtime unrolling for multiple exit blocks and multiple exiting
+  // blocks, when all except `Exit` block have no immediate successors.
+  if (!Exit && UnrollRuntimeMultiExit) {
+    assert(UseEpilogRemainder && "Multi exit unrolling is currently supported "
+                                 "unrolling with epilog remainder only!");
+    Exit = LatchBR->getSuccessor(ExitIndex);
+    // TODO: Support multiple exiting blocks jumping to the latch `Exit`. This
+    // will need updating the logic in connectEpilog.
+    if (!Exit->getSinglePredecessor())
+        return false;
+    SmallVector<BasicBlock *, 4> Exits;
+    L->getUniqueExitBlocks(Exits);
+    for (auto *BB : Exits) {
+      if (BB == Exit)
+        continue;
+      if (BB->getTerminator()->getNumSuccessors())
+        return false;
+      OtherExits.push_back(BB);
+    }
+  }
+
   if (!Exit)
     return false;
 
@@ -483,7 +521,12 @@
 
   // Only unroll loops with a computable trip count, and the trip count needs
   // to be an int value (allowing a pointer type is a TODO item).
-  const SCEV *BECountSC = SE->getBackedgeTakenCount(L);
+  // Since we support multiple exit blocks, we cannot use getBackEdgeTakenCount
+  // which finds SCEV for single exit loops. Since the main exit is the latch
+  // exit, we take the exitCount for the latch. This is equivalent to
+  // getBackEdgeTakenCount when we have just one exit, and the exiting block is
+  // the latch.
+  const SCEV *BECountSC = SE->getExitCount(L, Latch);
   if (isa<SCEVCouldNotCompute>(BECountSC) ||
       !BECountSC->getType()->isIntegerTy())
     return false;
@@ -496,7 +539,6 @@
   if (isa<SCEVCouldNotCompute>(TripCountSC))
     return false;
 
-  BasicBlock *Header = L->getHeader();
   BasicBlock *PreHeader = L->getLoopPreheader();
   BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
   const DataLayout &DL = Header->getModule()->getDataLayout();
@@ -509,19 +551,6 @@
   // comment on ModVal below.
   if (Log2_32(Count) > BEWidth)
     return false;
-
-  BasicBlock *Latch = L->getLoopLatch();
-
-  // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
-  // targets of the Latch be the single exit block out of the loop. This needs
-  // to be guaranteed by the callers of UnrollRuntimeLoopRemainder.
-  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
-  assert(
-      (LatchBR->getSuccessor(0) == Exit || LatchBR->getSuccessor(1) == Exit) &&
-      "one of the loop latch successors should be "
-      "the exit block!");
-  // Avoid warning of unused `LatchBR` variable in release builds.
-  (void)LatchBR;
   // Loop structure is the following:
   //
   // PreHeader
@@ -659,6 +688,44 @@
                                 NewBlocks[0]->getIterator(),
                                 F->end());
 
+  // At this stage, the loop blocks are cloned for the prolog/epilog. Clone the
+  // other exit blocks as well for the epilog, and update the Vmap and dominator
+  // tree info.
+  for (auto *BB : OtherExits) {
+
+    BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
+    NewBB->setName(BB->getName() + ".epil");
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;
+         ++II) {
+      Instruction *NewInst = II->clone();
+      // For phi nodes, get the appropriate values and the block from the VMap.
+      if (isa<PHINode>(II)) {
+        PHINode *OldPHI = cast<PHINode>(II);
+        PHINode *NewPHI = cast<PHINode>(NewInst);
+        for (unsigned int i = 0; i < OldPHI->getNumIncomingValues(); i++) {
+          BasicBlock *NewPred =
+              cast<BasicBlock>(VMap[OldPHI->getIncomingBlock(i)]);
+          Value *InVal = OldPHI->getIncomingValue(i);
+          NewPHI->setIncomingBlock(i, NewPred);
+          if (Value *V = VMap.lookup(InVal))
+            NewPHI->setIncomingValue(i, V);
+        }
+        NewInst = NewPHI;
+      }
+      if (II->hasName())
+        NewInst->setName(II->getName() + ".epil");
+      VMap[&*II] = NewInst;
+      NewBB->getInstList().push_back(NewInst);
+    }
+    VMap[BB] = NewBB;
+    NewBlocks.push_back(NewBB);
+
+    if (DT) {
+      BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
+      DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
+    }
+  }
+
   // Loop structure should be the following:
   //  Epilog             Prolog
   //
Index: test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
@@ -0,0 +1,131 @@
+; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-runtime-multi-exit=true -verify-dom-info -instcombine
+
+; test with three exiting and three exit blocks.
+define void @test1(i64 %trip) {
+; CHECK-LABEL: test1
+; CHECK:  %niter.nsub.7 = add i64 %niter, -8
+entry:
+  br label %loop_header
+loop_header:
+  %iv = phi i64 [ 0, %entry ], [ %iv_next, %loop_latch ]
+  br i1 undef, label %loop_latch, label %loop_exiting_bb1
+loop_exiting_bb1:
+  br i1 false, label %loop_exiting_bb2, label %exit1
+loop_exiting_bb2:
+  br i1 false, label %loop_latch, label %exit3
+exit3:
+  ret void
+loop_latch:
+  %iv_next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv_next, %trip
+  br i1 %cmp, label %loop_header, label %exit2.loopexit
+exit1:
+ ret void
+exit2.loopexit:
+  ret void
+}
+
+
+; test with three exiting and two exit blocks
+; There are 2 values for the return that are calculated at every iteration.
+; %sum.02 and %add. Both of these are incoming values for phi from every exiting
+; unrolled block.
+define i32 @test2(i32* nocapture %a, i64 %n) nounwind uwtable readonly {
+; CHECK-LABEL: test2
+; CHECK-LABEL: for.exit2:
+; CHECK-NEXT:    %retval = phi i32 [ %sum.02, %header ], [ 42, %for.exiting_block ], [ %add, %for.body ], [ 42, %for.exiting_block.1 ], [ %add.1, %for.body.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %for.body.2 ], [ 42, %for.exiting_block.3 ],
+; CHECK-LABEL: for.exit2.epil:
+; CHECK-NEXT:    %retval.epil = phi i32 [ %sum.02.epil, %header.epil ], [ 42, %for.exiting_block.epil ]
+; CHECK: %niter.nsub.7 = add i64 %niter, -8
+entry:
+  br label %header
+
+header:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  br i1 false, label %for.exit2, label %for.exiting_block
+
+for.exiting_block:
+ %cmp = icmp eq i64 %n, 42
+ br i1 %cmp, label %for.exit2, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %sum.02
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.end, label %header
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+
+for.exit2:
+  %retval = phi i32 [ %sum.02, %header ], [ 42, %for.exiting_block ]
+  ret i32 %retval
+}
+
+; FIXME: Support multiple exiting blocks to the same latch exit block.
+define i32 @test3(i32* nocapture %a, i64 %n) nounwind uwtable readonly {
+; CHECK-LABEL: test3
+; CHECK-NOT: .unr
+entry:
+  br label %header
+
+header:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  br i1 undef, label %for.end, label %for.exiting_block
+  ;br i1 undef, label %for.exit2, label %for.exiting_block
+
+for.exiting_block:
+ %cmp = icmp eq i64 %n, 42
+ br i1 %cmp, label %for.exit2, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %sum.02
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.end, label %header
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 0, %header ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+
+for.exit2:
+  ret i32 42
+}
+
+
+; test with two exiting and three exit blocks.
+; the non-latch exiting block has a switch.
+define void @test4(i64 %trip, i64 %add) {
+; CHECK-LABEL: test4
+; CHECK: %niter.nsub.7 = add i64 %niter, -8
+entry:
+  br label %loop_header
+loop_header:
+  %iv = phi i64 [ 0, %entry ], [ %iv_next, %loop_latch ]
+  %sum = phi i64 [ 0, %entry ], [ %sum.next, %loop_latch ]
+  br i1 undef, label %loop_latch, label %loop_exiting_bb1
+loop_exiting_bb1:
+   switch i64 %sum, label %loop_latch [
+     i64 24, label %exit1
+     i64 42, label %exit3
+   ]
+
+exit3:
+  ret void
+loop_latch:
+  %iv_next = add nuw nsw i64 %iv, 1
+  %sum.next = add i64 %sum, %add
+  %cmp = icmp ne i64 %iv_next, %trip
+  br i1 %cmp, label %loop_header, label %exit2.loopexit
+exit1:
+ ret void
+exit2.loopexit:
+  ret void
+}