Index: include/llvm/Analysis/LoopInfo.h
===================================================================
--- include/llvm/Analysis/LoopInfo.h
+++ include/llvm/Analysis/LoopInfo.h
@@ -561,6 +561,14 @@
   /// Otherwise return null.
   BasicBlock *getUniqueExitBlock() const;
 
+  /// If among getUniqueExitBlocks there is only one block with successors,
+  /// return it (ignoring  blocks with exit, assert, throw...).
+  /// Otherwise return null.
+  BasicBlock *getUniqueReachableExitBlock() const;
+
+  /// Same as above, but for exiting blocks.
+  BasicBlock *getExitingToReachableBlock() const;
+
   void dump() const;
   void dumpVerbose() const;
 
Index: include/llvm/Transforms/Utils/UnrollLoop.h
===================================================================
--- include/llvm/Transforms/Utils/UnrollLoop.h
+++ include/llvm/Transforms/Utils/UnrollLoop.h
@@ -69,7 +69,7 @@
                                 AssumptionCache *AC,
                                 bool PreserveLCSSA);
 
-void computePeelCount(Loop *L, unsigned LoopSize,
+void computePeelCount(Loop *L, DominatorTree &DT, unsigned LoopSize,
                       TargetTransformInfo::UnrollingPreferences &UP,
                       unsigned &TripCount, ScalarEvolution &SE);
 
Index: lib/Analysis/LoopInfo.cpp
===================================================================
--- lib/Analysis/LoopInfo.cpp
+++ lib/Analysis/LoopInfo.cpp
@@ -432,6 +432,63 @@
   }
 }
 
+BasicBlock *Loop::getUniqueReachableExitBlock() const {
+  SmallVector<BasicBlock *, 8> UniqueExitBlocks;
+  getUniqueExitBlocks(UniqueExitBlocks);
+  if (UniqueExitBlocks.size() == 1)
+    return UniqueExitBlocks[0];
+  BasicBlock *ReachableExit = nullptr;
+  for (BasicBlock *BB : UniqueExitBlocks) {
+    const TerminatorInst *TI = BB->getTerminator();
+    if (TI->getNumSuccessors() == 0 && (isa<UnreachableInst>(TI) ||
+        // If this block is terminated by a call to
+        // @llvm.experimental.deoptimize then treat it like an unreachable since
+        // the @llvm.experimental.deoptimize call is expected to practically
+        // never execute.
+        BB->getTerminatingDeoptimizeCall()))
+      continue;
+
+    // If found second reachable exit return nullptr.
+    if (ReachableExit) {
+      return nullptr;
+    }
+    ReachableExit = BB;
+  }
+  return ReachableExit;
+}
+
+BasicBlock *Loop::getExitingToReachableBlock() const {
+  SmallVector<BasicBlock *, 8> ExitingBlocks;
+  getExitingBlocks(ExitingBlocks);
+  if (ExitingBlocks.size() == 1)
+    return ExitingBlocks[0];
+  BasicBlock *ExitingToReachable = nullptr;
+  for (BasicBlock *BB : ExitingBlocks) {
+    BasicBlock *EB = nullptr;
+    for (BasicBlock *Succ : successors(BB)) {
+      if (contains(Succ))
+        continue;
+      if (EB)
+        return nullptr;
+      EB = Succ;
+    }
+    const TerminatorInst *TI = EB->getTerminator();
+    if (TI->getNumSuccessors() == 0 && (isa<UnreachableInst>(TI) ||
+        // If this block is terminated by a call to
+        // @llvm.experimental.deoptimize then treat it like an unreachable since
+        // the @llvm.experimental.deoptimize call is expected to practically
+        // never execute.
+        EB->getTerminatingDeoptimizeCall()))
+      continue;
+    // If found second reachable exit return nullptr.
+    if (ExitingToReachable) {
+      return nullptr;
+    }
+    ExitingToReachable = BB;
+  }
+  return ExitingToReachable;
+}
+
 BasicBlock *Loop::getUniqueExitBlock() const {
   SmallVector<BasicBlock *, 8> UniqueExitBlocks;
   getUniqueExitBlocks(UniqueExitBlocks);
Index: lib/Transforms/Scalar/LoopUnrollPass.cpp
===================================================================
--- lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -795,7 +795,7 @@
   }
 
   // 4th priority is loop peeling
-  computePeelCount(L, LoopSize, UP, TripCount, SE);
+  computePeelCount(L, DT, LoopSize, UP, TripCount, SE);
   if (UP.PeelCount) {
     UP.Runtime = false;
     UP.Count = 1;
Index: lib/Transforms/Utils/LoopUnrollPeel.cpp
===================================================================
--- lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -75,14 +76,14 @@
     return false;
 
   // Only peel loops that contain a single exit
-  if (!L->getExitingBlock() || !L->getUniqueExitBlock())
+  if (!L->getExitingToReachableBlock() || !L->getUniqueReachableExitBlock())
     return false;
 
   // Don't try to peel loops where the latch is not the exiting block.
   // This can be an indication of two different things:
   // 1) The loop is not rotated.
   // 2) The loop contains irreducible control flow that involves the latch.
-  if (L->getLoopLatch() != L->getExitingBlock())
+  if (L->getLoopLatch() != L->getExitingToReachableBlock())
     return false;
 
   return true;
@@ -217,7 +218,7 @@
 }
 
 // Return the number of iterations we want to peel off.
-void llvm::computePeelCount(Loop *L, unsigned LoopSize,
+void llvm::computePeelCount(Loop *L, DominatorTree &DT, unsigned LoopSize,
                             TargetTransformInfo::UnrollingPreferences &UP,
                             unsigned &TripCount, ScalarEvolution &SE) {
   assert(LoopSize > 0 && "Zero loop size is not allowed!");
@@ -229,6 +230,51 @@
   if (!L->empty())
     return;
 
+  // Peel first iteration in the following case.
+  if (2 * LoopSize <= UP.Threshold && !L->getUniqueExitBlock() &&
+      L->getUniqueReachableExitBlock()) {
+    // There is more than 1 Exit block, but only 1 is reachable
+    // (has successors).
+    SmallVector<BasicBlock *, 8> ExitBlocks;
+    L->getUniqueExitBlocks(ExitBlocks);
+    bool SafeToSpeculate;
+    for (BasicBlock *BB : L->blocks()) {
+      for (BasicBlock *EB : ExitBlocks) {
+        if (!DT.dominates(BB, EB)) {
+          SafeToSpeculate = false;
+          break;
+        }
+      }
+      // All instructions before first exit are ok to speculate.
+      // Peeling will not help here.
+      if (SafeToSpeculate)
+        continue;
+
+      for (Instruction &I : *BB) {
+        if (isa<StoreInst>(&I) || isa<CallInst>(&I))
+          continue;
+        // Skip store and call instructions as they are unsafe to speculate
+        // even after peeling.
+        bool OperandsInv = true;
+        for (Value *Op : I.operands()) {
+          if (!L->isLoopInvariant(Op)) {
+            OperandsInv = false;
+            break;
+          }
+        }
+        // If there is an instruction with invariant operands and unsafe to
+        // speculate - that is because of unreachable exit block prior to it.
+        // Peeling first iteration will prove the instruction is safe.
+        if (OperandsInv && !isSafeToSpeculativelyExecute(&I)) {
+          DEBUG(dbgs() << "Peel 1 iteration to make instruction " << I
+                       << " safe to speculate.\n");
+          UP.PeelCount = 1;
+          return;
+        }
+      }
+      SafeToSpeculate = true;
+    }
+  }
   // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
   // iterations of the loop. For this we compute the number for iterations after
   // which every Phi is guaranteed to become an invariant, and try to peel the
@@ -482,10 +528,13 @@
   BasicBlock *Header = L->getHeader();
   BasicBlock *PreHeader = L->getLoopPreheader();
   BasicBlock *Latch = L->getLoopLatch();
-  BasicBlock *Exit = L->getUniqueExitBlock();
+  BasicBlock *Exit = L->getUniqueReachableExitBlock();
 
   Function *F = Header->getParent();
 
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
   // Set up all the necessary basic blocks. It is convenient to split the
   // preheader into 3 parts - two blocks to anchor the peeled copy of the loop
   // body, and a new preheader for the "real" loop.
@@ -579,6 +628,27 @@
     // previous one.
     remapInstructionsInBlocks(NewBlocks, VMap);
 
+    // Because now unreachable Exits are allowed we need to adjust phi nodes
+    // there.  After peeling each Exit block could have more predecessors
+    // blocks.
+    for (BasicBlock *EB : ExitBlocks) {
+      if (EB == Exit)
+        continue;
+      for (BasicBlock::iterator I = EB->begin(); isa<PHINode>(I); ++I) {
+        PHINode *PHI = cast<PHINode>(I);
+        for (BasicBlock *P : predecessors(EB)) {
+          if (!L->contains(P))
+            continue;
+          if (VMap[PHI->getIncomingValueForBlock(P)])
+            // If value defined in the loop.
+            PHI->addIncoming(VMap[PHI->getIncomingValueForBlock(P)], cast<BasicBlock>(VMap[P]));
+          else
+            // If value defined outside of the loop.
+            PHI->addIncoming(PHI->getIncomingValueForBlock(P), cast<BasicBlock>(VMap[P]));
+        }
+      }
+    }
+
     if (DT) {
       // Latches of the cloned loops dominate over the loop exit, so idom of the
       // latter is the first cloned loop body, as original PreHeader dominates
Index: test/Transforms/LoopUnroll/peel-loop-spec-load.ll
===================================================================
--- test/Transforms/LoopUnroll/peel-loop-spec-load.ll
+++ test/Transforms/LoopUnroll/peel-loop-spec-load.ll
@@ -0,0 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S  -loop-unroll -unroll-allow-peeling -unroll-threshold=150 | FileCheck %s
+
+;  Generated from the following source:
+;
+; int foo(std::vector<int> *a, std::vector<int> *b) {
+;   int s = 0;
+;   for (int i = 0; i < 1024; i++) {
+;     s += a->at(i) + b->at(i);
+;   return s;
+; }
+;
+;  Peeling is profitable here as each "at" generates bound check and
+;   exception. The bound check has invariant load - unsafe to speculate,
+;   due to exception.
+;  When first iteration is hoisted out of the loop all invariant loads are
+;   executed before loop and thus become safe to speculate.
+;
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%"class.std::vector" = type { %"struct.std::_Vector_base" }
+%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<int, std::allocator<int> >::_Vector_impl" }
+%"struct.std::_Vector_base<int, std::allocator<int> >::_Vector_impl" = type { i32*, i32*, i32* }
+
+@.str = private unnamed_addr constant [23 x i8] c"vector::_M_range_check\00", align 1
+
+; Function Attrs: noinline uwtable
+define dso_local i32 @_Z3fooPSt6vectorIiSaIiEES2_i(%"class.std::vector"* nocapture readonly, %"class.std::vector"* nocapture readonly, i32) local_unnamed_addr #0 {
+; CHECK-LABEL: @_Z3fooPSt6vectorIiSaIiEES2_i(
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[TMP0:%.*]], i64 0, i32 0, i32 0, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i64*
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast %"class.std::vector"* [[TMP0]] to i64*
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = ashr exact i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[TMP1:%.*]], i64 0, i32 0, i32 0, i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32** [[TMP11]] to i64*
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast %"class.std::vector"* [[TMP1]] to i64*
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i32*
+; CHECK-NEXT:    br label [[DOTPEEL_BEGIN:%.*]]
+; CHECK:       .peel.begin:
+; CHECK-NEXT:    br label [[TMP15:%.*]]
+; CHECK:         [[TMP16:%.*]] = icmp ugt i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[TMP16]], label [[TMP17:%.*]], label [[TMP38:%.*]]
+; CHECK:         [[TMP18:%.*]] = load i64, i64* [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, i64* [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = sub i64 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = ashr exact i64 [[TMP20]], 2
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ugt i64 [[TMP21]], 0
+; CHECK-NEXT:    br i1 [[TMP22]], label [[TMP23:%.*]], label [[TMP45:%.*]]
+; CHECK:         [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i32*
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[TMP26]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = add i32 [[TMP25]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP29]], [[TMP28]]
+; CHECK-NEXT:    [[TMP31:%.*]] = add nuw nsw i64 0, 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ult i64 [[TMP31]], 1024
+; CHECK-NEXT:    br i1 [[TMP32]], label [[DOTPEEL_NEXT:%.*]], label [[TMP33:%.*]]
+; CHECK:       .peel.next:
+; CHECK-NEXT:    br label [[DOTPEEL_NEXT1:%.*]]
+; CHECK:       .peel.next1:
+; CHECK-NEXT:    br label [[DOTPEEL_NEWPH:%.*]]
+; CHECK:       .peel.newph:
+; CHECK-NEXT:    br label [[TMP34:%.*]]
+; CHECK:       .loopexit3:
+; CHECK-NEXT:    [[DOTLCSSA_PH:%.*]] = phi i32 [ [[TMP53:%.*]], [[TMP46:%.*]] ]
+; CHECK-NEXT:    br label [[TMP33]]
+; CHECK:         [[DOTLCSSA:%.*]] = phi i32 [ [[TMP30]], [[TMP23]] ], [ [[DOTLCSSA_PH]], [[DOTLOOPEXIT3:%.*]] ]
+; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
+; CHECK:         [[TMP35:%.*]] = phi i64 [ [[TMP31]], [[DOTPEEL_NEWPH]] ], [ [[TMP54:%.*]], [[TMP46]] ]
+; CHECK-NEXT:    [[TMP36:%.*]] = phi i32 [ [[TMP30]], [[DOTPEEL_NEWPH]] ], [ [[TMP53]], [[TMP46]] ]
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ugt i64 [[TMP10]], [[TMP35]]
+; CHECK-NEXT:    br i1 [[TMP37]], label [[TMP39:%.*]], label [[DOTLOOPEXIT:%.*]]
+; CHECK:       .loopexit:
+; CHECK-NEXT:    br label [[TMP38]]
+; CHECK:         tail call void @_ZSt20__throw_out_of_rangePKc(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i64 0, i64 0))
+; CHECK-NEXT:    unreachable
+; CHECK:         [[TMP40:%.*]] = load i64, i64* [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP41:%.*]] = load i64, i64* [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP42:%.*]] = sub i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    [[TMP43:%.*]] = ashr exact i64 [[TMP42]], 2
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ugt i64 [[TMP43]], [[TMP35]]
+; CHECK-NEXT:    br i1 [[TMP44]], label [[TMP46]], label [[DOTLOOPEXIT2:%.*]]
+; CHECK:       .loopexit2:
+; CHECK-NEXT:    br label [[TMP45]]
+; CHECK:         tail call void @_ZSt20__throw_out_of_rangePKc(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i64 0, i64 0))
+; CHECK-NEXT:    unreachable
+; CHECK:         [[TMP47:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i32, i32* [[TMP47]], align 4
+; CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP41]] to i32*
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP49]], i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP51:%.*]] = load i32, i32* [[TMP50]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = add i32 [[TMP48]], [[TMP36]]
+; CHECK-NEXT:    [[TMP53]] = add i32 [[TMP52]], [[TMP51]]
+; CHECK-NEXT:    [[TMP54]] = add nuw nsw i64 [[TMP35]], 1
+; CHECK-NEXT:    [[TMP55:%.*]] = icmp ult i64 [[TMP54]], 1024
+; CHECK-NEXT:    br i1 [[TMP55]], label [[TMP34]], label [[DOTLOOPEXIT3]], !llvm.loop !0
+;
+  %4 = getelementptr inbounds %"class.std::vector", %"class.std::vector"* %0, i64 0, i32 0, i32 0, i32 1
+  %5 = bitcast i32** %4 to i64*
+  %6 = load i64, i64* %5, align 8
+  %7 = bitcast %"class.std::vector"* %0 to i64*
+  %8 = load i64, i64* %7, align 8
+  %9 = sub i64 %6, %8
+  %10 = ashr exact i64 %9, 2
+  %11 = getelementptr inbounds %"class.std::vector", %"class.std::vector"* %1, i64 0, i32 0, i32 0, i32 1
+  %12 = bitcast i32** %11 to i64*
+  %13 = bitcast %"class.std::vector"* %1 to i64*
+  %14 = inttoptr i64 %8 to i32*
+  br label %16
+
+; <label>:15:                                     ; preds = %28
+  ret i32 %35
+
+; <label>:16:                                     ; preds = %28, %3
+  %17 = phi i64 [ 0, %3 ], [ %36, %28 ]
+  %18 = phi i32 [ 0, %3 ], [ %35, %28 ]
+  %19 = icmp ugt i64 %10, %17
+  br i1 %19, label %21, label %20
+
+; <label>:20:                                     ; preds = %16
+  tail call void @_ZSt20__throw_out_of_rangePKc(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i64 0, i64 0)) #2
+  unreachable
+
+; <label>:21:                                     ; preds = %16
+  %22 = load i64, i64* %12, align 8
+  %23 = load i64, i64* %13, align 8
+  %24 = sub i64 %22, %23
+  %25 = ashr exact i64 %24, 2
+  %26 = icmp ugt i64 %25, %17
+  br i1 %26, label %28, label %27
+
+; <label>:27:                                     ; preds = %21
+  tail call void @_ZSt20__throw_out_of_rangePKc(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i64 0, i64 0)) #2
+  unreachable
+
+; <label>:28:                                     ; preds = %21
+  %29 = getelementptr inbounds i32, i32* %14, i64 %17
+  %30 = load i32, i32* %29, align 4
+  %31 = inttoptr i64 %23 to i32*
+  %32 = getelementptr inbounds i32, i32* %31, i64 %17
+  %33 = load i32, i32* %32, align 4
+  %34 = add i32 %30, %18
+  %35 = add i32 %34, %33
+  %36 = add nuw nsw i64 %17, 1
+  %37 = icmp ult i64 %36, 1024
+  br i1 %37, label %16, label %15
+}
+
+; Function Attrs: noreturn
+declare dso_local void @_ZSt20__throw_out_of_rangePKc(i8*) local_unnamed_addr #1