Index: llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp
+++ llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -1515,6 +1516,46 @@
         SafetyInfo->BlockColors = colorEHFunclets(*Fn);
 }
 
+/// Return true if we can prove that the given ExitBlock is not reached on the
+/// first iteration of the given loop.  That is, the backedge of the loop must
+/// be executed before the ExitBlock is executed in any dynamic execution trace.
+static bool CanProveNotTakenFirstIteration(BasicBlock *ExitBlock,
+                                           const DominatorTree *DT,
+                                           const Loop *CurLoop) {
+  auto *CondExitBlock = ExitBlock->getSinglePredecessor();
+  if (!CondExitBlock)
+    // expect unique exits
+    return false;
+  assert(CurLoop->contains(CondExitBlock) && "meaning of exit block");
+  auto *BI = dyn_cast<BranchInst>(CondExitBlock->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+  // todo: handle fcmp someday
+  // todo: this would be a lot more powerful if we used scev, but all the
+  // plumbing is currently missing to pass a pointer in from the pass
+  auto *ICI = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!ICI)
+    return false;
+  // Check for cmp (phi [x, preheader] ...), y where (pred x, y is known
+  auto *LHS = dyn_cast<PHINode>(ICI->getOperand(0));
+  auto *RHS = ICI->getOperand(1);
+  if (!LHS || LHS->getParent() != CurLoop->getHeader())
+    return false;
+  auto DL = ExitBlock->getModule()->getDataLayout();
+  auto *IVStart = LHS->getIncomingValueForBlock(CurLoop->getLoopPreheader());
+  auto *SimpleValOrNull = SimplifyICmpInst(ICI->getPredicate(),
+                                           IVStart, RHS,
+                                           {DL, /*TLI*/ nullptr,
+                                               DT, /*AC*/ nullptr, BI});
+  auto *SimpleCst = dyn_cast_or_null<Constant>(SimpleValOrNull);
+  if (!SimpleCst)
+    return false;
+  if (ExitBlock == BI->getSuccessor(0))
+    return SimpleCst->isZeroValue();
+  assert(ExitBlock == BI->getSuccessor(1) && "implied by above");
+  return SimpleCst->isAllOnesValue();
+}
+
 /// Returns true if the instruction in a loop is guaranteed to execute at least
 /// once.
 bool llvm::isGuaranteedToExecute(const Instruction &Inst,
@@ -1537,6 +1578,22 @@
   if (SafetyInfo->MayThrow)
     return false;
 
+  // Note: There are two styles of reasoning intermixed below for
+  // implementation efficiency reasons.  They are:
+  // 1) If we can prove that the instruction dominates all exit blocks, then we
+  // know the instruction must have executed on *some* iteration before we
+  // exit.  We do not prove *which* iteration the instruction must execute on.
+  // 2) If we can prove that the instruction dominates the latch and all exits
+  // which might be taken on the first iteration, we know the instruction must
+  // execute on the first iteration.  This second style allows a conditional
+  // exit before the instruction of interest which is provably not taken on the
+  // first iteration.  This is a quite common case for range check like
+  // patterns.  TODO: support loops with multiple latches.
+
+  const bool InstDominatesLatch =
+    CurLoop->getLoopLatch() != nullptr &&
+    DT->dominates(Inst.getParent(), CurLoop->getLoopLatch());
+
   // Get the exit blocks for the current loop.
   SmallVector<BasicBlock *, 8> ExitBlocks;
   CurLoop->getExitBlocks(ExitBlocks);
@@ -1544,7 +1601,9 @@
   // Verify that the block dominates each of the exit blocks of the loop.
   for (BasicBlock *ExitBlock : ExitBlocks)
     if (!DT->dominates(Inst.getParent(), ExitBlock))
-      return false;
+      if (!InstDominatesLatch ||
+          !CanProveNotTakenFirstIteration(ExitBlock, DT, CurLoop))
+        return false;
 
   // As a degenerate case, if the loop is statically infinite then we haven't
   // proven anything since there are no exit blocks.
Index: llvm/trunk/test/Transforms/LICM/hoist-mustexec.ll
===================================================================
--- llvm/trunk/test/Transforms/LICM/hoist-mustexec.ll
+++ llvm/trunk/test/Transforms/LICM/hoist-mustexec.ll
@@ -0,0 +1,249 @@
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @f() nounwind
+
+; constant fold on first ieration
+define i32 @test1(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test1(
+entry:
+; CHECK: %i1 = load i32, i32* %a, align 4
+; CHECK-NEXT: br label %for.body
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue ]
+  %r.chk = icmp ult i32 %iv, 2000
+  br i1 %r.chk, label %continue, label %fail
+continue:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; Count down from a.length w/entry guard
+; TODO: currently unable to prove the following:
+; ule i32 (add nsw i32 %len, -1), %len where len is [0, 512]
+define i32 @test2(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test2(
+entry:
+  %len = load i32, i32* %a, align 4, !range !{i32 0, i32 512}
+  %is.non.pos = icmp eq i32 %len, 0
+  br i1 %is.non.pos, label %fail, label %preheader
+preheader:
+  %lenminusone = add nsw i32 %len, -1
+  br label %for.body
+for.body:
+  %iv = phi i32 [ %lenminusone, %preheader ], [ %dec, %continue ]
+  %acc = phi i32 [ 0, %preheader ], [ %add, %continue ]
+  %r.chk = icmp ule i32 %iv, %len
+  br i1 %r.chk, label %continue, label %fail
+continue:
+; CHECK-LABEL: continue
+; CHECK: %i1 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %dec = add nsw i32 %iv, -1
+  %exitcond = icmp eq i32 %dec, 0
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; trivially true for zero
+define i32 @test3(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test3(
+entry:
+  %len = load i32, i32* %a, align 4, !range !{i32 0, i32 512}
+  %is.zero = icmp eq i32 %len, 0
+  br i1 %is.zero, label %fail, label %preheader
+preheader:
+; CHECK: %i1 = load i32, i32* %a, align 4
+; CHECK-NEXT: br label %for.body
+  br label %for.body
+for.body:
+  %iv = phi i32 [ 0, %preheader ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %preheader ], [ %add, %continue ]
+  %r.chk = icmp ule i32 %iv, %len
+  br i1 %r.chk, label %continue, label %fail
+continue:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; requires fact length is non-zero
+; TODO: IsKnownNonNullFromDominatingConditions is currently only be done for
+; pointers; should handle integers too
+define i32 @test4(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test4(
+entry:
+  %len = load i32, i32* %a, align 4, !range !{i32 0, i32 512}
+  %is.zero = icmp eq i32 %len, 0
+  br i1 %is.zero, label %fail, label %preheader
+preheader:
+  br label %for.body
+for.body:
+  %iv = phi i32 [ 0, %preheader ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %preheader ], [ %add, %continue ]
+  %r.chk = icmp ult i32 %iv, %len
+  br i1 %r.chk, label %continue, label %fail
+continue:
+; CHECK-LABEL: continue
+; CHECK: %i1 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; variation on test1 with branch swapped
+define i32 @test-brswap(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test-brswap(
+entry:
+; CHECK: %i1 = load i32, i32* %a, align 4
+; CHECK-NEXT: br label %for.body
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue ]
+  %r.chk = icmp ugt i32 %iv, 2000
+  br i1 %r.chk, label %fail, label %continue
+continue:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+define i32 @test-nonphi(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test-nonphi(
+entry:
+  br label %for.body
+
+for.body:
+; CHECK-LABEL: continue
+; CHECK: %i1 = load i32, i32* %a, align 4
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue ]
+  %xor = xor i32 %iv, 72
+  %r.chk = icmp ugt i32 %xor, 2000
+  br i1 %r.chk, label %fail, label %continue
+continue:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+define i32 @test-wrongphi(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test-wrongphi(
+entry:
+  br label %for.body
+  
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue ]
+  br label %dummy_block
+dummy_block:
+  %wrongphi = phi i32 [12, %for.body]
+  %r.chk = icmp ugt i32 %wrongphi, 2000
+  br i1 %r.chk, label %fail, label %continue
+continue:
+; CHECK-LABEL: continue
+; CHECK: %i1 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; This works because loop-simplify is run implicitly, but test for it anyways
+define i32 @test-multiple-latch(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test-multiple-latch(
+entry:
+; CHECK: %i1 = load i32, i32* %a, align 4
+; CHECK-NEXT: br label %for.body
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue1 ], [ %inc, %continue2 ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue1 ], [ %add, %continue2 ]
+  %r.chk = icmp ult i32 %iv, 2000
+  br i1 %r.chk, label %continue1, label %fail
+continue1:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %cmp = icmp eq i32 %add, 0
+  br i1 %cmp, label %continue2, label %for.body
+continue2:
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
Index: llvm/trunk/test/Transforms/LICM/scalar-promote.ll
===================================================================
--- llvm/trunk/test/Transforms/LICM/scalar-promote.ll
+++ llvm/trunk/test/Transforms/LICM/scalar-promote.ll
@@ -405,6 +405,44 @@
 
 }
 
+; Early exit is known not to be taken on first iteration and thus doesn't
+; effect whether load is known to execute.
+define void @test11(i32 %i) {
+Entry:
+  br label %Loop
+; CHECK-LABEL: @test11(
+; CHECK: Entry:
+; CHECK-NEXT:   load i32, i32* @X
+; CHECK-NEXT:   br label %Loop
+
+
+Loop:   ; preds = %Loop, %0
+  %j = phi i32 [ 0, %Entry ], [ %Next, %body ]    ; <i32> [#uses=1]
+  %early.test = icmp ult i32 %j, 32
+  br i1 %early.test, label %body, label %Early
+body:
+  %x = load i32, i32* @X   ; <i32> [#uses=1]
+  %x2 = add i32 %x, 1   ; <i32> [#uses=1]
+  store i32 %x2, i32* @X
+  %Next = add i32 %j, 1   ; <i32> [#uses=2]
+  %cond = icmp eq i32 %Next, 0    ; <i1> [#uses=1]
+  br i1 %cond, label %Out, label %Loop
+
+Early:
+; CHECK: Early:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %x2
+; CHECK-NEXT:   store i32 %[[LCSSAPHI]], i32* @X
+; CHECK-NEXT:   ret void
+  ret void
+Out:
+  ret void
+; CHECK: Out:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %x2
+; CHECK-NEXT:   store i32 %[[LCSSAPHI]], i32* @X
+; CHECK-NEXT:   ret void
+
+}
+
 !0 = !{!4, !4, i64 0}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}