Index: llvm/include/llvm/Analysis/ScalarEvolution.h
===================================================================
--- llvm/include/llvm/Analysis/ScalarEvolution.h
+++ llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1173,6 +1173,7 @@
 
   public:
     SCEVCallbackVH(Value *V, ScalarEvolution *SE = nullptr);
+    SCEVCallbackVH(const Value *V, ScalarEvolution *SE = nullptr);
   };
 
   friend class SCEVCallbackVH;
@@ -1459,6 +1460,15 @@
            SmallVector<PointerIntPair<const Loop *, 2, LoopDisposition>, 2>>
       LoopDispositions;
 
+  /// Cache for \c isGuaranteedToTransferExecutionToSuccessor(BB)
+  /// Note: Key must be a wrapped BasicBlock *
+  DenseMap<SCEVCallbackVH, bool, DenseMapInfo<Value *>>
+      BlockTransferExecutionToSuccessorCache;
+
+  /// A version of the ValueTracking routine, but cached for efficiency, and
+  /// restricted to blocks inside a Loop.
+  bool isGuaranteedToTransferExecutionToSuccessor(const BasicBlock *BB);
+
   struct LoopProperties {
     /// Set to true if the loop contains no instruction that can abnormally exit
     /// the loop (i.e. via throwing an exception, by terminating the thread
Index: llvm/lib/Analysis/ScalarEvolution.cpp
===================================================================
--- llvm/lib/Analysis/ScalarEvolution.cpp
+++ llvm/lib/Analysis/ScalarEvolution.cpp
@@ -6631,24 +6631,102 @@
   return Bound ? Bound : &*F.getEntryBlock().begin();
 }
 
+bool ScalarEvolution::
+isGuaranteedToTransferExecutionToSuccessor(const BasicBlock *BB) {
+  assert(LI.getLoopFor(BB) && "must be in a loop for invalidation!");
+  if (!BlockTransferExecutionToSuccessorCache.count(BB))
+    BlockTransferExecutionToSuccessorCache[{BB, this}] =
+      llvm::isGuaranteedToTransferExecutionToSuccessor(BB);
+
+  return BlockTransferExecutionToSuccessorCache[BB];
+}
+
 bool ScalarEvolution::isGuaranteedToTransferExecutionTo(const Instruction *A,
                                                         const Instruction *B) {
   if (A->getParent() == B->getParent() &&
-      isGuaranteedToTransferExecutionToSuccessor(A->getIterator(),
-                                                 B->getIterator()))
+      llvm::isGuaranteedToTransferExecutionToSuccessor(A->getIterator(),
+                                                       B->getIterator()))
     return true;
 
-  auto *BLoop = LI.getLoopFor(B->getParent());
-  if (BLoop && BLoop->getHeader() == B->getParent() &&
-      BLoop->getLoopPreheader() == A->getParent() &&
-      isGuaranteedToTransferExecutionToSuccessor(A->getIterator(),
-                                                 A->getParent()->end()) &&
-      isGuaranteedToTransferExecutionToSuccessor(B->getParent()->begin(),
-                                                 B->getIterator()))
-    return true;
-  return false;
-}
+  // The current implementation only handles the case where A dominates B.
+  // Note that cases where A does not dominate B, but where A must reach B do
+  // exist.  An example would be B in the header of some loop L, and A in the
+  // (unconditional) latch block of L.
+  if (!DT.dominates(A->getParent(), B->getParent()))
+    return false;
+
+  // Unreachable CFGs can get very weird.  To avoid infinite loops below,
+  // require reachability
+  if (!DT.isReachableFromEntry(B->getParent()))
+    return false;
+
+  // First find a path from B to A where if all blocks along path
+  // are transparent, then we can prove A reaches B.  Defer the actual
+  // checks for transparence until the end as (even cached) that's expensive.
+  SmallVector<const BasicBlock *> Path;
+  Path.push_back(B->getParent());
+
+  auto getPrevBB = [&](const BasicBlock *BB) -> const BasicBlock * {
+    auto *L = LI.getLoopFor(BB);
+    if (L && L->getHeader() == BB)
+      return L->getLoopPreheader();
 
+    auto *PrevBB = BB->getUniquePredecessor();
+    return PrevBB && PrevBB->getUniqueSuccessor() ? PrevBB : nullptr;
+  };
+
+  auto *PrevBB = getPrevBB(B->getParent());
+  if (!PrevBB)
+    return false;
+
+  while (true) {
+    Path.push_back(PrevBB);
+    if (PrevBB == A->getParent())
+      break;
+
+    PrevBB = getPrevBB(PrevBB);
+    if (!PrevBB)
+      return false;
+  }
+  assert(Path.front() == B->getParent());
+  assert(Path.back() == A->getParent());
+  assert(Path.size() >= 2);
+
+  // We rely on forgetLoop for invalidation of the cache, as a result, we
+  // can only query blocks in loops.  This restriction can be removed once
+  // we find a better cache update mechanism.
+  for (unsigned i = 1; i < Path.size() - 1; i++)
+    if (!LI.getLoopFor(Path[i]))
+      return false;
+
+  // Do the cacheable part first
+  for (unsigned i = 1; i < Path.size() - 1; i++)
+    if (!isGuaranteedToTransferExecutionToSuccessor(Path[i]))
+      return false;
+
+  // Finally, check the prefix of B's block and the suffix of A's.  For the
+  // local search, we use the block local cache as a filter.
+  auto doLocalSearch = [&](BasicBlock::const_iterator Begin,
+                           BasicBlock::const_iterator End) {
+    if (Begin == End)
+      return true;
+
+    auto *BB = Begin->getParent();
+
+    // Knowing the block isn't transparent isn't enough to answer this query;
+    // we'd need to know where in the block the blockage is.
+    if (BlockTransferExecutionToSuccessorCache.count(BB) &&
+        BlockTransferExecutionToSuccessorCache[BB])
+      return true;
+
+    // It's tempting to cache this, but since we're using a bounded search
+    // here, we'd risk saving false positives into the block cache.
+    return ::isGuaranteedToTransferExecutionToSuccessor(Begin, End);
+  };
+
+  return (doLocalSearch(B->getParent()->begin(), B->getIterator()) &&
+          doLocalSearch(A->getIterator(), A->getParent()->end()));
+}
 
 bool ScalarEvolution::isSCEVExprNeverPoison(const Instruction *I) {
   // Only proceed if we can prove that I does not yield poison.
@@ -6755,15 +6833,23 @@
     LoopProperties LP = {/* HasNoAbnormalExits */ true,
                          /*HasNoSideEffects*/ true};
 
-    for (auto *BB : L->getBlocks())
+    for (auto *BB : L->getBlocks()) {
+      LoopProperties Local = {true, true};
       for (auto &I : *BB) {
-        if (!isGuaranteedToTransferExecutionToSuccessor(&I))
-          LP.HasNoAbnormalExits = false;
+        if (!llvm::isGuaranteedToTransferExecutionToSuccessor(&I))
+          Local.HasNoAbnormalExits = false;
         if (HasSideEffects(&I))
-          LP.HasNoSideEffects = false;
-        if (!LP.HasNoAbnormalExits && !LP.HasNoSideEffects)
+          Local.HasNoSideEffects = false;
+        if (!Local.HasNoAbnormalExits && !Local.HasNoSideEffects)
           break; // We're already as pessimistic as we can get.
       }
+      BlockTransferExecutionToSuccessorCache[{BB, this}]
+        = Local.HasNoAbnormalExits;
+      LP.HasNoAbnormalExits &= Local.HasNoAbnormalExits;
+      LP.HasNoSideEffects &= Local.HasNoSideEffects;
+      if (!LP.HasNoAbnormalExits && !LP.HasNoSideEffects)
+        break; // We're already as pessimistic as we can get.
+    }
 
     auto InsertPair = LoopPropertiesCache.insert({L, LP});
     assert(InsertPair.second && "We just checked!");
@@ -7506,6 +7592,7 @@
   // result.
   BackedgeTakenCounts.clear();
   PredicatedBackedgeTakenCounts.clear();
+  BlockTransferExecutionToSuccessorCache.clear();
   LoopPropertiesCache.clear();
   ConstantEvolutionLoopExitValue.clear();
   ValueExprMap.clear();
@@ -7570,6 +7657,13 @@
       PushDefUseChildren(I, Worklist);
     }
 
+    // Removing the blocks is probably overly conservative here, but it's not
+    // clear what client code might assume that forgetLoop handled the code
+    // motion invalidation required.
+    if (auto *BB = CurrL->getLoopPreheader())
+      BlockTransferExecutionToSuccessorCache.erase(BB);
+    for (auto *BB : CurrL->getBlocks())
+      BlockTransferExecutionToSuccessorCache.erase(BB);
     LoopPropertiesCache.erase(CurrL);
     // Forget all contained loops too, to avoid dangling entries in the
     // ValuesAtScopes map.
@@ -12305,6 +12399,12 @@
 
 void ScalarEvolution::SCEVCallbackVH::deleted() {
   assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!");
+
+  if (isa<BasicBlock>(getValPtr())) {
+    SE->BlockTransferExecutionToSuccessorCache.erase(*this);
+    return;
+  } 
+  
   if (PHINode *PN = dyn_cast<PHINode>(getValPtr()))
     SE->ConstantEvolutionLoopExitValue.erase(PN);
   SE->eraseValueFromMap(getValPtr());
@@ -12314,6 +12414,10 @@
 void ScalarEvolution::SCEVCallbackVH::allUsesReplacedWith(Value *V) {
   assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!");
 
+  if (isa<BasicBlock>(getValPtr()))
+    // Not isSCEVable, and thus no further handling needed
+    return;
+
   // Forget all the expressions associated with users of the old value,
   // so that future queries will recompute the expressions using the new
   // value.
@@ -12343,6 +12447,10 @@
 ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se)
   : CallbackVH(V), SE(se) {}
 
+ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(const Value *V,
+                                                ScalarEvolution *se)
+  : CallbackVH(const_cast<Value*>(V)), SE(se) {}
+
 //===----------------------------------------------------------------------===//
 //                   ScalarEvolution Class Implementation
 //===----------------------------------------------------------------------===//
@@ -12383,6 +12491,7 @@
           std::move(Arg.ConstantEvolutionLoopExitValue)),
       ValuesAtScopes(std::move(Arg.ValuesAtScopes)),
       LoopDispositions(std::move(Arg.LoopDispositions)),
+      BlockTransferExecutionToSuccessorCache(std::move(Arg.BlockTransferExecutionToSuccessorCache)),
       LoopPropertiesCache(std::move(Arg.LoopPropertiesCache)),
       BlockDispositions(std::move(Arg.BlockDispositions)),
       UnsignedRanges(std::move(Arg.UnsignedRanges)),
@@ -12945,6 +13054,17 @@
     assert(ValidLoops.contains(AR->getLoop()) &&
            "AddRec references invalid loop");
   }
+
+  // Make sure the block transfer cache is correct
+  // FIXME: It appears LoopFusion does not update SCEV correctly.
+  for (auto KVPair : BlockTransferExecutionToSuccessorCache) {
+    auto *BB = cast_or_null<BasicBlock>(&*KVPair.first);
+    if (!BB)
+      continue;
+    bool Uncached = llvm::isGuaranteedToTransferExecutionToSuccessor(BB);
+    assert(Uncached == KVPair.second);
+    (void)Uncached;
+  }
 }
 
 bool ScalarEvolution::invalidate(
Index: llvm/lib/Transforms/Scalar/LoopFuse.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1478,6 +1478,8 @@
     // and rebuild the information in subsequent passes of fusion?
     // Note: Need to forget the loops before merging the loop latches, as
     // mergeLatch may remove the only block in FC1.
+    //if (auto *L = FC0.L->getParentLoop())
+    //  SE.forgetLoop(L);
     SE.forgetLoop(FC1.L);
     SE.forgetLoop(FC0.L);
 
Index: llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll
===================================================================
--- llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll
+++ llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll
@@ -11,7 +11,7 @@
 ; AddRec: {{{(56 + (8 * (-4 + (3 * %m)) * %o) + %A),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
 ; CHECK: Base offset: %A
 ; CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of 8 bytes.
-; CHECK: ArrayRef[{3,+,1}<nuw><%for.i>][{-4,+,1}<nsw><%for.j>][{7,+,1}<nuw><nsw><%for.k>]
+; CHECK: ArrayRef[{3,+,1}<nuw><nsw><%for.i>][{-4,+,1}<nsw><%for.j>][{7,+,1}<nuw><nsw><%for.k>]
 
 define void @foo(i64 %n, i64 %m, i64 %o, double* %A) {
 entry:
Index: llvm/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll
===================================================================
--- llvm/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll
+++ llvm/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll
@@ -11,7 +11,7 @@
 ; AddRec: {{{((8 * ((((%m * %p) + %q) * %o) + %r)) + %A),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
 ; CHECK: Base offset: %A
 ; CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of 8 bytes.
-; CHECK: ArrayRef[{%p,+,1}<nw><%for.i>][{%q,+,1}<nsw><%for.j>][{%r,+,1}<nsw><%for.k>]
+; CHECK: ArrayRef[{%p,+,1}<nsw><%for.i>][{%q,+,1}<nsw><%for.j>][{%r,+,1}<nsw><%for.k>]
 
 define void @foo(i64 %n, i64 %m, i64 %o, double* %A, i64 %p, i64 %q, i64 %r) {
 entry:
Index: llvm/test/Analysis/ScalarEvolution/flags-from-poison.ll
===================================================================
--- llvm/test/Analysis/ScalarEvolution/flags-from-poison.ll
+++ llvm/test/Analysis/ScalarEvolution/flags-from-poison.ll
@@ -451,9 +451,9 @@
 ; CHECK-NEXT:    %i = phi i32 [ %nexti, %loop2 ], [ 0, %entry ]
 ; CHECK-NEXT:    --> {0,+,1}<nuw><nsw><%loop> U: [0,-2147483648) S: [0,-2147483648) Exits: (-1 + %numIterations) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index32 = add nsw i32 %i, %offset
-; CHECK-NEXT:    --> {%offset,+,1}<nw><%loop> U: full-set S: full-set Exits: (-1 + %offset + %numIterations) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%offset,+,1}<nsw><%loop> U: full-set S: full-set Exits: (-1 + %offset + %numIterations) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %ptr = getelementptr inbounds float, float* %input, i32 %index32
-; CHECK-NEXT:    --> ((4 * (sext i32 {%offset,+,1}<nw><%loop> to i64))<nsw> + %input) U: full-set S: full-set Exits: ((4 * (sext i32 (-1 + %offset + %numIterations) to i64))<nsw> + %input) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {((4 * (sext i32 %offset to i64))<nsw> + %input),+,4}<nw><%loop> U: full-set S: full-set Exits: ((4 * (zext i32 (-1 + %numIterations) to i64))<nuw><nsw> + (4 * (sext i32 %offset to i64))<nsw> + %input) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %nexti = add nsw i32 %i, 1
 ; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%loop> U: [1,-2147483648) S: [1,-2147483648) Exits: %numIterations LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @test-add-not-header
Index: llvm/test/Analysis/ScalarEvolution/incorrect-exit-count.ll
===================================================================
--- llvm/test/Analysis/ScalarEvolution/incorrect-exit-count.ll
+++ llvm/test/Analysis/ScalarEvolution/incorrect-exit-count.ll
@@ -21,7 +21,7 @@
 ; CHECK-NEXT:    %idxprom20 = zext i32 %storemerge1921 to i64
 ; CHECK-NEXT:    --> {3,+,4294967295}<nuw><nsw><%for.cond6> U: [3,4) S: [3,4) Exits: <<Unknown>> LoopDispositions: { %for.cond6: Computable, %outer.loop: Variant }
 ; CHECK-NEXT:    %arrayidx7 = getelementptr inbounds [1 x [4 x i16]], [1 x [4 x i16]]* @__const.f.g, i64 0, i64 0, i64 %idxprom20
-; CHECK-NEXT:    --> {(6 + @__const.f.g),+,8589934590}<nuw><%for.cond6> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %for.cond6: Computable, %outer.loop: Variant }
+; CHECK-NEXT:    --> {(6 + @__const.f.g)<nuw>,+,8589934590}<nuw><%for.cond6> U: [6,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %for.cond6: Computable, %outer.loop: Variant }
 ; CHECK-NEXT:    %i = load i16, i16* %arrayidx7, align 2
 ; CHECK-NEXT:    --> %i U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.cond6: Variant, %outer.loop: Variant }
 ; CHECK-NEXT:    %storemerge1822.lcssa.ph = phi i32 [ 0, %for.cond6 ]
@@ -45,7 +45,7 @@
 ; CHECK-NEXT:    %idxprom20.3 = zext i32 %storemerge1921.3 to i64
 ; CHECK-NEXT:    --> {3,+,4294967295}<nuw><nsw><%inner.loop> U: [3,4) S: [3,4) Exits: <<Unknown>> LoopDispositions: { %inner.loop: Computable, %outer.loop: Variant }
 ; CHECK-NEXT:    %arrayidx7.3 = getelementptr inbounds [1 x [4 x i16]], [1 x [4 x i16]]* @__const.f.g, i64 0, i64 0, i64 %idxprom20.3
-; CHECK-NEXT:    --> {(6 + @__const.f.g),+,8589934590}<nuw><%inner.loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %inner.loop: Computable, %outer.loop: Variant }
+; CHECK-NEXT:    --> {(6 + @__const.f.g)<nuw>,+,8589934590}<nuw><%inner.loop> U: [6,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %inner.loop: Computable, %outer.loop: Variant }
 ; CHECK-NEXT:    %i7 = load i16, i16* %arrayidx7.3, align 2
 ; CHECK-NEXT:    --> %i7 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %inner.loop: Variant, %outer.loop: Variant }
 ; CHECK-NEXT:    %i8 = load volatile i32, i32* @b, align 4
Index: llvm/test/Analysis/ScalarEvolution/nsw.ll
===================================================================
--- llvm/test/Analysis/ScalarEvolution/nsw.ll
+++ llvm/test/Analysis/ScalarEvolution/nsw.ll
@@ -25,7 +25,7 @@
 ; CHECK-NEXT:    %phitmp = sext i32 %tmp8 to i64
 ; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%bb> U: [1,-9223372036854775808) S: [1,-9223372036854775808) Exits: <<Unknown>> LoopDispositions: { %bb: Computable }
 ; CHECK-NEXT:    %tmp9 = getelementptr inbounds double, double* %p, i64 %phitmp
-; CHECK-NEXT:    --> {(8 + %p),+,8}<%bb> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb: Computable }
+; CHECK-NEXT:    --> {(8 + %p),+,8}<nuw><%bb> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @test1
 ; CHECK-NEXT:  Loop %bb: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %bb: Unpredictable max backedge-taken count.