Index: llvm/include/llvm/Analysis/ScalarEvolution.h =================================================================== --- llvm/include/llvm/Analysis/ScalarEvolution.h +++ llvm/include/llvm/Analysis/ScalarEvolution.h @@ -1173,6 +1173,7 @@ public: SCEVCallbackVH(Value *V, ScalarEvolution *SE = nullptr); + SCEVCallbackVH(const Value *V, ScalarEvolution *SE = nullptr); }; friend class SCEVCallbackVH; @@ -1459,6 +1460,15 @@ SmallVector, 2>> LoopDispositions; + /// Cache for \c isGuaranteedToTransferExecutionToSuccessor(BB) + /// Note: Key must be a wrapped BasicBlock * + DenseMap> + BlockTransferExecutionToSuccessorCache; + + /// A version of the ValueTracking routine, but cached for efficiency, and + /// restricted to blocks inside a Loop. + bool isGuaranteedToTransferExecutionToSuccessor(const BasicBlock *BB); + struct LoopProperties { /// Set to true if the loop contains no instruction that can abnormally exit /// the loop (i.e. via throwing an exception, by terminating the thread Index: llvm/lib/Analysis/ScalarEvolution.cpp =================================================================== --- llvm/lib/Analysis/ScalarEvolution.cpp +++ llvm/lib/Analysis/ScalarEvolution.cpp @@ -6631,24 +6631,102 @@ return Bound ? Bound : &*F.getEntryBlock().begin(); } +bool ScalarEvolution:: +isGuaranteedToTransferExecutionToSuccessor(const BasicBlock *BB) { + assert(LI.getLoopFor(BB) && "must be in a loop for invalidation!"); + if (!BlockTransferExecutionToSuccessorCache.count(BB)) + BlockTransferExecutionToSuccessorCache[{BB, this}] = + llvm::isGuaranteedToTransferExecutionToSuccessor(BB); + + return BlockTransferExecutionToSuccessorCache[BB]; +} + bool ScalarEvolution::isGuaranteedToTransferExecutionTo(const Instruction *A, const Instruction *B) { if (A->getParent() == B->getParent() && - isGuaranteedToTransferExecutionToSuccessor(A->getIterator(), - B->getIterator())) + llvm::isGuaranteedToTransferExecutionToSuccessor(A->getIterator(), + B->getIterator())) return true; - auto *BLoop = LI.getLoopFor(B->getParent()); - if (BLoop && BLoop->getHeader() == B->getParent() && - BLoop->getLoopPreheader() == A->getParent() && - isGuaranteedToTransferExecutionToSuccessor(A->getIterator(), - A->getParent()->end()) && - isGuaranteedToTransferExecutionToSuccessor(B->getParent()->begin(), - B->getIterator())) - return true; - return false; -} + // The current implementation only handles the case where A dominates B. + // Note that cases where A does not dominate B, but where A must reach B do + // exist. An example would be B in the header of some loop L, and A in the + // (unconditional) latch block of L. + if (!DT.dominates(A->getParent(), B->getParent())) + return false; + + // Unreachable CFGs can get very weird. To avoid infinite loops below, + // require reachability + if (!DT.isReachableFromEntry(B->getParent())) + return false; + + // First find a path from B to A where if all blocks along path + // are transparent, then we can prove A reaches B. Defer the actual + // checks for transparence until the end as (even cached) that's expensive. + SmallVector Path; + Path.push_back(B->getParent()); + + auto getPrevBB = [&](const BasicBlock *BB) -> const BasicBlock * { + auto *L = LI.getLoopFor(BB); + if (L && L->getHeader() == BB) + return L->getLoopPreheader(); + auto *PrevBB = BB->getUniquePredecessor(); + return PrevBB && PrevBB->getUniqueSuccessor() ? PrevBB : nullptr; + }; + + auto *PrevBB = getPrevBB(B->getParent()); + if (!PrevBB) + return false; + + while (true) { + Path.push_back(PrevBB); + if (PrevBB == A->getParent()) + break; + + PrevBB = getPrevBB(PrevBB); + if (!PrevBB) + return false; + } + assert(Path.front() == B->getParent()); + assert(Path.back() == A->getParent()); + assert(Path.size() >= 2); + + // We rely on forgetLoop for invalidation of the cache, as a result, we + // can only query blocks in loops. This restriction can be removed once + // we find a better cache update mechanism. + for (unsigned i = 1; i < Path.size() - 1; i++) + if (!LI.getLoopFor(Path[i])) + return false; + + // Do the cacheable part first + for (unsigned i = 1; i < Path.size() - 1; i++) + if (!isGuaranteedToTransferExecutionToSuccessor(Path[i])) + return false; + + // Finally, check the prefix of B's block and the suffix of A's. For the + // local search, we use the block local cache as a filter. + auto doLocalSearch = [&](BasicBlock::const_iterator Begin, + BasicBlock::const_iterator End) { + if (Begin == End) + return true; + + auto *BB = Begin->getParent(); + + // Knowing the block isn't transparent isn't enough to answer this query; + // we'd need to know where in the block the blockage is. + if (BlockTransferExecutionToSuccessorCache.count(BB) && + BlockTransferExecutionToSuccessorCache[BB]) + return true; + + // It's tempting to cache this, but since we're using a bounded search + // here, we'd risk saving false positives into the block cache. + return ::isGuaranteedToTransferExecutionToSuccessor(Begin, End); + }; + + return (doLocalSearch(B->getParent()->begin(), B->getIterator()) && + doLocalSearch(A->getIterator(), A->getParent()->end())); +} bool ScalarEvolution::isSCEVExprNeverPoison(const Instruction *I) { // Only proceed if we can prove that I does not yield poison. @@ -6755,15 +6833,23 @@ LoopProperties LP = {/* HasNoAbnormalExits */ true, /*HasNoSideEffects*/ true}; - for (auto *BB : L->getBlocks()) + for (auto *BB : L->getBlocks()) { + LoopProperties Local = {true, true}; for (auto &I : *BB) { - if (!isGuaranteedToTransferExecutionToSuccessor(&I)) - LP.HasNoAbnormalExits = false; + if (!llvm::isGuaranteedToTransferExecutionToSuccessor(&I)) + Local.HasNoAbnormalExits = false; if (HasSideEffects(&I)) - LP.HasNoSideEffects = false; - if (!LP.HasNoAbnormalExits && !LP.HasNoSideEffects) + Local.HasNoSideEffects = false; + if (!Local.HasNoAbnormalExits && !Local.HasNoSideEffects) break; // We're already as pessimistic as we can get. } + BlockTransferExecutionToSuccessorCache[{BB, this}] + = Local.HasNoAbnormalExits; + LP.HasNoAbnormalExits &= Local.HasNoAbnormalExits; + LP.HasNoSideEffects &= Local.HasNoSideEffects; + if (!LP.HasNoAbnormalExits && !LP.HasNoSideEffects) + break; // We're already as pessimistic as we can get. + } auto InsertPair = LoopPropertiesCache.insert({L, LP}); assert(InsertPair.second && "We just checked!"); @@ -7506,6 +7592,7 @@ // result. BackedgeTakenCounts.clear(); PredicatedBackedgeTakenCounts.clear(); + BlockTransferExecutionToSuccessorCache.clear(); LoopPropertiesCache.clear(); ConstantEvolutionLoopExitValue.clear(); ValueExprMap.clear(); @@ -7570,6 +7657,13 @@ PushDefUseChildren(I, Worklist); } + // Removing the blocks is probably overly conservative here, but it's not + // clear what client code might assume that forgetLoop handled the code + // motion invalidation required. + if (auto *BB = CurrL->getLoopPreheader()) + BlockTransferExecutionToSuccessorCache.erase(BB); + for (auto *BB : CurrL->getBlocks()) + BlockTransferExecutionToSuccessorCache.erase(BB); LoopPropertiesCache.erase(CurrL); // Forget all contained loops too, to avoid dangling entries in the // ValuesAtScopes map. @@ -12305,6 +12399,12 @@ void ScalarEvolution::SCEVCallbackVH::deleted() { assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!"); + + if (isa(getValPtr())) { + SE->BlockTransferExecutionToSuccessorCache.erase(*this); + return; + } + if (PHINode *PN = dyn_cast(getValPtr())) SE->ConstantEvolutionLoopExitValue.erase(PN); SE->eraseValueFromMap(getValPtr()); @@ -12314,6 +12414,10 @@ void ScalarEvolution::SCEVCallbackVH::allUsesReplacedWith(Value *V) { assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!"); + if (isa(getValPtr())) + // Not isSCEVable, and thus no further handling needed + return; + // Forget all the expressions associated with users of the old value, // so that future queries will recompute the expressions using the new // value. @@ -12343,6 +12447,10 @@ ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se) : CallbackVH(V), SE(se) {} +ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(const Value *V, + ScalarEvolution *se) + : CallbackVH(const_cast(V)), SE(se) {} + //===----------------------------------------------------------------------===// // ScalarEvolution Class Implementation //===----------------------------------------------------------------------===// @@ -12383,6 +12491,7 @@ std::move(Arg.ConstantEvolutionLoopExitValue)), ValuesAtScopes(std::move(Arg.ValuesAtScopes)), LoopDispositions(std::move(Arg.LoopDispositions)), + BlockTransferExecutionToSuccessorCache(std::move(Arg.BlockTransferExecutionToSuccessorCache)), LoopPropertiesCache(std::move(Arg.LoopPropertiesCache)), BlockDispositions(std::move(Arg.BlockDispositions)), UnsignedRanges(std::move(Arg.UnsignedRanges)), @@ -12945,6 +13054,17 @@ assert(ValidLoops.contains(AR->getLoop()) && "AddRec references invalid loop"); } + + // Make sure the block transfer cache is correct + // FIXME: It appears LoopFusion does not update SCEV correctly. + for (auto KVPair : BlockTransferExecutionToSuccessorCache) { + auto *BB = cast_or_null(&*KVPair.first); + if (!BB) + continue; + bool Uncached = llvm::isGuaranteedToTransferExecutionToSuccessor(BB); + assert(Uncached == KVPair.second); + (void)Uncached; + } } bool ScalarEvolution::invalidate( Index: llvm/lib/Transforms/Scalar/LoopFuse.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -1478,6 +1478,8 @@ // and rebuild the information in subsequent passes of fusion? // Note: Need to forget the loops before merging the loop latches, as // mergeLatch may remove the only block in FC1. + //if (auto *L = FC0.L->getParentLoop()) + // SE.forgetLoop(L); SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); Index: llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll =================================================================== --- llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll +++ llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll @@ -11,7 +11,7 @@ ; AddRec: {{{(56 + (8 * (-4 + (3 * %m)) * %o) + %A),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k> ; CHECK: Base offset: %A ; CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of 8 bytes. -; CHECK: ArrayRef[{3,+,1}<%for.i>][{-4,+,1}<%for.j>][{7,+,1}<%for.k>] +; CHECK: ArrayRef[{3,+,1}<%for.i>][{-4,+,1}<%for.j>][{7,+,1}<%for.k>] define void @foo(i64 %n, i64 %m, i64 %o, double* %A) { entry: Index: llvm/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll =================================================================== --- llvm/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll +++ llvm/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll @@ -11,7 +11,7 @@ ; AddRec: {{{((8 * ((((%m * %p) + %q) * %o) + %r)) + %A),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k> ; CHECK: Base offset: %A ; CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of 8 bytes. -; CHECK: ArrayRef[{%p,+,1}<%for.i>][{%q,+,1}<%for.j>][{%r,+,1}<%for.k>] +; CHECK: ArrayRef[{%p,+,1}<%for.i>][{%q,+,1}<%for.j>][{%r,+,1}<%for.k>] define void @foo(i64 %n, i64 %m, i64 %o, double* %A, i64 %p, i64 %q, i64 %r) { entry: Index: llvm/test/Analysis/ScalarEvolution/flags-from-poison.ll =================================================================== --- llvm/test/Analysis/ScalarEvolution/flags-from-poison.ll +++ llvm/test/Analysis/ScalarEvolution/flags-from-poison.ll @@ -451,9 +451,9 @@ ; CHECK-NEXT: %i = phi i32 [ %nexti, %loop2 ], [ 0, %entry ] ; CHECK-NEXT: --> {0,+,1}<%loop> U: [0,-2147483648) S: [0,-2147483648) Exits: (-1 + %numIterations) LoopDispositions: { %loop: Computable } ; CHECK-NEXT: %index32 = add nsw i32 %i, %offset -; CHECK-NEXT: --> {%offset,+,1}<%loop> U: full-set S: full-set Exits: (-1 + %offset + %numIterations) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: --> {%offset,+,1}<%loop> U: full-set S: full-set Exits: (-1 + %offset + %numIterations) LoopDispositions: { %loop: Computable } ; CHECK-NEXT: %ptr = getelementptr inbounds float, float* %input, i32 %index32 -; CHECK-NEXT: --> ((4 * (sext i32 {%offset,+,1}<%loop> to i64)) + %input) U: full-set S: full-set Exits: ((4 * (sext i32 (-1 + %offset + %numIterations) to i64)) + %input) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: --> {((4 * (sext i32 %offset to i64)) + %input),+,4}<%loop> U: full-set S: full-set Exits: ((4 * (zext i32 (-1 + %numIterations) to i64)) + (4 * (sext i32 %offset to i64)) + %input) LoopDispositions: { %loop: Computable } ; CHECK-NEXT: %nexti = add nsw i32 %i, 1 ; CHECK-NEXT: --> {1,+,1}<%loop> U: [1,-2147483648) S: [1,-2147483648) Exits: %numIterations LoopDispositions: { %loop: Computable } ; CHECK-NEXT: Determining loop execution counts for: @test-add-not-header Index: llvm/test/Analysis/ScalarEvolution/incorrect-exit-count.ll =================================================================== --- llvm/test/Analysis/ScalarEvolution/incorrect-exit-count.ll +++ llvm/test/Analysis/ScalarEvolution/incorrect-exit-count.ll @@ -21,7 +21,7 @@ ; CHECK-NEXT: %idxprom20 = zext i32 %storemerge1921 to i64 ; CHECK-NEXT: --> {3,+,4294967295}<%for.cond6> U: [3,4) S: [3,4) Exits: <> LoopDispositions: { %for.cond6: Computable, %outer.loop: Variant } ; CHECK-NEXT: %arrayidx7 = getelementptr inbounds [1 x [4 x i16]], [1 x [4 x i16]]* @__const.f.g, i64 0, i64 0, i64 %idxprom20 -; CHECK-NEXT: --> {(6 + @__const.f.g),+,8589934590}<%for.cond6> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %for.cond6: Computable, %outer.loop: Variant } +; CHECK-NEXT: --> {(6 + @__const.f.g),+,8589934590}<%for.cond6> U: [6,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %for.cond6: Computable, %outer.loop: Variant } ; CHECK-NEXT: %i = load i16, i16* %arrayidx7, align 2 ; CHECK-NEXT: --> %i U: full-set S: full-set Exits: <> LoopDispositions: { %for.cond6: Variant, %outer.loop: Variant } ; CHECK-NEXT: %storemerge1822.lcssa.ph = phi i32 [ 0, %for.cond6 ] @@ -45,7 +45,7 @@ ; CHECK-NEXT: %idxprom20.3 = zext i32 %storemerge1921.3 to i64 ; CHECK-NEXT: --> {3,+,4294967295}<%inner.loop> U: [3,4) S: [3,4) Exits: <> LoopDispositions: { %inner.loop: Computable, %outer.loop: Variant } ; CHECK-NEXT: %arrayidx7.3 = getelementptr inbounds [1 x [4 x i16]], [1 x [4 x i16]]* @__const.f.g, i64 0, i64 0, i64 %idxprom20.3 -; CHECK-NEXT: --> {(6 + @__const.f.g),+,8589934590}<%inner.loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %inner.loop: Computable, %outer.loop: Variant } +; CHECK-NEXT: --> {(6 + @__const.f.g),+,8589934590}<%inner.loop> U: [6,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %inner.loop: Computable, %outer.loop: Variant } ; CHECK-NEXT: %i7 = load i16, i16* %arrayidx7.3, align 2 ; CHECK-NEXT: --> %i7 U: full-set S: full-set Exits: <> LoopDispositions: { %inner.loop: Variant, %outer.loop: Variant } ; CHECK-NEXT: %i8 = load volatile i32, i32* @b, align 4 Index: llvm/test/Analysis/ScalarEvolution/nsw.ll =================================================================== --- llvm/test/Analysis/ScalarEvolution/nsw.ll +++ llvm/test/Analysis/ScalarEvolution/nsw.ll @@ -25,7 +25,7 @@ ; CHECK-NEXT: %phitmp = sext i32 %tmp8 to i64 ; CHECK-NEXT: --> {1,+,1}<%bb> U: [1,-9223372036854775808) S: [1,-9223372036854775808) Exits: <> LoopDispositions: { %bb: Computable } ; CHECK-NEXT: %tmp9 = getelementptr inbounds double, double* %p, i64 %phitmp -; CHECK-NEXT: --> {(8 + %p),+,8}<%bb> U: full-set S: full-set Exits: <> LoopDispositions: { %bb: Computable } +; CHECK-NEXT: --> {(8 + %p),+,8}<%bb> U: full-set S: full-set Exits: <> LoopDispositions: { %bb: Computable } ; CHECK-NEXT: Determining loop execution counts for: @test1 ; CHECK-NEXT: Loop %bb: Unpredictable backedge-taken count. ; CHECK-NEXT: Loop %bb: Unpredictable max backedge-taken count.