diff --git a/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h b/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h --- a/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h +++ b/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h @@ -15,7 +15,9 @@ #ifndef LLVM_ANALYSIS_LOOPUNROLLANALYZER_H #define LLVM_ANALYSIS_LOOPUNROLLANALYZER_H +#include "llvm/ADT/Optional.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/InstVisitor.h" @@ -47,9 +49,9 @@ public: UnrolledInstAnalyzer(unsigned Iteration, DenseMap &SimplifiedValues, - ScalarEvolution &SE, const Loop *L) - : SimplifiedValues(SimplifiedValues), SE(SE), L(L) { - IterationNumber = SE.getConstant(APInt(64, Iteration)); + ScalarEvolution &SE, const Loop *L, MemorySSA *MSSA) + : SimplifiedValues(SimplifiedValues), SE(SE), L(L), MSSA(MSSA) { + IterationNumber = SE.getConstant(APInt(64, Iteration)); } // Allow access to the initial visit method. @@ -80,12 +82,13 @@ ScalarEvolution &SE; const Loop *L; + MemorySSA *MSSA; bool simplifyInstWithSCEV(Instruction *I); /// Try to simplify an address computation outside of the loop body, so we can /// compare it with simplified addresses in the loop. - void simplifyNonLoopAddress(Value *V); + Optional simplifyNonLoopAddress(Value *V); bool visitInstruction(Instruction &I) { return simplifyInstWithSCEV(&I); } bool visitBinaryOperator(BinaryOperator &I); diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -17,6 +17,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Transforms/Utils/ValueMapper.h" @@ -113,9 +114,9 @@ bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const SmallPtrSetImpl &EphValues, - OptimizationRemarkEmitter *ORE, unsigned &TripCount, - unsigned MaxTripCount, unsigned &TripMultiple, - unsigned LoopSize, + OptimizationRemarkEmitter *ORE, MemorySSA *MSSA, + unsigned &TripCount, unsigned MaxTripCount, + unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound); diff --git a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp --- a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp +++ b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp @@ -60,30 +60,32 @@ return false; } -void UnrolledInstAnalyzer::simplifyNonLoopAddress(Value *V) { +Optional +UnrolledInstAnalyzer::simplifyNonLoopAddress(Value *V) { Instruction *Inst = dyn_cast(V); // For now we only try to simplify address computation instructions outside // the current loop. if (!Inst || !Inst->getType()->isPointerTy() || !SE.isSCEVable(Inst->getType()) || L->contains(Inst)) - return; + return {}; auto Iter = SimplifiedAddresses.find(Inst); if (Iter != SimplifiedAddresses.end()) - return; + return {}; const SCEV *S = SE.getSCEV(Inst); // Check if the offset from the base address becomes a constant. auto *Base = dyn_cast(SE.getPointerBase(S)); if (!Base) - return; + return {}; auto *Offset = dyn_cast(SE.getMinusSCEV(S, Base)); if (!Offset) - return; + return {}; SimplifiedAddress Address; Address.Base = Base->getValue(); Address.Offset = Offset->getValue(); SimplifiedAddresses[V] = Address; + return Address; } /// Try to simplify binary operator I. @@ -124,8 +126,32 @@ if (AddressIt == SimplifiedAddresses.end()) return false; ConstantInt *SimplifiedAddrOp = AddressIt->second.Offset; + Value *AddrBase = AddressIt->second.Base; + + if (MSSA) { + // Try to find a defining store using the simplified address of the load. + const MemoryAccess *MemAcc = MSSA->getMemoryAccess(&I); + for (auto *D : def_chain(MemAcc)) { + if (isa(D)) + continue; + const MemoryDef *MD = dyn_cast(D); + if (!MD) + break; + StoreInst *SI = dyn_cast_or_null(MD->getMemoryInst()); + if (!SI) + break; + Optional MaybeSimpAddr = + simplifyNonLoopAddress(SI->getPointerOperand()); + if (!MaybeSimpAddr) + break; + + if (MaybeSimpAddr->Base == AddrBase && + MaybeSimpAddr->Offset == SimplifiedAddrOp) + return true; + } + } - auto *GV = dyn_cast(AddressIt->second.Base); + auto *GV = dyn_cast(AddrBase); // We're only interested in loads that can be completely folded to a // constant. if (!GV || !GV->hasDefinitiveInitializer() || !GV->isConstant()) diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -165,7 +165,7 @@ unsigned MaxTripCount = 0; bool UseUpperBound = false; bool ExplicitUnroll = computeUnrollCount( - L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount, + L, TTI, DT, LI, SE, EphValues, ORE, nullptr, OuterTripCount, MaxTripCount, OuterTripMultiple, OuterLoopSize, UP, UseUpperBound); if (ExplicitUnroll || UseUpperBound) { // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -30,6 +30,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/LoopUnrollAnalyzer.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -327,10 +328,12 @@ /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. -static Optional analyzeLoopUnrollCost( - const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, - const SmallPtrSetImpl &EphValues, - const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) { +static Optional +analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, + ScalarEvolution &SE, + const SmallPtrSetImpl &EphValues, + const TargetTransformInfo &TTI, MemorySSA *MSSA, + unsigned MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. @@ -498,7 +501,7 @@ while (!SimplifiedInputValues.empty()) SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L, MSSA); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); @@ -734,8 +737,8 @@ bool llvm::computeUnrollCount( Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const SmallPtrSetImpl &EphValues, - OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, - unsigned &TripMultiple, unsigned LoopSize, + OptimizationRemarkEmitter *ORE, MemorySSA *MSSA, unsigned &TripCount, + unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) { // Check for explicit Count. @@ -806,7 +809,7 @@ // helps to remove a significant number of instructions. // To check that, run additional analysis on the loop. if (Optional Cost = analyzeLoopUnrollCost( - L, FullUnrollTripCount, DT, SE, EphValues, TTI, + L, FullUnrollTripCount, DT, SE, EphValues, TTI, MSSA, UP.Threshold * UP.MaxPercentThresholdBoost / 100)) { unsigned Boost = getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); @@ -982,7 +985,7 @@ Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel, + ProfileSummaryInfo *PSI, MemorySSA *MSSA, bool PreserveLCSSA, int OptLevel, bool OnlyWhenForced, bool ForgetAllSCEV, Optional ProvidedCount, Optional ProvidedThreshold, Optional ProvidedAllowPartial, Optional ProvidedRuntime, Optional ProvidedUpperBound, @@ -1096,7 +1099,7 @@ // fully unroll the loop. bool UseUpperBound = false; bool IsCountSetExplicitly = computeUnrollCount( - L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, + L, TTI, DT, LI, SE, EphValues, &ORE, MSSA, TripCount, MaxTripCount, TripMultiple, LoopSize, UP, UseUpperBound); if (!UP.Count) return LoopUnrollResult::Unmodified; @@ -1209,12 +1212,14 @@ // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(&F); bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + MemorySSA *MSSA = &getAnalysis().getMSSA(); - LoopUnrollResult Result = tryToUnrollLoop( - L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel, - OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold, - ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, - ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling); + LoopUnrollResult Result = + tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, MSSA, + PreserveLCSSA, OptLevel, OnlyWhenForced, ForgetAllSCEV, + ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, + ProvidedRuntime, ProvidedUpperBound, + ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling); if (Result == LoopUnrollResult::FullyUnrolled) LPM.markLoopAsDeleted(*L); @@ -1227,6 +1232,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); // FIXME: Loop passes are required to preserve domtree, and for now we just // recreate dom info if anything gets unrolled. getLoopAnalysisUsage(AU); @@ -1240,6 +1246,7 @@ INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) @@ -1280,6 +1287,8 @@ "LoopFullUnrollPass: OptimizationRemarkEmitterAnalysis not " "cached at a higher level"); + auto *MSSAA = FAM.getCachedResult(*F); + MemorySSA *MSSA = MSSAA ? &MSSAA->getMSSA() : nullptr; // Keep track of the previous loop structure so we can identify new loops // created by unrolling. Loop *ParentL = L.getParentLoop(); @@ -1292,7 +1301,7 @@ std::string LoopName = L.getName(); bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, - /*BFI*/ nullptr, /*PSI*/ nullptr, + /*BFI*/ nullptr, /*PSI*/ nullptr, MSSA, /*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced, ForgetSCEV, /*Count*/ None, /*Threshold*/ None, /*AllowPartial*/ false, @@ -1388,6 +1397,7 @@ auto &DT = AM.getResult(F); auto &AC = AM.getResult(F); auto &ORE = AM.getResult(F); + auto &MSSA = AM.getResult(F).getMSSA(); LoopAnalysisManager *LAM = nullptr; if (auto *LAMProxy = AM.getCachedResult(F)) @@ -1435,7 +1445,7 @@ // The API here is quite complex to call and we allow to select some // flavors of unrolling during construction time (by setting UnrollOpts). LoopUnrollResult Result = tryToUnrollLoop( - &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI, + &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI, &MSSA, /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced, UnrollOpts.ForgetSCEV, /*Count*/ None, /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime, diff --git a/llvm/test/Transforms/LoopUnroll/unrolled-inst-analyzer-pointers.ll b/llvm/test/Transforms/LoopUnroll/unrolled-inst-analyzer-pointers.ll --- a/llvm/test/Transforms/LoopUnroll/unrolled-inst-analyzer-pointers.ll +++ b/llvm/test/Transforms/LoopUnroll/unrolled-inst-analyzer-pointers.ll @@ -3,7 +3,7 @@ ; REQUIRES: asserts ; Check that we simplify pointers outside of the loop for comparisons. -; %cmp1.i.i is free when unrolling. +; %cmp1.i.i is free when unrolling, so is the load %.pre3 define i32 @test_2_iters(i32 %a, i32 %b, i32 %c, i32 %d) optsize { ; CHECK-LABEL: Loop Unroll: F[test_2_iters] Loop %loop @@ -13,6 +13,60 @@ ; CHECK-NEXT: Analyzing iteration 1 ; CHECK-NEXT: Adding cost of instruction (iteration 1): 1 %spec.select.i.i = select i1 %cmp.i.i.i, i32* %incdec.ptr.i.i8, i32* %spec.select.i.i7 ; CHECK-NEXT: Adding cost of instruction (iteration 1): 1 %cmp.i.i.i = icmp slt i32 %.pre, %.pre3 +; CHECK-NEXT: Adding cost of instruction (iteration 1): 1 %.pre = load i32, i32* %spec.select.i.i7, align 4 +; CHECK-NEXT: Adding cost of instruction (iteration 0): 1 %incdec.ptr.i.i = getelementptr inbounds i32, i32* %incdec.ptr.i.i8, i64 1 +; CHECK-NEXT: Adding cost of instruction (iteration 0): 1 %spec.select.i.i = select i1 %cmp.i.i.i, i32* %incdec.ptr.i.i8, i32* %spec.select.i.i7 +; CHECK-NEXT: Adding cost of instruction (iteration 0): 1 %cmp.i.i.i = icmp slt i32 %.pre, %.pre3 +; CHECK-NEXT: Adding cost of instruction (iteration 0): 1 %.pre = load i32, i32* %spec.select.i.i7, align 4 +; CHECK-NEXT: Analysis finished: +; CHECK-NEXT: UnrolledCost: 7, RolledDynamicCost: 14 + + +entry: + %ref.tmp = alloca [4 x i32], align 4 + %0 = bitcast [4 x i32]* %ref.tmp to i8* + %arrayinit.begin = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 0 + store i32 %a, i32* %arrayinit.begin, align 4 + %arrayinit.element = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 1 + store i32 %b, i32* %arrayinit.element, align 4 + %arrayinit.element1 = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 2 + store i32 %c, i32* %arrayinit.element1, align 4 + %arrayinit.element2 = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 3 + store i32 %d, i32* %arrayinit.element2, align 4 + %add.ptr.i.i = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 4 + %cmp.i.i.i4 = icmp slt i32 %a, %b + %spec.select.i.i5 = select i1 %cmp.i.i.i4, i32* %arrayinit.element, i32* %arrayinit.begin + %incdec.ptr.i.i6 = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 2 + br label %loop + +loop: ; preds = %entry, %loop + %incdec.ptr.i.i8 = phi i32* [ %incdec.ptr.i.i6, %entry ], [ %incdec.ptr.i.i, %loop ] + %spec.select.i.i7 = phi i32* [ %spec.select.i.i5, %entry ], [ %spec.select.i.i, %loop ] + %.pre = load i32, i32* %spec.select.i.i7, align 4 + %.pre3 = load i32, i32* %incdec.ptr.i.i8, align 4 + %cmp.i.i.i = icmp slt i32 %.pre, %.pre3 + %spec.select.i.i = select i1 %cmp.i.i.i, i32* %incdec.ptr.i.i8, i32* %spec.select.i.i7 + %incdec.ptr.i.i = getelementptr inbounds i32, i32* %incdec.ptr.i.i8, i64 1 + %cmp1.i.i = icmp eq i32* %incdec.ptr.i.i, %add.ptr.i.i + br i1 %cmp1.i.i, label %exit, label %loop + +exit: ; preds = %loop + %1 = load i32, i32* %spec.select.i.i, align 4 + ret i32 %1 +} + + +; Same as test_2_iters, except that we have a store to an unknown index, +; potentially clobbering all stores with known indices. + +define i32 @test_2_iters_unknown_clobber(i32 %a, i32 %b, i32 %c, i32 %d, i64 %idx) optsize { +; CHECK-LABEL: Loop Unroll: F[test_2_iters_unknown_clobber] Loop %loop +; CHECK-NEXT: Loop Size = 7 +; CHECK-NEXT: Starting LoopUnroll profitability analysis... +; CHECK-NEXT: Analyzing iteration 0 +; CHECK-NEXT: Analyzing iteration 1 +; CHECK-NEXT: Adding cost of instruction (iteration 1): 1 %spec.select.i.i = select i1 %cmp.i.i.i, i32* %incdec.ptr.i.i8, i32* %spec.select.i.i7 +; CHECK-NEXT: Adding cost of instruction (iteration 1): 1 %cmp.i.i.i = icmp slt i32 %.pre, %.pre3 ; CHECK-NEXT: Adding cost of instruction (iteration 1): 1 %.pre3 = load i32, i32* %incdec.ptr.i.i8, align 4 ; CHECK-NEXT: Adding cost of instruction (iteration 1): 1 %.pre = load i32, i32* %spec.select.i.i7, align 4 ; CHECK-NEXT: Adding cost of instruction (iteration 0): 1 %incdec.ptr.i.i = getelementptr inbounds i32, i32* %incdec.ptr.i.i8, i64 1 @@ -22,8 +76,6 @@ ; CHECK-NEXT: Adding cost of instruction (iteration 0): 1 %.pre = load i32, i32* %spec.select.i.i7, align 4 ; CHECK-NEXT: Analysis finished: ; CHECK-NEXT: UnrolledCost: 9, RolledDynamicCost: 14 - - entry: %ref.tmp = alloca [4 x i32], align 4 %0 = bitcast [4 x i32]* %ref.tmp to i8* @@ -35,6 +87,8 @@ store i32 %c, i32* %arrayinit.element1, align 4 %arrayinit.element2 = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 3 store i32 %d, i32* %arrayinit.element2, align 4 + %arrayinit.element3 = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 %idx + store i32 %d, i32* %arrayinit.element3, align 4 %add.ptr.i.i = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 4 %cmp.i.i.i4 = icmp slt i32 %a, %b %spec.select.i.i5 = select i1 %cmp.i.i.i4, i32* %arrayinit.element, i32* %arrayinit.begin diff --git a/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp b/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp --- a/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp +++ b/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp @@ -37,7 +37,8 @@ TripCount = SE->getSmallConstantTripCount(L, Exiting); for (unsigned Iteration = 0; Iteration < TripCount; Iteration++) { DenseMap SimplifiedValues; - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L, + nullptr); for (auto *BB : L->getBlocks()) for (Instruction &I : *BB) Analyzer.visit(I);