diff --git a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h --- a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h @@ -15,6 +15,7 @@ #define LLVM_ANALYSIS_LOOPCACHEANALYSIS_H #include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/raw_ostream.h" @@ -28,7 +29,7 @@ class SCEV; class TargetTransformInfo; -using CacheCostTy = int64_t; +using CacheCostTy = std::pair; using LoopVectorTy = SmallVector; /// Represents a memory reference as a base pointer and a set of indexing @@ -53,6 +54,27 @@ bool isValid() const { return IsValid; } const SCEV *getBasePointer() const { return BasePointer; } size_t getNumSubscripts() const { return Subscripts.size(); } + const SCEV *getStepRecurrence(const Loop *L) const { + return StepRecurrences.lookup(L); + } + // Set up the mapping between loops and array access strides for the memory + // reference. For example, + // void foo(long n, long m, long o, int A[n][m][o]) { + // for (long j = 0; j < m; j++) + // for (long i = 0; i < n; i++) + // for (long k = 0; k < o; k++) + // A[2*i+3][2*j-4][2*k+7] = 1; + // } + // Loop 'for.i' maps to stride = 8*m*o, + // Loop 'for.j' maps to stride = 8*o, + // Loop 'for.k' maps to stride = 8. + // If m and o are not known constants, we will try to provide estimated + // strides later. + void setStepRecurrence(const Loop *L, const SCEV *S) { + const SCEVAddRecExpr *AR = dyn_cast(S); + StepRecurrences[L] = AR == nullptr ? nullptr : AR->getStepRecurrence(SE); + return; + } const SCEV *getSubscript(unsigned SubNum) const { assert(SubNum < getNumSubscripts() && "Invalid subscript number"); return Subscripts[SubNum]; @@ -90,7 +112,10 @@ /// + the coefficient of this loop's index variable used in all other /// subscripts is zero /// - or otherwise equal to 'TripCount'. - CacheCostTy computeRefCost(const Loop &L, unsigned CLS) const; + CacheCostTy + computeRefCost(const Loop &L, + const DenseMap &EstimatedTripCounts, + unsigned CLS) const; private: /// Attempt to delinearize the indexed reference. @@ -138,6 +163,9 @@ SmallVector Sizes; ScalarEvolution &SE; + + // The mapping between loops and array access strides for the memory reference. + SmallDenseMap StepRecurrences; }; /// A reference group represents a set of memory references that exhibit @@ -177,7 +205,7 @@ using LoopCacheCostTy = std::pair; public: - static CacheCostTy constexpr InvalidCost = -1; + static CacheCostTy constexpr InvalidCost = {-1, -1}; /// Construct a CacheCost object for the loop nest described by \p Loops. /// The optional parameter \p TRT can be used to specify the max. distance @@ -201,7 +229,10 @@ auto IT = llvm::find_if(LoopCosts, [&L](const LoopCacheCostTy &LCC) { return LCC.first == &L; }); - return (IT != LoopCosts.end()) ? (*IT).second : -1; + if (IT != LoopCosts.end()) + return (*IT).second; + + return {-1, -1}; } /// Return the estimated ordered loop costs. @@ -235,10 +266,17 @@ CacheCostTy computeRefGroupCacheCost(const ReferenceGroupTy &RG, const Loop &L) const; - /// Sort the LoopCosts vector by decreasing cache cost. + /// Sort the LoopCosts vector by decreasing cache cost, and decreasing stride + /// if cache costs are the same. void sortLoopCosts() { - sort(LoopCosts, [](const LoopCacheCostTy &A, const LoopCacheCostTy &B) { - return A.second > B.second; + sort(LoopCosts, [this](const LoopCacheCostTy &A, const LoopCacheCostTy &B) { + const CacheCostTy &CostA = A.second; + const CacheCostTy &CostB = B.second; + if (CostA.first == CostA.first && CostA.second != InvalidCost.second && + CostB.second != InvalidCost.second) { + return CostA.second > CostB.second; + } + return CostA.first > CostB.first; }); } @@ -249,6 +287,9 @@ /// Trip counts for the loops in the loop nest associated with this object. SmallVector TripCounts; + /// Mapping between the actual strides and the estimated strides + DenseMap EstimatedTripCounts; + /// Cache costs for the loops in the loop nest associated with this object. SmallVector LoopCosts; diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp --- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp +++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp @@ -33,7 +33,6 @@ #include "llvm/Analysis/Delinearization.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -113,6 +112,34 @@ return SE.getTripCountFromExitCount(BackedgeTakenCount); } +/// Estimate a constant integer stride for a memory reference given its actual +/// \p Stride. This would facilitate sorting of memory references (based on +/// costs and strides). Return \p true if we are able to get an estimated +/// integer stride. +static bool +getEstimatedStride(const SCEV *Stride, + const DenseMap &EstimatedTripCounts, + int64_t &EstimatedStride) { + auto MulStride = dyn_cast(Stride); + if (MulStride == nullptr) + return false; + + for (unsigned i = 0; i < MulStride->getNumOperands(); i++) { + const auto *Op = MulStride->getOperand(i); + if (auto ConstantOp = dyn_cast(Op)) { + EstimatedStride *= ConstantOp->getValue()->getSExtValue(); + continue; + } else if (EstimatedTripCounts.find(Op) != EstimatedTripCounts.end()) { + EstimatedStride *= EstimatedTripCounts.lookup(Op); + continue; + } else { + EstimatedStride = -1; + break; + } + } + return EstimatedStride != -1 ? true : false; +} + //===----------------------------------------------------------------------===// // IndexedReference implementation // @@ -260,18 +287,20 @@ return true; } -CacheCostTy IndexedReference::computeRefCost(const Loop &L, - unsigned CLS) const { +CacheCostTy IndexedReference::computeRefCost( + const Loop &L, const DenseMap &EstimatedTripCounts, + unsigned CLS) const { assert(IsValid && "Expecting a valid reference"); LLVM_DEBUG({ dbgs().indent(2) << "Computing cache cost for:\n"; dbgs().indent(4) << *this << "\n"; }); - // If the indexed reference is loop invariant the cost is one. + // If the indexed reference is loop invariant the cost is one + // and the stride is zero. if (isLoopInvariant(L)) { LLVM_DEBUG(dbgs().indent(4) << "Reference is loop invariant: RefCost=1\n"); - return 1; + return {1, 0}; } const SCEV *TripCount = computeTripCount(L, SE); @@ -286,11 +315,11 @@ // If the indexed reference is 'consecutive' the cost is // (TripCount*Stride)/CLS, otherwise the cost is TripCount. const SCEV *RefCost = TripCount; - + const SCEV *Stride = nullptr; if (isConsecutive(L, CLS)) { const SCEV *Coeff = getLastCoefficient(); const SCEV *ElemSize = Sizes.back(); - const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize); + Stride = SE.getMulExpr(Coeff, ElemSize); Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType()); const SCEV *CacheLineSize = SE.getConstant(WiderType, CLS); if (SE.isKnownNegative(Stride)) @@ -303,14 +332,28 @@ LLVM_DEBUG(dbgs().indent(4) << "Access is consecutive: RefCost=(TripCount*Stride)/CLS=" << *RefCost << "\n"); - } else + } else { + Stride = getStepRecurrence(&L); LLVM_DEBUG(dbgs().indent(4) << "Access is not consecutive: RefCost=TripCount=" << *RefCost << "\n"); + } // Attempt to fold RefCost into a constant. - if (auto ConstantCost = dyn_cast(RefCost)) - return ConstantCost->getValue()->getSExtValue(); + if (auto ConstantCost = dyn_cast(RefCost)) { + if (auto ConstantStride = dyn_cast(Stride)) + return {ConstantCost->getValue()->getSExtValue(), + ConstantStride->getValue()->getSExtValue()}; + else { + // For most parametric-sized loops the strides and trip counts + // are not known constants. Try to estimate the strides using the + // estimated trip counts. + int64_t EstimatedStride = 1; + if (getEstimatedStride(Stride, EstimatedTripCounts, EstimatedStride)) + return {ConstantCost->getValue()->getSExtValue(), EstimatedStride}; + return {ConstantCost->getValue()->getSExtValue(), -1}; + } + } LLVM_DEBUG(dbgs().indent(4) << "RefCost is not a constant! Setting to RefCost=InvalidCost " @@ -332,6 +375,15 @@ const SCEV *AccessFn = SE.getSCEVAtScope(getPointerOperand(&StoreOrLoadInst), L); + // Populate the access stride for each loop. + Loop *CurLoop = L; + while (CurLoop) { + const SCEV *CurAccessFn = + SE.getSCEVAtScope(getPointerOperand(&StoreOrLoadInst), CurLoop); + setStepRecurrence(CurLoop, CurAccessFn); + CurLoop = CurLoop->getParentLoop(); + } + BasePointer = dyn_cast(SE.getPointerBase(AccessFn)); if (BasePointer == nullptr) { LLVM_DEBUG( @@ -467,10 +519,14 @@ //===----------------------------------------------------------------------===// // CacheCost implementation // +CacheCostTy constexpr CacheCost::InvalidCost; + raw_ostream &llvm::operator<<(raw_ostream &OS, const CacheCost &CC) { for (const auto &LC : CC.LoopCosts) { const Loop *L = LC.first; - OS << "Loop '" << L->getName() << "' has cost = " << LC.second << "\n"; + OS << "Loop '" << L->getName() << "' has cost = " << LC.second.first; + if (LC.second.second != -1) + OS << " and stride = " << LC.second.second << " \n"; } return OS; } @@ -487,6 +543,12 @@ unsigned TripCount = SE.getSmallConstantTripCount(L); TripCount = (TripCount == 0) ? DefaultTripCount : TripCount; TripCounts.push_back({L, TripCount}); + // Keep track of the mapping between the actual (possibly non-constant) trip + // counts and the estimated trip counts, i.e., DefaultTripCount. This will + // be useful later when estimating the strides for parametric-sized arrays. + const SCEV *S = SE.getTripCountFromExitCount( + SE.getExitCount(L, L->getExitingBlock()), /*Extend=*/false); + EstimatedTripCounts[S] = TripCount; } calculateCacheFootprint(); @@ -618,21 +680,23 @@ << "' as innermost loop.\n"); // Compute the product of the trip counts of each other loop in the nest. - CacheCostTy TripCountsProduct = 1; + uint64_t TripCountsProduct = 1; for (const auto &TC : TripCounts) { if (TC.first == &L) continue; TripCountsProduct *= TC.second; } - CacheCostTy LoopCost = 0; + CacheCostTy LoopCost = {0, 0}; for (const ReferenceGroupTy &RG : RefGroups) { CacheCostTy RefGroupCost = computeRefGroupCacheCost(RG, L); - LoopCost += RefGroupCost * TripCountsProduct; + LoopCost.first += RefGroupCost.first * TripCountsProduct; + LoopCost.second = std::max(LoopCost.second, RefGroupCost.second); } - LLVM_DEBUG(dbgs().indent(2) << "Loop '" << L.getName() - << "' has cost=" << LoopCost << "\n"); + LLVM_DEBUG(dbgs().indent(2) + << "Loop '" << L.getName() << "' has cost=" << LoopCost.first + << " and stride= " << LoopCost.second << " \n"); return LoopCost; } @@ -642,7 +706,8 @@ assert(!RG.empty() && "Reference group should have at least one member."); const IndexedReference *Representative = RG.front().get(); - return Representative->computeRefCost(L, TTI.getCacheLineSize()); + return Representative->computeRefCost(L, EstimatedTripCounts, + TTI.getCacheLineSize()); } //===----------------------------------------------------------------------===// diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll --- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll +++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll @@ -75,3 +75,74 @@ ret void } +; void foo(long n, long m, long o, int A[n][m][o]) { +; for (long j = 0; j < m; j++) +; for (long i = 0; i < n; i++) +; for (long k = 0; k < o; k++) +; A[2*i+3][2*j-4][2*k+7] = 1; +; } + +; CHECK-DAG: Loop 'for.i' has cost = 1000000 and stride = 80000 +; CHECK-DAG: Loop 'for.j' has cost = 1000000 and stride = 800 +; CHECK-DAG: Loop 'for.k' has cost = 60000 and stride = 8 + +define void @foo2(i64 %n, i64 %m, i64 %o, i32* %A) { +entry: + %cmp32 = icmp sgt i64 %n, 0 + %cmp230 = icmp sgt i64 %m, 0 + %cmp528 = icmp sgt i64 %o, 0 + br i1 %cmp32, label %for.cond1.preheader.lr.ph, label %for.end + +for.cond1.preheader.lr.ph: ; preds = %entry + br i1 %cmp230, label %for.j.preheader, label %for.end + +for.j.preheader: ; preds = %for.cond1.preheader.lr.ph + br i1 %cmp528, label %for.j.preheader.split, label %for.end + +for.j.preheader.split: ; preds = %for.j.preheader + br label %for.j + +for.i: ; preds = %for.inci, %for.j + %i = phi i64 [ %inci, %for.inci ], [ 0, %for.j ] + %mul8 = shl i64 %i, 1 + %add9 = add nsw i64 %mul8, 3 + %0 = mul i64 %add9, %m + %sub = add i64 %0, -4 + %mul7 = mul nsw i64 %j, 2 + %tmp = add i64 %sub, %mul7 + %tmp27 = mul i64 %tmp, %o + br label %for.k + +for.j: ; preds = %for.incj, %for.j.preheader.split + %j = phi i64 [ %incj, %for.incj ], [ 0, %for.j.preheader.split ] + br label %for.i + +for.k: ; preds = %for.k, %for.i + %k = phi i64 [ 0, %for.i ], [ %inck, %for.k ] + + %mul = mul nsw i64 %k, 2 + %arrayidx.sum = add i64 %mul, 7 + %arrayidx10.sum = add i64 %arrayidx.sum, %tmp27 + %arrayidx11 = getelementptr inbounds i32, i32* %A, i64 %arrayidx10.sum + store i32 1, i32* %arrayidx11, align 4 + + %inck = add nsw i64 %k, 1 + %exitcond.us = icmp eq i64 %inck, %o + br i1 %exitcond.us, label %for.inci, label %for.k + +for.incj: ; preds = %for.inci + %incj = add nsw i64 %j, 1 + %exitcond54.us = icmp eq i64 %incj, %m + br i1 %exitcond54.us, label %for.end.loopexit, label %for.j + +for.inci: ; preds = %for.k + %inci = add nsw i64 %i, 1 + %exitcond55.us = icmp eq i64 %inci, %n + br i1 %exitcond55.us, label %for.incj, label %for.i + +for.end.loopexit: ; preds = %for.incj + br label %for.end + +for.end: ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry + ret void +}