Index: llvm/include/llvm/Analysis/LoopCacheAnalysis.h =================================================================== --- /dev/null +++ llvm/include/llvm/Analysis/LoopCacheAnalysis.h @@ -0,0 +1,279 @@ +//===- llvm/Analysis/LoopCacheAnalysis.h ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines the interface for the loop cache analysis. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_LOOPCACHEANALYSIS_H +#define LLVM_ANALYSIS_LOOPCACHEANALYSIS_H + +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +class LPMUpdater; +using CacheCostTy = int64_t; +using LoopVectorTy = SmallVector; + +/// Represents a memory reference as a base pointer and a set of indexing +/// operations. For example given the array reference A[i][2j+1][3k+2] in a +/// 3-dim loop nest: +/// for(i=0;i A +/// Subscripts -> [{0,+,1}<%for.i>][{1,+,2}<%for.j>][{2,+,3}<%for.k>] +/// Sizes -> [m][o][4] +class IndexedReference { + friend raw_ostream &operator<<(raw_ostream &OS, const IndexedReference &R); + +public: + /// Construct an indexed reference given a \p StoreOrLoadInst instruction. + IndexedReference(Instruction &StoreOrLoadInst, const LoopInfo &LI, + ScalarEvolution &SE); + + bool isValid() const { return IsValid; } + const SCEV *getBasePointer() const { return BasePointer; } + size_t getNumSubscripts() const { return Subscripts.size(); } + const SCEV *getSubscript(unsigned SubNum) const { + assert(SubNum < getNumSubscripts() && "Invalid subscript number"); + return Subscripts[SubNum]; + } + const SCEV *getFirstSubscript() const { + assert(!Subscripts.empty() && "Expecting non-empty container"); + return Subscripts.front(); + } + const SCEV *getLastSubscript() const { + assert(!Subscripts.empty() && "Expecting non-empty container"); + return Subscripts.back(); + } + + /// Return true if the current object and the indexed reference \p Other are + /// in the same cache line of size \p CLS. This is true iff the distance + /// between the 2 indexed references in the innermost dimension is less than + /// the cache line size. + bool hasSpacialReuse(const IndexedReference &Other, unsigned CLS, + AliasAnalysis &AA) const; + + /// Return true if the current object and the indexed reference \p Other + /// have distance smaller than 'TemporalReuseThreashold' in the dimension + /// associated with the given loop \p L. + bool hasTemporalReuse(const IndexedReference &Other, const Loop &L, + DependenceInfo &DI, AliasAnalysis &AA) const; + + /// Compute the cost of the reference w.r.t. the given loop \p L when it is + /// considered in the innermost position in the loop nest. + /// The cost is defined as: + /// - equal to one if the reference is loop invariant, or + /// - equal to '(TripCount * stride) / cache_line_size' if: + /// + the reference stride is less than the cache line size, and + /// + the coefficient of this loop's index variable used in all other + /// subscripts is zero + /// - or otherwise equal to 'TripCount'. + CacheCostTy computeRefCost(const Loop &L, unsigned CLS) const; + +private: + /// Attempt to delinearize the indexed reference. + bool delinearize(const LoopInfo &LI); + + /// Return true if the index reference is invariant with respect to loop \p L. + bool isLoopInvariant(const Loop &L) const; + + /// Return true if the indexed reference is 'consecutive' in loop \p L. + /// An indexed reference is 'consecutive' if the only coefficient that uses + /// the loop induction variable is the rightmost one, and the access stride is + /// smaller than the cache line size \p CLS. + bool isConsecutive(const Loop &L, unsigned CLS) const; + + /// Compute the trip count for the given loop \p L. Return the SCEV expression + /// for the trip count or nullptr if it cannot be computed. + const SCEV *computeTripCount(const Loop &L) const; + + /// Return the coefficient used in the rightmost dimension. + const SCEV *getLastCoefficient() const; + + /// Return true if the coefficient corresponding to induction variable of + /// loop \p L in the given \p Subscript is zero or is loop invariant in \p L. + bool isCoeffForLoopZeroOrInvariant(const SCEV &Subscript, + const Loop &L) const; + + /// Verify that the given \p Subscript is 'well formed' (must be a simple add + /// recurrence). + bool isSimpleAddRecurrence(const SCEV &Subscript, const Loop &L) const; + + /// Return true if the given reference \p Other is definetely aliased with + /// the indexed reference represented by this class. + bool isAliased(const IndexedReference &Other, AliasAnalysis &AA) const; + +private: + /// True if the reference can be delinearized, false otherwise. + bool IsValid = false; + + /// Represent the memory reference instruction. + Instruction &StoreOrLoadInst; + + /// The base pointer of the memory reference. + const SCEV *BasePointer = nullptr; + + /// The subscript (indexes) of the memory reference. + SmallVector Subscripts; + + /// The dimensions of the memory reference. + SmallVector Sizes; + + ScalarEvolution &SE; +}; + +/// A reference group represents a set of memory references that exhibit +/// temporal or spacial reuse. Two references belong to the same +/// reference group with respect to a inner loop L iff: +/// 1. they have a loop independent dependency, or +/// 2. they have a loop carried dependence with a small dependence distance (e.g. +/// less than 2) carried by the inner loop, or +/// 3. they refer to the same array, and the subscript in their innermost +/// dimension is less than or equal to 'd' (where 'd' is less than the cache +/// line size) +/// +/// Intuitively a reference group represents memory references that access +/// the same cache line. Conditions 1,2 above account for temporal reuse, while +/// contition 3 accounts for spacial reuse. +using ReferenceGroupTy = SmallVector, 8>; +using ReferenceGroupsTy = SmallVector; + +/// \c CacheCost represents the estimated cost of a inner loop as the number of +/// cache lines used by the memory references it contains. +/// The 'cache cost' of a loop 'L' in a loop nest 'LN' is computed as the sum of +/// the cache costs of all of its reference groups when the loop is considered +/// to be in the innermost position in the nest. +class CacheCost { + friend raw_ostream &operator<<(raw_ostream &OS, const CacheCost &CC); + using LoopTripCountTy = std::pair; + using LoopCacheCostTy = std::pair; + +public: + static CacheCostTy constexpr InvalidCost = ULLONG_MAX; + + /// Construct a CacheCost object for the loop nest described by \p Loops. + CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI, ScalarEvolution &SE, + TargetTransformInfo &TTI, AliasAnalysis &AA, DependenceInfo &DI); + + /// Calculate the cache footprint of each loop in the nest (when it is + /// considered to be in the innermost position). + void calculateCacheFootprint(); + + /// Return the estimated cost of loop \p L if the given loop is part of the + /// loop nest associated with this object. Return -1 otherwise. + CacheCostTy getLoopCost(const Loop &L) const { + auto IT = std::find_if( + LoopCosts.begin(), LoopCosts.end(), + [&L](const LoopCacheCostTy &LCC) { return LCC.first == &L; }); + return (IT != LoopCosts.end()) ? (*IT).second : -1; + } + + const SmallVector &getLoopCosts() const { + return LoopCosts; + } + +private: + /// Partition store/load instructions in the loop nest into reference groups. + /// Two or more memory accesses belong in the same reference group if they + /// share the same cache line. + bool populateReferenceGroups(ReferenceGroupsTy &RefGroups) const; + + /// Calculate the cost of the given loop \p L assuming it is the innermost + /// loop in nest. + CacheCostTy computeLoopCacheCost(const Loop &L, + const ReferenceGroupsTy &RefGroups) const; + + /// Compute the cost of a representative reference in reference group \p RG + /// when the given loop \p L is considered as the innermost loop in the nest. + /// The computed cost is an estimate for the number of cache lines used by the + /// reference group. The representative reference cost is defined as: + /// - equal to one if the reference is loop invariant, or + /// - equal to '(TripCount * stride) / cache_line_size' if (a) loop \p L's + /// induction variable is used only in the reference subscript associated + /// with loop \p L, and (b) the reference stride is less than the cache + /// line size, or + /// - TripCount otherwise + CacheCostTy computeRefGroupCacheCost(const ReferenceGroupTy &RG, + const Loop &L) const; + + /// Sort the LoopCosts vector by decreasing cache cost. + void sortLoopCosts() { + sort(LoopCosts, [](const LoopCacheCostTy &A, const LoopCacheCostTy &B) { + return A.second > B.second; + }); + } + +private: + /// Loops in the loop nest associated with this object. + LoopVectorTy Loops; + + /// Trip counts for the loops in the loop nest associated with this object. + SmallVector TripCounts; + + /// Cache costs for the loops in the loop nest associated with this object. + SmallVector LoopCosts; + + const LoopInfo &LI; + ScalarEvolution &SE; + TargetTransformInfo &TTI; + AliasAnalysis &AA; + DependenceInfo &DI; +}; + +/// Estimate the number of cache lines used by the memory references in a loop. +/// This analysis starts by classifing memory references in a loop into +/// reference groups. A reference group represents memory references that fall +/// into the same cache line. Each reference group is analysed with respect to +/// the innermost loop in a loop nest. The cost of a reference is defined as +/// follow: +/// - one if it is loop invariant w.r.t the innermost loop, +/// - equal to the loop trip count divided by the cache line times the +/// reference stride if the reference stride is less than the cache line +/// size (CLS), and the coefficient of this loop's index variable used in all +/// other subscripts is zero (e.g. RefCost = TripCount/(CLS/RefStride)) +/// - equal to the innermost loop trip count if the reference stride is greater +/// or equal to the cache line size CLS. +class LoopCacheAnalysis : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; + +public: + LoopCacheAnalysis() = default; + + using Result = std::unique_ptr; + Result run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR); +}; + +/// Printer pass for the \c CacheCost results. +class LoopCachePrinterPass : public PassInfoMixin { + raw_ostream &OS; + +public: + explicit LoopCachePrinterPass(raw_ostream &OS) : OS(OS) {} + + PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; + +} // namespace llvm + +#endif // LLVM_ANALYSIS_LOOPCACHEANALYSIS_H Index: llvm/lib/Analysis/CMakeLists.txt =================================================================== --- llvm/lib/Analysis/CMakeLists.txt +++ llvm/lib/Analysis/CMakeLists.txt @@ -51,6 +51,7 @@ Loads.cpp LoopAccessAnalysis.cpp LoopAnalysisManager.cpp + LoopCacheAnalysis.cpp LoopUnrollAnalyzer.cpp LoopInfo.cpp LoopPass.cpp Index: llvm/lib/Analysis/LoopCacheAnalysis.cpp =================================================================== --- /dev/null +++ llvm/lib/Analysis/LoopCacheAnalysis.cpp @@ -0,0 +1,632 @@ +//===- LoopCacheAnalysis.cpp - Loop Cache Analysis -------------------------==// +// +// The LLVM Compiler Infrastructure +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines the implementation for the loop cache analysis. +/// The implementation is largely based on the following paper: +/// +/// Compiler Optimizations for Improving Data Locality +/// By: Steve Carr, Katherine S. McKinley, Chau-Wen Tseng +/// http://www.cs.utexas.edu/users/mckinley/papers/asplos-1994.pdf +/// +/// The general approach taken to estimate the number of cache lines used by the +/// memory references in an inner loop is: +/// 1. Partition memory references that exhibit temporal or spacial reuse +/// into reference groups. +/// 2. For each loop L in the a loop nest LN: +/// a. Compute the cost of the reference group +/// b. Compute the loop cost by summing up the reference groups costs +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/LoopCacheAnalysis.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/Debug.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "loop-cache-cost" + +static cl::opt DefaultTripCount( + "default-trip-count", cl::init(100), cl::Hidden, + cl::desc("Use this to specify the default trip count of a loop")); + +static cl::opt TemporalReuseThreshold( + "temporal-reuse-threshold", cl::init(2), cl::Hidden, + cl::desc("Use this to specify the temporal reuse distance threshold")); + +/// Populate \p Loops with the loops in the loop nest rooted by loop \p Root. +static void getLoops(Loop &Root, LoopVectorTy &Loops) { + assert(Root.getParentLoop() == nullptr && "Root should be a top level loop"); + assert(Loops.empty() && "Expecting Loops to be empty"); + + Loop *CurrentLoop = &Root; + Loops.push_back(CurrentLoop); + + // Collect the loops in the incoming nest in breadth-first order. + SmallVector WL; + WL.push_back(CurrentLoop); + + while (!WL.empty()) { + CurrentLoop = WL.back(); + WL.pop_back(); + const auto *SubLoops = &CurrentLoop->getSubLoops(); + if (SubLoops->empty()) + continue; + + Loops.insert(Loops.end(), SubLoops->begin(), SubLoops->end()); + WL.insert(WL.begin(), SubLoops->rbegin(), SubLoops->rend()); + } +} + +/// Retrieve the innermost loop in the given loop nest \p Loops. It returns a +/// nullptr if any loops in the loop vector supplied has more than one sibling. +/// The loop vector is expected to contain loops collected in breadth-first +/// order. +static Loop *getInnerMostLoop(const LoopVectorTy &Loops) { + assert(!Loops.empty() && "Expecting a non-empy loop vector"); + + Loop *LastLoop = Loops.back(); + Loop *ParentLoop = LastLoop->getParentLoop(); + + if (ParentLoop == nullptr) { + assert(Loops.size() == 1 && "Expecting a single loop"); + return LastLoop; + } + + return (std::is_sorted(Loops.begin(), Loops.end(), + [](const Loop* L1, const Loop *L2) { + return L1->getLoopDepth() < L2->getLoopDepth(); + })) ? LastLoop : nullptr; +} + +//===----------------------------------------------------------------------===// +// IndexedReference implementation +// +raw_ostream &llvm::operator<<(raw_ostream &OS, const IndexedReference &R) { + if (!R.IsValid) { + OS << R.StoreOrLoadInst; + OS << ", IsValid=false."; + return OS; + } + + OS << *R.BasePointer; + for (const SCEV *Subscript : R.Subscripts) + OS << "[" << *Subscript << "]"; + + OS << ", Sizes: "; + for (const SCEV *Size : R.Sizes) + OS << "[" << *Size << "]"; + + return OS; +} + +IndexedReference::IndexedReference(Instruction &StoreOrLoadInst, + const LoopInfo &LI, ScalarEvolution &SE) + : StoreOrLoadInst(StoreOrLoadInst), SE(SE) { + assert((isa(StoreOrLoadInst) || isa(StoreOrLoadInst)) && + "Expecting a load or store instruction"); + + IsValid = delinearize(LI); + if (IsValid) + LLVM_DEBUG(dbgs().indent(2) + << "Succesfully delinearized: " << *this << "\n"); +} + +bool IndexedReference::hasSpacialReuse(const IndexedReference &Other, + unsigned CLS, AliasAnalysis &AA) const { + assert(IsValid && "Expecting a valid reference"); + + if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) { + LLVM_DEBUG(dbgs().indent(2) + << "No spacial reuse: different base pointer\n"); + return false; + } + + unsigned NumSubscripts = getNumSubscripts(); + if (NumSubscripts != Other.getNumSubscripts()) { + LLVM_DEBUG( + dbgs().indent(2) + << "No spacial reuse: different number of subscripts\n"); + return false; + } + + // all subscripts must be equal, except the leftmost one (the last one). + for (auto SubNum : seq(0, NumSubscripts - 1)) { + if (getSubscript(SubNum) != Other.getSubscript(SubNum)) { + LLVM_DEBUG(dbgs().indent(2) + << "No spacial reuse, different subscripts: " + << "\n\t" << *getSubscript(SubNum) << "\n\t" + << *Other.getSubscript(SubNum) << "\n"); + return false; + } + } + + // the difference between the last subscripts must be less than the cache line + // size + const SCEV *LastSubscript = getLastSubscript(); + const SCEV *OtherLastSubscript = Other.getLastSubscript(); + const SCEVConstant *Diff = dyn_cast( + SE.getMinusSCEV(LastSubscript, OtherLastSubscript)); + if (Diff == nullptr) { + LLVM_DEBUG( + dbgs().indent(2) + << "No spacial reuse, difference between subscript:\n\t" + << *LastSubscript << "\n\t" << OtherLastSubscript + << "\nis not constant.\n"); + return false; + } + + bool InSameCacheLine = (Diff->getValue()->getSExtValue() < CLS); + + LLVM_DEBUG({ + if (InSameCacheLine) + dbgs().indent(2) << "Found spacial reuse.\n"; + else + dbgs().indent(2) << "No spacial reuse.\n"; + }); + + return InSameCacheLine; +} + +bool IndexedReference::hasTemporalReuse(const IndexedReference &Other, + const Loop &L, DependenceInfo &DI, + AliasAnalysis &AA) const { + assert(IsValid && "Expecting a valid reference"); + + if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) { + LLVM_DEBUG(dbgs().indent(2) + << "No temporal reuse: different base pointer\n"); + return false; + } + + auto D = DI.depends(&StoreOrLoadInst, &Other.StoreOrLoadInst, true); + if (D == nullptr) { + LLVM_DEBUG(dbgs().indent(2) << "No temporal reuse: no dependence\n"); + return false; + } + + if (D->isLoopIndependent()) { + LLVM_DEBUG(dbgs().indent(2) << "Found temporal reuse\n"); + return true; + } + + // Check the dependence distance at every loop level. There is temporal reuse + // if the distance at the given loop's depth is small (|d| <= 2) and it is + // zero at every other loop level. + int LoopDepth = L.getLoopDepth(); + int Levels = D->getLevels(); + for (int Level = 1; Level <= Levels; ++Level) { + const SCEV *Distance = D->getDistance(Level); + const SCEVConstant *SCEVConst = dyn_cast_or_null(Distance); + + if (SCEVConst == nullptr) { + LLVM_DEBUG(dbgs().indent(2) << "No temporal reuse: distance unknown\n"); + return false; + } + + const ConstantInt &CI = *SCEVConst->getValue(); + if (Level != LoopDepth && !CI.isZero()) { + LLVM_DEBUG(dbgs().indent(2) + << "No temporal reuse: distance is not zero at depth=" << Level + << "\n"); + return false; + } else if (Level == LoopDepth && + CI.getSExtValue() > TemporalReuseThreshold) { + LLVM_DEBUG(dbgs().indent(2) + << "No temporal reuse: distance is greater than 2 at depth=" + << Level << "\n"); + return false; + } + } + + LLVM_DEBUG(dbgs().indent(2) << "Found temporal reuse\n"); + return true; +} + +CacheCostTy IndexedReference::computeRefCost(const Loop &L, + unsigned CLS) const { + assert(IsValid && "Expecting a valid reference"); + LLVM_DEBUG({ + dbgs().indent(2) << "Computing cache cost for:\n"; + dbgs().indent(4) << *this << "\n"; + }); + + // If the indexed reference is loop invariant the cost is one. + if (isLoopInvariant(L)) { + LLVM_DEBUG(dbgs().indent(4) << "Reference is loop invariant: RefCost=1\n"); + return 1; + } + + // If the indexed reference is 'consecutive' the cost is + // (TripCount*Stride)/CLS, otherwise the cost is TripCount. + const SCEV *TripCount = computeTripCount(L); + const SCEV *RefCost = TripCount; + + LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n"); + + if (isConsecutive(L, CLS)) { + const SCEV *Coeff = getLastCoefficient(); + const SCEV *ElemSize = Sizes.back(); + const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize); + const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS); + const SCEV *Numerator = SE.getMulExpr(Stride, TripCount); + RefCost = SE.getUDivExpr(Numerator, CacheLineSize); + LLVM_DEBUG(dbgs().indent(4) + << "Access is consecutive: RefCost=(TripCount*Stride)/CLS=" + << *RefCost << "\n"); + } else + LLVM_DEBUG(dbgs().indent(4) + << "Access is not consecutive: RefCost=TripCount=" << *RefCost + << "\n"); + + // Attempt to fold RefCost into a constant. + RefCost = dyn_cast(RefCost); + if (RefCost == nullptr) { + LLVM_DEBUG(dbgs().indent(4) + << "RefCost is not a constant! Setting to RefCost=InvalidCost " + "(invalid value).\n"); + return CacheCost::InvalidCost; + } + + return dyn_cast(RefCost)->getValue()->getSExtValue(); +} + +bool IndexedReference::delinearize(const LoopInfo &LI) { + assert(Subscripts.empty() && "Subscripts should be empty"); + assert(Sizes.empty() && "Sizes should be empty"); + assert(!IsValid && "Should be called once from the constructor"); + LLVM_DEBUG(dbgs() << "Delinearizing: " << StoreOrLoadInst << "\n"); + + const SCEV *ElemSize = SE.getElementSize(&StoreOrLoadInst); + + auto isOneDimensionalArray = [&](const SCEV *AccessFn, const Loop *L) { + const SCEVAddRecExpr *AR = dyn_cast(AccessFn); + if (AR == nullptr || !AR->isAffine()) + return false; + + assert(AR->getLoop() && "AR should have a loop"); + + // check that start and increment are not add recurrences. + const SCEV *Start = AR->getStart(); + const SCEV *Step = AR->getStepRecurrence(SE); + if (isa(Start) || isa(Step)) + return false; + + // check that start and increment are both invariant in the loop. + if (!SE.isLoopInvariant(Start, L) || !SE.isLoopInvariant(Step, L)) + return false; + + return AR->getStepRecurrence(SE) == ElemSize; + }; + + const BasicBlock *BB = StoreOrLoadInst.getParent(); + for (Loop *L = LI.getLoopFor(BB); L != nullptr; L = L->getParentLoop()) { + const SCEV *AccessFn = + SE.getSCEVAtScope(getPointerOperand(&StoreOrLoadInst), L); + + BasePointer = dyn_cast(SE.getPointerBase(AccessFn)); + if (BasePointer == nullptr) { + LLVM_DEBUG( + dbgs().indent(2) + << "ERROR: failed to delinearize, can't identify base pointer\n"); + return false; + } + + AccessFn = SE.getMinusSCEV(AccessFn, BasePointer); + + LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName() + << "', AccessFn: " << *AccessFn << "\n"); + + SE.delinearize(AccessFn, Subscripts, Sizes, + SE.getElementSize(&StoreOrLoadInst)); + + if (Subscripts.empty() || Sizes.empty() || + Subscripts.size() != Sizes.size()) { + // Attempt to determine whether we have a single dimensional array access. + // before giving up. + if (!isOneDimensionalArray(AccessFn, L)) { + LLVM_DEBUG(dbgs().indent(2) + << "ERROR: failed to delinearize reference\n"); + Subscripts.clear(); + Sizes.clear(); + break; + } + + const SCEV *Div = SE.getUDivExactExpr(AccessFn, ElemSize); + Subscripts.push_back(Div); + Sizes.push_back(ElemSize); + } + + return all_of(Subscripts, [&](const SCEV *Subscript) { + return isSimpleAddRecurrence(*Subscript, *L); + }); + } + + return false; +} + +bool IndexedReference::isLoopInvariant(const Loop &L) const { + Value *Addr = getPointerOperand(&StoreOrLoadInst); + assert(Addr != nullptr && "Expecting either a load or a store instruction"); + assert(SE.isSCEVable(Addr->getType()) && "Addr should be SCEVable"); + + if (SE.isLoopInvariant(SE.getSCEV(Addr), &L)) + return true; + + // The indexed reference is loop invariant if none of the coefficients use + // the loop induction variable. + bool allCoeffForLoopAreZero = all_of(Subscripts, [&](const SCEV *Subscript) { + return isCoeffForLoopZeroOrInvariant(*Subscript, L); + }); + + return allCoeffForLoopAreZero; +} + +bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const { + // The indexed reference is 'consecutive' if the only coefficient that uses + // the loop induction variable is the last one... + const SCEV *LastSubscript = Subscripts.back(); + for (const SCEV *Subscript : Subscripts) { + if (Subscript == LastSubscript) + continue; + if (!isCoeffForLoopZeroOrInvariant(*Subscript, L)) + return false; + } + + // ...and the access stride is less than the cache line size. + const SCEV *Coeff = getLastCoefficient(); + const SCEV *ElemSize = Sizes.back(); + const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize); + const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS); + + return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize); +} + +const SCEV *IndexedReference::computeTripCount(const Loop &L) const { + const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(&L); + if (isa(BackedgeTakenCount) || + !isa(BackedgeTakenCount)) { + LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName() + << " could not be computed"); + const SCEV *ElemSize = Sizes.back(); + return SE.getConstant(ElemSize->getType(), DefaultTripCount); + } + + return SE.getAddExpr(BackedgeTakenCount, + SE.getOne(BackedgeTakenCount->getType())); +} + +const SCEV *IndexedReference::getLastCoefficient() const { + const SCEV *LastSubscript = getLastSubscript(); + assert(isa(LastSubscript) && + "Expecting a SCEV add recurrence expression"); + const SCEVAddRecExpr *AR = dyn_cast(LastSubscript); + return AR->getStepRecurrence(SE); +} + +bool IndexedReference::isCoeffForLoopZeroOrInvariant(const SCEV &Subscript, + const Loop &L) const { + const SCEVAddRecExpr *AR = dyn_cast(&Subscript); + return (AR != nullptr) ? AR->getLoop() != &L + : SE.isLoopInvariant(&Subscript, &L); +} + +bool IndexedReference::isSimpleAddRecurrence(const SCEV &Subscript, + const Loop &L) const { + if (!isa(Subscript)) + return false; + + const SCEVAddRecExpr *AR = cast(&Subscript); + assert(AR->getLoop() && "AR should have a loop"); + + if (!AR->isAffine()) + return false; + + const SCEV *Start = AR->getStart(); + const SCEV *Step = AR->getStepRecurrence(SE); + + if (!SE.isLoopInvariant(Start, &L) || !SE.isLoopInvariant(Step, &L)) + return false; + + return true; +} + +bool IndexedReference::isAliased(const IndexedReference &Other, + AliasAnalysis &AA) const { + const auto &Loc1 = MemoryLocation::get(&StoreOrLoadInst); + const auto &Loc2 = MemoryLocation::get(&Other.StoreOrLoadInst); + return AA.isMustAlias(Loc1, Loc2); +} + +//===----------------------------------------------------------------------===// +// CacheCost implementation +// +raw_ostream &llvm::operator<<(raw_ostream &OS, const CacheCost &CC) { + for (const auto &LC : CC.LoopCosts) { + const Loop *L = LC.first; + OS << "Loop '" << L->getName() << "' has cost = " << LC.second << "\n"; + } + return OS; +} + +CacheCost::CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI, + ScalarEvolution &SE, TargetTransformInfo &TTI, + AliasAnalysis &AA, DependenceInfo &DI) + : Loops(Loops), TripCounts(), LoopCosts(), LI(LI), SE(SE), TTI(TTI), AA(AA), + DI(DI) { + assert(!Loops.empty() && "Expecting a non-empty loop vector."); + + for (const Loop *L : Loops) { + unsigned TripCount = SE.getSmallConstantTripCount(L); + TripCount = (TripCount == 0) ? DefaultTripCount : TripCount; + TripCounts.push_back({L, TripCount}); + } + + calculateCacheFootprint(); +} + +void CacheCost::calculateCacheFootprint() { + LLVM_DEBUG(dbgs() << "POPULATING REFERENCE GROUPS\n"); + ReferenceGroupsTy RefGroups; + if (!populateReferenceGroups(RefGroups)) + return; + + LLVM_DEBUG(dbgs() << "COMPUTING LOOP CACHE COSTS\n"); + for (const Loop *L : Loops) { + auto IT = + std::find_if(LoopCosts.begin(), LoopCosts.end(), + [L](const LoopCacheCostTy &LCC) { return LCC.first == L; }); + assert(IT == LoopCosts.end() && "Should not add duplicate element"); + CacheCostTy LoopCost = computeLoopCacheCost(*L, RefGroups); + LoopCosts.push_back(std::make_pair(L, LoopCost)); + } + + sortLoopCosts(); + RefGroups.clear(); +} + +bool CacheCost::populateReferenceGroups(ReferenceGroupsTy &RefGroups) const { + assert(RefGroups.empty() && "Reference groups should be empty"); + + unsigned CLS = TTI.getCacheLineSize(); + Loop *InnerMostLoop = getInnerMostLoop(Loops); + assert(InnerMostLoop != nullptr && "Expecting a valid innermost loop"); + + for (BasicBlock *BB : InnerMostLoop->getBlocks()) { + for (Instruction &I : *BB) { + if (!isa(I) && !isa(I)) + continue; + + std::unique_ptr R(new IndexedReference(I, LI, SE)); + if (!R->isValid()) + continue; + + bool Added = false; + for (ReferenceGroupTy &RefGroup : RefGroups) { + const IndexedReference &Representative = *RefGroup.front().get(); + LLVM_DEBUG({ + dbgs() << "References:\n"; + dbgs().indent(2) << *R << "\n"; + dbgs().indent(2) << Representative << "\n"; + }); + + if (R->hasTemporalReuse(Representative, *InnerMostLoop, DI, AA) || + R->hasSpacialReuse(Representative, CLS, AA)) { + RefGroup.push_back(std::move(R)); + Added = true; + break; + } + } + + if (!Added) { + ReferenceGroupTy RG; + RG.push_back(std::move(R)); + RefGroups.push_back(std::move(RG)); + } + } + } + + if (RefGroups.empty()) + return false; + + LLVM_DEBUG({ + dbgs() << "\nIDENTIFIED REFERENCE GROUPS:\n"; + int n = 1; + for (const ReferenceGroupTy &RG : RefGroups) { + dbgs().indent(2) << "RefGroup " << n << ":\n"; + for (const auto &IR : RG) + dbgs().indent(4) << *IR << "\n"; + n++; + } + dbgs() << "\n" + }); + + return true; +} + +CacheCostTy +CacheCost::computeLoopCacheCost(const Loop &L, + const ReferenceGroupsTy &RefGroups) const { + if (!L.isLoopSimplifyForm()) + return InvalidCost; + + LLVM_DEBUG(dbgs() << "Considering loop '" << L.getName() + << "' as innermost loop.\n"); + + // Compute the product of the trip counts of each other loop in the nest. + CacheCostTy TripCountsProduct = 1; + for (const auto &TC : TripCounts) { + if (TC.first == &L) + continue; + TripCountsProduct *= TC.second; + } + + CacheCostTy LoopCost = 0; + for (const ReferenceGroupTy &RG : RefGroups) { + CacheCostTy RefGroupCost = computeRefGroupCacheCost(RG, L); + LoopCost += RefGroupCost * TripCountsProduct; + } + + LLVM_DEBUG(dbgs().indent(2) + << "Loop '" << L.getName() << "' has cost=" << LoopCost << "\n"); + + return LoopCost; +} + +CacheCostTy CacheCost::computeRefGroupCacheCost(const ReferenceGroupTy &RG, + const Loop &L) const { + assert(!RG.empty() && "Reference group should have at least one member."); + + const IndexedReference *Representative = RG.front().get(); + return Representative->computeRefCost(L, TTI.getCacheLineSize()); +} + +//===----------------------------------------------------------------------===// +// LoopCacheAnalysis implementation +// +AnalysisKey LoopCacheAnalysis::Key; + +LoopCacheAnalysis::Result +LoopCacheAnalysis::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR) { + // Analyze loop nests in their entirety. + if (L.getParentLoop() != nullptr) + return nullptr; + + LoopVectorTy Loops; + getLoops(L, Loops); + assert(!Loops.empty() && "Failed to retrieve loops in the nest"); + + if (getInnerMostLoop(Loops) == nullptr) { + LLVM_DEBUG(dbgs() << "Cannot compute cache cost of loop nest with more " + "than one innermost loop\n"); + return nullptr; + } + + Function *F = L.getHeader()->getParent(); + DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI); + + return make_unique(Loops, AR.LI, AR.SE, AR.TTI, AR.AA, DI); +} + +//===----------------------------------------------------------------------===// +// LoopCachePrinterPass implementation +// +PreservedAnalyses LoopCachePrinterPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &U) { + auto &CacheCost = AM.getResult(L, AR); + if (CacheCost != nullptr) + OS << *CacheCost; + + return PreservedAnalyses::all(); +} Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -35,6 +35,7 @@ #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/MemorySSA.h" Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -276,6 +276,7 @@ LOOP_ANALYSIS("no-op-loop", NoOpLoopAnalysis()) LOOP_ANALYSIS("access-info", LoopAccessAnalysis()) LOOP_ANALYSIS("ivusers", IVUsersAnalysis()) +LOOP_ANALYSIS("loop-cache-cost", LoopCacheAnalysis()) LOOP_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC)) #undef LOOP_ANALYSIS @@ -298,6 +299,7 @@ LOOP_PASS("unroll-full", LoopFullUnrollPass()) LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs())) LOOP_PASS("print", IVUsersPrinterPass(dbgs())) +LOOP_PASS("print", LoopCachePrinterPass(dbgs())) LOOP_PASS("loop-predication", LoopPredicationPass()) LOOP_PASS("guard-widening", GuardWideningPass()) #undef LOOP_PASS Index: llvm/test/Analysis/LoopCacheAnalysis/loads-store.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/LoopCacheAnalysis/loads-store.ll @@ -0,0 +1,88 @@ +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; void foo(long n, long m, long o, int A[n][m][o], int B[n][m][o], int C[n][m][o]) { +; for (long i = 0; i < n; i++) +; for (long j = 0; j < m; j++) +; for (long k = 0; k < o; k++) +; A[i][k][j] += B[i][k][j] + C[i][j][k]; +; } + +; CHECK-DAG:Loop 'for.i' has cost = 3000000 +; CHECK-DAG:Loop 'for.k' has cost = 2030000 +; CHECK-DAG:Loop 'for.j' has cost = 1060000 + +define void @foo(i64 %n, i64 %m, i64 %o, i32* %A, i32* %B, i32* %C) { +entry: + %cmp32 = icmp sgt i64 %n, 0 + %cmp230 = icmp sgt i64 %m, 0 + %cmp528 = icmp sgt i64 %o, 0 + br i1 %cmp32, label %for.cond1.preheader.lr.ph, label %for.end + +for.cond1.preheader.lr.ph: ; preds = %entry + br i1 %cmp230, label %for.i.preheader, label %for.end + +for.i.preheader: ; preds = %for.cond1.preheader.lr.ph + br i1 %cmp528, label %for.i.preheader.split, label %for.end + +for.i.preheader.split: ; preds = %for.i.preheader + br label %for.i + +for.i: ; preds = %for.inci, %for.i.preheader.split + %i = phi i64 [ %inci, %for.inci ], [ 0, %for.i.preheader.split ] + %muli = mul i64 %i, %m + br label %for.j + +for.j: ; preds = %for.incj, %for.i + %j = phi i64 [ %incj, %for.incj ], [ 0, %for.i ] + %addj = add i64 %muli, %j + %mulj = mul i64 %addj, %o + br label %for.k + +for.k: ; preds = %for.k, %for.j + %k = phi i64 [ 0, %for.j ], [ %inck, %for.k ] + + ; B[i][k][j] + %addk = add i64 %muli, %k + %mulk = mul i64 %addk, %o + %arrayidx1 = add i64 %j, %mulk + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %arrayidx1 + %elem_B = load i32, i32* %arrayidx2, align 4 + + ; C[i][j][k] + %arrayidx3 = add i64 %k, %mulj + %arrayidx4 = getelementptr inbounds i32, i32* %C, i64 %arrayidx3 + %elem_C = load i32, i32* %arrayidx4, align 4 + + ; A[i][k][j] + %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %arrayidx1 + %elem_A = load i32, i32* %arrayidx5, align 4 + + ; A[i][k][j] += B[i][k][j] + C[i][j][k] + %add1 = add i32 %elem_B, %elem_C + %add2 = add i32 %add1, %elem_A + %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 %arrayidx1 + store i32 %add2, i32* %arrayidx6, align 4 + + %inck = add nsw i64 %k, 1 + %exitcond.us = icmp eq i64 %inck, %o + br i1 %exitcond.us, label %for.incj, label %for.k + +for.incj: ; preds = %for.k + %incj = add nsw i64 %j, 1 + %exitcond54.us = icmp eq i64 %incj, %m + br i1 %exitcond54.us, label %for.inci, label %for.j + +for.inci: ; preds = %for.incj + %inci = add nsw i64 %i, 1 + %exitcond55.us = icmp eq i64 %inci, %n + br i1 %exitcond55.us, label %for.end.loopexit, label %for.i + +for.end.loopexit: ; preds = %for.inci + br label %for.end + +for.end: ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry + ret void +} Index: llvm/test/Analysis/LoopCacheAnalysis/matmul.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/LoopCacheAnalysis/matmul.ll @@ -0,0 +1,81 @@ +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; void matmul(long n, long m, long o, int A[n][m], int B[n][m], int C[n]) { +; for (long i = 0; i < n; i++) +; for (long j = 0; j < m; j++) +; for (long k = 0; k < o; k++) +; C[i][j] = C[i][j] + A[i][k] * B[k][j]; +; } + +; CHECK-DAG:Loop 'for.i' has cost = 2010000 +; CHECK-DAG:Loop 'for.j' has cost = 70000 +; CHECK-DAG:Loop 'for.k' has cost = 1040000 + +define void @matmul(i64 %n, i64 %m, i64 %o, i32* %A, i32* %B, i32* %C) { +entry: + br label %for.i + +for.i: ; preds = %entry, %for.inc.i + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc.i ] + %muli = mul i64 %i, %m + br label %for.j + +for.j: ; preds = %for.i, %for.inc.j + %j = phi i64 [ 0, %for.i ], [ %j.next, %for.inc.j ] + %addj = add i64 %muli, %j + %mulj = mul i64 %addj, %o + br label %for.k + +for.k: ; preds = %for.j, %for.inc.k + %k = phi i64 [ 0, %for.j ], [ %k.next, %for.inc.k ] + + ; A[i][k] + %arrayidx3 = add i64 %k, %muli + %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %arrayidx3 + %elem_A = load i32, i32* %arrayidx4, align 4 + + ; B[k][j] + %mulk = mul i64 %k, %o + %arrayidx5 = add i64 %j, %mulk + %arrayidx6 = getelementptr inbounds i32, i32* %B, i64 %arrayidx5 + %elem_B = load i32, i32* %arrayidx6, align 4 + + ; C[i][k] + %arrayidx7 = add i64 %j, %muli + %arrayidx8 = getelementptr inbounds i32, i32* %C, i64 %arrayidx7 + %elem_C = load i32, i32* %arrayidx8, align 4 + + ; C[i][j] = C[i][j] + A[i][k] * B[k][j]; + %mul = mul nsw i32 %elem_A, %elem_B + %add = add nsw i32 %elem_C, %mul + store i32 %add, i32* %arrayidx8, align 4 + + br label %for.inc.k + +for.inc.k: ; preds = %for.k + %k.next = add nuw nsw i64 %k, 1 + %exitcond = icmp ne i64 %k.next, %o + br i1 %exitcond, label %for.k, label %for.end + +for.end: ; preds = %for.inc + br label %for.inc.j + +for.inc.j: ; preds = %for.end + %j.next = add nuw nsw i64 %j, 1 + %exitcond5 = icmp ne i64 %j.next, %m + br i1 %exitcond5, label %for.j, label %for.end23 + +for.end23: ; preds = %for.inc.j + br label %for.inc.i + +for.inc.i: ; preds = %for.end23 + %i.next = add nuw nsw i64 %i, 1 + %exitcond8 = icmp ne i64 %i.next, %n + br i1 %exitcond8, label %for.i, label %for.end26 + +for.end26: ; preds = %for.inc.i + ret void +} Index: llvm/test/Analysis/LoopCacheAnalysis/matvecmul.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/LoopCacheAnalysis/matvecmul.ll @@ -0,0 +1,185 @@ +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; void matvecmul(const double *__restrict y, const double * __restrict x, const double * __restrict b, +; const int * __restrict nb, const int * __restrict nx, const int * __restrict ny, const int * __restrict nz) { +; +; for (int k=1;k + +; Function Attrs: norecurse nounwind +define void @mat_vec_mpy([0 x %_elem_type_of_double]* noalias %y, [0 x %_elem_type_of_double]* noalias readonly %x, + [0 x %_elem_type_of_double]* noalias readonly %b, i32* noalias readonly %nb, i32* noalias readonly %nx, + i32* noalias readonly %ny, i32* noalias readonly %nz) { +mat_times_vec_entry: + %_ind_val = load i32, i32* %nb, align 4 + %_conv = sext i32 %_ind_val to i64 + %_grt_tmp.i = icmp sgt i64 %_conv, 0 + %a_b.i = select i1 %_grt_tmp.i, i64 %_conv, i64 0 + %_ind_val1 = load i32, i32* %nx, align 4 + %_conv2 = sext i32 %_ind_val1 to i64 + %_grt_tmp.i266 = icmp sgt i64 %_conv2, 0 + %a_b.i267 = select i1 %_grt_tmp.i266, i64 %_conv2, i64 0 + %_ind_val3 = load i32, i32* %ny, align 4 + %_conv4 = sext i32 %_ind_val3 to i64 + %_grt_tmp.i264 = icmp sgt i64 %_conv4, 0 + %a_b.i265 = select i1 %_grt_tmp.i264, i64 %_conv4, i64 0 + %_ind_val5 = load i32, i32* %nz, align 4 + %_mult_tmp = shl nsw i64 %a_b.i, 3 + %_mult_tmp7 = mul i64 %_mult_tmp, %a_b.i267 + %_mult_tmp8 = mul i64 %_mult_tmp7, %a_b.i265 + %_sub_tmp = sub nuw nsw i64 -8, %_mult_tmp + %_sub_tmp21 = sub i64 %_sub_tmp, %_mult_tmp7 + %_sub_tmp23 = sub i64 %_sub_tmp21, %_mult_tmp8 + %_mult_tmp73 = mul i64 %_mult_tmp, %a_b.i + %_mult_tmp74 = mul i64 %_mult_tmp73, %a_b.i267 + %_mult_tmp75 = mul i64 %_mult_tmp74, %a_b.i265 + %_sub_tmp93 = sub i64 %_sub_tmp, %_mult_tmp73 + %_sub_tmp95 = sub i64 %_sub_tmp93, %_mult_tmp74 + %_sub_tmp97 = sub i64 %_sub_tmp95, %_mult_tmp75 + %_grt_tmp853288 = icmp slt i32 %_ind_val5, 1 + br i1 %_grt_tmp853288, label %_return_bb, label %k_loop.lr.ph + +k_loop.lr.ph: ; preds = %mat_times_vec_entry + %_grt_tmp851279 = icmp slt i32 %_ind_val3, 1 + %_grt_tmp847270 = icmp slt i32 %_ind_val, 1 + %_aa_conv = bitcast [0 x %_elem_type_of_double]* %y to i8* + %_adda_ = getelementptr inbounds i8, i8* %_aa_conv, i64 %_sub_tmp23 + %_aa_conv434 = bitcast [0 x %_elem_type_of_double]* %x to i8* + %_adda_435 = getelementptr inbounds i8, i8* %_aa_conv434, i64 %_sub_tmp23 + %_aa_conv785 = bitcast [0 x %_elem_type_of_double]* %b to i8* + %_adda_786 = getelementptr inbounds i8, i8* %_aa_conv785, i64 %_sub_tmp97 + br i1 %_grt_tmp851279, label %k_loop.us.preheader, label %k_loop.lr.ph.split + +k_loop.us.preheader: ; preds = %k_loop.lr.ph + br label %_return_bb.loopexit + +k_loop.lr.ph.split: ; preds = %k_loop.lr.ph + %_grt_tmp849273 = icmp slt i32 %_ind_val1, 1 + br i1 %_grt_tmp849273, label %k_loop.us291.preheader, label %k_loop.lr.ph.split.split + +k_loop.us291.preheader: ; preds = %k_loop.lr.ph.split + br label %_return_bb.loopexit300 + +k_loop.lr.ph.split.split: ; preds = %k_loop.lr.ph.split + br i1 %_grt_tmp847270, label %k_loop.us294.preheader, label %k_loop.preheader + +k_loop.preheader: ; preds = %k_loop.lr.ph.split.split + %0 = add i32 %_ind_val, 1 + %1 = add i32 %_ind_val1, 1 + %2 = add i32 %_ind_val3, 1 + %3 = add i32 %_ind_val5, 1 + br label %k_loop + +k_loop.us294.preheader: ; preds = %k_loop.lr.ph.split.split + br label %_return_bb.loopexit301 + +k_loop: ; preds = %k_loop._label_18_crit_edge.split.split.split, %k_loop.preheader + %indvars.iv316 = phi i64 [ 1, %k_loop.preheader ], [ %indvars.iv.next317, %k_loop._label_18_crit_edge.split.split.split ] + %indvars.iv.next317 = add nuw nsw i64 %indvars.iv316, 1 + %_ix_x_len = mul i64 %_mult_tmp8, %indvars.iv.next317 + %_ix_x_len410 = mul i64 %_mult_tmp75, %indvars.iv316 + %_ix_x_len822 = mul i64 %_mult_tmp8, %indvars.iv316 + br label %j_loop + +j_loop: ; preds = %j_loop._label_15_crit_edge.split.split, %k_loop + %indvars.iv312 = phi i64 [ %indvars.iv.next313, %j_loop._label_15_crit_edge.split.split ], [ 1, %k_loop ] + %_ix_x_len371 = mul i64 %_mult_tmp7, %indvars.iv312 + %_ix_x_len415 = mul i64 %_mult_tmp74, %indvars.iv312 + br label %i_loop + +i_loop: ; preds = %i_loop._label_12_crit_edge.split, %j_loop + %indvars.iv307 = phi i64 [ %indvars.iv.next308, %i_loop._label_12_crit_edge.split ], [ 1, %j_loop ] + %_ix_x_len375 = mul i64 %_mult_tmp, %indvars.iv307 + %_ix_x_len420 = mul i64 %_mult_tmp73, %indvars.iv307 + br label %l_loop + +l_loop: ; preds = %l_loop._label_9_crit_edge, %i_loop + %indvars.iv303 = phi i64 [ %indvars.iv.next304, %l_loop._label_9_crit_edge ], [ 1, %i_loop ] + %_ix_x_len378 = shl nuw nsw i64 %indvars.iv303, 3 + br label %m_loop + +m_loop: ; preds = %m_loop, %l_loop + %indvars.iv = phi i64 [ %indvars.iv.next, %m_loop ], [ 1, %l_loop ] + %_ix_x_len424 = mul i64 %_mult_tmp, %indvars.iv + %_ix_x_len454 = shl nuw nsw i64 %indvars.iv, 3 + %_ixa_gep = getelementptr inbounds i8, i8* %_adda_, i64 %_ix_x_len + %_ixa_gep791 = getelementptr inbounds i8, i8* %_adda_786, i64 %_ix_x_len410 + %_ixa_gep823 = getelementptr inbounds i8, i8* %_adda_435, i64 %_ix_x_len822 + %_ixa_gep372 = getelementptr inbounds i8, i8* %_ixa_gep, i64 %_ix_x_len371 + %_ixa_gep376 = getelementptr inbounds i8, i8* %_ixa_gep372, i64 %_ix_x_len375 + %_ixa_gep796 = getelementptr inbounds i8, i8* %_ixa_gep791, i64 %_ix_x_len415 + %_ixa_gep828 = getelementptr inbounds i8, i8* %_ixa_gep823, i64 %_ix_x_len371 + %_ixa_gep379 = getelementptr inbounds i8, i8* %_ixa_gep376, i64 %_ix_x_len378 + %_ixa_gep801 = getelementptr inbounds i8, i8* %_ixa_gep796, i64 %_ix_x_len420 + %_ixa_gep833 = getelementptr inbounds i8, i8* %_ixa_gep828, i64 %_ix_x_len375 + %_ixa_gep806 = getelementptr inbounds i8, i8* %_ixa_gep801, i64 %_ix_x_len378 + %_ixa_gep810 = getelementptr inbounds i8, i8* %_ixa_gep806, i64 %_ix_x_len424 + %_gepp = bitcast i8* %_ixa_gep379 to double* + %_gepp813 = bitcast i8* %_ixa_gep810 to double* + %_ind_val814 = load double, double* %_gepp813, align 8 + %_ixa_gep837 = getelementptr inbounds i8, i8* %_ixa_gep833, i64 %_ix_x_len454 + %_gepp840 = bitcast i8* %_ixa_gep837 to double* + %_ind_val841 = load double, double* %_gepp840, align 8 + %_mult_tmp842 = fmul double %_ind_val814, %_ind_val841 + store double %_mult_tmp842, double* %_gepp, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %0 to i64 + %wide.trip.count305 = zext i32 %0 to i64 + %wide.trip.count309 = zext i32 %1 to i64 + %wide.trip.count314 = zext i32 %2 to i64 + %wide.trip.count319 = zext i32 %3 to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %m_loop, label %l_loop._label_9_crit_edge + +l_loop._label_9_crit_edge: ; preds = %m_loop + %indvars.iv.next304 = add nuw nsw i64 %indvars.iv303, 1 + %exitcond306 = icmp ne i64 %indvars.iv.next304, %wide.trip.count305 + br i1 %exitcond306, label %l_loop, label %i_loop._label_12_crit_edge.split + +i_loop._label_12_crit_edge.split: ; preds = %l_loop._label_9_crit_edge + %indvars.iv.next308 = add nuw nsw i64 %indvars.iv307, 1 + %exitcond310 = icmp ne i64 %indvars.iv.next308, %wide.trip.count309 + br i1 %exitcond310, label %i_loop, label %j_loop._label_15_crit_edge.split.split + +j_loop._label_15_crit_edge.split.split: ; preds = %i_loop._label_12_crit_edge.split + %indvars.iv.next313 = add nuw nsw i64 %indvars.iv312, 1 + %exitcond315 = icmp ne i64 %indvars.iv.next313, %wide.trip.count314 + br i1 %exitcond315, label %j_loop, label %k_loop._label_18_crit_edge.split.split.split + +k_loop._label_18_crit_edge.split.split.split: ; preds = %j_loop._label_15_crit_edge.split.split + %exitcond320 = icmp ne i64 %indvars.iv.next317, %wide.trip.count319 + br i1 %exitcond320, label %k_loop, label %_return_bb.loopexit302 + +_return_bb.loopexit: ; preds = %k_loop.us.preheader + br label %_return_bb + +_return_bb.loopexit300: ; preds = %k_loop.us291.preheader + br label %_return_bb + +_return_bb.loopexit301: ; preds = %k_loop.us294.preheader + br label %_return_bb + +_return_bb.loopexit302: ; preds = %k_loop._label_18_crit_edge.split.split.split + br label %_return_bb + +_return_bb: ; preds = %_return_bb.loopexit302, %_return_bb.loopexit301, %_return_bb.loopexit300, %_return_bb.loopexit, %mat_times_vec_entry + ret void +} + + Index: llvm/test/Analysis/LoopCacheAnalysis/single-store.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/LoopCacheAnalysis/single-store.ll @@ -0,0 +1,77 @@ +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; void foo(long n, long m, long o, int A[n][m][o]) { +; for (long i = 0; i < n; i++) +; for (long j = 0; j < m; j++) +; for (long k = 0; k < o; k++) +; A[2*i+3][3*j-4][2*k+7] = 1; +; } + +; CHECK-DAG: Loop 'for.i' has cost = 1000000 +; CHECK-DAG: Loop 'for.j' has cost = 1000000 +; CHECK-DAG: Loop 'for.k' has cost = 60000 + +define void @foo(i64 %n, i64 %m, i64 %o, i32* %A) { +entry: + %cmp32 = icmp sgt i64 %n, 0 + %cmp230 = icmp sgt i64 %m, 0 + %cmp528 = icmp sgt i64 %o, 0 + br i1 %cmp32, label %for.cond1.preheader.lr.ph, label %for.end + +for.cond1.preheader.lr.ph: ; preds = %entry + br i1 %cmp230, label %for.i.preheader, label %for.end + +for.i.preheader: ; preds = %for.cond1.preheader.lr.ph + br i1 %cmp528, label %for.i.preheader.split, label %for.end + +for.i.preheader.split: ; preds = %for.i.preheader + br label %for.i + +for.i: ; preds = %for.inci, %for.i.preheader.split + %i = phi i64 [ %inci, %for.inci ], [ 0, %for.i.preheader.split ] + %mul8 = shl i64 %i, 1 + %add9 = add nsw i64 %mul8, 3 + %0 = mul i64 %add9, %m + %sub = add i64 %0, -4 + br label %for.j + +for.j: ; preds = %for.incj, %for.i + %j = phi i64 [ %incj, %for.incj ], [ 0, %for.i ] + %mul7 = mul nsw i64 %j, 3 + %tmp = add i64 %sub, %mul7 + %tmp27 = mul i64 %tmp, %o + br label %for.k + +for.k: ; preds = %for.k, %for.j.us + %k = phi i64 [ 0, %for.j ], [ %inck, %for.k ] + + %mul = mul nsw i64 %k, 2 + %arrayidx.sum = add i64 %mul, 7 + %arrayidx10.sum = add i64 %arrayidx.sum, %tmp27 + %arrayidx11 = getelementptr inbounds i32, i32* %A, i64 %arrayidx10.sum + store i32 1, i32* %arrayidx11, align 4 + + %inck = add nsw i64 %k, 1 + %exitcond.us = icmp eq i64 %inck, %o + br i1 %exitcond.us, label %for.incj, label %for.k + +for.incj: ; preds = %for.k + %incj = add nsw i64 %j, 1 + %exitcond54.us = icmp eq i64 %incj, %m + br i1 %exitcond54.us, label %for.inci, label %for.j + +for.inci: ; preds = %for.incj + %inci = add nsw i64 %i, 1 + %exitcond55.us = icmp eq i64 %inci, %n + br i1 %exitcond55.us, label %for.end.loopexit, label %for.i + +for.end.loopexit: ; preds = %for.inci + br label %for.end + +for.end: ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry + ret void +} + Index: llvm/test/Analysis/LoopCacheAnalysis/stencil.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/LoopCacheAnalysis/stencil.ll @@ -0,0 +1,98 @@ +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; void foo(long n, long m, long o, int A[n][m], int B[n][m], int C[n]) { +; for (long i = 0; i < n; i++) +; for (long j = 0; j < m; j++) { +; A[i][j] = A[i][j+1] + B[i-1][j] + B[i+1][j+1] + C[i]; +; A[i][j] += B[i][i]; +; } +; } + +; CHECK-DAG: Loop 'for.i' has cost = 20600 +; CHECK-DAG: Loop 'for.j' has cost = 800 + +define void @foo(i64 %n, i64 %m, i32* %A, i32* %B, i32* %C) { +entry: + %cmp32 = icmp sgt i64 %n, 0 + %cmp230 = icmp sgt i64 %m, 0 + br i1 %cmp32, label %for.cond1.preheader.lr.ph, label %for.end + +for.cond1.preheader.lr.ph: ; preds = %entry + br i1 %cmp230, label %for.i.preheader, label %for.end + +for.i.preheader: ; preds = %for.cond1.preheader.lr.ph + br label %for.i + +for.i: ; preds = %for.inci, %for.i.preheader.split + %i = phi i64 [ %inci, %for.inci ], [ 0, %for.i.preheader ] + %subione = sub i64 %i, 1 + %addione = add i64 %i, 1 + %muli = mul i64 %i, %m + %muliminusone = mul i64 %subione, %m + %muliplusone = mul i64 %addione, %m + br label %for.j + +for.j: ; preds = %for.incj, %for.i + %j = phi i64 [ %incj, %for.incj ], [ 0, %for.i ] + %addj = add i64 %muli, %j + + ; B[i-1][j] + %arrayidx1 = add i64 %j, %muliminusone + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %arrayidx1 + %elem_B1 = load i32, i32* %arrayidx2, align 4 + + ; B[i-1][j+1] + %addjone = add i64 %j, 1 + %arrayidx3 = add i64 %addjone, %muliminusone + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %arrayidx3 + %elem_B2 = load i32, i32* %arrayidx4, align 4 + + ; C[i] + %arrayidx6 = getelementptr inbounds i32, i32* %C, i64 %i + %elem_C = load i32, i32* %arrayidx6, align 4 + + ; A[i][j+1] + %arrayidx7 = add i64 %addjone, %muli + %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %arrayidx7 + %elem_A = load i32, i32* %arrayidx8, align 4 + + ; A[i][j] = A[i][j+1] + B[i-1][j] + B[i-1][j+1] + C[i] + %addB = add i32 %elem_B1, %elem_B2 + %addC = add i32 %addB, %elem_C + %addA = add i32 %elem_A, %elem_C + %arrayidx9 = add i64 %j, %muli + %arrayidx10 = getelementptr inbounds i32, i32* %A, i64 %arrayidx9 + store i32 %addA, i32* %arrayidx10, align 4 + + ; A[i][j] += B[i][i]; + %arrayidx11 = add i64 %j, %muli + %arrayidx12 = getelementptr inbounds i32, i32* %A, i64 %arrayidx11 + %elem_A1 = load i32, i32* %arrayidx12, align 4 + %arrayidx13 = add i64 %i, %muli + %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 %arrayidx13 + %elem_B3 = load i32, i32* %arrayidx14, align 4 + %addA1 = add i32 %elem_A1, %elem_B3 + store i32 %addA1, i32* %arrayidx12, align 4 + + br label %for.incj + +for.incj: ; preds = %for.j + %incj = add nsw i64 %j, 1 + %exitcond54.us = icmp eq i64 %incj, %m + br i1 %exitcond54.us, label %for.inci, label %for.j + +for.inci: ; preds = %for.incj + %inci = add nsw i64 %i, 1 + %exitcond55.us = icmp eq i64 %inci, %n + br i1 %exitcond55.us, label %for.end.loopexit, label %for.i + +for.end.loopexit: ; preds = %for.inci + br label %for.end + +for.end: ; preds = %for.end.loopexit, %for.cond1.preheader.lr.ph, %entry + ret void +} +