diff --git a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h --- a/llvm/include/llvm/Analysis/LoopCacheAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopCacheAnalysis.h @@ -111,6 +111,13 @@ /// smaller than the cache line size \p CLS. bool isConsecutive(const Loop &L, unsigned CLS) const; + /// Retrieve the index of the subscript corresponding to the given loop \p + /// L. Return a zero-based positive index if the subscript index is + /// succesfully located and a negative value otherwise. For example given the + /// indexed reference 'A[i][2j+1][3k+2]', the call + /// 'getSubscriptIndex(loop-k)' would return value 2. + unsigned getSubscriptIndex(const Loop &L) const; + /// Return the coefficient used in the rightmost dimension. const SCEV *getLastCoefficient() const; diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp --- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp +++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp @@ -103,14 +103,24 @@ return StepRec == &ElemSize; } -/// Compute the trip count for the given loop \p L. Return the SCEV expression -/// for the trip count or nullptr if it cannot be computed. -static const SCEV *computeTripCount(const Loop &L, ScalarEvolution &SE) { +/// Compute the trip count for the given loop \p L or assume a default value if +/// it is not a compile time constant. Return the SCEV expression for the trip +/// count. +static const SCEV *computeTripCount(const Loop &L, const SCEV &ElemSize, + ScalarEvolution &SE) { const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(&L); - if (isa(BackedgeTakenCount) || - !isa(BackedgeTakenCount)) - return nullptr; - return SE.getTripCountFromExitCount(BackedgeTakenCount); + const SCEV *TripCount = (!isa(BackedgeTakenCount) && + isa(BackedgeTakenCount)) + ? SE.getTripCountFromExitCount(BackedgeTakenCount) + : nullptr; + + if (!TripCount) { + LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName() + << " could not be computed, using DefaultTripCount\n"); + TripCount = SE.getConstant(ElemSize.getType(), DefaultTripCount); + } + + return TripCount; } //===----------------------------------------------------------------------===// @@ -274,22 +284,18 @@ return 1; } - const SCEV *TripCount = computeTripCount(L, SE); - if (!TripCount) { - LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName() - << " could not be computed, using DefaultTripCount\n"); - const SCEV *ElemSize = Sizes.back(); - TripCount = SE.getConstant(ElemSize->getType(), DefaultTripCount); - } + const SCEV *TripCount = computeTripCount(L, *Sizes.back(), SE); + assert(TripCount && "Expecting valid TripCount"); LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n"); - // If the indexed reference is 'consecutive' the cost is - // (TripCount*Stride)/CLS, otherwise the cost is TripCount. - const SCEV *RefCost = TripCount; - + const SCEV *RefCost = nullptr; if (isConsecutive(L, CLS)) { + // If the indexed reference is 'consecutive' the cost is + // (TripCount*Stride)/CLS. const SCEV *Coeff = getLastCoefficient(); const SCEV *ElemSize = Sizes.back(); + assert(Coeff->getType() == ElemSize->getType() && + "Expecting the same type"); const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize); Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType()); const SCEV *CacheLineSize = SE.getConstant(WiderType, CLS); @@ -303,10 +309,33 @@ LLVM_DEBUG(dbgs().indent(4) << "Access is consecutive: RefCost=(TripCount*Stride)/CLS=" << *RefCost << "\n"); - } else + } else { + // If the indexed reference is not 'consecutive' the cost is proportional to + // the trip count and the depth of the dimension which the subject loop + // subscript is accessing. We try to estimate this by multiplying the cost + // by the trip counts of loops corresponding to the inner dimensions. For + // example, given the indexed reference 'A[i][j][k]', and assuming the + // i-loop is in the innermost position, the cost would be equal to the + // iterations of the i-loop multiplied by iterations of the j-loop. + RefCost = TripCount; + + unsigned Index = getSubscriptIndex(L); + assert(Index >= 0 && "Cound not locate a valid Index"); + + for (unsigned I = Index + 1; I < getNumSubscripts() - 1; ++I) { + const SCEVAddRecExpr *AR = dyn_cast(getSubscript(I)); + assert(AR && AR->getLoop() && "Expecting valid loop"); + const SCEV *TripCount = + computeTripCount(*AR->getLoop(), *Sizes.back(), SE); + Type *WiderType = SE.getWiderType(RefCost->getType(), TripCount->getType()); + RefCost = SE.getMulExpr(SE.getNoopOrAnyExtend(RefCost, WiderType), + SE.getNoopOrAnyExtend(TripCount, WiderType)); + } + LLVM_DEBUG(dbgs().indent(4) - << "Access is not consecutive: RefCost=TripCount=" << *RefCost - << "\n"); + << "Access is not consecutive: RefCost=" << *RefCost << "\n"); + } + assert(RefCost && "Expecting a valid RefCost"); // Attempt to fold RefCost into a constant. if (auto ConstantCost = dyn_cast(RefCost)) @@ -481,6 +510,16 @@ return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize); } +unsigned IndexedReference::getSubscriptIndex(const Loop &L) const { + for (auto Idx : seq(0, getNumSubscripts())) { + const SCEVAddRecExpr *AR = dyn_cast(getSubscript(Idx)); + if (AR && AR->getLoop() == &L) { + return Idx; + } + } + return -1; +} + const SCEV *IndexedReference::getLastCoefficient() const { const SCEV *LastSubscript = getLastSubscript(); auto *AR = cast(LastSubscript); diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll --- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll +++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll @@ -83,14 +83,13 @@ declare [2048 x i32]* @func_with_returned_arg([2048 x i32]* returned %arg) -; CHECK: Loop 'for.body' has cost = 4472886244958208 -; CHECK: Loop 'for.body4' has cost = 4472886244958208 -; CHECK: Loop 'for.body8' has cost = 4472886244958208 -; CHECK: Loop 'for.body12' has cost = 4472886244958208 -; CHECK: Loop 'for.body16' has cost = 137728168833024 +; CHECK: Loop 'for.body' has cost = 2112128815104000000 +; CHECK: Loop 'for.body4' has cost = 16762927104000000 +; CHECK: Loop 'for.body8' has cost = 130960368000000 +; CHECK: Loop 'for.body12' has cost = 1047682944000 +; CHECK: Loop 'for.body16' has cost = 32260032000 - -;; #define N 1024 +;; #define N 128 ;; #define M 2048 ;; void t3(int a[][N][N][N][M]) { ;; for (int i1 = 0; i1 < N-1; ++i1) @@ -101,7 +100,7 @@ ;; a[i1][i2][i3][i4][i5] = a[i1+1][i2-2][i3][i4-3][i5+2]; ;; } -define void @t3([1024 x [1024 x [1024 x [2048 x i32]]]]* %a) { +define void @t3([128 x [128 x [128 x [2048 x i32]]]]* %a) { entry: br label %for.body @@ -127,9 +126,9 @@ %1 = add nsw i64 %indvars.iv14, -2 %2 = add nsw i64 %indvars.iv7, -3 %3 = add nuw nsw i64 %indvars.iv, 2 - %arrayidx26 = getelementptr inbounds [1024 x [1024 x [1024 x [2048 x i32]]]], [1024 x [1024 x [1024 x [2048 x i32]]]]* %a, i64 %0, i64 %1, i64 %indvars.iv11, i64 %2, i64 %3 + %arrayidx26 = getelementptr inbounds [128 x [128 x [128 x [2048 x i32]]]], [128 x [128 x [128 x [2048 x i32]]]]* %a, i64 %0, i64 %1, i64 %indvars.iv11, i64 %2, i64 %3 %4 = load i32, i32* %arrayidx26, align 4 - %arrayidx36 = getelementptr inbounds [1024 x [1024 x [1024 x [2048 x i32]]]], [1024 x [1024 x [1024 x [2048 x i32]]]]* %a, i64 %indvars.iv18, i64 %indvars.iv14, i64 %indvars.iv11, i64 %indvars.iv7, i64 %indvars.iv + %arrayidx36 = getelementptr inbounds [128 x [128 x [128 x [2048 x i32]]]], [128 x [128 x [128 x [2048 x i32]]]]* %a, i64 %indvars.iv18, i64 %indvars.iv14, i64 %indvars.iv11, i64 %indvars.iv7, i64 %indvars.iv store i32 %4, i32* %arrayidx36, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp ne i64 %indvars.iv.next, 2046 @@ -137,22 +136,22 @@ for.inc37: ; preds = %for.body16 %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1 - %exitcond10 = icmp ne i64 %indvars.iv.next8, 1024 + %exitcond10 = icmp ne i64 %indvars.iv.next8, 128 br i1 %exitcond10, label %for.body12, label %for.inc40 for.inc40: ; preds = %for.inc37 %indvars.iv.next12 = add nuw nsw i64 %indvars.iv11, 1 - %exitcond13 = icmp ne i64 %indvars.iv.next12, 1024 + %exitcond13 = icmp ne i64 %indvars.iv.next12, 128 br i1 %exitcond13, label %for.body8, label %for.inc43 for.inc43: ; preds = %for.inc40 %indvars.iv.next15 = add nuw nsw i64 %indvars.iv14, 1 - %exitcond17 = icmp ne i64 %indvars.iv.next15, 1024 + %exitcond17 = icmp ne i64 %indvars.iv.next15, 128 br i1 %exitcond17, label %for.body4, label %for.inc46 for.inc46: ; preds = %for.inc43 %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1 - %exitcond21 = icmp ne i64 %indvars.iv.next19, 1023 + %exitcond21 = icmp ne i64 %indvars.iv.next19, 127 br i1 %exitcond21, label %for.body, label %for.end48 for.end48: ; preds = %for.inc46 diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll --- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll +++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll @@ -14,9 +14,9 @@ ; y[k+1][j][i][l] = y[k+1][j][i][l] + b[k][j][i][m][l]*x[k][j][i][m] ; } -; CHECK: Loop 'k_loop' has cost = 30000000000 -; CHECK: Loop 'j_loop' has cost = 30000000000 -; CHECK: Loop 'i_loop' has cost = 30000000000 +; CHECK: Loop 'k_loop' has cost = 10200000000000000 +; CHECK: Loop 'j_loop' has cost = 102000000000000 +; CHECK: Loop 'i_loop' has cost = 1020000000000 ; CHECK: Loop 'm_loop' has cost = 10700000000 ; CHECK: Loop 'l_loop' has cost = 1300000000 diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll @@ -0,0 +1,102 @@ +; RUN: opt < %s -opaque-pointers -passes='print' -disable-output 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512" +target triple = "powerpc64le-unknown-linux-gnu" + +; CHECK-DAG: Loop 'for.j' has cost = 201000000 +; CHECK-DAG: Loop 'for.i' has cost = 102000000 +; CHECK-DAG: Loop 'for.k' has cost = 90000 + +;; Test to make sure when we have multiple conflicting access patterns, the +;; chosen loop configuration favours the majority of those accesses. +;; For example this nest should be ordered as j-i-k. +;; for (int i = 0; i < n; i++) +;; for (int j = 0; j < n; j++) +;; for (int k = 0; k < n; k++) { +;; A[i][j][k] = 1; +;; B[j][i][k] = 2; +;; C[j][i][k] = 3; +;; } + +define void @foo(i32 noundef signext %n, ptr noalias noundef %A, ptr noalias noundef %B, ptr noalias noundef %C) { +entry: + %0 = zext i32 %n to i64 + %1 = zext i32 %n to i64 + %2 = zext i32 %n to i64 + %3 = zext i32 %n to i64 + %4 = zext i32 %n to i64 + %5 = zext i32 %n to i64 + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.i.preheader, label %for.end30 + +for.i.preheader: ; preds = %entry + %wide.trip.count16 = zext i32 %n to i64 + br label %for.i + +for.i: ; preds = %for.i.preheader, %for.inc28 + %indvars.iv13 = phi i64 [ 0, %for.i.preheader ], [ %indvars.iv.next14, %for.inc28 ] + %cmp23 = icmp sgt i32 %n, 0 + br i1 %cmp23, label %for.j.preheader, label %for.inc28 + +for.j.preheader: ; preds = %for.i + %wide.trip.count11 = zext i32 %n to i64 + br label %for.j + +for.j: ; preds = %for.j.preheader, %for.inc25 + %indvars.iv8 = phi i64 [ 0, %for.j.preheader ], [ %indvars.iv.next9, %for.inc25 ] + %cmp61 = icmp sgt i32 %n, 0 + br i1 %cmp61, label %for.k.preheader, label %for.inc25 + +for.k.preheader: ; preds = %for.j + %wide.trip.count = zext i32 %n to i64 + br label %for.k + +for.k: ; preds = %for.k.preheader, %for.k + %indvars.iv = phi i64 [ 0, %for.k.preheader ], [ %indvars.iv.next, %for.k ] + %6 = mul nuw i64 %0, %1 + %7 = mul nsw i64 %6, %indvars.iv13 + %arrayidx = getelementptr inbounds i32, ptr %A, i64 %7 + %8 = mul nuw nsw i64 %indvars.iv8, %1 + %arrayidx10 = getelementptr inbounds i32, ptr %arrayidx, i64 %8 + %arrayidx12 = getelementptr inbounds i32, ptr %arrayidx10, i64 %indvars.iv + store i32 1, ptr %arrayidx12, align 4 + %9 = mul nuw i64 %2, %3 + %10 = mul nsw i64 %9, %indvars.iv8 + %arrayidx14 = getelementptr inbounds i32, ptr %B, i64 %10 + %11 = mul nuw nsw i64 %indvars.iv13, %3 + %arrayidx16 = getelementptr inbounds i32, ptr %arrayidx14, i64 %11 + %arrayidx18 = getelementptr inbounds i32, ptr %arrayidx16, i64 %indvars.iv + store i32 2, ptr %arrayidx18, align 4 + %12 = mul nuw i64 %4, %5 + %13 = mul nsw i64 %12, %indvars.iv8 + %arrayidx20 = getelementptr inbounds i32, ptr %C, i64 %13 + %14 = mul nuw nsw i64 %indvars.iv13, %5 + %arrayidx22 = getelementptr inbounds i32, ptr %arrayidx20, i64 %14 + %arrayidx24 = getelementptr inbounds i32, ptr %arrayidx22, i64 %indvars.iv + store i32 3, ptr %arrayidx24, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.k, label %for.inc25.loopexit + +for.inc25.loopexit: ; preds = %for.k + br label %for.inc25 + +for.inc25: ; preds = %for.inc25.loopexit, %for.j + %indvars.iv.next9 = add nuw nsw i64 %indvars.iv8, 1 + %exitcond12 = icmp ne i64 %indvars.iv.next9, %wide.trip.count11 + br i1 %exitcond12, label %for.j, label %for.inc28.loopexit + +for.inc28.loopexit: ; preds = %for.inc25 + br label %for.inc28 + +for.inc28: ; preds = %for.inc28.loopexit, %for.i + %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1 + %exitcond17 = icmp ne i64 %indvars.iv.next14, %wide.trip.count16 + br i1 %exitcond17, label %for.i, label %for.end30.loopexit + +for.end30.loopexit: ; preds = %for.inc28 + br label %for.end30 + +for.end30: ; preds = %for.end30.loopexit, %entry + ret void +} diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll --- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll +++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll @@ -10,7 +10,7 @@ ; A[2*i+3][3*j-4][2*k+7] = 1; ; } -; CHECK: Loop 'for.i' has cost = 1000000 +; CHECK: Loop 'for.i' has cost = 100000000 ; CHECK: Loop 'for.j' has cost = 1000000 ; CHECK: Loop 'for.k' has cost = 60000