diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp --- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp +++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp @@ -313,24 +313,25 @@ // If the indexed reference is not 'consecutive' the cost is proportional to // the trip count and the depth of the dimension which the subject loop // subscript is accessing. We try to estimate this by multiplying the cost - // by the trip counts of loops corresponding to the inner dimensions. For - // example, given the indexed reference 'A[i][j][k]', and assuming the - // i-loop is in the innermost position, the cost would be equal to the - // iterations of the i-loop multiplied by iterations of the j-loop. + // by "relative depth" of loops w.r.t. to the innermost loop. For example, + // given the indexed reference 'A[i][j][k][m][n]', the relative depth of + // loops i, j, k, m with regard to the innermost loop is 4, 3, 2, 1, + // respectively. + // + // If we assume the i-loop is in the innermost position, the cost would be + // equal to the iterations of the i-loop multiplied by 4. Similarly if we + // assume the i-loop is in the innermost position, the cost would be equal + // to the iterations of the i-loop multiplied by 3. RefCost = TripCount; int Index = getSubscriptIndex(L); assert(Index >= 0 && "Cound not locate a valid Index"); - for (unsigned I = Index + 1; I < getNumSubscripts() - 1; ++I) { - const SCEVAddRecExpr *AR = dyn_cast(getSubscript(I)); - assert(AR && AR->getLoop() && "Expecting valid loop"); - const SCEV *TripCount = - computeTripCount(*AR->getLoop(), *Sizes.back(), SE); - Type *WiderType = SE.getWiderType(RefCost->getType(), TripCount->getType()); - RefCost = SE.getMulExpr(SE.getNoopOrAnyExtend(RefCost, WiderType), - SE.getNoopOrAnyExtend(TripCount, WiderType)); - } + Type *WiderType = SE.getWiderType(RefCost->getType(), TripCount->getType()); + const SCEV *DepthFactor = + SE.getConstant(WiderType, getNumSubscripts() - Index - 1); + RefCost = SE.getMulExpr(SE.getNoopOrAnyExtend(RefCost, WiderType), + SE.getNoopOrAnyExtend(DepthFactor, WiderType)); LLVM_DEBUG(dbgs().indent(4) << "Access is not consecutive: RefCost=" << *RefCost << "\n"); diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll --- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll +++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll @@ -7,7 +7,7 @@ ; The IR is copied from llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll ; CHECK: Loop 'for.body' has cost = 4186116 -; CHECK: Loop 'for.body4' has cost = 128898 +; CHECK-NEXT: Loop 'for.body4' has cost = 128898 ;; #define N 1024 ;; #define M 2048 @@ -49,7 +49,7 @@ ; CHECK: Loop 'for.body' has cost = 4186116 -; CHECK: Loop 'for.body4' has cost = 128898 +; CHECK-NEXT: Loop 'for.body4' has cost = 128898 define void @t2([2048 x i32]* %a) { entry: @@ -83,11 +83,11 @@ declare [2048 x i32]* @func_with_returned_arg([2048 x i32]* returned %arg) -; CHECK: Loop 'for.body' has cost = 2112128815104000000 -; CHECK: Loop 'for.body4' has cost = 16762927104000000 -; CHECK: Loop 'for.body8' has cost = 130960368000000 -; CHECK: Loop 'for.body12' has cost = 1047682944000 -; CHECK: Loop 'for.body16' has cost = 32260032000 +; CHECK: Loop 'for.body' has cost = 4190731776000 +; CHECK-NEXT: Loop 'for.body4' has cost = 3143048832000 +; CHECK-NEXT: Loop 'for.body8' has cost = 2095365888000 +; CHECK-NEXT: Loop 'for.body12' has cost = 1047682944000 +; CHECK-NEXT: Loop 'for.body16' has cost = 32260032000 ;; #define N 128 ;; #define M 2048 @@ -157,3 +157,81 @@ for.end48: ; preds = %for.inc46 ret void } + +; Similar to t3() but with larger array size. +; +; CHECK: Loop 'for.body' has cost = 9205370278971310080 +; CHECK-NEXT: Loop 'for.body4' has cost = 6904027709228482560 +; CHECK-NEXT: Loop 'for.body8' has cost = 4602685139485655040 +; CHECK-NEXT: Loop 'for.body12' has cost = 2301342569742827520 +; CHECK-NEXT: Loop 'for.body16' has cost = 71389962471260160 + +;; #define N 4096 +;; #define M 4096 +;; void t3(int a[][N][N][N][M]) { +;; for (int i1 = 0; i1 < N-1; ++i1) +;; for (int i2 = 2; i2 < N; ++i2) +;; for (int i3 = 0; i3 < N; ++i3) +;; for (int i4 = 3; i4 < N; ++i4) +;; for (int i5 = 0; i5 < M-2; ++i5) +;; a[i1][i2][i3][i4][i5] = a[i1+1][i2-2][i3][i4-3][i5+2]; +;; } + +define void @t4([4096 x [4096 x [4096 x [4096 x i32]]]]* %a) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc46 + %indvars.iv18 = phi i64 [ 0, %entry ], [ %indvars.iv.next19, %for.inc46 ] + br label %for.body4 + +for.body4: ; preds = %for.body, %for.inc43 + %indvars.iv14 = phi i64 [ 2, %for.body ], [ %indvars.iv.next15, %for.inc43 ] + br label %for.body8 + +for.body8: ; preds = %for.body4, %for.inc40 + %indvars.iv11 = phi i64 [ 0, %for.body4 ], [ %indvars.iv.next12, %for.inc40 ] + br label %for.body12 + +for.body12: ; preds = %for.body8, %for.inc37 + %indvars.iv7 = phi i64 [ 3, %for.body8 ], [ %indvars.iv.next8, %for.inc37 ] + br label %for.body16 + +for.body16: ; preds = %for.body12, %for.body16 + %indvars.iv = phi i64 [ 0, %for.body12 ], [ %indvars.iv.next, %for.body16 ] + %0 = add nuw nsw i64 %indvars.iv18, 1 + %1 = add nsw i64 %indvars.iv14, -2 + %2 = add nsw i64 %indvars.iv7, -3 + %3 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx26 = getelementptr inbounds [4096 x [4096 x [4096 x [4096 x i32]]]], [4096 x [4096 x [4096 x [4096 x i32]]]]* %a, i64 %0, i64 %1, i64 %indvars.iv11, i64 %2, i64 %3 + %4 = load i32, i32* %arrayidx26, align 4 + %arrayidx36 = getelementptr inbounds [4096 x [4096 x [4096 x [4096 x i32]]]], [4096 x [4096 x [4096 x [4096 x i32]]]]* %a, i64 %indvars.iv18, i64 %indvars.iv14, i64 %indvars.iv11, i64 %indvars.iv7, i64 %indvars.iv + store i32 %4, i32* %arrayidx36, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 4094 + br i1 %exitcond, label %for.body16, label %for.inc37 + +for.inc37: ; preds = %for.body16 + %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1 + %exitcond10 = icmp ne i64 %indvars.iv.next8, 4096 + br i1 %exitcond10, label %for.body12, label %for.inc40 + +for.inc40: ; preds = %for.inc37 + %indvars.iv.next12 = add nuw nsw i64 %indvars.iv11, 1 + %exitcond13 = icmp ne i64 %indvars.iv.next12, 4096 + br i1 %exitcond13, label %for.body8, label %for.inc43 + +for.inc43: ; preds = %for.inc40 + %indvars.iv.next15 = add nuw nsw i64 %indvars.iv14, 1 + %exitcond17 = icmp ne i64 %indvars.iv.next15, 4096 + br i1 %exitcond17, label %for.body4, label %for.inc46 + +for.inc46: ; preds = %for.inc43 + %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1 + %exitcond21 = icmp ne i64 %indvars.iv.next19, 4095 + br i1 %exitcond21, label %for.body, label %for.end48 + +for.end48: ; preds = %for.inc46 + ret void +} + diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll --- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll +++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll @@ -10,9 +10,9 @@ ; A[i][k][j] += B[i][k][j] + C[i][j][k]; ; } -; CHECK: Loop 'for.i' has cost = 3000000 -; CHECK: Loop 'for.k' has cost = 2030000 -; CHECK: Loop 'for.j' has cost = 1060000 +; CHECK: Loop 'for.i' has cost = 6000000 +; CHECK-NEXT: Loop 'for.k' has cost = 2030000 +; CHECK-NEXT: Loop 'for.j' has cost = 1060000 define void @foo(i64 %n, i64 %m, i64 %o, i32* %A, i32* %B, i32* %C) { entry: diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll --- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll +++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll @@ -4,16 +4,16 @@ target triple = "powerpc64le-unknown-linux-gnu" ; void matmul(long n, long m, long o, int A[n][m], int B[n][m], int C[n]) { -; for (long i = 0; i < n; i++) -; for (long j = 0; j < m; j++) -; for (long k = 0; k < o; k++) +; for (long i = 0; i < n; i++) +; for (long j = 0; j < m; j++) +; for (long k = 0; k < o; k++) ; C[i][j] = C[i][j] + A[i][k] * B[k][j]; ; } ; CHECK:Loop 'for.i' has cost = 2010000 -; CHECK:Loop 'for.k' has cost = 1040000 -; CHECK:Loop 'for.j' has cost = 70000 - +; CHECK-NEXT:Loop 'for.k' has cost = 1040000 +; CHECK-NEXT:Loop 'for.j' has cost = 70000 + define void @matmul(i64 %n, i64 %m, i64 %o, i32* %A, i32* %B, i32* %C) { entry: br label %for.i diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll --- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll +++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll @@ -14,9 +14,9 @@ ; y[k+1][j][i][l] = y[k+1][j][i][l] + b[k][j][i][m][l]*x[k][j][i][m] ; } -; CHECK: Loop 'k_loop' has cost = 10200000000000000 -; CHECK: Loop 'j_loop' has cost = 102000000000000 -; CHECK: Loop 'i_loop' has cost = 1020000000000 +; CHECK: Loop 'k_loop' has cost = 100000000000 +; CHECK: Loop 'j_loop' has cost = 70000000000 +; CHECK: Loop 'i_loop' has cost = 40000000000 ; CHECK: Loop 'm_loop' has cost = 10700000000 ; CHECK: Loop 'l_loop' has cost = 1300000000 diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll --- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll +++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll @@ -3,12 +3,12 @@ target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512" target triple = "powerpc64le-unknown-linux-gnu" -; CHECK-DAG: Loop 'for.j' has cost = 201000000 -; CHECK-DAG: Loop 'for.i' has cost = 102000000 -; CHECK-DAG: Loop 'for.k' has cost = 90000 +; CHECK: Loop 'for.j' has cost = 5000000 +; CHECK-NEXT: Loop 'for.i' has cost = 4000000 +; CHECK-NEXT: Loop 'for.k' has cost = 90000 -;; Test to make sure when we have multiple conflicting access patterns, the -;; chosen loop configuration favours the majority of those accesses. +;; Test to make sure when we have multiple conflicting access patterns, the +;; chosen loop configuration favours the majority of those accesses. ;; For example this nest should be ordered as j-i-k. ;; for (int i = 0; i < n; i++) ;; for (int j = 0; j < n; j++) @@ -16,7 +16,7 @@ ;; A[i][j][k] = 1; ;; B[j][i][k] = 2; ;; C[j][i][k] = 3; -;; } +;; } define void @foo(i32 noundef signext %n, ptr noalias noundef %A, ptr noalias noundef %B, ptr noalias noundef %C) { entry: diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll --- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll +++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll @@ -10,9 +10,9 @@ ; A[2*i+3][3*j-4][2*k+7] = 1; ; } -; CHECK: Loop 'for.i' has cost = 100000000 -; CHECK: Loop 'for.j' has cost = 1000000 -; CHECK: Loop 'for.k' has cost = 60000 +; CHECK: Loop 'for.i' has cost = 2000000 +; CHECK-NEXT: Loop 'for.j' has cost = 1000000 +; CHECK-NEXT: Loop 'for.k' has cost = 60000 define void @foo(i64 %n, i64 %m, i64 %o, i32* %A) { entry: @@ -88,7 +88,7 @@ ; A[2*i+3][2*j-4][2*k+7] = 1; ; } -; CHECK: Loop 'for.i' has cost = 100000000 +; CHECK: Loop 'for.i' has cost = 2000000 ; CHECK-NEXT: Loop 'for.j' has cost = 1000000 ; CHECK-NEXT: Loop 'for.k' has cost = 60000 diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll --- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll +++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll @@ -8,11 +8,11 @@ ; for (long j = 0; j < m; j++) { ; A[i][j] = A[i][j+1] + B[i-1][j] + B[i+1][j+1] + C[i]; ; A[i][j] += B[i][i]; -; } +; } ; } ; CHECK: Loop 'for.i' has cost = 20600 -; CHECK: Loop 'for.j' has cost = 800 +; CHECK-NEXT: Loop 'for.j' has cost = 800 define void @foo(i64 %n, i64 %m, i32* %A, i32* %B, i32* %C) { entry: