diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -313,24 +313,25 @@
     // If the indexed reference is not 'consecutive' the cost is proportional to
     // the trip count and the depth of the dimension which the subject loop
     // subscript is accessing. We try to estimate this by multiplying the cost
-    // by the trip counts of loops corresponding to the inner dimensions. For
-    // example, given the indexed reference 'A[i][j][k]', and assuming the
-    // i-loop is in the innermost position, the cost would be equal to the
-    // iterations of the i-loop multiplied by iterations of the j-loop.
+    // by "relative depth" of loops w.r.t. to the innermost loop. For example,
+    // given the indexed reference 'A[i][j][k][m][n]', the relative depth of
+    // loops i, j, k, m with regard to the innermost loop is 4, 3, 2, 1,
+    // respectively.
+    //
+    // If we assume the i-loop is in the innermost position, the cost would be
+    // equal to the iterations of the i-loop multiplied by 4. Similarly if we
+    // assume the i-loop is in the innermost position, the cost would be equal
+    // to the iterations of the i-loop multiplied by 3.
     RefCost = TripCount;
 
     int Index = getSubscriptIndex(L);
     assert(Index >= 0 && "Cound not locate a valid Index");
 
-    for (unsigned I = Index + 1; I < getNumSubscripts() - 1; ++I) {
-      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(I));
-      assert(AR && AR->getLoop() && "Expecting valid loop");
-      const SCEV *TripCount =
-          computeTripCount(*AR->getLoop(), *Sizes.back(), SE);
-      Type *WiderType = SE.getWiderType(RefCost->getType(), TripCount->getType());
-      RefCost = SE.getMulExpr(SE.getNoopOrAnyExtend(RefCost, WiderType),
-                              SE.getNoopOrAnyExtend(TripCount, WiderType));
-    }
+    Type *WiderType = SE.getWiderType(RefCost->getType(), TripCount->getType());
+    const SCEV *DepthFactor =
+        SE.getConstant(WiderType, getNumSubscripts() - Index - 1);
+    RefCost = SE.getMulExpr(SE.getNoopOrAnyExtend(RefCost, WiderType),
+                            SE.getNoopOrAnyExtend(DepthFactor, WiderType));
 
     LLVM_DEBUG(dbgs().indent(4)
                << "Access is not consecutive: RefCost=" << *RefCost << "\n");
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/LoopnestFixedSize.ll
@@ -7,7 +7,7 @@
 ; The IR is copied from llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll
 
 ; CHECK: Loop 'for.body' has cost = 4186116
-; CHECK: Loop 'for.body4' has cost = 128898
+; CHECK-NEXT: Loop 'for.body4' has cost = 128898
 
 ;; #define N 1024
 ;; #define M 2048
@@ -49,7 +49,7 @@
 
 
 ; CHECK: Loop 'for.body' has cost = 4186116
-; CHECK: Loop 'for.body4' has cost = 128898
+; CHECK-NEXT: Loop 'for.body4' has cost = 128898
 
 define void @t2([2048 x i32]* %a) {
 entry:
@@ -83,11 +83,11 @@
 
 declare [2048 x i32]* @func_with_returned_arg([2048 x i32]* returned %arg)
 
-; CHECK: Loop 'for.body' has cost = 2112128815104000000
-; CHECK: Loop 'for.body4' has cost = 16762927104000000
-; CHECK: Loop 'for.body8' has cost = 130960368000000
-; CHECK: Loop 'for.body12' has cost = 1047682944000
-; CHECK: Loop 'for.body16' has cost = 32260032000
+; CHECK: Loop 'for.body' has cost = 4190731776000
+; CHECK-NEXT: Loop 'for.body4' has cost = 3143048832000
+; CHECK-NEXT: Loop 'for.body8' has cost = 2095365888000
+; CHECK-NEXT: Loop 'for.body12' has cost = 1047682944000
+; CHECK-NEXT: Loop 'for.body16' has cost = 32260032000
 
 ;; #define N 128
 ;; #define M 2048
@@ -157,3 +157,81 @@
 for.end48:                                        ; preds = %for.inc46
   ret void
 }
+
+; Similar to t3() but with larger array size.
+;
+; CHECK: Loop 'for.body' has cost = 9205370278971310080
+; CHECK-NEXT: Loop 'for.body4' has cost = 6904027709228482560
+; CHECK-NEXT: Loop 'for.body8' has cost = 4602685139485655040
+; CHECK-NEXT: Loop 'for.body12' has cost = 2301342569742827520
+; CHECK-NEXT: Loop 'for.body16' has cost = 71389962471260160
+
+;; #define N 4096
+;; #define M 4096
+;; void t3(int a[][N][N][N][M]) {
+;;   for (int i1 = 0; i1 < N-1; ++i1)
+;;     for (int i2 = 2; i2 < N; ++i2)
+;;       for (int i3 = 0; i3 < N; ++i3)
+;;         for (int i4 = 3; i4 < N; ++i4)
+;;           for (int i5 = 0; i5 < M-2; ++i5)
+;;             a[i1][i2][i3][i4][i5] = a[i1+1][i2-2][i3][i4-3][i5+2];
+;; }
+
+define void @t4([4096 x [4096 x [4096 x [4096 x i32]]]]* %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc46
+  %indvars.iv18 = phi i64 [ 0, %entry ], [ %indvars.iv.next19, %for.inc46 ]
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.body, %for.inc43
+  %indvars.iv14 = phi i64 [ 2, %for.body ], [ %indvars.iv.next15, %for.inc43 ]
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.body4, %for.inc40
+  %indvars.iv11 = phi i64 [ 0, %for.body4 ], [ %indvars.iv.next12, %for.inc40 ]
+  br label %for.body12
+
+for.body12:                                       ; preds = %for.body8, %for.inc37
+  %indvars.iv7 = phi i64 [ 3, %for.body8 ], [ %indvars.iv.next8, %for.inc37 ]
+  br label %for.body16
+
+for.body16:                                       ; preds = %for.body12, %for.body16
+  %indvars.iv = phi i64 [ 0, %for.body12 ], [ %indvars.iv.next, %for.body16 ]
+  %0 = add nuw nsw i64 %indvars.iv18, 1
+  %1 = add nsw i64 %indvars.iv14, -2
+  %2 = add nsw i64 %indvars.iv7, -3
+  %3 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx26 = getelementptr inbounds [4096 x [4096 x [4096 x [4096 x i32]]]], [4096 x [4096 x [4096 x [4096 x i32]]]]* %a, i64 %0, i64 %1, i64 %indvars.iv11, i64 %2, i64 %3
+  %4 = load i32, i32* %arrayidx26, align 4
+  %arrayidx36 = getelementptr inbounds [4096 x [4096 x [4096 x [4096 x i32]]]], [4096 x [4096 x [4096 x [4096 x i32]]]]* %a, i64 %indvars.iv18, i64 %indvars.iv14, i64 %indvars.iv11, i64 %indvars.iv7, i64 %indvars.iv
+  store i32 %4, i32* %arrayidx36, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 4094
+  br i1 %exitcond, label %for.body16, label %for.inc37
+
+for.inc37:                                        ; preds = %for.body16
+  %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
+  %exitcond10 = icmp ne i64 %indvars.iv.next8, 4096
+  br i1 %exitcond10, label %for.body12, label %for.inc40
+
+for.inc40:                                        ; preds = %for.inc37
+  %indvars.iv.next12 = add nuw nsw i64 %indvars.iv11, 1
+  %exitcond13 = icmp ne i64 %indvars.iv.next12, 4096
+  br i1 %exitcond13, label %for.body8, label %for.inc43
+
+for.inc43:                                        ; preds = %for.inc40
+  %indvars.iv.next15 = add nuw nsw i64 %indvars.iv14, 1
+  %exitcond17 = icmp ne i64 %indvars.iv.next15, 4096
+  br i1 %exitcond17, label %for.body4, label %for.inc46
+
+for.inc46:                                        ; preds = %for.inc43
+  %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
+  %exitcond21 = icmp ne i64 %indvars.iv.next19, 4095
+  br i1 %exitcond21, label %for.body, label %for.end48
+
+for.end48:                                        ; preds = %for.inc46
+  ret void
+}
+
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/loads-store.ll
@@ -10,9 +10,9 @@
 ;         A[i][k][j] += B[i][k][j] + C[i][j][k];
 ; }
 
-; CHECK: Loop 'for.i' has cost = 3000000
-; CHECK: Loop 'for.k' has cost = 2030000
-; CHECK: Loop 'for.j' has cost = 1060000
+; CHECK: Loop 'for.i' has cost = 6000000
+; CHECK-NEXT: Loop 'for.k' has cost = 2030000
+; CHECK-NEXT: Loop 'for.j' has cost = 1060000
 
 define void @foo(i64 %n, i64 %m, i64 %o, i32* %A, i32* %B, i32* %C) {
 entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matmul.ll
@@ -4,16 +4,16 @@
 target triple = "powerpc64le-unknown-linux-gnu"
 
 ; void matmul(long n, long m, long o, int A[n][m], int B[n][m], int C[n]) {
-;   for (long i = 0; i < n; i++) 
-;     for (long j = 0; j < m; j++) 
-;       for (long k = 0; k < o; k++) 
+;   for (long i = 0; i < n; i++)
+;     for (long j = 0; j < m; j++)
+;       for (long k = 0; k < o; k++)
 ;         C[i][j] = C[i][j] + A[i][k] * B[k][j];
 ; }
 
 ; CHECK:Loop 'for.i' has cost = 2010000
-; CHECK:Loop 'for.k' has cost = 1040000
-; CHECK:Loop 'for.j' has cost = 70000
-    
+; CHECK-NEXT:Loop 'for.k' has cost = 1040000
+; CHECK-NEXT:Loop 'for.j' has cost = 70000
+
 define void @matmul(i64 %n, i64 %m, i64 %o, i32* %A, i32* %B, i32* %C) {
 entry:
   br label %for.i
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/matvecmul.ll
@@ -14,9 +14,9 @@
 ;                 y[k+1][j][i][l] = y[k+1][j][i][l] + b[k][j][i][m][l]*x[k][j][i][m]
 ; }
 
-; CHECK: Loop 'k_loop' has cost = 10200000000000000
-; CHECK: Loop 'j_loop' has cost = 102000000000000
-; CHECK: Loop 'i_loop' has cost = 1020000000000
+; CHECK: Loop 'k_loop' has cost = 100000000000
+; CHECK: Loop 'j_loop' has cost = 70000000000
+; CHECK: Loop 'i_loop' has cost = 40000000000
 ; CHECK: Loop 'm_loop' has cost = 10700000000
 ; CHECK: Loop 'l_loop' has cost = 1300000000
 
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/multi-store.ll
@@ -3,12 +3,12 @@
 target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512"
 target triple = "powerpc64le-unknown-linux-gnu"
 
-; CHECK-DAG: Loop 'for.j' has cost = 201000000
-; CHECK-DAG: Loop 'for.i' has cost = 102000000
-; CHECK-DAG: Loop 'for.k' has cost = 90000
+; CHECK: Loop 'for.j' has cost = 5000000
+; CHECK-NEXT: Loop 'for.i' has cost = 4000000
+; CHECK-NEXT: Loop 'for.k' has cost = 90000
 
-;; Test to make sure when we have multiple conflicting access patterns, the 
-;; chosen loop configuration favours the majority of those accesses. 
+;; Test to make sure when we have multiple conflicting access patterns, the
+;; chosen loop configuration favours the majority of those accesses.
 ;; For example this nest should be ordered as j-i-k.
 ;;  for (int i = 0; i < n; i++)
 ;;    for (int j = 0; j < n; j++)
@@ -16,7 +16,7 @@
 ;;        A[i][j][k] = 1;
 ;;        B[j][i][k] = 2;
 ;;        C[j][i][k] = 3;
-;;      }                            
+;;      }
 
 define void @foo(i32 noundef signext %n, ptr noalias noundef %A, ptr noalias noundef %B, ptr noalias noundef %C) {
 entry:
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/single-store.ll
@@ -10,9 +10,9 @@
 ;         A[2*i+3][3*j-4][2*k+7] = 1;
 ; }
 
-; CHECK: Loop 'for.i' has cost = 100000000
-; CHECK: Loop 'for.j' has cost = 1000000
-; CHECK: Loop 'for.k' has cost = 60000
+; CHECK: Loop 'for.i' has cost = 2000000
+; CHECK-NEXT: Loop 'for.j' has cost = 1000000
+; CHECK-NEXT: Loop 'for.k' has cost = 60000
 
 define void @foo(i64 %n, i64 %m, i64 %o, i32* %A) {
 entry:
@@ -88,7 +88,7 @@
 ;         A[2*i+3][2*j-4][2*k+7] = 1;
 ; }
 
-; CHECK: Loop 'for.i' has cost = 100000000
+; CHECK: Loop 'for.i' has cost = 2000000
 ; CHECK-NEXT: Loop 'for.j' has cost = 1000000
 ; CHECK-NEXT: Loop 'for.k' has cost = 60000
 
diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll
--- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll
+++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/stencil.ll
@@ -8,11 +8,11 @@
 ;     for (long j = 0; j < m; j++) {
 ;         A[i][j] = A[i][j+1] + B[i-1][j] + B[i+1][j+1] + C[i];
 ;         A[i][j] += B[i][i];
-;     }   
+;     }
 ; }
 
 ; CHECK: Loop 'for.i' has cost = 20600
-; CHECK: Loop 'for.j' has cost = 800
+; CHECK-NEXT: Loop 'for.j' has cost = 800
 
 define void @foo(i64 %n, i64 %m, i32* %A, i32* %B, i32* %C) {
 entry: