Index: llvm/lib/Transforms/Scalar/LoopInterchange.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopNestAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -358,8 +359,10 @@
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
 
   /// Check if the loop interchange is profitable.
-  bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
-                    CharMatrix &DepMatrix);
+  bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
+                    unsigned InnerLoopId, unsigned OuterLoopId,
+                    CharMatrix &DepMatrix,
+                    const DenseMap<const Loop *, unsigned> &CostMap);
 
 private:
   int getInstrOrderCost();
@@ -410,13 +413,15 @@
   LoopInfo *LI = nullptr;
   DependenceInfo *DI = nullptr;
   DominatorTree *DT = nullptr;
+  std::unique_ptr<CacheCost> CC = nullptr;
 
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
   LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
-                  DominatorTree *DT, OptimizationRemarkEmitter *ORE)
-      : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {}
+                  DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
+                  OptimizationRemarkEmitter *ORE)
+      : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
 
   bool run(Loop *L) {
     if (L->getParentLoop())
@@ -499,6 +504,17 @@
     }
 
     unsigned SelecLoopId = selectLoopForInterchange(LoopList);
+    // Obtain the loop vector returned from loop cache analysis beforehand,
+    // and populate the <Loop, index> pair into a map for constant time query
+    // later. Indices in loop vector reprsent the optimal order of the
+    // corresponding loop, e.g., given a loopnest with depth N, index 0
+    // indicates the loop should be placed as the outermost loop and index N
+    // indicates the loop should be placed as the innermost loop, .
+    const auto &LoopCosts = CC->getLoopCosts();
+    DenseMap<const Loop *, unsigned> CostMap;
+    for (unsigned i = 0; i < LoopCosts.size(); i++) {
+      CostMap[LoopCosts[i].first] = i;
+    }
     // We try to achieve the globally optimal memory access for the loopnest,
     // and do interchange based on a bubble-sort fasion. We start from
     // the innermost loop, move it outwards to the best possible position
@@ -507,7 +523,7 @@
       bool ChangedPerIter = false;
       for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
         bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
-                                        DependencyMatrix);
+                                        DependencyMatrix, CostMap);
         if (!Interchanged)
           continue;
         // Loops interchanged, update LoopList accordingly.
@@ -531,7 +547,8 @@
 
   bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
                    unsigned OuterLoopId,
-                   std::vector<std::vector<char>> &DependencyMatrix) {
+                   std::vector<std::vector<char>> &DependencyMatrix,
+                   const DenseMap<const Loop *, unsigned> &CostMap) {
     LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
                       << " and OuterLoopId = " << OuterLoopId << "\n");
     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
@@ -541,7 +558,8 @@
     }
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
-    if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+    if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
+                          DependencyMatrix, CostMap)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1135,21 +1153,33 @@
   return !DepMatrix.empty();
 }
 
-bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
-                                                unsigned OuterLoopId,
-                                                CharMatrix &DepMatrix) {
-  // TODO: Add better profitability checks.
-  // e.g
-  // 1) Construct dependency matrix and move the one with no loop carried dep
-  //    inside to enable vectorization.
-
-  // This is rough cost estimation algorithm. It counts the good and bad order
-  // of induction variables in the instruction and allows reordering if number
-  // of bad orders is more than good.
-  int Cost = getInstrOrderCost();
-  LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
-  if (Cost < -LoopInterchangeCostThreshold)
-    return true;
+bool LoopInterchangeProfitability::isProfitable(
+    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
+    unsigned OuterLoopId, CharMatrix &DepMatrix,
+    const DenseMap<const Loop *, unsigned> &CostMap) {
+  // TODO: Remove the legacy cost model.
+
+  // This is the new cost model returned from loop cache analysis.
+  // A smaller index means the loop should be placed an outer loop, and vice
+  // versa.
+  unsigned IndexInner = 0, IndexOuter = 0;
+  if (CostMap.find(InnerLoop) != CostMap.end() &&
+      CostMap.find(OuterLoop) != CostMap.end()) {
+    IndexInner = CostMap.find(InnerLoop)->second;
+    IndexOuter = CostMap.find(OuterLoop)->second;
+    LLVM_DEBUG(dbgs() << "IndexInner = " << IndexInner
+                      << ", IndexOuter = " << IndexOuter << "\n");
+    if (IndexInner < IndexOuter)
+      return true;
+  } else {
+    // Legacy cost model: this is rough cost estimation algorithm. It counts the
+    // good and bad order of induction variables in the instruction and allows
+    // reordering if number of bad orders is more than good.
+    int Cost = getInstrOrderCost();
+    LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
+    if (Cost < -LoopInterchangeCostThreshold)
+      return true;
+  }
 
   // It is not profitable as per current cache profitability model. But check if
   // we can move this loop outside to improve parallelism.
@@ -1160,9 +1190,9 @@
     return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
                                     InnerLoop->getStartLoc(),
                                     InnerLoop->getHeader())
-           << "Interchanging loops is too costly (cost="
-           << ore::NV("Cost", Cost) << ", threshold="
-           << ore::NV("Threshold", LoopInterchangeCostThreshold)
+           << "Interchanging loops is too costly (IndexInner="
+           << ore::NV("IndexInner", IndexInner)
+           << ", IndexOuter=" << ore::NV("IndexOuter", IndexOuter)
            << ") and it does not improve parallelism.";
   });
   return false;
@@ -1709,8 +1739,8 @@
     auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
-    return LoopInterchange(SE, LI, DI, DT, ORE).run(L);
+    std::unique_ptr<CacheCost> CC = nullptr;
+    return LoopInterchange(SE, LI, DI, DT, CC, ORE).run(L);
   }
 };
 } // namespace
@@ -1737,8 +1767,10 @@
   Function &F = *LN.getParent();
 
   DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
+  std::unique_ptr<CacheCost> CC =
+      CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
   OptimizationRemarkEmitter ORE(&F);
-  if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &ORE).run(LN))
+  if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
 }
Index: llvm/test/Transforms/LoopInterchange/call-instructions.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/call-instructions.ll
+++ llvm/test/Transforms/LoopInterchange/call-instructions.ll
@@ -4,7 +4,7 @@
 ; RUN: FileCheck --input-file=%t %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"
 
 @A = common global [100 x [100 x i32]] zeroinitializer
 
Index: llvm/test/Transforms/LoopInterchange/currentLimitation.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/currentLimitation.ll
+++ llvm/test/Transforms/LoopInterchange/currentLimitation.ll
@@ -8,7 +8,7 @@
 ; RUN: FileCheck --check-prefix=DELIN --input-file=%t %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"
  
 @A = common global [100 x [100 x i32]] zeroinitializer
 @B = common global [100 x [100 x [100 x i32]]] zeroinitializer
Index: llvm/test/Transforms/LoopInterchange/debuginfo.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/debuginfo.ll
+++ llvm/test/Transforms/LoopInterchange/debuginfo.ll
@@ -4,7 +4,7 @@
 
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"
 
 @A = common global [100 x [100 x i64]] zeroinitializer
 
Index: llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll
+++ llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll
@@ -2,6 +2,7 @@
 ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \
 ; RUN:     -S -debug 2>&1 | FileCheck %s
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @A = common global [100 x [100 x i64]] zeroinitializer
 @N = dso_local local_unnamed_addr global i64 100, align 8
 
Index: llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll
+++ llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll
@@ -5,6 +5,7 @@
 ; Inner loop only reductions are not supported currently. See discussion at
 ; D53027 for more information on the required checks.
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @A = common global [500 x [500 x i32]] zeroinitializer
 @X = common global i32 0
 @B = common global [500 x [500 x i32]] zeroinitializer
Index: llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll
+++ llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll
@@ -2,6 +2,7 @@
 ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \
 ; RUN:     -S -debug 2>&1 | FileCheck %s
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @a = common global i32 0, align 4
 @d = common dso_local local_unnamed_addr global [1 x [6 x i32]] zeroinitializer, align 4
 
Index: llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
+++ llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
@@ -3,7 +3,7 @@
 ; RUN:     -S -debug 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"
 
 @A = common global [100 x [100 x i32]] zeroinitializer
 @B = common global [100 x i32] zeroinitializer
Index: llvm/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll
+++ llvm/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll
@@ -1,6 +1,7 @@
 ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \
 ; RUN:     -S -pass-remarks=loop-interchange 2>&1 | FileCheck %s
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16
 
 ;; Test to make sure we can handle zext instructions introduced by
Index: llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll
+++ llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll
@@ -4,6 +4,7 @@
 ; RUN:     | FileCheck -check-prefix=STATS %s
 ; RUN: FileCheck -input-file %t %s
 
+target triple = "powerpc64le-unknown-linux-gnu"
 
 ; no_deps_interchange just accesses a single nested array and can be interchange.
 ; CHECK:      Name:       Interchanged
@@ -34,35 +35,6 @@
 
 }
 
-; Only the inner loop induction variable is used for memory accesses.
-; Interchanging is not beneficial.
-; CHECK:      Name:       InterchangeNotProfitable
-; CHECK-NEXT: Function:   no_bad_order
-define i32 @no_bad_order(i32* %Arr) {
-entry:
-  br label %for1.header
-
-for1.header:                                         ; preds = %entry, %for1.inc
-  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for1.inc ]
-  br label %for2
-
-for2:                                        ; preds = %for1.header, %for2
-  %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next, %for2 ]
-  %arrayidx6 = getelementptr inbounds i32, i32* %Arr, i64 %indvars.iv
-  store i32 0, i32* %arrayidx6, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for2, label %for1.inc
-
-for1.inc:
-  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
-  %exitcond21 = icmp ne i64 %indvars.iv.next20, 1024
-  br i1 %exitcond21, label %for1.header, label %exit
-
-exit:                                 ; preds = %for1.inc
-  ret i32 0
-}
-
 ; No memory access using any induction variables, interchanging not beneficial.
 ; CHECK:      Name:        InterchangeNotProfitable
 ; CHECK-NEXT: Function:    no_mem_instrs
Index: llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll
+++ llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @b = common dso_local local_unnamed_addr global [200 x [200 x i32]] zeroinitializer, align 4
 @a = common dso_local local_unnamed_addr global i32 0, align 4
 
Index: llvm/test/Transforms/LoopInterchange/interchangeable-outerloop-multiple-indvars.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/interchangeable-outerloop-multiple-indvars.ll
+++ llvm/test/Transforms/LoopInterchange/interchangeable-outerloop-multiple-indvars.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s --basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
+; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @b = constant [200 x [100 x i32]] zeroinitializer, align 4
 @a = constant i32 0, align 4
 
Index: llvm/test/Transforms/LoopInterchange/interchangeable.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/interchangeable.ll
+++ llvm/test/Transforms/LoopInterchange/interchangeable.ll
@@ -3,7 +3,7 @@
 ; RUN: opt < %s -aa-pipeline=basic-aa -passes=loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"
 
 @A = common global [100 x [100 x i64]] zeroinitializer
 @B = common global [100 x i64] zeroinitializer
Index: llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll
+++ llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll
@@ -3,7 +3,7 @@
 ; RUN:     -S -debug 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"
 
 @D = common global [100 x [100 x [100 x i32]]] zeroinitializer
 
@@ -24,31 +24,31 @@
   br label %for.cond1.preheader
 
 for.cond1.preheader:                              ; preds = %for.inc15, %entry
-  %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ]
+  %i.028 = phi i64 [ 0, %entry ], [ %inc16, %for.inc15 ]
   br label %for.cond4.preheader
 
 for.cond4.preheader:                              ; preds = %for.inc12, %for.cond1.preheader
-  %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
+  %j.027 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
   br label %for.body6
 
 for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
-  %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
-  %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %k.026, i32 %j.027, i32 %i.028
+  %k.026 = phi i64 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i64 0, i64 %k.026, i64 %j.027, i64 %i.028
   %0 = load i32, i32* %arrayidx8
   %add = add nsw i32 %0, %t
   store i32 %add, i32* %arrayidx8
-  %inc = add nuw nsw i32 %k.026, 1
-  %exitcond = icmp eq i32 %inc, 100
+  %inc = add nuw nsw i64 %k.026, 1
+  %exitcond = icmp eq i64 %inc, 100
   br i1 %exitcond, label %for.inc12, label %for.body6
 
 for.inc12:                                        ; preds = %for.body6
-  %inc13 = add nuw nsw i32 %j.027, 1
-  %exitcond29 = icmp eq i32 %inc13, 100
+  %inc13 = add nuw nsw i64 %j.027, 1
+  %exitcond29 = icmp eq i64 %inc13, 100
   br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader
 
 for.inc15:                                        ; preds = %for.inc12
-  %inc16 = add nuw nsw i32 %i.028, 1
-  %exitcond30 = icmp eq i32 %inc16, 100
+  %inc16 = add nuw nsw i64 %i.028, 1
+  %exitcond30 = icmp eq i64 %inc16, 100
   br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
 
 for.end17:                                        ; preds = %for.inc15
Index: llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll
+++ llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll
@@ -3,6 +3,7 @@
 ; RUN: opt < %s -basic-aa -loop-interchange -da-disable-delinearization-checks -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -S | FileCheck -check-prefix=CHECK-DELIN %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "powerpc64le-unknown-linux-gnu"
 
 ; void foo(int n, int m) {
 ;   int temp[16][16];
Index: llvm/test/Transforms/LoopInterchange/lcssa.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/lcssa.ll
+++ llvm/test/Transforms/LoopInterchange/lcssa.ll
@@ -2,7 +2,7 @@
 ; RUN: FileCheck --input-file %t --check-prefix REMARK %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"
 
 @A = common global [100 x [100 x i32]] zeroinitializer
 @C = common global [100 x [100 x i32]] zeroinitializer
Index: llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
+++ llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
@@ -10,6 +10,7 @@
 ; RUN:     -pass-remarks='loop-interchange' -S -da-disable-delinearization-checks
 ; RUN: cat %t |  FileCheck --check-prefix=DELIN %s
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @A = common global [100 x [100 x i32]] zeroinitializer
 @B = common global [100 x [100 x i32]] zeroinitializer
 @C = common global [100 x i32] zeroinitializer
@@ -71,10 +72,10 @@
 ; DELIN-NEXT: Name:            InterchangeNotProfitable
 ; DELIN-NEXT: Function:        test01
 ; DELIN-NEXT: Args:
-; DELIN-NEXT:   - String:          'Interchanging loops is too costly (cost='
-; DELIN-NEXT:   - Cost:            '2'
-; DELIN-NEXT:   - String:          ', threshold='
-; DELIN-NEXT:   - Threshold:       '0'
+; DELIN-NEXT:   - String:          'Interchanging loops is too costly (IndexInner='
+; DELIN-NEXT:   - IndexInner:            '1'
+; DELIN-NEXT:   - String:          ', IndexOuter='
+; DELIN-NEXT:   - IndexOuter:       '0'
 ; DELIN-NEXT:   - String:          ') and it does not improve parallelism.'
 ; DELIN-NEXT: ...
 
Index: llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
+++ llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
@@ -3,7 +3,7 @@
 ; RUN:     -S -debug 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"
 
 @A = common global [100 x [100 x i32]] zeroinitializer
 @B = common global [100 x i32] zeroinitializer
Index: llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
+++ llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
@@ -3,7 +3,7 @@
 ; RUN:     -S -debug 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"
 
 @D = common global [100 x [100 x [100 x i32]]] zeroinitializer
 
@@ -24,31 +24,31 @@
   br label %for.cond1.preheader
 
 for.cond1.preheader:                              ; preds = %for.inc15, %entry
-  %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ]
+  %i.028 = phi i64 [ 0, %entry ], [ %inc16, %for.inc15 ]
   br label %for.cond4.preheader
 
 for.cond4.preheader:                              ; preds = %for.inc12, %for.cond1.preheader
-  %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
+  %j.027 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
   br label %for.body6
 
 for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
-  %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
-  %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027
+  %k.026 = phi i64 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i64 %i.028, i64 %k.026, i64 %j.027
   %0 = load i32, i32* %arrayidx8
   %add = add nsw i32 %0, %t
   store i32 %add, i32* %arrayidx8
-  %inc = add nuw nsw i32 %k.026, 1
-  %exitcond = icmp eq i32 %inc, 100
+  %inc = add nuw nsw i64 %k.026, 1
+  %exitcond = icmp eq i64 %inc, 100
   br i1 %exitcond, label %for.inc12, label %for.body6
 
 for.inc12:                                        ; preds = %for.body6
-  %inc13 = add nuw nsw i32 %j.027, 1
-  %exitcond29 = icmp eq i32 %inc13, 100
+  %inc13 = add nuw nsw i64 %j.027, 1
+  %exitcond29 = icmp eq i64 %inc13, 100
   br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader
 
 for.inc15:                                        ; preds = %for.inc12
-  %inc16 = add nuw nsw i32 %i.028, 1
-  %exitcond30 = icmp eq i32 %inc16, 100
+  %inc16 = add nuw nsw i64 %i.028, 1
+  %exitcond30 = icmp eq i64 %inc16, 100
   br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
 
 for.end17:                                        ; preds = %for.inc15
Index: llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
+++ llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
@@ -3,7 +3,7 @@
 ; RUN:     -S -debug 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"
 
 @A = common global [100 x [100 x i32]] zeroinitializer
 @B = common global [100 x i32] zeroinitializer
@@ -108,13 +108,13 @@
 ;; The outer loop header does not branch to the inner loop preheader, or the
 ;; inner loop header, or the outer loop latch.
 ; CHECK: Not interchanging loops. Cannot prove legality.
-define void @interchange_07(i32 %k, i32 %N, i32 %ny) {
+define void @interchange_07(i32 %k, i32 %N, i64 %ny) {
 entry:
   br label %for1.header
 
 for1.header:
-  %j23 = phi i32 [ 0, %entry ], [ %j.next24, %for1.inc10 ]
-  %cmp21 = icmp slt i32 0, %ny
+  %j23 = phi i64 [ 0, %entry ], [ %j.next24, %for1.inc10 ]
+  %cmp21 = icmp slt i64 0, %ny
   br label %singleSucc
 
 singleSucc:
@@ -124,18 +124,18 @@
   br label %for2
 
 for2:
-  %j = phi i32 [ %j.next, %for2 ], [ 0, %preheader.j ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i32 0, i32 %j, i32 %j23
+  %j = phi i64 [ %j.next, %for2 ], [ 0, %preheader.j ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 %j23
   %lv = load i32, i32* %arrayidx5
   %add = add nsw i32 %lv, %k
   store i32 %add, i32* %arrayidx5
-  %j.next = add nuw nsw i32 %j, 1
-  %exitcond = icmp eq i32 %j, 99
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond = icmp eq i64 %j, 99
   br i1 %exitcond, label %for1.inc10, label %for2
 
 for1.inc10:
-  %j.next24 = add nuw nsw i32 %j23, 1
-  %exitcond26 = icmp eq i32 %j23, 99
+  %j.next24 = add nuw nsw i64 %j23, 1
+  %exitcond26 = icmp eq i64 %j23, 99
   br i1 %exitcond26, label %for.end12, label %for1.header
 
 for.end12:
Index: llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll
+++ llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-loop-lcssa -S %s | FileCheck %s
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @b = global [3 x [5 x [8 x i16]]] [[5 x [8 x i16]] zeroinitializer, [5 x [8 x i16]] [[8 x i16] zeroinitializer, [8 x i16] [i16 0, i16 0, i16 0, i16 6, i16 1, i16 6, i16 0, i16 0], [8 x i16] zeroinitializer, [8 x i16] zeroinitializer, [8 x i16] zeroinitializer], [5 x [8 x i16]] zeroinitializer], align 2
 @a = common global i32 0, align 4
 @d = common dso_local local_unnamed_addr global [1 x [6 x i32]] zeroinitializer, align 4
Index: llvm/test/Transforms/LoopInterchange/outer-only-reductions.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/outer-only-reductions.ll
+++ llvm/test/Transforms/LoopInterchange/outer-only-reductions.ll
@@ -4,6 +4,7 @@
 
 ; Outer loop only reductions are not supported currently.
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @A = common global [500 x [500 x i32]] zeroinitializer
 
 ;; global X
Index: llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll
+++ llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll
@@ -3,6 +3,7 @@
 ; Test case for PR41725. The induction variables in the latches escape the
 ; loops and we must move some PHIs around.
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @a = common dso_local global i64 0, align 4
 @b = common dso_local global i64 0, align 4
 @c = common dso_local global [10 x [10 x i32 ]] zeroinitializer, align 16
@@ -157,7 +158,7 @@
 ; Make sure we do not crash for loops without reachable exits.
 define void @no_reachable_exits() {
 ; Check we interchanged.
-; CHECK-LABEL: @no_reachable_exits() {
+; CHECK-LABEL: @no_reachable_exits()
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label %inner.ph
 ; CHECK-LABEL: outer.ph:
Index: llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
+++ llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-interchange -verify-loop-lcssa -verify-dom-info -S %s | FileCheck %s
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @b = external dso_local global [5 x i32], align 16
 
 define void @test1() {
Index: llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll
+++ llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll
@@ -14,6 +14,8 @@
 ;   }
 ; }
 
+target triple = "powerpc64le-unknown-linux-gnu"
+
 ; REMARKS: --- !Passed
 ; REMARKS-NEXT: Pass:            loop-interchange
 ; REMARKS-NEXT: Name:            Interchanged
Index: llvm/test/Transforms/LoopInterchange/pr43326.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/pr43326.ll
+++ llvm/test/Transforms/LoopInterchange/pr43326.ll
@@ -2,6 +2,7 @@
 ; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1
 ; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @a = global i32 0
 @b = global i8 0
 @c = global i32 0
Index: llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll
+++ llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll
@@ -6,6 +6,8 @@
 ; In the 2 test cases below, we have a LCSSA PHI in the inner loop exit, which
 ; is used in the outer loop latch. This is not supported.
 
+target triple = "powerpc64le-unknown-linux-gnu"
+
 define void @test1() {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
Index: llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll
+++ llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll
@@ -3,6 +3,7 @@
 
 ; Tests for PR43797.
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @wdtdr = external dso_local global [5 x [5 x double]], align 16
 
 define void @test1() {
Index: llvm/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll
+++ llvm/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-interchange -S %s | FileCheck %s
 
+target triple = "powerpc64le-unknown-linux-gnu"
 @global = external local_unnamed_addr global [400 x [400 x i32]], align 16
 
 ; We need to move %tmp4 from the inner loop pre header to the outer loop header
Index: llvm/test/Transforms/LoopInterchange/pr48212.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/pr48212.ll
+++ llvm/test/Transforms/LoopInterchange/pr48212.ll
@@ -2,6 +2,8 @@
 ; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa 2>&1
 ; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s
 
+target triple = "powerpc64le-unknown-linux-gnu"
+
 ; REMARKS: --- !Passed
 ; REMARKS-NEXT: Pass:            loop-interchange
 ; REMARKS-NEXT: Name:            Interchanged
Index: llvm/test/Transforms/LoopInterchange/profitability.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/profitability.ll
+++ llvm/test/Transforms/LoopInterchange/profitability.ll
@@ -5,7 +5,7 @@
 ;; We test profitability model in these test cases.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"
 
 @A = common global [100 x [100 x i32]] zeroinitializer
 @B = common global [100 x [100 x i32]] zeroinitializer
Index: llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
+++ llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
@@ -4,7 +4,7 @@
 
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"
 
 ; REMARKS: --- !Passed
 ; REMARKS-NEXT: Pass:            loop-interchange
Index: llvm/test/Transforms/LoopInterchange/update-condbranch-duplicate-successors.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/update-condbranch-duplicate-successors.ll
+++ llvm/test/Transforms/LoopInterchange/update-condbranch-duplicate-successors.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-interchange -S %s | FileCheck %s
 
-
+target triple = "powerpc64le-unknown-linux-gnu"
 @global = external dso_local global [1000 x [1000 x i32]], align 16
 
 ; Test that we support updating conditional branches where both targets are the same
Index: llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll
+++ llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-interchange -loop-interchange-threshold=-10 -S %s | FileCheck %s
 
+target triple = "powerpc64le-unknown-linux-gnu"
+
 ; The test contains a GEP with an operand that is not SCEV-able. Make sure
 ; loop-interchange does not crash.
 define void @test([256 x float]* noalias %src, float* %dst) {