Index: llvm/lib/Transforms/Scalar/LoopInterchange.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -338,10 +338,18 @@
   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
                     unsigned InnerLoopId, unsigned OuterLoopId,
                     CharMatrix &DepMatrix,
-                    const DenseMap<const Loop *, unsigned> &CostMap);
+                    const DenseMap<const Loop *, unsigned> &CostMap,
+                    std::unique_ptr<CacheCost>& CC);
 
 private:
   int getInstrOrderCost();
+  std::optional<bool> isProfitableAccordingLoopCacheAnalysis(
+                                const DenseMap<const Loop *, unsigned> &CostMap);
+  std::optional<bool> isProfitableAccordingInstrOrderCost();
+  std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
+                                                   unsigned OuterLoopId,
+                                                   CharMatrix &DepMatrix,
+                                                   std::unique_ptr<CacheCost>& CC);
 
   Loop *OuterLoop;
   Loop *InnerLoop;
@@ -539,7 +547,7 @@
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
-                          DependencyMatrix, CostMap)) {
+                          DependencyMatrix, CostMap, CC)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1110,31 +1118,8 @@
   return GoodOrder - BadOrder;
 }
 
-static bool isProfitableForVectorization(unsigned InnerLoopId,
-                                         unsigned OuterLoopId,
-                                         CharMatrix &DepMatrix) {
-  // TODO: Improve this heuristic to catch more cases.
-  // If the inner loop is loop independent or doesn't carry any dependency it is
-  // profitable to move this to outer position.
-  for (auto &Row : DepMatrix) {
-    if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
-      return false;
-    // TODO: We need to improve this heuristic.
-    if (Row[OuterLoopId] != '=')
-      return false;
-  }
-  // If outer loop has dependence and inner loop is loop independent then it is
-  // profitable to interchange to enable parallelism.
-  // If there are no dependences, interchanging will not improve anything.
-  return !DepMatrix.empty();
-}
-
-bool LoopInterchangeProfitability::isProfitable(
-    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
-    unsigned OuterLoopId, CharMatrix &DepMatrix,
-    const DenseMap<const Loop *, unsigned> &CostMap) {
-  // TODO: Remove the legacy cost model.
-
+std::optional<bool> LoopInterchangeProfitability::isProfitableAccordingLoopCacheAnalysis(
+       const DenseMap<const Loop *, unsigned> &CostMap) {
   // This is the new cost model returned from loop cache analysis.
   // A smaller index means the loop should be placed an outer loop, and vice
   // versa.
@@ -1146,30 +1131,87 @@
     LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
                       << ", OuterIndex = " << OuterIndex << "\n");
     if (InnerIndex < OuterIndex)
-      return true;
-  } else {
-    // Legacy cost model: this is rough cost estimation algorithm. It counts the
-    // good and bad order of induction variables in the instruction and allows
-    // reordering if number of bad orders is more than good.
-    int Cost = getInstrOrderCost();
-    LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
-    if (Cost < -LoopInterchangeCostThreshold)
-      return true;
+      return std::optional<bool>(true);
+    else
+      return std::optional<bool>(false);
   }
+  return std::nullopt;
+}
 
-  // It is not profitable as per current cache profitability model. But check if
-  // we can move this loop outside to improve parallelism.
-  if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
-    return true;
+std::optional<bool> LoopInterchangeProfitability::isProfitableAccordingInstrOrderCost() {
+  // Legacy cost model: this is rough cost estimation algorithm. It counts the
+  // good and bad order of induction variables in the instruction and allows
+  // reordering if number of bad orders is more than good.
+  const int LoopInterchangeCostMeaningfulnessThreshold = 10;
+  int Cost = getInstrOrderCost();
+  LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
+  if (abs(Cost) < LoopInterchangeCostMeaningfulnessThreshold)
+    if (Cost < -LoopInterchangeCostThreshold)
+      return std::optional<bool>(true);
+  return std::nullopt;
+}
 
-  ORE->emit([&]() {
-    return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
-                                    InnerLoop->getStartLoc(),
-                                    InnerLoop->getHeader())
-           << "Interchanging loops is too costly and it does not improve "
-              "parallelism.";
-  });
-  return false;
+std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
+                                                        unsigned InnerLoopId,
+                                                        unsigned OuterLoopId,
+                                                        CharMatrix &DepMatrix,
+                                                        std::unique_ptr<CacheCost>& CC) {
+  // To prevent endless interchange, only check whether it is profitable for vectorization
+  // when InnerLoop and OuterLoop has the same locality, or loop cache analysis failed to
+  // analyze the loopnest (e.g., due to delinearization issues).
+  if (CC && CC->getLoopCost(*InnerLoop) == CC->getLoopCost(*OuterLoop)) {
+    for (auto &Row : DepMatrix) {
+      // If the inner loop is loop independent or doesn't carry any dependency it is
+      // not profitable to move this to outer position, since we are likely able to do
+      // inner loop vectorization already.
+      if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=')
+        return std::optional<bool>(false);
+
+      // If the outer loop is not loop independent it is not profitable to move this
+      // to inner position, since doing so would not enable inner loop parallelism.
+      if (Row[OuterLoopId] != '=')
+        return std::optional<bool>(false);
+    }
+    // If inner loop has dependence and outer loop is loop independent then it is
+    // profitable to interchange to enable inner loop parallelism.
+    // If there are no dependences, interchanging will not improve anything.
+    return std::optional<bool>(!DepMatrix.empty());
+  }
+  return std::nullopt;
+}
+
+bool LoopInterchangeProfitability::isProfitable(
+    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
+    unsigned OuterLoopId, CharMatrix &DepMatrix,
+    const DenseMap<const Loop *, unsigned> &CostMap,
+    std::unique_ptr<CacheCost>& CC) {
+  std::optional<bool> shouldInterchange = isProfitableAccordingLoopCacheAnalysis(CostMap);
+  if (!shouldInterchange.has_value()) {
+    shouldInterchange = isProfitableAccordingInstrOrderCost();
+    if (!shouldInterchange.has_value()) {
+      shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix, CC);
+    }
+  }
+  if (!shouldInterchange.has_value()) {
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Insufficient information to calculate the cost of loop for interchange.";
+    });
+    return false;
+  }
+  else if (!shouldInterchange.value()) {
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Interchanging loops not considered to improve cache locality "
+                "nor vectorization.";
+    });
+    return false;
+  }
+  return true;
 }
 
 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
Index: llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
+++ llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
@@ -71,7 +71,7 @@
 ; DELIN-NEXT: Name:            InterchangeNotProfitable
 ; DELIN-NEXT: Function:        test01
 ; DELIN-NEXT: Args:
-; DELIN-NEXT:   - String:          Interchanging loops is too costly and it does not improve parallelism.
+; DELIN-NEXT:   - String:          Interchanging loops not considered to improve cache locality nor vectorization.
 ; DELIN-NEXT: ...
 
 ;;--------------------------------------Test case 02------------------------------------
Index: llvm/test/Transforms/LoopInterchange/profitability.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/profitability.ll
+++ llvm/test/Transforms/LoopInterchange/profitability.ll
@@ -2,6 +2,10 @@
 ; RUN:     -pass-remarks=loop-interchange -pass-remarks-missed=loop-interchange
 ; RUN: FileCheck -input-file %t %s
 
+; RUN: opt < %s -passes=loop-interchange,loop-interchange -cache-line-size=64 \
+; RUN:     -pass-remarks-output=%t -pass-remarks='loop-interchange' -S
+; RUN: cat %t |  FileCheck --check-prefix=PROFIT %s
+
 ;; We test profitability model in these test cases.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,7 +14,7 @@
 @B = common global [100 x [100 x i32]] zeroinitializer
 
 ;;---------------------------------------Test case 01---------------------------------
-;; Loops interchange will result in code vectorization and hence profitable. Check for interchange.
+;; Loops interchange will result in better cache locality and hence profitable. Check for interchange.
 ;;   for(int i=1;i<100;i++)
 ;;     for(int j=1;j<100;j++)
 ;;       A[j][i] = A[j - 1][i] + B[j][i];
@@ -18,6 +22,20 @@
 ; CHECK:      Name:            Interchanged
 ; CHECK-NEXT: Function:        interchange_01
 
+; PROFIT: --- !Passed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            Interchanged
+; PROFIT-NEXT: Function:        interchange_01
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Loop interchanged with enclosing loop.
+; PROFIT-NEXT: ...
+; PROFIT: --- !Missed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            InterchangeNotProfitable
+; PROFIT-NEXT: Function:        interchange_01
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Interchanging loops not considered to improve cache locality nor vectorization.
+; PROFIT-NEXT: ...
 define void @interchange_01() {
 entry:
   br label %for2.preheader
@@ -59,6 +77,21 @@
 
 ; CHECK:      Name:            Interchanged
 ; CHECK-NEXT: Function:        interchange_02
+
+; PROFIT: --- !Passed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            Interchanged
+; PROFIT-NEXT: Function:        interchange_02
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Loop interchanged with enclosing loop.
+; PROFIT-NEXT: ...
+; PROFIT: --- !Missed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            InterchangeNotProfitable
+; PROFIT-NEXT: Function:        interchange_02
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Interchanging loops not considered to improve cache locality nor vectorization.
+; PROFIT-NEXT: ...
 define void @interchange_02() {
 entry:
   br label %for1.header
@@ -98,6 +131,21 @@
 
 ; CHECK:      Name:            InterchangeNotProfitable
 ; CHECK-NEXT: Function:        interchange_03
+
+; PROFIT: --- !Missed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            InterchangeNotProfitable
+; PROFIT-NEXT: Function:        interchange_03
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Interchanging loops not considered to improve cache locality nor vectorization.
+; PROFIT-NEXT: ...
+; PROFIT: --- !Missed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            InterchangeNotProfitable
+; PROFIT-NEXT: Function:        interchange_03
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Interchanging loops not considered to improve cache locality nor vectorization.
+; PROFIT-NEXT: ...
 define void @interchange_03(){
 entry:
   br label %for1.header
@@ -136,6 +184,21 @@
 
 ; CHECK:      Name:            InterchangeNotProfitable
 ; CHECK-NEXT: Function:        interchange_04
+
+; PROFIT: --- !Missed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            InterchangeNotProfitable
+; PROFIT-NEXT: Function:        interchange_04
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Interchanging loops not considered to improve cache locality nor vectorization.
+; PROFIT-NEXT: ...
+; PROFIT: --- !Missed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            InterchangeNotProfitable
+; PROFIT-NEXT: Function:        interchange_04
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Interchanging loops not considered to improve cache locality nor vectorization.
+; PROFIT-NEXT: ...
 define void @interchange_04(i32 %k) {
 entry:
   br label %for.cond1.preheader
@@ -162,3 +225,58 @@
 for.end12:
   ret void
 }
+
+;;---------------------------------------Test case 05---------------------------------
+;; Loops interchange will result in code cache cost and hence profitable. Check for interchange.
+;;   for(int i=1;i<100;i++)
+;;     for(int j=1;j<100;j++)
+;;       A[j][0] = A[j][0] + B[j][i];
+
+; CHECK:      Name:            Interchanged
+; CHECK-NEXT: Function:        interchange_05
+
+; PROFIT: --- !Passed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            Interchanged
+; PROFIT-NEXT: Function:        interchange_05
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Loop interchanged with enclosing loop.
+; PROFIT-NEXT: ...
+; PROFIT: --- !Missed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            InterchangeNotProfitable
+; PROFIT-NEXT: Function:        interchange_05
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Interchanging loops not considered to improve cache locality nor vectorization.
+; PROFIT-NEXT: ...
+define void @interchange_05() {
+entry:
+  br label %for2.preheader
+
+for2.preheader:
+  %i30 = phi i64 [ 1, %entry ], [ %i.next31, %for1.inc14 ]
+  br label %for2
+
+for2:
+  %j = phi i64 [ %i.next, %for2 ], [ 1, %for2.preheader ]
+  %j.prev = add nsw i64 %j,  -1
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 0
+  %lv1 = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %j,  i64 %i30
+  %lv2 = load i32, i32* %arrayidx9
+  %add = add nsw i32 %lv1, %lv2
+  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j,  i64 0
+  store i32 %add, i32* %arrayidx13
+  %i.next = add nuw nsw i64 %j,  1
+  %exitcond = icmp eq i64 %j,  99
+  br i1 %exitcond, label %for1.inc14, label %for2
+
+for1.inc14:
+  %i.next31 = add nuw nsw i64 %i30, 1
+  %exitcond33 = icmp eq i64 %i30, 99
+  br i1 %exitcond33, label %for.end16, label %for2.preheader
+
+for.end16:
+  ret void
+}
+