Index: llvm/lib/Transforms/Scalar/LoopInterchange.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -311,11 +311,18 @@
   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
                     unsigned InnerLoopId, unsigned OuterLoopId,
                     CharMatrix &DepMatrix,
-                    const DenseMap<const Loop *, unsigned> &CostMap);
+                    const DenseMap<const Loop *, unsigned> &CostMap,
+                    std::unique_ptr<CacheCost> &CC);
 
 private:
   int getInstrOrderCost();
-
+  std::optional<bool> isProfitablePerLoopCacheAnalysis(
+      const DenseMap<const Loop *, unsigned> &CostMap,
+      std::unique_ptr<CacheCost> &CC);
+  std::optional<bool> isProfitablePerInstrOrderCost();
+  std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
+                                                   unsigned OuterLoopId,
+                                                   CharMatrix &DepMatrix);
   Loop *OuterLoop;
   Loop *InnerLoop;
 
@@ -512,7 +519,7 @@
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
-                          DependencyMatrix, CostMap)) {
+                          DependencyMatrix, CostMap, CC)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1091,31 +1098,10 @@
   return GoodOrder - BadOrder;
 }
 
-static bool isProfitableForVectorization(unsigned InnerLoopId,
-                                         unsigned OuterLoopId,
-                                         CharMatrix &DepMatrix) {
-  // TODO: Improve this heuristic to catch more cases.
-  // If the inner loop is loop independent or doesn't carry any dependency it is
-  // profitable to move this to outer position.
-  for (auto &Row : DepMatrix) {
-    if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
-      return false;
-    // TODO: We need to improve this heuristic.
-    if (Row[OuterLoopId] != '=')
-      return false;
-  }
-  // If outer loop has dependence and inner loop is loop independent then it is
-  // profitable to interchange to enable parallelism.
-  // If there are no dependences, interchanging will not improve anything.
-  return !DepMatrix.empty();
-}
-
-bool LoopInterchangeProfitability::isProfitable(
-    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
-    unsigned OuterLoopId, CharMatrix &DepMatrix,
-    const DenseMap<const Loop *, unsigned> &CostMap) {
-  // TODO: Remove the legacy cost model.
-
+std::optional<bool>
+LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
+    const DenseMap<const Loop *, unsigned> &CostMap,
+    std::unique_ptr<CacheCost> &CC) {
   // This is the new cost model returned from loop cache analysis.
   // A smaller index means the loop should be placed an outer loop, and vice
   // versa.
@@ -1127,30 +1113,90 @@
     LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
                       << ", OuterIndex = " << OuterIndex << "\n");
     if (InnerIndex < OuterIndex)
-      return true;
-  } else {
-    // Legacy cost model: this is rough cost estimation algorithm. It counts the
-    // good and bad order of induction variables in the instruction and allows
-    // reordering if number of bad orders is more than good.
-    int Cost = getInstrOrderCost();
-    LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
-    if (Cost < -LoopInterchangeCostThreshold)
-      return true;
+      return std::optional<bool>(true);
+
+    if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
+      return std::nullopt;
+    return std::optional<bool>(false);
   }
+  return std::nullopt;
+}
 
-  // It is not profitable as per current cache profitability model. But check if
-  // we can move this loop outside to improve parallelism.
-  if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
-    return true;
+std::optional<bool>
+LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
+  // Legacy cost model: this is rough cost estimation algorithm. It counts the
+  // good and bad order of induction variables in the instruction and allows
+  // reordering if number of bad orders is more than good.
+  int Cost = getInstrOrderCost();
+  LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
+  if (Cost < 0 && Cost < LoopInterchangeCostThreshold)
+    return std::optional<bool>(true);
+
+  return std::nullopt;
+}
 
-  ORE->emit([&]() {
-    return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
-                                    InnerLoop->getStartLoc(),
-                                    InnerLoop->getHeader())
-           << "Interchanging loops is too costly and it does not improve "
-              "parallelism.";
-  });
-  return false;
+std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
+    unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
+  for (auto &Row : DepMatrix) {
+    // If the inner loop is loop independent or doesn't carry any dependency
+    // it is not profitable to move this to outer position, since we are
+    // likely able to do inner loop vectorization already.
+    if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=')
+      return std::optional<bool>(false);
+
+    // If the outer loop is not loop independent it is not profitable to move
+    // this to inner position, since doing so would not enable inner loop
+    // parallelism.
+    if (Row[OuterLoopId] != '=')
+      return std::optional<bool>(false);
+  }
+  // If inner loop has dependence and outer loop is loop independent then it
+  // is/ profitable to interchange to enable inner loop parallelism.
+  // If there are no dependences, interchanging will not improve anything.
+  return std::optional<bool>(!DepMatrix.empty());
+}
+
+bool LoopInterchangeProfitability::isProfitable(
+    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
+    unsigned OuterLoopId, CharMatrix &DepMatrix,
+    const DenseMap<const Loop *, unsigned> &CostMap,
+    std::unique_ptr<CacheCost> &CC) {
+  // isProfitable() is structured to avoid endless loop interchange.
+  // If loop cache analysis could decide the profitability then,
+  // profitability check will stop and return the analysis result.
+  // If cache analysis failed to analyze the loopnest (e.g.,
+  // due to delinearization issues) then only check whether it is
+  // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
+  // analysis the profitability then only, isProfitableForVectorization
+  // will decide.
+  std::optional<bool> shouldInterchange =
+      isProfitablePerLoopCacheAnalysis(CostMap, CC);
+  if (!shouldInterchange.has_value()) {
+    shouldInterchange = isProfitablePerInstrOrderCost();
+    if (!shouldInterchange.has_value())
+      shouldInterchange =
+          isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+  }
+  if (!shouldInterchange.has_value()) {
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Insufficient information to calculate the cost of loop for "
+                "interchange.";
+    });
+    return false;
+  } else if (!shouldInterchange.value()) {
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Interchanging loops is not considered to improve cache locality "
+                "nor vectorization.";
+    });
+    return false;
+  }
+  return true;
 }
 
 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
Index: llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
+++ llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
@@ -71,7 +71,7 @@
 ; DELIN-NEXT: Name:            InterchangeNotProfitable
 ; DELIN-NEXT: Function:        test01
 ; DELIN-NEXT: Args:
-; DELIN-NEXT:   - String:          Interchanging loops is too costly and it does not improve parallelism.
+; DELIN-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
 ; DELIN-NEXT: ...
 
 ;;--------------------------------------Test case 02------------------------------------
Index: llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll
+++ llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll
@@ -156,20 +156,29 @@
 
 ; Make sure we do not crash for loops without reachable exits.
 define void @no_reachable_exits() {
-; Check we interchanged.
-; CHECK-LABEL: @no_reachable_exits() {
+; Check we do not crash.
+; CHECK-LABEL: @no_reachable_exits(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label %inner.ph
-; CHECK-LABEL: outer.ph:
-; CHECK-NEXT:    br label %outer.header
-; CHECK-LABEL: inner.ph:
-; CHECK-NEXT:    br label %inner.body
-; CHECK-LABEL: inner.body:
-; CHECK-NEXT:    %tmp31 = phi i32 [ 0, %inner.ph ], [ %[[IVNEXT:[0-9]]], %inner.body.split ]
-; CHECK-NEXT:    br label %outer.ph
-; CHECK-LABEL: inner.body.split:
-; CHECK-NEXT:    %[[IVNEXT]] = add nsw i32 %tmp31, 1
-; CHECK-NEXT:    br i1 false, label %inner.body, label %exit
+; CHECK-NEXT:    br label [[OUTER_PH:%.*]]
+; CHECK:       outer.ph:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ 0, [[OUTER_PH]] ], [ [[TMP8:%.*]], [[OUTER_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 undef, label [[INNER_PH:%.*]], label [[OUTER_LATCH]]
+; CHECK:       inner.ph:
+; CHECK-NEXT:    br label [[INNER_BODY:%.*]]
+; CHECK:       inner.body:
+; CHECK-NEXT:    [[TMP31:%.*]] = phi i32 [ 0, [[INNER_PH]] ], [ [[TMP6:%.*]], [[INNER_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32*, i32** undef, align 8
+; CHECK-NEXT:    [[TMP6]] = add nsw i32 [[TMP31]], 1
+; CHECK-NEXT:    br i1 false, label [[INNER_BODY]], label [[OUTER_LATCH_LOOPEXIT:%.*]]
+; CHECK:       outer.latch.loopexit:
+; CHECK-NEXT:    br label [[OUTER_LATCH]]
+; CHECK:       outer.latch:
+; CHECK-NEXT:    [[TMP8]] = add nsw i32 [[TMP2]], 1
+; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    unreachable
 
 
 bb:
Index: llvm/test/Transforms/LoopInterchange/pr57148.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/pr57148.ll
+++ llvm/test/Transforms/LoopInterchange/pr57148.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-interchange -cache-line-size=4 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=4 -loop-interchange-threshold=-100 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
 
 ; Make sure the loops are in LCSSA form after loop interchange,
 ; and loop interchange does not hit assertion errors and crash.
Index: llvm/test/Transforms/LoopInterchange/profitability.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/profitability.ll
+++ llvm/test/Transforms/LoopInterchange/profitability.ll
@@ -2,6 +2,10 @@
 ; RUN:     -pass-remarks=loop-interchange -pass-remarks-missed=loop-interchange
 ; RUN: FileCheck -input-file %t %s
 
+; RUN: opt < %s -passes=loop-interchange,loop-interchange -cache-line-size=64 \
+; RUN:     -pass-remarks-output=%t -pass-remarks='loop-interchange' -S
+; RUN: cat %t |  FileCheck --check-prefix=PROFIT %s
+
 ;; We test profitability model in these test cases.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,7 +14,7 @@
 @B = common global [100 x [100 x i32]] zeroinitializer
 
 ;;---------------------------------------Test case 01---------------------------------
-;; Loops interchange will result in code vectorization and hence profitable. Check for interchange.
+;; Loops interchange will result in better cache locality and hence profitable. Check for interchange.
 ;;   for(int i=1;i<100;i++)
 ;;     for(int j=1;j<100;j++)
 ;;       A[j][i] = A[j - 1][i] + B[j][i];
@@ -162,3 +166,60 @@
 for.end12:
   ret void
 }
+
+;;---------------------------------------Test case 05---------------------------------
+;; This test is to make sure, that multiple invocations of loop interchange will not
+;; undo previous interchange and will converge to a particular order determined by the
+;; profitability analysis.
+;;   for(int i=1;i<100;i++)
+;;     for(int j=1;j<100;j++)
+;;       A[j][0] = A[j][0] + B[j][i];
+
+; CHECK:      Name:            Interchanged
+; CHECK-NEXT: Function:        interchange_05
+
+; PROFIT-LABEL: --- !Passed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            Interchanged
+; PROFIT-LABEL: Function:        interchange_05
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Loop interchanged with enclosing loop.
+; PROFIT-NEXT: ...
+; PROFIT: --- !Missed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            InterchangeNotProfitable
+; PROFIT-NEXT: Function:        interchange_05
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+; PROFIT-NEXT: ...
+define void @interchange_05() {
+entry:
+  br label %for2.preheader
+
+for2.preheader:
+  %i30 = phi i64 [ 1, %entry ], [ %i.next31, %for1.inc14 ]
+  br label %for2
+
+for2:
+  %j = phi i64 [ %i.next, %for2 ], [ 1, %for2.preheader ]
+  %j.prev = add nsw i64 %j,  -1
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 0
+  %lv1 = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %j,  i64 %i30
+  %lv2 = load i32, i32* %arrayidx9
+  %add = add nsw i32 %lv1, %lv2
+  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j,  i64 0
+  store i32 %add, i32* %arrayidx13
+  %i.next = add nuw nsw i64 %j,  1
+  %exitcond = icmp eq i64 %j,  99
+  br i1 %exitcond, label %for1.inc14, label %for2
+
+for1.inc14:
+  %i.next31 = add nuw nsw i64 %i30, 1
+  %exitcond33 = icmp eq i64 %i30, 99
+  br i1 %exitcond33, label %for.end16, label %for2.preheader
+
+for.end16:
+  ret void
+}
+