Index: llvm/lib/Transforms/Scalar/LoopInterchange.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -61,6 +61,9 @@
     "loop-interchange-threshold", cl::init(0), cl::Hidden,
     cl::desc("Interchange if you gain more than this number"));
 
+static cl::opt<int> LegacyCostModelThreshold(
+    "legacy-cost-threshold", cl::init(10), cl::Hidden,
+    cl::desc("The threshold for the legacy cost model to be considered."));
 namespace {
 
 using LoopVector = SmallVector<Loop *, 8>;
@@ -338,11 +341,18 @@
   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
                     unsigned InnerLoopId, unsigned OuterLoopId,
                     CharMatrix &DepMatrix,
-                    const DenseMap<const Loop *, unsigned> &CostMap);
+                    const DenseMap<const Loop *, unsigned> &CostMap,
+                    std::unique_ptr<CacheCost> &CC);
 
 private:
   int getInstrOrderCost();
-
+  std::optional<bool> isProfitablePerLoopCacheAnalysis(
+      const DenseMap<const Loop *, unsigned> &CostMap);
+  std::optional<bool> isProfitablePerInstrOrderCost();
+  std::optional<bool>
+  isProfitableForVectorization(unsigned InnerLoopId, unsigned OuterLoopId,
+                               CharMatrix &DepMatrix,
+                               std::unique_ptr<CacheCost> &CC);
   Loop *OuterLoop;
   Loop *InnerLoop;
 
@@ -539,7 +549,7 @@
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
-                          DependencyMatrix, CostMap)) {
+                          DependencyMatrix, CostMap, CC)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1110,31 +1120,9 @@
   return GoodOrder - BadOrder;
 }
 
-static bool isProfitableForVectorization(unsigned InnerLoopId,
-                                         unsigned OuterLoopId,
-                                         CharMatrix &DepMatrix) {
-  // TODO: Improve this heuristic to catch more cases.
-  // If the inner loop is loop independent or doesn't carry any dependency it is
-  // profitable to move this to outer position.
-  for (auto &Row : DepMatrix) {
-    if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
-      return false;
-    // TODO: We need to improve this heuristic.
-    if (Row[OuterLoopId] != '=')
-      return false;
-  }
-  // If outer loop has dependence and inner loop is loop independent then it is
-  // profitable to interchange to enable parallelism.
-  // If there are no dependences, interchanging will not improve anything.
-  return !DepMatrix.empty();
-}
-
-bool LoopInterchangeProfitability::isProfitable(
-    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
-    unsigned OuterLoopId, CharMatrix &DepMatrix,
+std::optional<bool>
+LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
     const DenseMap<const Loop *, unsigned> &CostMap) {
-  // TODO: Remove the legacy cost model.
-
   // This is the new cost model returned from loop cache analysis.
   // A smaller index means the loop should be placed an outer loop, and vice
   // versa.
@@ -1146,30 +1134,96 @@
     LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
                       << ", OuterIndex = " << OuterIndex << "\n");
     if (InnerIndex < OuterIndex)
-      return true;
-  } else {
-    // Legacy cost model: this is rough cost estimation algorithm. It counts the
-    // good and bad order of induction variables in the instruction and allows
-    // reordering if number of bad orders is more than good.
-    int Cost = getInstrOrderCost();
-    LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
-    if (Cost < -LoopInterchangeCostThreshold)
-      return true;
+      return std::optional<bool>(true);
+    else
+      return std::optional<bool>(false);
   }
+  return std::nullopt;
+}
 
-  // It is not profitable as per current cache profitability model. But check if
-  // we can move this loop outside to improve parallelism.
-  if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
-    return true;
+std::optional<bool>
+LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
+  // Legacy cost model: this is rough cost estimation algorithm. It counts the
+  // good and bad order of induction variables in the instruction and allows
+  // reordering if number of bad orders is more than good.
+  int Cost = getInstrOrderCost();
+  LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
+  if (abs(Cost) < LegacyCostModelThreshold)
+    if (Cost < -LoopInterchangeCostThreshold)
+      return std::optional<bool>(true);
+  return std::nullopt;
+}
 
-  ORE->emit([&]() {
-    return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
-                                    InnerLoop->getStartLoc(),
-                                    InnerLoop->getHeader())
-           << "Interchanging loops is too costly and it does not improve "
-              "parallelism.";
-  });
-  return false;
+std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
+    unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix,
+    std::unique_ptr<CacheCost> &CC) {
+  // To prevent endless interchange, only check whether it is profitable for
+  // vectorization when InnerLoop and OuterLoop has the same locality, or loop
+  // cache analysis failed to analyze the loopnest (e.g., due to delinearization
+  // issues).
+  if (CC && CC->getLoopCost(*InnerLoop) == CC->getLoopCost(*OuterLoop)) {
+    for (auto &Row : DepMatrix) {
+      // If the inner loop is loop independent or doesn't carry any dependency
+      // it is not profitable to move this to outer position, since we are
+      // likely able to do inner loop vectorization already.
+      if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=')
+        return std::optional<bool>(false);
+
+      // If the outer loop is not loop independent it is not profitable to move
+      // this to inner position, since doing so would not enable inner loop
+      // parallelism.
+      if (Row[OuterLoopId] != '=')
+        return std::optional<bool>(false);
+    }
+    // If inner loop has dependence and outer loop is loop independent then it
+    // is/ profitable to interchange to enable inner loop parallelism.
+    // If there are no dependences, interchanging will not improve anything.
+    return std::optional<bool>(!DepMatrix.empty());
+  }
+  return std::nullopt;
+}
+
+bool LoopInterchangeProfitability::isProfitable(
+    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
+    unsigned OuterLoopId, CharMatrix &DepMatrix,
+    const DenseMap<const Loop *, unsigned> &CostMap,
+    std::unique_ptr<CacheCost> &CC) {
+  // isProfitable() is structured to avoid endless loop interchange.
+  // If loop cache analysis could decide the profitability then,
+  // profitability check will stop and return the analysis result.
+  // If cache analysis failed to analyze the loopnest (e.g.,
+  // due to delinearization issues) then only check whether it is
+  // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
+  // analysis the profitability then only, isProfitableForVectorization
+  // will decide.
+  std::optional<bool> shouldInterchange =
+      isProfitablePerLoopCacheAnalysis(CostMap);
+  if (!shouldInterchange.has_value()) {
+    shouldInterchange = isProfitablePerInstrOrderCost();
+    if (!shouldInterchange.has_value())
+      shouldInterchange =
+          isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix, CC);
+  }
+  if (!shouldInterchange.has_value()) {
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Insufficient information to calculate the cost of loop for "
+                "interchange.";
+    });
+    return false;
+  } else if (!shouldInterchange.value()) {
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Interchanging loops not considered to improve cache locality "
+                "nor vectorization.";
+    });
+    return false;
+  }
+  return true;
 }
 
 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
Index: llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
+++ llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
@@ -71,7 +71,7 @@
 ; DELIN-NEXT: Name:            InterchangeNotProfitable
 ; DELIN-NEXT: Function:        test01
 ; DELIN-NEXT: Args:
-; DELIN-NEXT:   - String:          Interchanging loops is too costly and it does not improve parallelism.
+; DELIN-NEXT:   - String:          Interchanging loops not considered to improve cache locality nor vectorization.
 ; DELIN-NEXT: ...
 
 ;;--------------------------------------Test case 02------------------------------------
Index: llvm/test/Transforms/LoopInterchange/profitability.ll
===================================================================
--- llvm/test/Transforms/LoopInterchange/profitability.ll
+++ llvm/test/Transforms/LoopInterchange/profitability.ll
@@ -2,6 +2,10 @@
 ; RUN:     -pass-remarks=loop-interchange -pass-remarks-missed=loop-interchange
 ; RUN: FileCheck -input-file %t %s
 
+; RUN: opt < %s -passes=loop-interchange,loop-interchange -cache-line-size=64 \
+; RUN:     -pass-remarks-output=%t -pass-remarks='loop-interchange' -S
+; RUN: cat %t |  FileCheck --check-prefix=PROFIT %s
+
 ;; We test profitability model in these test cases.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,7 +14,7 @@
 @B = common global [100 x [100 x i32]] zeroinitializer
 
 ;;---------------------------------------Test case 01---------------------------------
-;; Loops interchange will result in code vectorization and hence profitable. Check for interchange.
+;; Loops interchange will result in better cache locality and hence profitable. Check for interchange.
 ;;   for(int i=1;i<100;i++)
 ;;     for(int j=1;j<100;j++)
 ;;       A[j][i] = A[j - 1][i] + B[j][i];
@@ -162,3 +166,63 @@
 for.end12:
   ret void
 }
+
+;;---------------------------------------Test case 05---------------------------------
+;; Loop interchange needed for better locality as per Cache analysis.
+;; Again loop-interchange is profitable as per isProfitableForVectorization.
+;; There would be endless interchange (no proper priority was not there in profitability check.
+;; Any profitable check may lead to loop-interchange) before D135808.
+;; And there is no endless interchange  after D135808 (priority in profitability check is defined.
+;; Order of decision is Cache cost check, InstrOrderCost , Vectorization ).
+;;   for(int i=1;i<100;i++)
+;;     for(int j=1;j<100;j++)
+;;       A[j][0] = A[j][0] + B[j][i];
+
+; CHECK:      Name:            Interchanged
+; CHECK-NEXT: Function:        interchange_05
+
+; PROFIT-LABEL: --- !Passed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            Interchanged
+; PROFIT-LABEL: Function:        interchange_05
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Loop interchanged with enclosing loop.
+; PROFIT-NEXT: ...
+; PROFIT: --- !Missed
+; PROFIT-NEXT: Pass:            loop-interchange
+; PROFIT-NEXT: Name:            InterchangeNotProfitable
+; PROFIT-NEXT: Function:        interchange_05
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT:   - String:          Interchanging loops not considered to improve cache locality nor vectorization.
+; PROFIT-NEXT: ...
+define void @interchange_05() {
+entry:
+  br label %for2.preheader
+
+for2.preheader:
+  %i30 = phi i64 [ 1, %entry ], [ %i.next31, %for1.inc14 ]
+  br label %for2
+
+for2:
+  %j = phi i64 [ %i.next, %for2 ], [ 1, %for2.preheader ]
+  %j.prev = add nsw i64 %j,  -1
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 0
+  %lv1 = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %j,  i64 %i30
+  %lv2 = load i32, i32* %arrayidx9
+  %add = add nsw i32 %lv1, %lv2
+  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j,  i64 0
+  store i32 %add, i32* %arrayidx13
+  %i.next = add nuw nsw i64 %j,  1
+  %exitcond = icmp eq i64 %j,  99
+  br i1 %exitcond, label %for1.inc14, label %for2
+
+for1.inc14:
+  %i.next31 = add nuw nsw i64 %i30, 1
+  %exitcond33 = icmp eq i64 %i30, 99
+  br i1 %exitcond33, label %for.end16, label %for2.preheader
+
+for.end16:
+  ret void
+}
+