Index: lib/Transforms/Scalar/LoopInterchange.cpp
===================================================================
--- lib/Transforms/Scalar/LoopInterchange.cpp
+++ lib/Transforms/Scalar/LoopInterchange.cpp
@@ -366,15 +366,17 @@
 class LoopInterchangeProfitability {
 public:
   LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
-                               OptimizationRemarkEmitter *ORE)
-      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
+                               OptimizationRemarkEmitter *ORE,
+                               unsigned CacheLineSize)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE),
+        CacheLineSize(CacheLineSize) {}
 
   /// Check if the loop interchange is profitable.
   bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
                     CharMatrix &DepMatrix);
 
 private:
-  int getInstrOrderCost();
+  unsigned getLoopOrderCost(Loop *Loop);
 
   Loop *OuterLoop;
   Loop *InnerLoop;
@@ -383,6 +385,7 @@
   ScalarEvolution *SE;
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
+  const unsigned CacheLineSize;
 };
 
 /// LoopInterchangeTransform interchanges the loop.
@@ -432,6 +435,7 @@
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
+  unsigned CacheLineSize = 0;
   LoopInterchange()
       : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) {
     initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
@@ -446,6 +450,7 @@
     AU.addRequiredID(LoopSimplifyID);
     AU.addRequiredID(LCSSAID);
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override {
@@ -459,6 +464,11 @@
     DT = DTWP ? &DTWP->getDomTree() : nullptr;
     ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
     PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+    CacheLineSize =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F)
+                                                   .getCacheLineSize();
+    if (CacheLineSize == 0)
+      CacheLineSize = 64; // Assume 64 byte cache lines as default.
 
     // Build up a worklist of loop pairs to analyze.
     SmallVector<LoopVector, 8> Worklist;
@@ -593,7 +603,8 @@
       return false;
     }
     DEBUG(dbgs() << "Loops are legal to interchange\n");
-    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
+    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE,
+                                     CacheLineSize);
     if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
       DEBUG(dbgs() << "Interchanging loops not profitable\n");
       return false;
@@ -1007,55 +1018,56 @@
   return true;
 }
 
-int LoopInterchangeProfitability::getInstrOrderCost() {
-  unsigned GoodOrder, BadOrder;
-  BadOrder = GoodOrder = 0;
-  for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end();
-       BI != BE; ++BI) {
-    for (Instruction &Ins : **BI) {
-      if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
-        unsigned NumOp = GEP->getNumOperands();
-        bool FoundInnerInduction = false;
-        bool FoundOuterInduction = false;
-        for (unsigned i = 0; i < NumOp; ++i) {
-          const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
-          const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
-          if (!AR)
-            continue;
-
-          // If we find the inner induction after an outer induction e.g.
-          // for(int i=0;i<N;i++)
-          //   for(int j=0;j<N;j++)
-          //     A[i][j] = A[i-1][j-1]+k;
-          // then it is a good order.
-          if (AR->getLoop() == InnerLoop) {
-            // We found an InnerLoop induction after OuterLoop induction. It is
-            // a good order.
-            FoundInnerInduction = true;
-            if (FoundOuterInduction) {
-              GoodOrder++;
-              break;
-            }
-          }
-          // If we find the outer induction after an inner induction e.g.
-          // for(int i=0;i<N;i++)
-          //   for(int j=0;j<N;j++)
-          //     A[j][i] = A[j-1][i-1]+k;
-          // then it is a bad order.
-          if (AR->getLoop() == OuterLoop) {
-            // We found an OuterLoop induction after InnerLoop induction. It is
-            // a bad order.
-            FoundOuterInduction = true;
-            if (FoundInnerInduction) {
-              BadOrder++;
-              break;
-            }
-          }
-        }
-      }
+// Returns the number of bytes that fit in a cache line when using Addr to
+// access memory in L.
+static unsigned getBytesInCache(Loop *L, ScalarEvolution *SE, Value *Addr,
+                                unsigned CacheLineSize) {
+  const Type *AddrTy = Addr->getType();
+  assert(AddrTy->isPointerTy() && "Addr must be a pointer type");
+  const SCEV *Expr = SE->getSCEVAtScope(Addr, L);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Expr);
+  const unsigned ValueSizeBytes =
+    std::min(CacheLineSize,
+             AddrTy->getPointerElementType()->getScalarSizeInBits() / 8);
+
+  if (SE->isLoopInvariant(Expr, L))
+    return CacheLineSize;
+  if (!AR) return ValueSizeBytes;
+  if (!AR->isAffine()) return ValueSizeBytes;
+
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+  if (!C) return ValueSizeBytes;
+  // This roughly models the number of cache misses when the loop
+  // trip count is very large.
+  unsigned Stride = C->getValue()->getValue().abs().getZExtValue();
+  assert(Stride != 0 && "Stride is 0");
+  unsigned UsedBytes = std::min(CacheLineSize,
+                                (CacheLineSize / Stride) * ValueSizeBytes);
+  if (UsedBytes == 0)
+    return ValueSizeBytes;
+
+  return UsedBytes;
+}
+
+/// \brief Returns the number of bytes that fit into cache lines for memory
+/// accesses that happen in Loop.
+unsigned LoopInterchangeProfitability::getLoopOrderCost(Loop *Loop) {
+  unsigned BytesInCache = 0;
+  for (BasicBlock *BB : InnerLoop->blocks()) {
+    for (Instruction &Ins : *BB) {
+      // For each load or store instruction, calculate the number of bytes that
+      // fit into a cache line, if this address is used in the Loop.
+      Value *Addr = nullptr;
+      if (LoadInst *LI = dyn_cast<LoadInst>(&Ins))
+        Addr = LI->getPointerOperand();
+      if (StoreInst *SI = dyn_cast<StoreInst>(&Ins))
+        Addr = SI->getPointerOperand();
+      if (!Addr) continue;
+
+      BytesInCache += getBytesInCache(Loop, SE, Addr, CacheLineSize);
     }
   }
-  return GoodOrder - BadOrder;
+  return BytesInCache;
 }
 
 static bool isProfitableForVectorization(unsigned InnerLoopId,
@@ -1086,12 +1098,14 @@
   // 1) Construct dependency matrix and move the one with no loop carried dep
   //    inside to enable vectorization.
 
-  // This is rough cost estimation algorithm. It counts the good and bad order
-  // of induction variables in the instruction and allows reordering if number
-  // of bad orders is more than good.
-  int Cost = getInstrOrderCost();
-  DEBUG(dbgs() << "Cost = " << Cost << "\n");
-  if (Cost < -LoopInterchangeCostThreshold)
+  // To estimate the cost of a loop order, we calculate the number of bytes that
+  // fit into cache lines for memory access in the loop. The interchanged loops
+  // fit more bytes in cache lines, we interchange them.
+  int OriginalCost = getLoopOrderCost(InnerLoop);
+  int InterchangedCost = getLoopOrderCost(OuterLoop);
+  DEBUG(dbgs() << "OriginalCost = " << OriginalCost << " InterchangedCost "
+               << InterchangedCost << "\n");
+  if ((OriginalCost - InterchangedCost) < LoopInterchangeCostThreshold)
     return true;
 
   // It is not profitable as per current cache profitability model. But check if
@@ -1104,7 +1118,8 @@
                                      InnerLoop->getStartLoc(),
                                      InnerLoop->getHeader())
             << "Interchanging loops is too costly (cost="
-            << ore::NV("Cost", Cost) << ", threshold="
+            << ore::NV("Cost", (OriginalCost - InterchangedCost))
+            << ", threshold="
             << ore::NV("Threshold", LoopInterchangeCostThreshold) <<
             ") and it does not improve parallelism.");
   return false;
Index: test/Transforms/LoopInterchange/interchange-single-comp.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopInterchange/interchange-single-comp.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -loop-interchange -S | FileCheck %s
+
+;;--------------------------------------Test case 01------------------------------------
+;; It is profitable to interchange those loops, as both A[j*25+i] and B[n+i]
+;; will sequentially access memory.
+;;  for(int i=0;i<n;i++)
+;;    for(int j=1;j<25;j++)
+;;      A[j*25+i] = B[n+i]
+
+define void @foo(i32* %A, i32* %B, i64 %n) {
+entry:
+  br label %outer.preheader
+
+outer.preheader:                           ; preds = %entry, %outer.for.inc8_crit_edge.us
+  %i = phi i64 [ 0, %entry ], [ %i.next, %outer.inc8_crit_edge.us ]
+  %0 = add i64 %i, %n
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i64 %0
+  br label %inner.body
+
+inner.body:                                     ; preds = %outer.preheader, %inner.body
+  %j = phi i64 [ %j.next, %inner.body ], [ 0, %outer.preheader ]
+  %1 = load i32, i32* %arrayidx.us, align 4
+  %2 = mul i64 %j, 25
+  %3 = add i64 %2, %i
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i64 %3
+  %4 = load i32, i32* %arrayidx6.us, align 4
+  %add7.us = add i32 %4, %1
+  store i32 %add7.us, i32* %arrayidx6.us, align 4
+  %j.next = add i64 %j, 1
+  %exitcond = icmp ne i64 %j.next, %n
+  br i1 %exitcond, label %inner.body, label %outer.inc8_crit_edge.us
+
+outer.inc8_crit_edge.us:                  ; preds = %inner.body
+  %i.next = add i64 %i, 1
+  %exitcond29 = icmp ne i64 %i.next, 25
+  br i1 %exitcond29, label %outer.preheader, label %for.end10.loopexit
+
+for.end10.loopexit:                               ; preds = %outer.inc8_crit_edge.us
+  br label %for.end10
+
+for.end10:                                        ; preds = %entry, %for.end10.loopexit
+  ret void
+}
+
+; CHECK: split
Index: test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
===================================================================
--- test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
+++ test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
@@ -9,7 +9,14 @@
 @C = common global [100 x i32] zeroinitializer
 
 ;;---------------------------------------Test case 01---------------------------------
-;; Loops interchange is not profitable.
+;; Loops interchange is not profitable, as the induction variable of the inner
+;; loop is already used to access continuous elements in the array.
+;;
+;; The cost of interchanging the loops is 180, because with the original loop
+;; order, all 3 accesses can make use of full cache lines (192 bytes in cache).
+;; When the loops are interchanged, only a single element fits in cache lines
+;; for each access (12 bytes in cache).
+;;
 ;;   for(int i=1;i<N;i++)
 ;;     for(int j=1;j<N;j++)
 ;;       A[i-1][j-1] = A[i - 1][j-1] + B[i][j];
@@ -58,163 +65,8 @@
 ; CHECK-NEXT: Function:        test01
 ; CHECK-NEXT: Args:
 ; CHECK-NEXT:  - String:          'Interchanging loops is too costly (cost='
-; CHECK-NEXT:  - Cost:            '2'
+; CHECK-NEXT:  - Cost:            '180'
 ; CHECK-NEXT:  - String:          ', threshold='
 ; CHECK-NEXT:  - Threshold:       '0'
 ; CHECK-NEXT:  - String:          ') and it does not improve parallelism.'
 ; CHECK-NEXT: ...
-
-;;--------------------------------------Test case 02------------------------------------
-;; [FIXME] This loop though valid is currently not interchanged due to the
-;; limitation that we cannot split the inner loop latch due to multiple use of inner induction
-;; variable.(used to increment the loop counter and to access A[j+1][i+1]
-;;  for(int i=0;i<N-1;i++)
-;;    for(int j=1;j<N-1;j++)
-;;      A[j+1][i+1] = A[j+1][i+1] + k;
-
-define void @test02(i32 %k, i32 %N) {
- entry:
-   %sub = add nsw i32 %N, -1
-   %cmp26 = icmp sgt i32 %N, 1
-   br i1 %cmp26, label %for.cond1.preheader.lr.ph, label %for.end17
-
- for.cond1.preheader.lr.ph:
-   %cmp324 = icmp sgt i32 %sub, 1
-   %0 = add i32 %N, -2
-   %1 = sext i32 %sub to i64
-   br label %for.cond1.preheader
-
- for.cond.loopexit:
-   %cmp = icmp slt i64 %indvars.iv.next29, %1
-   br i1 %cmp, label %for.cond1.preheader, label %for.end17
-
- for.cond1.preheader:
-   %indvars.iv28 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next29, %for.cond.loopexit ]
-   %indvars.iv.next29 = add nuw nsw i64 %indvars.iv28, 1
-   br i1 %cmp324, label %for.body4, label %for.cond.loopexit
-
- for.body4:
-   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 1, %for.cond1.preheader ]
-   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-   %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next, i64 %indvars.iv.next29
-   %2 = load i32, i32* %arrayidx7
-   %add8 = add nsw i32 %2, %k
-   store i32 %add8, i32* %arrayidx7
-   %lftr.wideiv = trunc i64 %indvars.iv to i32
-   %exitcond = icmp eq i32 %lftr.wideiv, %0
-   br i1 %exitcond, label %for.cond.loopexit, label %for.body4
-
- for.end17:
-   ret void
-}
-
-; CHECK: --- !Missed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            UnsupportedInsBetweenInduction
-; CHECK-NEXT: Function:        test02
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          Found unsupported instruction between induction variable increment and branch.
-; CHECK-NEXT: ...
-
-;;-----------------------------------Test case 03-------------------------------
-;; Test to make sure we can handle output dependencies.
-;;
-;;  for (int i = 0; i < 2; ++i)
-;;    for(int j = 0; j < 3; ++j) {
-;;      A[j][i] = i;
-;;      A[j][i+1] = j;
-;;    }
-
-@A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16
-
-define void @test03() {
-entry:
-  br label %for.cond1.preheader
-
-for.cond.loopexit:                                ; preds = %for.body4
-  %exitcond28 = icmp ne i64 %indvars.iv.next27, 2
-  br i1 %exitcond28, label %for.cond1.preheader, label %for.cond.cleanup
-
-for.cond1.preheader:                              ; preds = %for.cond.loopexit, %entry
-  %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ]
-  %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
-  br label %for.body4
-
-for.cond.cleanup:                                 ; preds = %for.cond.loopexit
-  ret void
-
-for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
-  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
-  %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26
-  %tmp = trunc i64 %indvars.iv26 to i32
-  store i32 %tmp, i32* %arrayidx6, align 4
-  %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27
-  %tmp1 = trunc i64 %indvars.iv to i32
-  store i32 %tmp1, i32* %arrayidx10, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 3
-  br i1 %exitcond, label %for.body4, label %for.cond.loopexit
-}
-
-; CHECK: --- !Passed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        test03
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
-; CHECK-NEXT: ...
-
-;;--------------------------------------Test case 04-------------------------------------
-;; Loops not tightly nested are not interchanged
-;;  for(int j=0;j<N;j++) {
-;;    B[j] = j+k;
-;;    for(int i=0;i<N;i++)
-;;      A[j][i] = A[j][i]+B[j];
-;;  }
-
-define void @test04(i32 %k, i32 %N){
-entry:
-  %cmp30 = icmp sgt i32 %N, 0
-  br i1 %cmp30, label %for.body.lr.ph, label %for.end17
-
-for.body.lr.ph:
-  %0 = add i32 %N, -1
-  %1 = zext i32 %k to i64
-  br label %for.body
-
-for.body:
-  %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ]
-  %2 = add nsw i64 %indvars.iv32, %1
-  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @C, i64 0, i64 %indvars.iv32
-  %3 = trunc i64 %2 to i32
-  store i32 %3, i32* %arrayidx
-  br label %for.body3
-
-for.body3:
-  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
-  %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv
-  %4 = load i32, i32* %arrayidx7
-  %add10 = add nsw i32 %3, %4
-  store i32 %add10, i32* %arrayidx7
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %0
-  br i1 %exitcond, label %for.inc15, label %for.body3
-
-for.inc15:
-  %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
-  %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32
-  %exitcond36 = icmp eq i32 %lftr.wideiv35, %0
-  br i1 %exitcond36, label %for.end17, label %for.body
-
-for.end17:
-  ret void
-}
-
-; CHECK: --- !Missed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            NotTightlyNested
-; CHECK-NEXT: Function:        test04
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          Cannot interchange loops because they are not tightly nested.
-; CHECK-NEXT: ...
Index: test/Transforms/LoopInterchange/profitability.ll
===================================================================
--- test/Transforms/LoopInterchange/profitability.ll
+++ test/Transforms/LoopInterchange/profitability.ll
@@ -203,3 +203,51 @@
 ; CHECK:    %3 = load i32, i32* %arrayidx6
 ; CHECK:    %arrayidx10 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %indvars.iv34, i64 %indvars.iv
 ; CHECK:    %4 = load i32, i32* %arrayidx10
+
+;;---------------------------------------Test case 04---------------------------------
+;; Interchanging the loops is not profitable, because the outer loop increments
+;; the induction variable by 6, so only a single element will be in the cache.
+;;   for(int i=1;i<N;i+=6)
+;;     for(int j=1;j<N;j++)
+;;       A[j][i] = A[j - 1][i] + B[j][i];
+
+define void @interchange_04(i32 %N) {
+entry:
+  %cmp27 = icmp sgt i32 %N, 1
+  br i1 %cmp27, label %for.cond1.preheader.lr.ph, label %for.end16
+
+for.cond1.preheader.lr.ph:
+  %0 = add i32 %N, -1
+  br label %for.body3.preheader
+
+for.body3.preheader:
+  %indvars.iv30 = phi i64 [ 1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next31, %for.inc14 ]
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.body3.preheader ]
+  %1 = add nsw i64 %indvars.iv, -1
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %1, i64 %indvars.iv30
+  %2 = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv30
+  %3 = load i32, i32* %arrayidx9
+  %add = add nsw i32 %3, %2
+  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv30
+  store i32 %add, i32* %arrayidx13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc14, label %for.body3
+
+for.inc14:
+  %indvars.iv.next31 = add nuw nsw i64 %indvars.iv30, 6
+  %lftr.wideiv32 = trunc i64 %indvars.iv30 to i32
+  %exitcond33 = icmp eq i32 %lftr.wideiv32, %0
+  br i1 %exitcond33, label %for.end16, label %for.body3.preheader
+
+for.end16:
+  ret void
+}
+;; Here we are checking partial .ll to check if loop are interchanged.
+; CHECK-LABEL: @interchange_04
+; CHECK-NOT: split