Index: lib/Transforms/Scalar/LoopInterchange.cpp
===================================================================
--- lib/Transforms/Scalar/LoopInterchange.cpp
+++ lib/Transforms/Scalar/LoopInterchange.cpp
@@ -361,21 +361,24 @@
 /// loop.
 class LoopInterchangeProfitability {
 public:
-  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)
-      : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {}
+  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+                               unsigned CacheLineSize)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE),
+        CacheLineSize(CacheLineSize) {}
 
   /// Check if the loop interchange is profitable.
   bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
                     CharMatrix &DepMatrix);
 
 private:
-  int getInstrOrderCost();
+  unsigned getInstrOrderCost(Loop *Loop);
 
   Loop *OuterLoop;
   Loop *InnerLoop;
 
   /// Scev analysis.
   ScalarEvolution *SE;
+  const unsigned CacheLineSize;
 };
 
 /// LoopInterchangeTransform interchanges the loop.
@@ -422,6 +425,7 @@
   DependenceInfo *DI;
   DominatorTree *DT;
   bool PreserveLCSSA;
+  unsigned CacheLineSize = 0;
   LoopInterchange()
       : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) {
     initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
@@ -435,6 +439,7 @@
     AU.addRequired<DependenceAnalysisWrapperPass>();
     AU.addRequiredID(LoopSimplifyID);
     AU.addRequiredID(LCSSAID);
+    AU.addRequired<TargetTransformInfoWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override {
@@ -447,6 +452,9 @@
     auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
     DT = DTWP ? &DTWP->getDomTree() : nullptr;
     PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+    CacheLineSize = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F).getCacheLineSize();
+    if (CacheLineSize == 0)
+      CacheLineSize = 64; // Assume 64 byte cache lines as default.
 
     // Build up a worklist of loop pairs to analyze.
     SmallVector<LoopVector, 8> Worklist;
@@ -581,7 +589,7 @@
       return false;
     }
     DEBUG(dbgs() << "Loops are legal to interchange\n");
-    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE);
+    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, CacheLineSize);
     if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
       DEBUG(dbgs() << "Interchanging loops not profitable\n");
       return false;
@@ -844,7 +852,8 @@
 
   bool FoundInduction = false;
   for (const Instruction &I : reverse(*InnerLoopLatch)) {
-    if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I))
+    if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
+        isa<ZExtInst>(I))
       continue;
 
     // We found an instruction. If this is not induction variable then it is not
@@ -916,55 +925,62 @@
   return true;
 }
 
-int LoopInterchangeProfitability::getInstrOrderCost() {
-  unsigned GoodOrder, BadOrder;
-  BadOrder = GoodOrder = 0;
-  for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end();
-       BI != BE; ++BI) {
-    for (Instruction &Ins : **BI) {
-      if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
-        unsigned NumOp = GEP->getNumOperands();
-        bool FoundInnerInduction = false;
-        bool FoundOuterInduction = false;
-        for (unsigned i = 0; i < NumOp; ++i) {
-          const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
-          const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
-          if (!AR)
-            continue;
-
-          // If we find the inner induction after an outer induction e.g.
-          // for(int i=0;i<N;i++)
-          //   for(int j=0;j<N;j++)
-          //     A[i][j] = A[i-1][j-1]+k;
-          // then it is a good order.
-          if (AR->getLoop() == InnerLoop) {
-            // We found an InnerLoop induction after OuterLoop induction. It is
-            // a good order.
-            FoundInnerInduction = true;
-            if (FoundOuterInduction) {
-              GoodOrder++;
-              break;
-            }
-          }
-          // If we find the outer induction after an inner induction e.g.
-          // for(int i=0;i<N;i++)
-          //   for(int j=0;j<N;j++)
-          //     A[j][i] = A[j-1][i-1]+k;
-          // then it is a bad order.
-          if (AR->getLoop() == OuterLoop) {
-            // We found an OuterLoop induction after InnerLoop induction. It is
-            // a bad order.
-            FoundOuterInduction = true;
-            if (FoundInnerInduction) {
-              BadOrder++;
-              break;
-            }
-          }
-        }
-      }
-    }
+/// \brief Returns the number of bytes that fit in a cache line when using Addr
+/// to access memory in L.
+static unsigned getBytesInCache(Loop *L, ScalarEvolution *SE, Value *Addr, unsigned CacheLineSize) {
+  const Type *AddrTy = Addr->getType();
+  assert(AddrTy->isPointerTy() && "Addr must be a pointer type");
+  const SCEV *Expr = SE->getSCEVAtScope(Addr, L);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Expr);
+
+  // Number of bytes used by accessing a single element.
+  const unsigned ValueSizeBytes =
+    std::max(CacheLineSize,
+             AddrTy->getPointerElementType()->getScalarSizeInBits() / 8);
+
+  if (SE->isLoopInvariant(Expr, L))
+    return CacheLineSize;
+
+  // For complex address calculations, assume only a single element fits in a
+  // cache line.
+  if (!AR) return ValueSizeBytes;
+  if (!AR->isAffine()) return ValueSizeBytes;
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+  if (!C) return ValueSizeBytes;
+
+  // For constant steps, calculate the number of elements be in a cache line
+  // for that step size and use that to estimate the number of bytes in a cache
+  // line.
+  unsigned Step = C->getValue()->getValue().abs().getZExtValue();
+  assert(Step != 0 && "step is 0");
+  unsigned UsedBytes = std::max(CacheLineSize,
+                                CacheLineSize / Step * ValueSizeBytes);
+  if (UsedBytes == 0)
+    return ValueSizeBytes;
+
+  return UsedBytes;
+}
+
+/// \brief Returns the number of bytes that fit into cache lines if memory
+/// access happen in Loop.
+unsigned LoopInterchangeProfitability::getInstrOrderCost(Loop *Loop) {
+  unsigned BytesInCache = 0;
+  for (BasicBlock *BB : InnerLoop->blocks()) {
+    for (Instruction &Ins : *BB) {
+      // For each load or store instruction, calculate the number of bytes that
+      // fit into a cache line, if this address is used in the Loop.
+      Value *Addr = nullptr;
+      if (LoadInst *LI = dyn_cast<LoadInst>(&Ins))
+        Addr = LI->getPointerOperand();
+      if (StoreInst *SI = dyn_cast<StoreInst>(&Ins))
+        Addr = SI->getPointerOperand();
+
+      if (!Addr) continue;
+
+      BytesInCache += getBytesInCache(Loop, SE, Addr, CacheLineSize);
+     }
   }
-  return GoodOrder - BadOrder;
+  return BytesInCache;
 }
 
 static bool isProfitableForVectorization(unsigned InnerLoopId,
@@ -998,9 +1014,11 @@
   // This is rough cost estimation algorithm. It counts the good and bad order
   // of induction variables in the instruction and allows reordering if number
   // of bad orders is more than good.
-  int Cost = getInstrOrderCost();
-  DEBUG(dbgs() << "Cost = " << Cost << "\n");
-  if (Cost < -LoopInterchangeCostThreshold)
+  int OriginalCost = getInstrOrderCost(InnerLoop);
+  int InterchangedCost = getInstrOrderCost(OuterLoop);
+  DEBUG(dbgs() << "OriginalCost = " << OriginalCost << " InterchangedCost "
+               << InterchangedCost << "\n");
+  if ((InterchangedCost - OriginalCost) > LoopInterchangeCostThreshold)
     return true;
 
   // It is not profitable as per current cache profitability model. But check if
Index: test/Transforms/LoopInterchange/interchange-single-comp.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopInterchange/interchange-single-comp.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -loop-interchange -S | FileCheck %s
+
+;;--------------------------------------Test case 01------------------------------------
+;; It is profitable to interchange those loops, as both A[j*25+i] and B[n+i]
+;; will sequentially access memory.
+;;  for(int i=0;i<n;i++)
+;;    for(int j=1;j<25;j++)
+;;      A[j*25+i] = B[n+i]
+
+define void @foo(i32* %A, i32* %B, i64 %n) {
+entry:
+  br label %outer.preheader
+
+outer.preheader:                           ; preds = %entry, %outer.for.inc8_crit_edge.us
+  %i = phi i64 [ 0, %entry ], [ %i.next, %outer.inc8_crit_edge.us ]
+  %0 = add i64 %i, %n
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i64 %0
+  br label %inner.body
+
+inner.body:                                     ; preds = %outer.preheader, %inner.body
+  %j = phi i64 [ %j.next, %inner.body ], [ 0, %outer.preheader ]
+  %1 = load i32, i32* %arrayidx.us, align 4
+  %2 = mul i64 %j, 25
+  %3 = add i64 %2, %i
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i64 %3
+  %4 = load i32, i32* %arrayidx6.us, align 4
+  %add7.us = add i32 %4, %1
+  store i32 %add7.us, i32* %arrayidx6.us, align 4
+  %j.next = add i64 %j, 1
+  %exitcond = icmp ne i64 %j.next, %n
+  br i1 %exitcond, label %inner.body, label %outer.inc8_crit_edge.us
+
+outer.inc8_crit_edge.us:                  ; preds = %inner.body
+  %i.next = add i64 %i, 1
+  %exitcond29 = icmp ne i64 %i.next, 25
+  br i1 %exitcond29, label %outer.preheader, label %for.end10.loopexit
+
+for.end10.loopexit:                               ; preds = %outer.inc8_crit_edge.us
+  br label %for.end10
+
+for.end10:                                        ; preds = %entry, %for.end10.loopexit
+  ret void
+}
+
+; CHECK: split
Index: test/Transforms/LoopInterchange/profitability.ll
===================================================================
--- test/Transforms/LoopInterchange/profitability.ll
+++ test/Transforms/LoopInterchange/profitability.ll
@@ -203,3 +203,51 @@
 ; CHECK:    %3 = load i32, i32* %arrayidx6
 ; CHECK:    %arrayidx10 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %indvars.iv34, i64 %indvars.iv
 ; CHECK:    %4 = load i32, i32* %arrayidx10
+
+;;---------------------------------------Test case 04---------------------------------
+;; Interchanging the loops is not profitable, because the outer loop increments
+;; the induction variable by 6, so only a single element will be in the cache.
+;;   for(int i=1;i<N;i+=6)
+;;     for(int j=1;j<N;j++)
+;;       A[j][i] = A[j - 1][i] + B[j][i];
+
+define void @interchange_04(i32 %N) {
+entry:
+  %cmp27 = icmp sgt i32 %N, 1
+  br i1 %cmp27, label %for.cond1.preheader.lr.ph, label %for.end16
+
+for.cond1.preheader.lr.ph:
+  %0 = add i32 %N, -1
+  br label %for.body3.preheader
+
+for.body3.preheader:
+  %indvars.iv30 = phi i64 [ 1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next31, %for.inc14 ]
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.body3.preheader ]
+  %1 = add nsw i64 %indvars.iv, -1
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %1, i64 %indvars.iv30
+  %2 = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv30
+  %3 = load i32, i32* %arrayidx9
+  %add = add nsw i32 %3, %2
+  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv30
+  store i32 %add, i32* %arrayidx13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc14, label %for.body3
+
+for.inc14:
+  %indvars.iv.next31 = add nuw nsw i64 %indvars.iv30, 6
+  %lftr.wideiv32 = trunc i64 %indvars.iv30 to i32
+  %exitcond33 = icmp eq i32 %lftr.wideiv32, %0
+  br i1 %exitcond33, label %for.end16, label %for.body3.preheader
+
+for.end16:
+  ret void
+}
+;; Here we are checking partial .ll to check if loop are interchanged.
+; CHECK-LABEL: @interchange_04
+; CHECK-NOT: split