Index: lib/Transforms/Scalar/LoopInterchange.cpp
===================================================================
--- lib/Transforms/Scalar/LoopInterchange.cpp
+++ lib/Transforms/Scalar/LoopInterchange.cpp
@@ -347,7 +347,8 @@
   bool containsUnsafeInstructionsInLatch(BasicBlock *BB);
   bool findInductionAndReductions(Loop *L,
                                   SmallVector<PHINode *, 8> &Inductions,
-                                  SmallVector<PHINode *, 8> &Reductions);
+                                  SmallVector<PHINode *, 8> &Reductions,
+                                  Loop *InnerLoop);
 
   Loop *OuterLoop;
   Loop *InnerLoop;
@@ -703,9 +704,39 @@
   return true;
 }
 
+static Value *followLCSSA(Value *SV) {
+  PHINode *PHI = dyn_cast<PHINode>(SV);
+  if (!PHI)
+    return SV;
+
+  if (PHI->getNumIncomingValues() != 1)
+    return SV;
+  return followLCSSA(PHI->getIncomingValue(0));
+}
+
+/// \brief Checks if V is part of a reduction in L by following its operands
+/// and trying to find an reduction PHI in L.
+static bool isPartOfReduction(Loop *L, Value *V,
+                              SmallPtrSetImpl<Value *> &Visited) {
+  if (!Visited.insert(V).second)
+    return false;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || !L->contains(I->getParent()))
+    return false;
+
+  if (PHINode *PHI = dyn_cast<PHINode>(I)) {
+    RecurrenceDescriptor RD;
+    return RecurrenceDescriptor::isReductionPHI(PHI, L, RD);
+  }
+  return llvm::any_of(I->operands(), [L, &Visited](Value *Op) {
+    return isPartOfReduction(L, Op, Visited);
+  });
+}
+
 bool LoopInterchangeLegality::findInductionAndReductions(
     Loop *L, SmallVector<PHINode *, 8> &Inductions,
-    SmallVector<PHINode *, 8> &Reductions) {
+    SmallVector<PHINode *, 8> &Reductions, Loop *InnerLoop) {
   if (!L->getLoopLatch() || !L->getLoopPredecessor())
     return false;
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
@@ -716,7 +747,15 @@
       Inductions.push_back(PHI);
     else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
       Reductions.push_back(PHI);
-    else {
+    else if (InnerLoop && PHI->getNumIncomingValues() == 2) {
+      // Check if we have a PHI node in the outer loop that has a reduction
+      // result from the inner loop as an incoming value.
+      Value *V = followLCSSA(PHI->getIncomingValueForBlock(L->getLoopLatch()));
+      SmallPtrSet<Value *, 10> Visited;
+      if (!isPartOfReduction(InnerLoop, V, Visited))
+        return false;
+      Reductions.push_back(PHI);
+    } else {
       DEBUG(
           dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
       return false;
@@ -766,7 +805,7 @@
   PHINode *InnerInductionVar;
   SmallVector<PHINode *, 8> Inductions;
   SmallVector<PHINode *, 8> Reductions;
-  if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
+  if (!findInductionAndReductions(InnerLoop, Inductions, Reductions, nullptr)) {
     DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
                  << "are supported currently.\n");
     ORE->emit([&]() {
@@ -797,7 +836,8 @@
 
   InnerInductionVar = Inductions.pop_back_val();
   Reductions.clear();
-  if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
+  if (!findInductionAndReductions(OuterLoop, Inductions, Reductions,
+                                  InnerLoop)) {
     DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
                  << "are supported currently.\n");
     ORE->emit([&]() {
@@ -810,20 +850,6 @@
     return true;
   }
 
-  // Outer loop cannot have reduction because then loops will not be tightly
-  // nested.
-  if (!Reductions.empty()) {
-    DEBUG(dbgs() << "Outer loops with reductions are not supported "
-                 << "currently.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter",
-                                      OuterLoop->getStartLoc(),
-                                      OuterLoop->getHeader())
-             << "Outer loops with reductions cannot be interchangeed "
-                "currently.";
-    });
-    return true;
-  }
   // TODO: Currently we handle only loops with 1 induction variable.
   if (Inductions.size() != 1) {
     DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
@@ -1199,8 +1225,7 @@
     else
       InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
 
-    // Ensure that InductionPHI is the first Phi node as required by
-    // splitInnerLoopHeader
+    // Ensure that InductionPHI is the first Phi node.
     if (&InductionPHI->getParent()->front() != InductionPHI)
       InductionPHI->moveBefore(&InductionPHI->getParent()->front());
 
@@ -1211,8 +1236,9 @@
     DEBUG(dbgs() << "splitInnerLoopLatch done\n");
 
     // Splits the inner loops phi nodes out into a separate basic block.
-    splitInnerLoopHeader();
-    DEBUG(dbgs() << "splitInnerLoopHeader done\n");
+    BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+    SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
+    DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
   }
 
   Transformed |= adjustLoopLinks();
@@ -1231,31 +1257,6 @@
   InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
 }
 
-void LoopInterchangeTransform::splitInnerLoopHeader() {
-  // Split the inner loop header out. Here make sure that the reduction PHI's
-  // stay in the innerloop body.
-  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
-  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
-  SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
-  if (InnerLoopHasReduction) {
-    // Adjust Reduction PHI's in the block. The induction PHI must be the first
-    // PHI in InnerLoopHeader for this to work.
-    SmallVector<PHINode *, 8> PHIVec;
-    for (auto I = std::next(InnerLoopHeader->begin()); isa<PHINode>(I); ++I) {
-      PHINode *PHI = dyn_cast<PHINode>(I);
-      Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader);
-      PHI->replaceAllUsesWith(V);
-      PHIVec.push_back((PHI));
-    }
-    for (PHINode *P : PHIVec) {
-      P->eraseFromParent();
-    }
-  }
-
-  DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
-                  "InnerLoopHeader\n");
-}
-
 /// \brief Move all instructions except the terminator from FromBB right before
 /// InsertBefore
 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
@@ -1385,6 +1386,42 @@
     OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch);
   }
 
+  // Now update the reduction PHIs in the inner and outer loop headers.
+  SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
+  for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1))
+    InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
+  for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1))
+    OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
+
+  for (PHINode *PHI : OuterLoopPHIs)
+    PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
+
+  // Move the PHI nodes from the inner loop header to the outer loop header.
+  // We have to deal with 2 different kind of PHI nodes:
+  //  1) PHI nodes that are part of a reduction in the outer loop
+  //  2) PHI nodes that are part of inner loop-only reductions.
+  //  For 1) we only have to move the PHI node and update the incoming blocks.
+  //  For 2) we can replace the PHI node with the value coming from outside the
+  //  inner loop.
+  for (PHINode *PHI : InnerLoopPHIs) {
+    PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
+    for (BasicBlock *InBB : PHI->blocks()) {
+      if (InnerLoop->contains(InBB) ||
+          isa<PHINode>(PHI->getIncomingValueForBlock(InBB)))
+        continue;
+
+      PHI->replaceAllUsesWith(PHI->getIncomingValueForBlock(InBB));
+      PHI->eraseFromParent();
+      break;
+    }
+  }
+
+  // Update the incoming blocks for moved PHI nodes.
+  updateIncomingBlock(OuterLoopHeader, InnerLoopPreHeader, OuterLoopPreHeader);
+  updateIncomingBlock(OuterLoopHeader, InnerLoopLatch, OuterLoopLatch);
+  updateIncomingBlock(InnerLoopHeader, OuterLoopPreHeader, InnerLoopPreHeader);
+  updateIncomingBlock(InnerLoopHeader, OuterLoopLatch, InnerLoopLatch);
+
   return true;
 }
 
Index: test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
@@ -0,0 +1,64 @@
+; RUN: opt < %s -basicaa -loop-interchange -simplifycfg -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @test(i32** nocapture readonly %Arr, i32 %k) local_unnamed_addr #0 {
+entry:
+  br label %for.body
+
+
+for.body:                                         ; preds = %for.cond.cleanup3, %entry
+  %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.cond.cleanup3 ]
+  %sum.021 = phi i32 [ 0, %entry ], [ %add7.lcssa, %for.cond.cleanup3 ]
+  br label %for.body4
+
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next.3, %for.body4 ]
+  %sum.119 = phi i32 [ %sum.021, %for.body ], [ %add7, %for.body4 ]
+  %arrayidx = getelementptr inbounds i32*, i32** %Arr, i64 %indvars.iv
+  %0 = load i32*, i32** %arrayidx, align 8
+  %arrayidx6 = getelementptr inbounds i32, i32* %0, i64 %indvars.iv23
+  %1 = load i32, i32* %arrayidx6, align 4
+  %add = add i32 %sum.119, %k
+  %add7 = add i32 %add, %1
+  %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.3 = icmp eq i64 %indvars.iv.next.3, 1024
+  br i1 %exitcond.3, label %for.cond.cleanup3, label %for.body4
+
+for.cond.cleanup3:                                ; preds = %for.body4
+  %add7.lcssa = phi i32 [ %add7, %for.body4 ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %exitcond25 = icmp eq i64 %indvars.iv.next24, 1024
+  br i1 %exitcond25, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
+  %add7.lcssa.lcssa = phi i32 [ %add7.lcssa, %for.cond.cleanup3 ]
+  ret i32 %add7.lcssa.lcssa
+
+
+}
+
+; CHECK-LABEL: @test
+; CHECK-LABEL: entry:
+; CHECK-NEXT: br label %for.body4
+
+; CHECK-LABEL: for.body:                                         ; preds = %for.body4, %for.body
+; CHECK: %indvars.iv23 = phi i64 [ %indvars.iv.next24, %for.body ], [ 0, %for.body4 ]
+; CHECK: %sum.119 = phi i32 [ %add7, %for.body ], [ %sum.021, %for.body4 ]
+; CHECK: br i1 %exitcond25, label %for.body4.split, label %for.body
+
+; CHECK-LABEL: for.body4:
+; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next.3, %for.body4.split ], [ 0, %entry ]
+; CHECK: %sum.021 = phi i32 [ %add7, %for.body4.split ], [ 0, %entry ]
+; CHECK: br label %for.body
+
+; CHECK-LABEL: for.body4.split:                                  ; preds = %for.body
+; CHECK: %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1
+; CHECK: %exitcond.3 = icmp eq i64 %indvars.iv.next.3, 1024
+; CHECK: br i1 %exitcond.3, label %for.cond.cleanup, label %for.body4
+
+; CHECK-LABEL: for.cond.cleanup:                                 ; preds = %for.body4.split
+; CHECK: %add7.lcssa.lcssa = phi i32 [ %add7, %for.body4.split ]
+; CHECK: ret i32 %add7.lcssa.lcssa