Index: lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- lib/Transforms/Scalar/LoopInterchange.cpp +++ lib/Transforms/Scalar/LoopInterchange.cpp @@ -411,8 +411,6 @@ bool adjustLoopLinks(); void adjustLoopPreheaders(); bool adjustLoopBranches(); - void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred, - BasicBlock *NewPred); Loop *OuterLoop; Loop *InnerLoop; @@ -455,6 +453,7 @@ AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); + AU.addPreservedID(LCSSAID); } bool runOnFunction(Function &F) override { @@ -1297,9 +1296,8 @@ FromBB->getTerminator()->getIterator()); } -void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock, - BasicBlock *OldPred, - BasicBlock *NewPred) { +static void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred, + BasicBlock *NewPred) { for (PHINode &PHI : CurrBlock->phis()) { unsigned Num = PHI.getNumIncomingValues(); for (unsigned i = 0; i < Num; ++i) { @@ -1330,6 +1328,53 @@ } } +// Move Lcssa PHIs to the right place. +static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerLatch, + BasicBlock *OuterLatch) { + SmallVector LcssaInnerExit; + for (PHINode &P : InnerExit->phis()) + LcssaInnerExit.push_back(&P); + + SmallVector LcssaInnerLatch; + for (PHINode &P : InnerLatch->phis()) + LcssaInnerLatch.push_back(&P); + + // Lcssa PHIs for values used outside the inner loop are in InnerExit. + // If a PHI node has users outside of InnerExit, it has a use outside the + // interchanged loop and we have to preserve it. We move these to + // InnerLatch, which will become the new exit block for the innermost + // loop after interchanging. For PHIs only used in InnerExit, we can just + // replace them with the incoming value. + for (PHINode *P : LcssaInnerExit) { + bool hasUsersOutside = false; + auto UI = P->use_begin(), E = P->use_end(); + for (; UI != E;) { + Use &U = *UI; + ++UI; + auto *Usr = dyn_cast(U.getUser()); + if (Usr && Usr->getParent() != InnerExit) { + hasUsersOutside = true; + continue; + } + U.set(P->getIncomingValueForBlock(InnerLatch)); + } + if (hasUsersOutside) + P->moveBefore(InnerLatch->getFirstNonPHI()); + else + P->eraseFromParent(); + } + + // If the inner loop latch contains LCSSA PHIs, those come from a child loop + // and we have to move them to the new inner latch. + for (PHINode *P : LcssaInnerLatch) + P->moveBefore(InnerExit->getFirstNonPHI()); + + // Now adjust the incoming blocks for the LCSSA PHIs. + // For PHIs moved from Inner's exit block, we need to replace Inner's latch + // with the new latch. + updateIncomingBlock(InnerLatch, InnerLatch, OuterLatch); +} + bool LoopInterchangeTransform::adjustLoopBranches() { LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n"); std::vector DTUpdates; @@ -1409,17 +1454,6 @@ updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch, InnerLoopLatchSuccessor, DTUpdates); - // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with - // the value and remove this PHI node from inner loop. - SmallVector LcssaVec; - for (PHINode &P : InnerLoopLatchSuccessor->phis()) - LcssaVec.push_back(&P); - - for (PHINode *P : LcssaVec) { - Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch); - P->replaceAllUsesWith(Incoming); - P->eraseFromParent(); - } if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); @@ -1431,12 +1465,15 @@ updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch, DTUpdates); - updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch); - DT->applyUpdates(DTUpdates); restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader, OuterLoopPreHeader); + moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopLatch, OuterLoopLatch); + // For PHIs in the exit block of the outer loop, outer's latch has been + // replaced by Inners'. + updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch); + // Now update the reduction PHIs in the inner and outer loop headers. SmallVector InnerLoopPHIs, OuterLoopPHIs; for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1)) Index: test/Transforms/LoopInterchange/interchangeable.ll =================================================================== --- test/Transforms/LoopInterchange/interchangeable.ll +++ test/Transforms/LoopInterchange/interchangeable.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -S | FileCheck %s +; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" Index: test/Transforms/LoopInterchange/lcssa.ll =================================================================== --- test/Transforms/LoopInterchange/lcssa.ll +++ test/Transforms/LoopInterchange/lcssa.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -pass-remarks-output=%t ; RUN: cat %t | FileCheck --check-prefix REMARK %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -169,3 +169,123 @@ for.end16: ret void } + +; PHI node in inner latch with multiple predecessors. +; REMARK: Interchanged +; REMARK-NEXT: lcssa_05 + +define void @lcssa_05(i32* %ptr){ +entry: + br label %outer.header + +outer.header: + %iv.outer= phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] + br label %for.body3 + +for.body3: + %iv.inner = phi i64 [ %iv.inner.next, %bb3 ], [ 1, %outer.header ] + br i1 undef, label %bb2, label %bb3 + +bb2: + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer + %vA = load i32, i32* %arrayidx5 + %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer + %vC = load i32, i32* %arrayidx9 + %add = add nsw i32 %vA, %vC + br label %bb3 + +bb3: + %addp = phi i32 [ %add, %bb2 ], [ 0, %for.body3 ] + store i32 %addp, i32* %ptr + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %exitcond = icmp eq i64 %iv.inner.next, 100 + br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: + %iv.outer.next = add nsw i64 %iv.outer, 1 + %cmp = icmp eq i64 %iv.outer.next, 100 + br i1 %cmp, label %outer.header, label %for.exit + +for.exit: + store i64 %iv.inner, i64 * @Y + br label %for.end16 + +for.end16: + ret void +} + +; REMARK: UnsupportedExitPHI +; REMARK-NEXT: lcssa_06 + +define void @lcssa_06(i64* %ptr, i32* %ptr1){ +entry: + br label %outer.header + +outer.header: + %iv.outer= phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] + br i1 undef, label %for.body3, label %outer.inc + +for.body3: + %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer + %vA = load i32, i32* %arrayidx5 + %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer + %vC = load i32, i32* %arrayidx9 + %add = add nsw i32 %vA, %vC + store i32 %add, i32* %ptr1 + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %exitcond = icmp eq i64 %iv.inner.next, 100 + br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: + %sv = phi i64 [ 0, %outer.header ], [ 1, %for.body3 ] + store i64 %sv, i64* %ptr + %iv.outer.next = add nsw i64 %iv.outer, 1 + %cmp = icmp eq i64 %iv.outer.next, 100 + br i1 %cmp, label %outer.header, label %for.exit + +for.exit: + store i64 %sv, i64 * @Y + br label %for.end16 + +for.end16: + ret void +} + +; REMARK: Interchanged +; REMARK-NEXT: lcssa_07 +define void @lcssa_07(){ +entry: + br label %outer.header + +outer.header: + %iv.outer= phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] + br label %for.body3 + +for.body3: + %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer + %vA = load i32, i32* %arrayidx5 + %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer + %vC = load i32, i32* %arrayidx9 + %add = add nsw i32 %vA, %vC + store i32 %add, i32* %arrayidx5 + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %exitcond = icmp eq i64 %iv.inner.next, 100 + br i1 %exitcond, label %outer.bb, label %for.body3 + +outer.bb: + br label %outer.inc + +outer.inc: + %iv.outer.next = add nsw i64 %iv.outer, 1 + %cmp = icmp eq i64 %iv.outer.next, 100 + br i1 %cmp, label %outer.header, label %for.exit + +for.exit: + store i64 %iv.inner, i64 * @Y + br label %for.end16 + +for.end16: + ret void +} Index: test/Transforms/LoopInterchange/phi-ordering.ll =================================================================== --- test/Transforms/LoopInterchange/phi-ordering.ll +++ test/Transforms/LoopInterchange/phi-ordering.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -S 2>&1 | FileCheck %s +; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S 2>&1 | FileCheck %s ;; Checks the order of the inner phi nodes does not cause havoc. ;; The inner loop has a reduction into c. The IV is not the first phi. Index: test/Transforms/LoopInterchange/reductions.ll =================================================================== --- test/Transforms/LoopInterchange/reductions.ll +++ test/Transforms/LoopInterchange/reductions.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info -S -debug 2>&1 | FileCheck %s +; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info -verify-loop-lcssa -S -debug 2>&1 | FileCheck %s @A = common global [500 x [500 x i32]] zeroinitializer @X = common global i32 0 @@ -101,6 +101,8 @@ br i1 %exitcond43, label %for.end19, label %for.cond4.preheader.preheader for.end19: ; preds = %for.inc17, %entry + %res1 = phi i32 [ 0, %entry], [ %add, %for.inc17 ] + store i32 %res1, i32* @X ret void }