Index: llvm/trunk/lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LoopInterchange.cpp +++ llvm/trunk/lib/Transforms/Scalar/LoopInterchange.cpp @@ -339,10 +339,21 @@ bool currentLimitations(); + const SmallPtrSetImpl &getOuterInnerReductions() const { + return OuterInnerReductions; + } + private: bool tightlyNested(Loop *Outer, Loop *Inner); bool containsUnsafeInstructions(BasicBlock *BB); - bool findInductions(Loop *L, SmallVector &Inductions); + + /// Discover induction and reduction PHIs in the header of \p L. Induction + /// PHIs are added to \p Inductions, reductions are added to + /// OuterInnerReductions. When the outer loop is passed, the inner loop needs + /// to be passed as \p InnerLoop. + bool findInductionAndReductions(Loop *L, + SmallVector &Inductions, + Loop *InnerLoop); Loop *OuterLoop; Loop *InnerLoop; @@ -352,6 +363,9 @@ /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; + /// Set of reduction PHIs taking part of a reduction across the inner and + /// outer loop. + SmallPtrSet OuterInnerReductions; }; /// LoopInterchangeProfitability checks if it is profitable to interchange the @@ -384,9 +398,10 @@ public: LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, - BasicBlock *LoopNestExit) + BasicBlock *LoopNestExit, + const LoopInterchangeLegality &LIL) : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), - LoopExit(LoopNestExit) {} + LoopExit(LoopNestExit), LIL(LIL) {} /// Interchange OuterLoop and InnerLoop. bool transform(); @@ -411,6 +426,8 @@ LoopInfo *LI; DominatorTree *DT; BasicBlock *LoopExit; + + const LoopInterchangeLegality &LIL; }; // Main LoopInterchange Pass. @@ -560,8 +577,8 @@ << "Loop interchanged with enclosing loop."; }); - LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, - LoopNestExit); + LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit, + LIL); LIT.transform(); LLVM_DEBUG(dbgs() << "Loops interchanged.\n"); LoopsInterchanged++; @@ -633,8 +650,36 @@ return true; } -bool LoopInterchangeLegality::findInductions( - Loop *L, SmallVector &Inductions) { +// If SV is a LCSSA PHI node with a single incoming value, return the incoming +// value. +static Value *followLCSSA(Value *SV) { + PHINode *PHI = dyn_cast(SV); + if (!PHI) + return SV; + + if (PHI->getNumIncomingValues() != 1) + return SV; + return followLCSSA(PHI->getIncomingValue(0)); +} + +// Check V's users to see if it is involved in a reduction in L. +static PHINode *findInnerReductionPhi(Loop *L, Value *V) { + for (Value *User : V->users()) { + if (PHINode *PHI = dyn_cast(User)) { + if (PHI->getNumIncomingValues() == 1) + continue; + RecurrenceDescriptor RD; + if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) + return PHI; + return nullptr; + } + } + + return nullptr; +} + +bool LoopInterchangeLegality::findInductionAndReductions( + Loop *L, SmallVector &Inductions, Loop *InnerLoop) { if (!L->getLoopLatch() || !L->getLoopPredecessor()) return false; for (PHINode &PHI : L->getHeader()->phis()) { @@ -643,8 +688,32 @@ if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) Inductions.push_back(&PHI); else { - LLVM_DEBUG(dbgs() << "Failed to recognize PHI as an induction.\n"); - return false; + // PHIs in inner loops need to be part of a reduction in the outer loop, + // discovered when checking the PHIs of the outer loop earlier. + if (!InnerLoop) { + if (OuterInnerReductions.find(&PHI) == OuterInnerReductions.end()) { + LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions " + "across the outer loop.\n"); + return false; + } + } else { + assert(PHI.getNumIncomingValues() == 2 && + "Phis in loop header should have exactly 2 incoming values"); + // Check if we have a PHI node in the outer loop that has a reduction + // result from the inner loop as an incoming value. + Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch())); + PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V); + if (!InnerRedPhi || + !llvm::any_of(InnerRedPhi->incoming_values(), + [&PHI](Value *V) { return V == &PHI; })) { + LLVM_DEBUG( + dbgs() + << "Failed to recognize PHI as an induction or reduction.\n"); + return false; + } + OuterInnerReductions.insert(&PHI); + OuterInnerReductions.insert(InnerRedPhi); + } } } return true; @@ -693,63 +762,64 @@ PHINode *InnerInductionVar; SmallVector Inductions; - if (!findInductions(InnerLoop, Inductions)) { + if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) { LLVM_DEBUG( - dbgs() << "Only inner loops with induction or reduction PHI nodes " + dbgs() << "Only outer loops with induction or reduction PHI nodes " << "are supported currently.\n"); ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Only inner loops with induction or reduction PHI nodes can be" - " interchange currently."; + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with induction or reduction PHI nodes can be" + " interchanged currently."; }); return true; } // TODO: Currently we handle only loops with 1 induction variable. if (Inductions.size() != 1) { - LLVM_DEBUG( - dbgs() << "We currently only support loops with 1 induction variable." - << "Failed to interchange due to current limitation\n"); + LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not " + << "supported currently.\n"); ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Only inner loops with 1 induction variable can be " + return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with 1 induction variable can be " "interchanged currently."; }); return true; } - InnerInductionVar = Inductions.pop_back_val(); - if (!findInductions(OuterLoop, Inductions)) { + Inductions.clear(); + if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) { LLVM_DEBUG( - dbgs() << "Only outer loops with induction or reduction PHI nodes " + dbgs() << "Only inner loops with induction or reduction PHI nodes " << "are supported currently.\n"); ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Only outer loops with induction or reduction PHI nodes can be" - " interchanged currently."; + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with induction or reduction PHI nodes can be" + " interchange currently."; }); return true; } // TODO: Currently we handle only loops with 1 induction variable. if (Inductions.size() != 1) { - LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not " - << "supported currently.\n"); + LLVM_DEBUG( + dbgs() << "We currently only support loops with 1 induction variable." + << "Failed to interchange due to current limitation\n"); ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Only outer loops with 1 induction variable can be " + return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with 1 induction variable can be " "interchanged currently."; }); return true; } + InnerInductionVar = Inductions.pop_back_val(); // TODO: Triangular loops are not handled for now. if (!isLoopStructureUnderstood(InnerInductionVar)) { @@ -1387,13 +1457,29 @@ // replaced by Inners'. updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch); - // Make sure we have no other PHIs. - auto InnerPhis = drop_begin(InnerLoopHeader->phis(), 1); - auto OuterPhis = drop_begin(OuterLoopHeader->phis(), 1); - (void) InnerPhis; - (void) OuterPhis; - assert(begin(InnerPhis) == end(InnerPhis) && "Unexpected PHIs in inner loop"); - assert(begin(OuterPhis) == end(OuterPhis) && "Unexpected PHis in outer loop"); + // Now update the reduction PHIs in the inner and outer loop headers. + SmallVector InnerLoopPHIs, OuterLoopPHIs; + for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1)) + InnerLoopPHIs.push_back(cast(&PHI)); + for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1)) + OuterLoopPHIs.push_back(cast(&PHI)); + + auto &OuterInnerReductions = LIL.getOuterInnerReductions(); + (void)OuterInnerReductions; + + // Now move the remaining reduction PHIs from outer to inner loop header and + // vice versa. The PHI nodes must be part of a reduction across the inner and + // outer loop and all the remains to do is and updating the incoming blocks. + for (PHINode *PHI : OuterLoopPHIs) { + PHI->moveBefore(InnerLoopHeader->getFirstNonPHI()); + assert(OuterInnerReductions.find(PHI) != OuterInnerReductions.end() && + "Expected a reduction PHI node"); + } + for (PHINode *PHI : InnerLoopPHIs) { + PHI->moveBefore(OuterLoopHeader->getFirstNonPHI()); + assert(OuterInnerReductions.find(PHI) != OuterInnerReductions.end() && + "Expected a reduction PHI node"); + } // Update the incoming blocks for moved PHI nodes. updateIncomingBlock(OuterLoopHeader, InnerLoopPreHeader, OuterLoopPreHeader); Index: llvm/trunk/test/Transforms/LoopInterchange/inner-only-reductions.ll =================================================================== --- llvm/trunk/test/Transforms/LoopInterchange/inner-only-reductions.ll +++ llvm/trunk/test/Transforms/LoopInterchange/inner-only-reductions.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \ -; RUN: -verify-dom-info -verify-loop-info 2>&1 | FileCheck -check-prefix=IR %s +; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa 2>&1 | FileCheck -check-prefix=IR %s ; RUN: FileCheck --input-file=%t %s ; Inner loop only reductions are not supported currently. See discussion at Index: llvm/trunk/test/Transforms/LoopInterchange/lcssa.ll =================================================================== --- llvm/trunk/test/Transforms/LoopInterchange/lcssa.ll +++ llvm/trunk/test/Transforms/LoopInterchange/lcssa.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -pass-remarks-output=%t -; RUN: cat %t | FileCheck --check-prefix REMARK %s +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -pass-remarks-output=%t -S +; RUN: FileCheck --input-file %t --check-prefix REMARK %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" Index: llvm/trunk/test/Transforms/LoopInterchange/outer-only-reductions.ll =================================================================== --- llvm/trunk/test/Transforms/LoopInterchange/outer-only-reductions.ll +++ llvm/trunk/test/Transforms/LoopInterchange/outer-only-reductions.ll @@ -0,0 +1,52 @@ +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \ +; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa 2>&1 | FileCheck -check-prefix=IR %s +; RUN: FileCheck --input-file=%t %s + +; Outer loop only reductions are not supported currently. + +@A = common global [500 x [500 x i32]] zeroinitializer + +;; global X + +;; for( int i=1;i&1 | FileCheck %s +; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; REMARKS: --- !Passed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: Interchanged +; REMARKS-NEXT: Function: test1 + +define i64 @test1([100 x [100 x i64]]* %Arr) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR2_PREHEADER:%.*]] +; CHECK: for1.header.preheader: +; CHECK-NEXT: br label [[FOR1_HEADER:%.*]] +; CHECK: for1.header: +; CHECK-NEXT: [[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], [[FOR1_INC:%.*]] ], [ 0, [[FOR1_HEADER_PREHEADER:%.*]] ] +; CHECK-NEXT: [[SUM_INNER:%.*]] = phi i64 [ [[SUM_INC:%.*]], [[FOR1_INC]] ], [ [[SUM_OUTER:%.*]], [[FOR1_HEADER_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR2_SPLIT1:%.*]] +; CHECK: for2.preheader: +; CHECK-NEXT: br label [[FOR2:%.*]] +; CHECK: for2: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR2_SPLIT:%.*]] ], [ 0, [[FOR2_PREHEADER]] ] +; CHECK-NEXT: [[SUM_OUTER]] = phi i64 [ [[SUM_INC_LCSSA:%.*]], [[FOR2_SPLIT]] ], [ 0, [[FOR2_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR1_HEADER_PREHEADER]] +; CHECK: for2.split1: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* [[ARR:%.*]], i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV23]] +; CHECK-NEXT: [[LV:%.*]] = load i64, i64* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[SUM_INC]] = add i64 [[SUM_INNER]], [[LV]] +; CHECK-NEXT: br label [[FOR1_INC]] +; CHECK: for2.split: +; CHECK-NEXT: [[SUM_INC_LCSSA]] = phi i64 [ [[SUM_INC]], %for1.inc ] +; CHECK-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXIT1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 100 +; CHECK-NEXT: br i1 [[EXIT1]], label [[FOR1_LOOPEXIT:%.*]], label [[FOR2]] +; CHECK: for1.inc: +; CHECK-NEXT: [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1 +; CHECK-NEXT: [[EXIT2:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT24]], 100 +; CHECK-NEXT: br i1 [[EXIT2]], label [[FOR2_SPLIT]], label [[FOR1_HEADER]] +; CHECK: for1.loopexit: +; CHECK-NEXT: [[SUM_INC_LCSSA2:%.*]] = phi i64 [ [[SUM_INC_LCSSA]], [[FOR2_SPLIT]] ] +; CHECK-NEXT: ret i64 [[SUM_INC_LCSSA2]] +; +entry: + br label %for1.header + +for1.header: ; preds = %for1.inc, %entry + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ] + %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ] + br label %for2 + +for2: ; preds = %for2, %for1.header + %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ] + %sum.inner = phi i64 [ %sum.outer, %for1.header ], [ %sum.inc, %for2 ] + %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23 + %lv = load i64, i64* %arrayidx, align 4 + %sum.inc = add i64 %sum.inner, %lv + %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1 + %exit1 = icmp eq i64 %indvars.iv.next.3, 100 + br i1 %exit1, label %for1.inc, label %for2 + +for1.inc: ; preds = %for2 + %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + %exit2 = icmp eq i64 %indvars.iv.next24, 100 + br i1 %exit2, label %for1.loopexit, label %for1.header + +for1.loopexit: ; preds = %for1.inc + %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ] + ret i64 %sum.inc.lcssa2 +} + +; In this test case, the inner reduction PHI %inner does not involve the outer +; reduction PHI %sum.outer, do not interchange. +; REMARKS: --- !Missed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: UnsupportedPHIOuter +; REMARKS-NEXT: Function: test2 + +define i64 @test2([100 x [100 x i64]]* %Arr) { +entry: + br label %for1.header + +for1.header: ; preds = %for1.inc, %entry + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ] + %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ] + br label %for2 + +for2: ; preds = %for2, %for1.header + %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ] + %inner = phi i64 [ %indvars.iv23, %for1.header ], [ %sum.inc, %for2 ] + %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23 + %lv = load i64, i64* %arrayidx, align 4 + %sum.inc = add i64 %inner, %lv + %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1 + %exit1 = icmp eq i64 %indvars.iv.next.3, 100 + br i1 %exit1, label %for1.inc, label %for2 + +for1.inc: ; preds = %for2 + %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + %exit2 = icmp eq i64 %indvars.iv.next24, 100 + br i1 %exit2, label %for1.loopexit, label %for1.header + +for1.loopexit: ; preds = %for1.inc + %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ] + ret i64 %sum.inc.lcssa2 +} + +; Check that we do not interchange if there is an additional instruction +; between the outer and inner reduction PHIs. +; REMARKS: --- !Missed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: UnsupportedPHIOuter +; REMARKS-NEXT: Function: test3 + +define i64 @test3([100 x [100 x i64]]* %Arr) { +entry: + br label %for1.header + +for1.header: ; preds = %for1.inc, %entry + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ] + %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ] + %so = add i64 %sum.outer, 10 + br label %for2 + +for2: ; preds = %for2, %for1.header + %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ] + %sum.inner = phi i64 [ %so, %for1.header ], [ %sum.inc, %for2 ] + %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23 + %lv = load i64, i64* %arrayidx, align 4 + %sum.inc = add i64 %sum.inner, %lv + %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1 + %exit1 = icmp eq i64 %indvars.iv.next.3, 100 + br i1 %exit1, label %for1.inc, label %for2 + +for1.inc: ; preds = %for2 + %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + %exit2 = icmp eq i64 %indvars.iv.next24, 100 + br i1 %exit2, label %for1.loopexit, label %for1.header + +for1.loopexit: ; preds = %for1.inc + %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ] + ret i64 %sum.inc.lcssa2 +}