Index: lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- lib/Transforms/Scalar/LoopInterchange.cpp +++ lib/Transforms/Scalar/LoopInterchange.cpp @@ -339,7 +339,13 @@ bool currentLimitations(); - bool hasInnerLoopReduction() { return InnerLoopHasReduction; } + const SmallPtrSet &getInnerReductions() const { + return InnerReductions; + } + + const SmallPtrSet &getOuterInnerReductions() const { + return OuterInnerReductions; + } private: bool tightlyNested(Loop *Outer, Loop *Inner); @@ -348,7 +354,7 @@ bool containsUnsafeInstructionsInLatch(BasicBlock *BB); bool findInductionAndReductions(Loop *L, SmallVector &Inductions, - SmallPtrSet &Reductions); + Loop *InnerLoop); Loop *OuterLoop; Loop *InnerLoop; @@ -359,6 +365,9 @@ OptimizationRemarkEmitter *ORE; bool InnerLoopHasReduction = false; + + SmallPtrSet InnerReductions; + SmallPtrSet OuterInnerReductions; }; /// LoopInterchangeProfitability checks if it is profitable to interchange the @@ -392,10 +401,10 @@ LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, BasicBlock *LoopNestExit, - bool InnerLoopContainsReductions) + const LoopInterchangeLegality &LIL) : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), - LoopExit(LoopNestExit), - InnerLoopHasReduction(InnerLoopContainsReductions) {} + LoopExit(LoopNestExit), InnerReductions(LIL.getInnerReductions()), + OuterInnerReductions(LIL.getOuterInnerReductions()) {} /// Interchange OuterLoop and InnerLoop. bool transform(); @@ -420,7 +429,8 @@ LoopInfo *LI; DominatorTree *DT; BasicBlock *LoopExit; - bool InnerLoopHasReduction; + SmallPtrSet InnerReductions; + SmallPtrSet OuterInnerReductions; }; // Main LoopInterchange Pass. @@ -570,8 +580,8 @@ << "Loop interchanged with enclosing loop."; }); - LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, - LoopNestExit, LIL.hasInnerLoopReduction()); + LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit, + LIL); LIT.transform(); LLVM_DEBUG(dbgs() << "Loops interchanged.\n"); LoopsInterchanged++; @@ -673,9 +683,36 @@ return true; } +// If SV is a LCSSA PHI node with a single incoming value, return the incoming +// value. +static Value *followLCSSA(Value *SV) { + PHINode *PHI = dyn_cast(SV); + if (!PHI) + return SV; + + if (PHI->getNumIncomingValues() != 1) + return SV; + return followLCSSA(PHI->getIncomingValue(0)); +} + +// Check V's users to see if it is involved in a reduction in L. +static PHINode *findInnerReductionPhi(Loop *L, Value *V) { + for (Value *User : V->users()) { + if (PHINode *PHI = dyn_cast(User)) { + if (PHI->getNumIncomingValues() == 1) + continue; + RecurrenceDescriptor RD; + if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) + return PHI; + return nullptr; + } + } + + return nullptr; +} + bool LoopInterchangeLegality::findInductionAndReductions( - Loop *L, SmallVector &Inductions, - SmallPtrSet &Reductions) { + Loop *L, SmallVector &Inductions, Loop *InnerLoop) { if (!L->getLoopLatch() || !L->getLoopPredecessor()) return false; for (PHINode &PHI : L->getHeader()->phis()) { @@ -683,7 +720,9 @@ InductionDescriptor ID; if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) Inductions.push_back(&PHI); - else if (RecurrenceDescriptor::isReductionPHI(&PHI, L, RD)) { + else if (!InnerLoop && RecurrenceDescriptor::isReductionPHI(&PHI, L, RD)) { + if (OuterInnerReductions.find(&PHI) != OuterInnerReductions.end()) + return true; // Detect inner loop reductions across promoted values. We allow reduction // PHIs in the inner loop, if they have an incoming value, that is loaded // in the preheader and the result of the reduction is stored to the same @@ -721,7 +760,22 @@ if (!LI || !SI || SI->getOperand(1) != LI->getOperand(0)) return false; - Reductions.insert(&PHI); + InnerReductions.insert(&PHI); + } else if (InnerLoop && PHI.getNumIncomingValues() == 2) { + // Check if we have a PHI node in the outer loop that has a reduction + // result from the inner loop as an incoming value. + Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch())); + PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V); + if (!InnerRedPhi || + !llvm::any_of(InnerRedPhi->incoming_values(), + [&PHI](Value *V) { return V == &PHI; })) { + LLVM_DEBUG( + dbgs() + << "Failed to recognize PHI as an induction or reduction.\n"); + return false; + } + OuterInnerReductions.insert(&PHI); + OuterInnerReductions.insert(InnerRedPhi); } else { LLVM_DEBUG( dbgs() << "Failed to recognize PHI as an induction or reduction.\n"); @@ -775,7 +829,37 @@ PHINode *InnerInductionVar; SmallVector Inductions; SmallPtrSet Reductions; - if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) { + + if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) { + LLVM_DEBUG( + dbgs() << "Only outer loops with induction or reduction PHI nodes " + << "are supported currently.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with induction or reduction PHI nodes can be" + " interchanged currently."; + }); + return true; + } + + // TODO: Currently we handle only loops with 1 induction variable. + if (Inductions.size() != 1) { + LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not " + << "supported currently.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with 1 induction variable can be " + "interchanged currently."; + }); + return true; + } + + Inductions.clear(); + if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) { LLVM_DEBUG( dbgs() << "Only inner loops with induction or reduction PHI nodes on" " promoted values are supported currently.\n"); @@ -803,52 +887,7 @@ }); return true; } - if (Reductions.size() > 0) - InnerLoopHasReduction = true; - InnerInductionVar = Inductions.pop_back_val(); - Reductions.clear(); - if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) { - LLVM_DEBUG( - dbgs() << "Only outer loops with induction or reduction PHI nodes " - << "are supported currently.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Only outer loops with induction or reduction PHI nodes can be" - " interchanged currently."; - }); - return true; - } - - // Outer loop cannot have reduction because then loops will not be tightly - // nested. - if (!Reductions.empty()) { - LLVM_DEBUG(dbgs() << "Outer loops with reductions are not supported " - << "currently.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Outer loops with reductions cannot be interchangeed " - "currently."; - }); - return true; - } - // TODO: Currently we handle only loops with 1 induction variable. - if (Inductions.size() != 1) { - LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not " - << "supported currently.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Only outer loops with 1 induction variable can be " - "interchanged currently."; - }); - return true; - } // TODO: Triangular loops are not handled for now. if (!isLoopStructureUnderstood(InnerInductionVar)) { @@ -1497,11 +1536,17 @@ PHI->moveBefore(InnerLoopHeader->getFirstNonPHI()); // Move the PHI nodes from the inner loop header to the outer loop header. - // We have to deal with one kind of PHI nodes: - // 1) PHI nodes that are part of inner loop-only reductions. - // We only have to move the PHI node and update the incoming blocks. + // We have to deal with two kinds of PHI nodes: + // 1) PHI nodes that are part of a reduction in the outer loop + // 2) PHI nodes that are part of inner loop-only reductions. + // For 1) we only have to move the PHI node and update the incoming blocks. + // For 2) we can replace the PHI node with the value coming from outside the for (PHINode *PHI : InnerLoopPHIs) { PHI->moveBefore(OuterLoopHeader->getFirstNonPHI()); + if (OuterInnerReductions.find(PHI) != OuterInnerReductions.end()) + continue; + assert(InnerReductions.find(PHI) != InnerReductions.end() && + "Unexpected PHI node!"); for (BasicBlock *InBB : PHI->blocks()) { if (InnerLoop->contains(InBB)) continue; Index: test/Transforms/LoopInterchange/inner-only-reductions.ll =================================================================== --- test/Transforms/LoopInterchange/inner-only-reductions.ll +++ test/Transforms/LoopInterchange/inner-only-reductions.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \ -; RUN: -verify-dom-info -verify-loop-info 2>&1 | FileCheck -check-prefix=IR %s +; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa 2>&1 | FileCheck -check-prefix=IR %s ; RUN: FileCheck --input-file=%t %s @A = common global [500 x [500 x i32]] zeroinitializer @@ -12,6 +12,7 @@ ;; X+=A[j][i]; ;; Loop is interchanged check that the phi nodes are split and the promoted value is used instead of the reduction phi. + ; CHECK: --- !Passed ; CHECK-NEXT: Pass: loop-interchange ; CHECK-NEXT: Name: Interchanged @@ -61,7 +62,7 @@ ;; Y+=B[k][i]; ;; } -; Loop is interchanged check that the phi nodes are split and the promoted value is used instead of the reduction phi. +;; Loop is interchanged check that the phi nodes are split and the promoted value is used instead of the reduction phi. ; CHECK: --- !Passed ; CHECK-NEXT: Pass: loop-interchange Index: test/Transforms/LoopInterchange/lcssa.ll =================================================================== --- test/Transforms/LoopInterchange/lcssa.ll +++ test/Transforms/LoopInterchange/lcssa.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -pass-remarks-output=%t -; RUN: cat %t | FileCheck --check-prefix REMARK %s +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -pass-remarks-output=%t -S +; RUN: FileCheck --input-file %t --check-prefix REMARK %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" Index: test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll =================================================================== --- /dev/null +++ test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll @@ -0,0 +1,150 @@ +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \ +; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1 | FileCheck %s +; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; REMARKS: --- !Passed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: Interchanged +; REMARKS-NEXT: Function: test1 + +define i64 @test1([100 x [100 x i64]]* %Arr) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR2_PREHEADER:%.*]] +; CHECK: for1.header.preheader: +; CHECK-NEXT: br label [[FOR1_HEADER:%.*]] +; CHECK: for1.header: +; CHECK-NEXT: [[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], [[FOR1_INC:%.*]] ], [ 0, [[FOR1_HEADER_PREHEADER:%.*]] ] +; CHECK-NEXT: [[SUM_INNER:%.*]] = phi i64 [ [[SUM_INC:%.*]], [[FOR1_INC]] ], [ [[SUM_OUTER:%.*]], [[FOR1_HEADER_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR2_SPLIT1:%.*]] +; CHECK: for2.preheader: +; CHECK-NEXT: br label [[FOR2:%.*]] +; CHECK: for2: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR2_SPLIT:%.*]] ], [ 0, [[FOR2_PREHEADER]] ] +; CHECK-NEXT: [[SUM_OUTER]] = phi i64 [ [[SUM_INC_LCSSA:%.*]], [[FOR2_SPLIT]] ], [ 0, [[FOR2_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR1_HEADER_PREHEADER]] +; CHECK: for2.split1: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* [[ARR:%.*]], i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV23]] +; CHECK-NEXT: [[LV:%.*]] = load i64, i64* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[SUM_INC]] = add i64 [[SUM_INNER]], [[LV]] +; CHECK-NEXT: br label [[FOR1_INC]] +; CHECK: for2.split: +; CHECK-NEXT: [[SUM_INC_LCSSA]] = phi i64 [ [[SUM_INC]], %for1.inc ] +; CHECK-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXIT1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 100 +; CHECK-NEXT: br i1 [[EXIT1]], label [[FOR1_LOOPEXIT:%.*]], label [[FOR2]] +; CHECK: for1.inc: +; CHECK-NEXT: [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1 +; CHECK-NEXT: [[EXIT2:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT24]], 100 +; CHECK-NEXT: br i1 [[EXIT2]], label [[FOR2_SPLIT]], label [[FOR1_HEADER]] +; CHECK: for1.loopexit: +; CHECK-NEXT: [[SUM_INC_LCSSA2:%.*]] = phi i64 [ [[SUM_INC_LCSSA]], [[FOR2_SPLIT]] ] +; CHECK-NEXT: ret i64 [[SUM_INC_LCSSA2]] +; +entry: + br label %for1.header + +for1.header: ; preds = %for1.inc, %entry + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ] + %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ] + br label %for2 + +for2: ; preds = %for2, %for1.header + %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ] + %sum.inner = phi i64 [ %sum.outer, %for1.header ], [ %sum.inc, %for2 ] + %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23 + %lv = load i64, i64* %arrayidx, align 4 + %sum.inc = add i64 %sum.inner, %lv + %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1 + %exit1 = icmp eq i64 %indvars.iv.next.3, 100 + br i1 %exit1, label %for1.inc, label %for2 + +for1.inc: ; preds = %for2 + %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + %exit2 = icmp eq i64 %indvars.iv.next24, 100 + br i1 %exit2, label %for1.loopexit, label %for1.header + +for1.loopexit: ; preds = %for1.inc + %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ] + ret i64 %sum.inc.lcssa2 +} + +; In this test case, the inner reduction PHI %inner does not involve the outer +; reduction PHI %sum.outer, do not interchange. +; REMARKS: --- !Missed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: UnsupportedPHIOuter +; REMARKS-NEXT: Function: test2 + +define i64 @test2([100 x [100 x i64]]* %Arr) { +entry: + br label %for1.header + +for1.header: ; preds = %for1.inc, %entry + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ] + %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ] + br label %for2 + +for2: ; preds = %for2, %for1.header + %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ] + %inner = phi i64 [ %indvars.iv23, %for1.header ], [ %sum.inc, %for2 ] + %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23 + %lv = load i64, i64* %arrayidx, align 4 + %sum.inc = add i64 %inner, %lv + %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1 + %exit1 = icmp eq i64 %indvars.iv.next.3, 100 + br i1 %exit1, label %for1.inc, label %for2 + +for1.inc: ; preds = %for2 + %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + %exit2 = icmp eq i64 %indvars.iv.next24, 100 + br i1 %exit2, label %for1.loopexit, label %for1.header + +for1.loopexit: ; preds = %for1.inc + %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ] + ret i64 %sum.inc.lcssa2 +} + +; Check that we do not interchange if there is an additional instruction +; between the outer and inner reduction PHIs. +; REMARKS: --- !Missed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: UnsupportedPHIOuter +; REMARKS-NEXT: Function: test3 + +define i64 @test3([100 x [100 x i64]]* %Arr) { +entry: + br label %for1.header + +for1.header: ; preds = %for1.inc, %entry + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ] + %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ] + %so = add i64 %sum.outer, 10 + br label %for2 + +for2: ; preds = %for2, %for1.header + %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ] + %sum.inner = phi i64 [ %so, %for1.header ], [ %sum.inc, %for2 ] + %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23 + %lv = load i64, i64* %arrayidx, align 4 + %sum.inc = add i64 %sum.inner, %lv + %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1 + %exit1 = icmp eq i64 %indvars.iv.next.3, 100 + br i1 %exit1, label %for1.inc, label %for2 + +for1.inc: ; preds = %for2 + %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + %exit2 = icmp eq i64 %indvars.iv.next24, 100 + br i1 %exit2, label %for1.loopexit, label %for1.header + +for1.loopexit: ; preds = %for1.inc + %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ] + ret i64 %sum.inc.lcssa2 +}