Index: lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- lib/Transforms/Scalar/LoopInterchange.cpp +++ lib/Transforms/Scalar/LoopInterchange.cpp @@ -347,7 +347,8 @@ bool containsUnsafeInstructionsInLatch(BasicBlock *BB); bool findInductionAndReductions(Loop *L, SmallVector<PHINode *, 8> &Inductions, - SmallVector<PHINode *, 8> &Reductions); + SmallVector<PHINode *, 8> &Reductions, + Loop *InnerLoop); Loop *OuterLoop; Loop *InnerLoop; @@ -703,9 +704,39 @@ return true; } +static Value *followLCSSA(Value *SV) { + PHINode *PHI = dyn_cast<PHINode>(SV); + if (!PHI) + return SV; + + if (PHI->getNumIncomingValues() != 1) + return SV; + return followLCSSA(PHI->getIncomingValue(0)); +} + +/// \brief Checks if V is part of a reduction in L by following its operands +/// and trying to find an reduction PHI in L. +static bool isPartOfReduction(Loop *L, Value *V, + SmallPtrSetImpl<Value *> &Visited) { + if (!Visited.insert(V).second) + return false; + + Instruction *I = dyn_cast<Instruction>(V); + if (!I || !L->contains(I->getParent())) + return false; + + if (PHINode *PHI = dyn_cast<PHINode>(I)) { + RecurrenceDescriptor RD; + return RecurrenceDescriptor::isReductionPHI(PHI, L, RD); + } + return llvm::any_of(I->operands(), [L, &Visited](Value *Op) { + return isPartOfReduction(L, Op, Visited); + }); +} + bool LoopInterchangeLegality::findInductionAndReductions( Loop *L, SmallVector<PHINode *, 8> &Inductions, - SmallVector<PHINode *, 8> &Reductions) { + SmallVector<PHINode *, 8> &Reductions, Loop *InnerLoop) { if (!L->getLoopLatch() || !L->getLoopPredecessor()) return false; for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { @@ -716,7 +747,15 @@ Inductions.push_back(PHI); else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) Reductions.push_back(PHI); - else { + else if (InnerLoop && PHI->getNumIncomingValues() == 2) { + // Check if we have a PHI node in the outer loop that has a reduction + // result from the inner loop as an incoming value. + Value *V = followLCSSA(PHI->getIncomingValueForBlock(L->getLoopLatch())); + SmallPtrSet<Value *, 10> Visited; + if (!isPartOfReduction(InnerLoop, V, Visited)) + return false; + Reductions.push_back(PHI); + } else { DEBUG( dbgs() << "Failed to recognize PHI as an induction or reduction.\n"); return false; @@ -766,7 +805,7 @@ PHINode *InnerInductionVar; SmallVector<PHINode *, 8> Inductions; SmallVector<PHINode *, 8> Reductions; - if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) { + if (!findInductionAndReductions(InnerLoop, Inductions, Reductions, nullptr)) { DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes " << "are supported currently.\n"); ORE->emit([&]() { @@ -797,7 +836,8 @@ InnerInductionVar = Inductions.pop_back_val(); Reductions.clear(); - if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) { + if (!findInductionAndReductions(OuterLoop, Inductions, Reductions, + InnerLoop)) { DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes " << "are supported currently.\n"); ORE->emit([&]() { @@ -810,20 +850,6 @@ return true; } - // Outer loop cannot have reduction because then loops will not be tightly - // nested. - if (!Reductions.empty()) { - DEBUG(dbgs() << "Outer loops with reductions are not supported " - << "currently.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Outer loops with reductions cannot be interchangeed " - "currently."; - }); - return true; - } // TODO: Currently we handle only loops with 1 induction variable. if (Inductions.size() != 1) { DEBUG(dbgs() << "Loops with more than 1 induction variables are not " @@ -1199,8 +1225,7 @@ else InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); - // Ensure that InductionPHI is the first Phi node as required by - // splitInnerLoopHeader + // Ensure that InductionPHI is the first Phi node. if (&InductionPHI->getParent()->front() != InductionPHI) InductionPHI->moveBefore(&InductionPHI->getParent()->front()); @@ -1211,8 +1236,9 @@ DEBUG(dbgs() << "splitInnerLoopLatch done\n"); // Splits the inner loops phi nodes out into a separate basic block. - splitInnerLoopHeader(); - DEBUG(dbgs() << "splitInnerLoopHeader done\n"); + BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); + DEBUG(dbgs() << "splitting InnerLoopHeader done\n"); } Transformed |= adjustLoopLinks(); @@ -1231,31 +1257,6 @@ InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI); } -void LoopInterchangeTransform::splitInnerLoopHeader() { - // Split the inner loop header out. Here make sure that the reduction PHI's - // stay in the innerloop body. - BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); - BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); - SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); - if (InnerLoopHasReduction) { - // Adjust Reduction PHI's in the block. The induction PHI must be the first - // PHI in InnerLoopHeader for this to work. - SmallVector<PHINode *, 8> PHIVec; - for (auto I = std::next(InnerLoopHeader->begin()); isa<PHINode>(I); ++I) { - PHINode *PHI = dyn_cast<PHINode>(I); - Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader); - PHI->replaceAllUsesWith(V); - PHIVec.push_back((PHI)); - } - for (PHINode *P : PHIVec) { - P->eraseFromParent(); - } - } - - DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & " - "InnerLoopHeader\n"); -} - /// \brief Move all instructions except the terminator from FromBB right before /// InsertBefore static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { @@ -1385,6 +1386,42 @@ OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch); } + // Now update the reduction PHIs in the inner and outer loop headers. + SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs; + for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1)) + InnerLoopPHIs.push_back(cast<PHINode>(&PHI)); + for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1)) + OuterLoopPHIs.push_back(cast<PHINode>(&PHI)); + + for (PHINode *PHI : OuterLoopPHIs) + PHI->moveBefore(InnerLoopHeader->getFirstNonPHI()); + + // Move the PHI nodes from the inner loop header to the outer loop header. + // We have to deal with 2 different kind of PHI nodes: + // 1) PHI nodes that are part of a reduction in the outer loop + // 2) PHI nodes that are part of inner loop-only reductions. + // For 1) we only have to move the PHI node and update the incoming blocks. + // For 2) we can replace the PHI node with the value coming from outside the + // inner loop. + for (PHINode *PHI : InnerLoopPHIs) { + PHI->moveBefore(OuterLoopHeader->getFirstNonPHI()); + for (BasicBlock *InBB : PHI->blocks()) { + if (InnerLoop->contains(InBB) || + isa<PHINode>(PHI->getIncomingValueForBlock(InBB))) + continue; + + PHI->replaceAllUsesWith(PHI->getIncomingValueForBlock(InBB)); + PHI->eraseFromParent(); + break; + } + } + + // Update the incoming blocks for moved PHI nodes. + updateIncomingBlock(OuterLoopHeader, InnerLoopPreHeader, OuterLoopPreHeader); + updateIncomingBlock(OuterLoopHeader, InnerLoopLatch, OuterLoopLatch); + updateIncomingBlock(InnerLoopHeader, OuterLoopPreHeader, InnerLoopPreHeader); + updateIncomingBlock(InnerLoopHeader, OuterLoopLatch, InnerLoopLatch); + return true; } Index: test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll =================================================================== --- /dev/null +++ test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll @@ -0,0 +1,64 @@ +; RUN: opt < %s -basicaa -loop-interchange -simplifycfg -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @test(i32** nocapture readonly %Arr, i32 %k) local_unnamed_addr #0 { +entry: + br label %for.body + + +for.body: ; preds = %for.cond.cleanup3, %entry + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.cond.cleanup3 ] + %sum.021 = phi i32 [ 0, %entry ], [ %add7.lcssa, %for.cond.cleanup3 ] + br label %for.body4 + + +for.body4: ; preds = %for.body4, %for.body + %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next.3, %for.body4 ] + %sum.119 = phi i32 [ %sum.021, %for.body ], [ %add7, %for.body4 ] + %arrayidx = getelementptr inbounds i32*, i32** %Arr, i64 %indvars.iv + %0 = load i32*, i32** %arrayidx, align 8 + %arrayidx6 = getelementptr inbounds i32, i32* %0, i64 %indvars.iv23 + %1 = load i32, i32* %arrayidx6, align 4 + %add = add i32 %sum.119, %k + %add7 = add i32 %add, %1 + %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1 + %exitcond.3 = icmp eq i64 %indvars.iv.next.3, 1024 + br i1 %exitcond.3, label %for.cond.cleanup3, label %for.body4 + +for.cond.cleanup3: ; preds = %for.body4 + %add7.lcssa = phi i32 [ %add7, %for.body4 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + %exitcond25 = icmp eq i64 %indvars.iv.next24, 1024 + br i1 %exitcond25, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3 + %add7.lcssa.lcssa = phi i32 [ %add7.lcssa, %for.cond.cleanup3 ] + ret i32 %add7.lcssa.lcssa + + +} + +; CHECK-LABEL: @test +; CHECK-LABEL: entry: +; CHECK-NEXT: br label %for.body4 + +; CHECK-LABEL: for.body: ; preds = %for.body4, %for.body +; CHECK: %indvars.iv23 = phi i64 [ %indvars.iv.next24, %for.body ], [ 0, %for.body4 ] +; CHECK: %sum.119 = phi i32 [ %add7, %for.body ], [ %sum.021, %for.body4 ] +; CHECK: br i1 %exitcond25, label %for.body4.split, label %for.body + +; CHECK-LABEL: for.body4: +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next.3, %for.body4.split ], [ 0, %entry ] +; CHECK: %sum.021 = phi i32 [ %add7, %for.body4.split ], [ 0, %entry ] +; CHECK: br label %for.body + +; CHECK-LABEL: for.body4.split: ; preds = %for.body +; CHECK: %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1 +; CHECK: %exitcond.3 = icmp eq i64 %indvars.iv.next.3, 1024 +; CHECK: br i1 %exitcond.3, label %for.cond.cleanup, label %for.body4 + +; CHECK-LABEL: for.cond.cleanup: ; preds = %for.body4.split +; CHECK: %add7.lcssa.lcssa = phi i32 [ %add7, %for.body4.split ] +; CHECK: ret i32 %add7.lcssa.lcssa