Index: lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- lib/Transforms/Scalar/LoopInterchange.cpp +++ lib/Transforms/Scalar/LoopInterchange.cpp @@ -348,7 +348,7 @@ bool containsUnsafeInstructionsInLatch(BasicBlock *BB); bool findInductionAndReductions(Loop *L, SmallVector &Inductions, - SmallVector &Reductions); + SmallPtrSet &Reductions); Loop *OuterLoop; Loop *InnerLoop; @@ -675,7 +675,7 @@ bool LoopInterchangeLegality::findInductionAndReductions( Loop *L, SmallVector &Inductions, - SmallVector &Reductions) { + SmallPtrSet &Reductions) { if (!L->getLoopLatch() || !L->getLoopPredecessor()) return false; for (PHINode &PHI : L->getHeader()->phis()) { @@ -683,9 +683,46 @@ InductionDescriptor ID; if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) Inductions.push_back(&PHI); - else if (RecurrenceDescriptor::isReductionPHI(&PHI, L, RD)) - Reductions.push_back(&PHI); - else { + else if (RecurrenceDescriptor::isReductionPHI(&PHI, L, RD)) { + // Detect inner loop reductions across promoted values. We allow reduction + // PHIs in the inner loop, if they have an incoming value, that is loaded + // in the preheader and the result of the reduction is stored to the same + // location in the exit block. Other uses of the reduction result outside + // the outer loop are fine. We do not allow other stores in the inner + // exit block, so there is no need to check for stores overwriting the + // reduction result. + int PHInd = PHI.getBasicBlockIndex(L->getLoopPreheader()); + int IncInd = PHI.getBasicBlockIndex(L->getLoopLatch()); + if (PHInd == -1 || IncInd == -1) + return false; + + auto *LI = dyn_cast(PHI.getIncomingValue(PHInd)); + Value *RedV = PHI.getIncomingValue(IncInd); + StoreInst *SI = nullptr; + + // Check users of the reduction value. + for (auto *U : RedV->users()) { + if (&*U == &PHI) + continue; + if (isa(&*U)) { + for (auto *U2 : U->users()) { + if (!OuterLoop->contains(cast(&*U)->getParent())) + continue; + + if ((SI = dyn_cast(&*U2)) && + SI->getParent() == L->getExitBlock()) + continue; + SI = nullptr; + } + } + } + + // Need both a load and a store to the same address. + if (!LI || !SI || SI->getOperand(1) != LI->getOperand(0)) + return false; + + Reductions.insert(&PHI); + } else { LLVM_DEBUG( dbgs() << "Failed to recognize PHI as an induction or reduction.\n"); return false; @@ -737,11 +774,11 @@ PHINode *InnerInductionVar; SmallVector Inductions; - SmallVector Reductions; + SmallPtrSet Reductions; if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) { LLVM_DEBUG( - dbgs() << "Only inner loops with induction or reduction PHI nodes " - << "are supported currently.\n"); + dbgs() << "Only inner loops with induction or reduction PHI nodes on" + " promoted values are supported currently.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", InnerLoop->getStartLoc(), Index: test/Transforms/LoopInterchange/inner-only-reductions.ll =================================================================== --- /dev/null +++ test/Transforms/LoopInterchange/inner-only-reductions.ll @@ -0,0 +1,438 @@ +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \ +; RUN: -verify-dom-info -verify-loop-info 2>&1 | FileCheck -check-prefix=IR %s +; RUN: FileCheck --input-file=%t %s + +@A = common global [500 x [500 x i32]] zeroinitializer +@X = common global i32 0 +@B = common global [500 x [500 x i32]] zeroinitializer +@Y = common global i32 0 + +;; for( int i=1;i&1 | FileCheck %s - -@A = common global [500 x [500 x i32]] zeroinitializer -@X = common global i32 0 -@B = common global [500 x [500 x i32]] zeroinitializer -@Y = common global i32 0 - -;; for( int i=1;i