Index: llvm/lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -241,8 +242,8 @@ class LoopInterchangeLegality { public: LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, - OptimizationRemarkEmitter *ORE) - : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} + AliasAnalysis *AA, OptimizationRemarkEmitter *ORE) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), AA(AA), ORE(ORE) {} /// Check if the loops can be interchanged. bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, @@ -266,7 +267,59 @@ return InnerLoopInductions; } + bool containsSinkablePHIs() const { return !SinkablePHIs.empty(); } + + void removeSinkablePHIs(); + private: + /// Structure to track sinkable PHIs. These PHIs can be removed by + /// sinking the loads and hoisting the stores from the outer to the + /// inner loops. For eg, consider the loop structure below: + /// + /// inner_loop.preheader: + /// %X.promoted = load i32, i32* @X ---> (1) + /// br label %inner_loop.header + /// + /// inner_loop.header: + /// %phi_val = phi i32 [ %X.promoted, %inner_loop.preheader ], + /// [ %var, %inner_loop.header ] ---> (2) + /// .... + /// %var = .... + /// .... + /// br i1 %exitcond, label %inner_loop.exit, label %inner_loop.header + /// + /// inner_loop.exit: + /// %add.lcssa = phi i32 [ %var, %inner_loop.header ] ---> (3) + /// store i32 %add.lcssa, i32* @X ---> (4) + /// + /// (1) Load - Sinkable Load + /// (2) LoopHeaderPHI - Inner Loop Header PHI with one value coming from the + /// Load Instruction & other from the LICM promoted var + /// (3) LoopExitPHI - Inner Loop Exit LCSSA PHI for LICM promoted value + /// (4) Store - Hoistable store of the LCSSA PHI value + struct PHIUseChain { + LoadInst *Load; + PHINode *LoopHeaderPHI; + PHINode *LoopExitPHI; + StoreInst *Store; + }; + + /// Check if SinkablePHIs contain \p I. The removable instruction can be + /// any of: + /// 1. Sinkable Load. + /// 2. Inner Loop Header PHI with one value coming from the Load Instruction + /// and other from the LICM promoted variable. + /// 3. Inner Loop Exit LCSSA PHI for LICM promoted value if it has just one + /// user (hoistable store). + /// 4. Hoistable store of the LCSSA PHI value. + bool isRemovableInst(const Instruction *I) const { + return any_of(SinkablePHIs, [I](const auto PHIChain) { + return PHIChain.Load == I || PHIChain.LoopHeaderPHI == I || + PHIChain.Store == I || + (PHIChain.LoopExitPHI == I && PHIChain.LoopExitPHI->hasOneUser()); + }); + } + bool tightlyNested(Loop *Outer, Loop *Inner); bool containsUnsafeInstructions(BasicBlock *BB); @@ -278,11 +331,46 @@ SmallVector &Inductions, Loop *InnerLoop); + bool areInnerLoopExitPHIsSupported(); + + /// Populate SinkablePHIs with all sinkable PHIs in the header of + /// \p InnerLoop. Currently we try to match LICM promoted variables in + /// the InnerLoop. We look for below structure: + /// + /// inner_loop.preheader: + /// %X.promoted = load i32, i32* @X + /// br label %inner_loop.header + /// + /// inner_loop.header: + /// %phi_val = phi i32 [ %X.promoted, %inner_loop.preheader ], + /// [ %var, %inner_loop.header ] + /// .... + /// %var = .... + /// .... + /// br i1 %exitcond, label %inner_loop.exit, label %inner_loop.header + /// + /// inner_loop.exit: + /// %add.lcssa = phi i32 [ %var, %inner_loop.header ] + /// store i32 %add.lcssa, i32* @X + /// + /// It's legal to move the load of X into the inner loop if there is a + /// correspoding store to the same location in the inner loop exit block + /// and the store value is same as the PHI value coming from the loop (%var). + /// By sinking the load and hoisting the store we can remove %add.lcssa & + /// %phi_val. + void findSinkablePHIs(); + + // Look for conflicts between \p Store and Instructions in the set: + // {\p L, from Load to \p L's preheader terminator, \p L's Exit till \p Store} + bool hasMemoryConflicts(Loop *L, LoadInst *Load, StoreInst *Store); + Loop *OuterLoop; Loop *InnerLoop; ScalarEvolution *SE; + AliasAnalysis *AA; + /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; @@ -292,6 +380,9 @@ /// Set of inner loop induction PHIs SmallVector InnerLoopInductions; + + // Set of Sinkable PHI chains + SmallVector SinkablePHIs; }; /// LoopInterchangeProfitability checks if it is profitable to interchange the @@ -333,7 +424,7 @@ public: LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, - const LoopInterchangeLegality &LIL) + LoopInterchangeLegality &LIL) : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {} /// Interchange OuterLoop and InnerLoop. @@ -356,7 +447,7 @@ LoopInfo *LI; DominatorTree *DT; - const LoopInterchangeLegality &LIL; + LoopInterchangeLegality &LIL; }; struct LoopInterchange { @@ -364,15 +455,17 @@ LoopInfo *LI = nullptr; DependenceInfo *DI = nullptr; DominatorTree *DT = nullptr; + AliasAnalysis *AA = nullptr; std::unique_ptr CC = nullptr; /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI, - DominatorTree *DT, std::unique_ptr &CC, + DominatorTree *DT, AliasAnalysis *AA, + std::unique_ptr &CC, OptimizationRemarkEmitter *ORE) - : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {} + : SE(SE), LI(LI), DI(DI), DT(DT), AA(AA), CC(std::move(CC)), ORE(ORE) {} bool run(Loop *L) { if (L->getParentLoop()) @@ -506,7 +599,7 @@ const DenseMap &CostMap) { LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId << " and OuterLoopId = " << OuterLoopId << "\n"); - LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); + LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, AA, ORE); if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n"); return false; @@ -539,8 +632,9 @@ } // end anonymous namespace bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) { - return any_of(*BB, [](const Instruction &I) { - return I.mayHaveSideEffects() || I.mayReadFromMemory(); + return any_of(*BB, [this](const Instruction &I) { + return !isRemovableInst(&I) && + (I.mayHaveSideEffects() || I.mayReadFromMemory()); }); } @@ -722,7 +816,6 @@ return nullptr; } } - return nullptr; } @@ -738,7 +831,7 @@ // PHIs in inner loops need to be part of a reduction in the outer loop, // discovered when checking the PHIs of the outer loop earlier. if (!InnerLoop) { - if (!OuterInnerReductions.count(&PHI)) { + if (!OuterInnerReductions.count(&PHI) && !isRemovableInst(&PHI)) { LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions " "across the outer loop.\n"); return false; @@ -765,6 +858,126 @@ return true; } +bool LoopInterchangeLegality::hasMemoryConflicts(Loop *L, LoadInst *Load, + StoreInst *Store) { + assert(Load->getParent() == L->getLoopPreheader() && + "Expecting Load to be in the loop preheader."); + assert(Store->getParent() == L->getExitBlock() && + "Expecting Store to be in the loop exit block."); + + auto conflicts = [this](BasicBlock::iterator Begin, BasicBlock::iterator End, + StoreInst *SI) { + for (auto I = Begin; I != End; I++) + // We need to check only for Load/Store instructions, as we abort + // LoopInterchange in the presence of other non load/store instructions + // that may read/write to memory as the loops don't remain tightly nested. + if ((isa(I) || isa(I)) && + !AA->isNoAlias(MemoryLocation::get(SI), MemoryLocation::get(&*I))) + return true; + return false; + }; + + // Check for conflicts between: + // 1. Store and instructions from Load to L's preheader terminator. + // 2. Store and instructions from L's ExitBlock to Store. + if (conflicts(std::next(Load->getIterator()), + L->getLoopPreheader()->getTerminator()->getIterator(), Store) || + conflicts(L->getExitBlock()->getFirstNonPHI()->getIterator(), + Store->getIterator(), Store)) + return true; + // Check for conflicts between store and all instructions in L. + for (BasicBlock *BB : L->blocks()) { + if (conflicts(BB->getFirstNonPHI()->getIterator(), + BB->getTerminator()->getIterator(), Store)) + return true; + } + return false; +} + +void LoopInterchangeLegality::findSinkablePHIs() { + // Check if the PHI is part of a LICM promoted variable. + auto isSinkablePHI = [this](Loop *L, PHINode *PHI) { + assert(PHI->getParent() == L->getHeader() && + "Expecting PHI to be part of the Loop Header"); + assert(PHI->getNumIncomingValues() == 2 && + "Phis in loop header should have exactly 2 incoming values"); + // Loop simplify guarantees that the PHIs 2 incoming values will be + // from LoopPreheader and LoopLatch. + Value *PromotedVarValue = PHI->getIncomingValueForBlock(L->getLoopLatch()); + Value *LoadValue = PHI->getIncomingValueForBlock(L->getLoopPreheader()); + if (!isa(LoadValue)) + return false; + LoadInst *Load = dyn_cast(LoadValue); + // Load should only be used by the PHI node. + assert(Load->isSimple() && "Expecting a simple load!"); + if (!Load->hasOneUser()) + return false; + + for (Value *PromotedVarUser : PromotedVarValue->users()) { + if (PromotedVarUser == PHI) + continue; + // If the variable was LICM promoted then the PHI should only + // have one incoming value from the loop. + PHINode *LCSSAPHI = dyn_cast(PromotedVarUser); + if (!LCSSAPHI) + continue; + + // Check for store in the LoopExitBlock which stores the LCSSA Phi value + // to the same address as the Load instruction. + StoreInst *Store = nullptr; + for (Value *LCSSAPHIUser : LCSSAPHI->users()) { + // Ignore uses of LCSSAPHIUser in other BasicBlocks + if (isa(LCSSAPHIUser)) + continue; + if (!isa(LCSSAPHIUser)) + return false; + Store = dyn_cast(LCSSAPHIUser); + assert(Store->isSimple() && "Expecting a simple store!"); + if (Load->getPointerOperand() == Store->getPointerOperand()) { + LLVM_DEBUG( + dbgs() << "Can demote PHI by sinking load and hoisting store\n"; + PHI->dump()); + // Make sure no other store instruction exists between the Load and + // the store that write to the same address: + // For eg: hoisting the 2nd store to inside the inner loop would be + // incorrect. + // Load X + // Innerloop + // Store X 4 + // Store X PromotedVar + if (hasMemoryConflicts(L, Load, Store)) + return false; + SinkablePHIs.push_back({Load, PHI, LCSSAPHI, Store}); + return true; + } + } + } + return false; + }; + + for (PHINode &PHI : InnerLoop->getHeader()->phis()) + isSinkablePHI(InnerLoop, &PHI); +} + +void LoopInterchangeLegality::removeSinkablePHIs() { + RecurrenceDescriptor RD; + for (const PHIUseChain &LICMChain : SinkablePHIs) { + LoadInst *Load = LICMChain.Load; + PHINode *LoopHeaderPHI = LICMChain.LoopHeaderPHI; + PHINode *LoopExitPHI = LICMChain.LoopExitPHI; + StoreInst *Store = LICMChain.Store; + + Load->moveBefore(&*InnerLoop->getHeader()->getFirstInsertionPt()); + LoopHeaderPHI->replaceAllUsesWith(Load); + Store->setOperand(0, LoopExitPHI->getIncomingValue(0)); + Store->moveBefore(&*InnerLoop->getLoopLatch()->getTerminator()); + LoopHeaderPHI->eraseFromParent(); + if (LoopExitPHI->user_empty()) + LoopExitPHI->eraseFromParent(); + } + SinkablePHIs.clear(); +} + // This function indicates the current limitations in the transform as a result // of which we do not proceed. bool LoopInterchangeLegality::currentLimitations() { @@ -789,6 +1002,10 @@ return true; } + // Find all the sinkable PHIs in the inner-loop header by finding LICM + // promoted variables. + findSinkablePHIs(); + SmallVector Inductions; if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) { LLVM_DEBUG( @@ -853,20 +1070,23 @@ } // We currently only support LCSSA PHI nodes in the inner loop exit, if their -// users are either reduction PHIs or PHIs outside the outer loop (which means -// the we are only interested in the final value after the loop). -static bool -areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, - SmallPtrSetImpl &Reductions) { - BasicBlock *InnerExit = OuterL->getUniqueExitBlock(); +// users are either: +// 1. Reduction PHIs. +// 2. PHIs outside the outer loop (which means that we are only interested in +// the final value after the loop). +// 3. One of the removable instructions from the SinkablePHIs structure. +bool LoopInterchangeLegality::areInnerLoopExitPHIsSupported() { + BasicBlock *InnerExit = InnerLoop->getUniqueExitBlock(); for (PHINode &PHI : InnerExit->phis()) { // Reduction lcssa phi will have only 1 incoming block that from loop latch. if (PHI.getNumIncomingValues() > 1) return false; - if (any_of(PHI.users(), [&Reductions, OuterL](User *U) { - PHINode *PN = dyn_cast(U); - return !PN || - (!Reductions.count(PN) && OuterL->contains(PN->getParent())); + if (any_of(PHI.users(), [this](User *U) { + PHINode *PN = dyn_cast(U); // false + Instruction *UI = dyn_cast(U); + return (!PN || (!OuterInnerReductions.count(PN) && + OuterLoop->contains(PN->getParent()))) && + !isRemovableInst(UI); })) { return false; } @@ -1012,8 +1232,7 @@ return false; } - if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, - OuterInnerReductions)) { + if (!areInnerLoopExitPHIsSupported()) { LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", @@ -1285,6 +1504,11 @@ bool LoopInterchangeTransform::transform() { bool Transformed = false; + if (LIL.containsSinkablePHIs()) { + LLVM_DEBUG(dbgs() << "Removing sinkable PHIs.\n"); + LIL.removeSinkablePHIs(); + } + if (InnerLoop->getSubLoops().empty()) { BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n"); @@ -1718,7 +1942,7 @@ std::unique_ptr CC = CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI); OptimizationRemarkEmitter ORE(&F); - if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN)) + if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR.AA, CC, &ORE).run(LN)) return PreservedAnalyses::all(); U.markLoopNestChanged(true); return getLoopPassPreservedAnalyses(); Index: llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll +++ llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll @@ -2,27 +2,23 @@ ; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa 2>&1 | FileCheck -check-prefix=IR %s ; RUN: FileCheck --input-file=%t %s -; Inner loop only reductions are not supported currently. See discussion at -; D53027 for more information on the required checks. - @A = common global [500 x [500 x i32]] zeroinitializer @X = common global i32 0 @B = common global [500 x [500 x i32]] zeroinitializer @Y = common global i32 0 -;; global X - ;; for( int i=1;i