diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -110,7 +110,7 @@ Loop **EpilogueLoop = nullptr); bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, - DependenceInfo &DI); + DependenceInfo &DI, LoopInfo &LI); bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -282,21 +282,6 @@ ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, DependenceInfo &DI, OptimizationRemarkEmitter &ORE, int OptLevel) { - // Quick checks of the correct loop form - if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1) - return LoopUnrollResult::Unmodified; - Loop *SubLoop = L->getSubLoops()[0]; - if (!SubLoop->isLoopSimplifyForm()) - return LoopUnrollResult::Unmodified; - - BasicBlock *Latch = L->getLoopLatch(); - BasicBlock *Exit = L->getExitingBlock(); - BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); - BasicBlock *SubLoopExit = SubLoop->getExitingBlock(); - - if (Latch != Exit || SubLoopLatch != SubLoopExit) - return LoopUnrollResult::Unmodified; - TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None, None, None, None, None, None, None, None); @@ -326,7 +311,7 @@ return LoopUnrollResult::Unmodified; } - if (!isSafeToUnrollAndJam(L, SE, DT, DI)) { + if (!isSafeToUnrollAndJam(L, SE, DT, DI, *LI)) { LLVM_DEBUG(dbgs() << " Disabled due to not being safe.\n"); return LoopUnrollResult::Unmodified; } @@ -337,6 +322,7 @@ bool Convergent; SmallPtrSet EphValues; CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + Loop *SubLoop = L->getSubLoops()[0]; unsigned InnerLoopSize = ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable, Convergent, TTI, EphValues, UP.BEInsns); @@ -374,6 +360,8 @@ SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue()); // Find trip count and trip multiple + BasicBlock *Latch = L->getLoopLatch(); + BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch); unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch); unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch); @@ -438,11 +426,11 @@ int OptLevel) { bool DidSomething = false; - // The loop unroll and jam pass requires loops to be in simplified form, and also needs LCSSA. - // Since simplification may add new inner loops, it has to run before the - // legality and profitability checks. This means running the loop unroll and jam pass - // will simplify all loops, regardless of whether anything end up being - // unroll and jammed. + // The loop unroll and jam pass requires loops to be in simplified form, and + // also needs LCSSA. Since simplification may add new inner loops, it has to + // run before the legality and profitability checks. This means running the + // loop unroll and jam pass will simplify all loops, regardless of whether + // anything end up being unroll and jammed. for (auto &L : LI) { DidSomething |= simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */); diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp --- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -69,17 +69,14 @@ // Partition blocks in an outer/inner loop pair into blocks before and after // the loop -static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, - BasicBlockSet &ForeBlocks, - BasicBlockSet &SubLoopBlocks, - BasicBlockSet &AftBlocks, - DominatorTree *DT) { +static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks, + BasicBlockSet &AftBlocks, DominatorTree &DT) { + Loop *SubLoop = L.getSubLoops()[0]; BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); - SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); - for (BasicBlock *BB : L->blocks()) { + for (BasicBlock *BB : L.blocks()) { if (!SubLoop->contains(BB)) { - if (DT->dominates(SubLoopLatch, BB)) + if (DT.dominates(SubLoopLatch, BB)) AftBlocks.insert(BB); else ForeBlocks.insert(BB); @@ -93,14 +90,42 @@ if (BB == SubLoopPreHeader) continue; Instruction *TI = BB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - if (!ForeBlocks.count(TI->getSuccessor(i))) + for (BasicBlock *Succ : successors(TI)) + if (!ForeBlocks.count(Succ)) return false; } return true; } +static bool partitionOuterLoopBlocks( + Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks, + DenseMap &ForeBlocksMap, + DenseMap &AftBlocksMap, DominatorTree &DT) { + JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end()); + + for (Loop *L : Root.getLoopsInPreorder()) { + if (L == &JamLoop) + break; + + if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT)) + return false; + } + + return true; +} + +/// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more +/// than 2 levels loop. +static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, + BasicBlockSet &ForeBlocks, + BasicBlockSet &SubLoopBlocks, + BasicBlockSet &AftBlocks, + DominatorTree *DT) { + SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); + return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT); +} + // Looks at the phi nodes in Header for values coming from Latch. For these // instructions and all their operands calls Visit on them, keeping going for // all the operands in AftBlocks. Returns false if Visit returns false, @@ -642,7 +667,7 @@ static bool checkDependencies(SmallVector &Earlier, SmallVector &Later, - unsigned LoopDepth, bool InnerLoop, + unsigned LoopDepth, Loop *CurLoop, DependenceInfo &DI) { // Use DA to check for dependencies between loads and stores that make unroll // and jam invalid @@ -670,7 +695,7 @@ << " " << *Dst << "\n"); return false; } - if (!InnerLoop) { + if (!CurLoop) { if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) { LLVM_DEBUG(dbgs() << " > dependency between:\n" << " " << *Src << "\n" @@ -678,13 +703,19 @@ return false; } } else { - assert(LoopDepth + 1 <= D->getLevels()); - if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT && - D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) { - LLVM_DEBUG(dbgs() << " < > dependency between:\n" - << " " << *Src << "\n" - << " " << *Dst << "\n"); - return false; + assert(CurLoop->getLoopDepth() <= D->getLevels()); + if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) { + for (unsigned CurLoopDepth = LoopDepth + 1; + CurLoopDepth <= CurLoop->getLoopDepth(); ++CurLoopDepth) { + if (D->getDirection(CurLoopDepth) & Dependence::DVEntry::LT) { + LLVM_DEBUG(dbgs() << " < > dependency between:\n" + << " " << *Src << "\n" + << " " << *Dst << "\n"); + return false; + } + if (D->getDirection(CurLoopDepth) & Dependence::DVEntry::GT) + break; + } } } } @@ -693,31 +724,89 @@ return true; } -static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks, - BasicBlockSet &SubLoopBlocks, - BasicBlockSet &AftBlocks, DependenceInfo &DI) { - // Get all loads/store pairs for each blocks - SmallVector ForeMemInstr; - SmallVector SubLoopMemInstr; - SmallVector AftMemInstr; - if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) || - !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) || - !getLoadsAndStores(AftBlocks, AftMemInstr)) +static bool +checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks, + const DenseMap &ForeBlocksMap, + const DenseMap &AftBlocksMap, + DependenceInfo &DI, LoopInfo &LI) { + SmallVector AllBlocks; + for (Loop *L : Root.getLoopsInPreorder()) + if (ForeBlocksMap.find(L) != ForeBlocksMap.end()) + AllBlocks.push_back(ForeBlocksMap.lookup(L)); + AllBlocks.push_back(SubLoopBlocks); + for (Loop *L : Root.getLoopsInPreorder()) + if (AftBlocksMap.find(L) != AftBlocksMap.end()) + AllBlocks.push_back(AftBlocksMap.lookup(L)); + + unsigned LoopDepth = Root.getLoopDepth(); + auto *LastIterator = AllBlocks.end(); + --LastIterator; + for (auto *I = AllBlocks.begin(); I != LastIterator; ++I) { + for (auto *J = I; J != AllBlocks.end(); ++J) { + if (I == J && (*I == ForeBlocksMap.lookup(&Root) || + *I == AftBlocksMap.lookup(&Root))) + continue; + + SmallVector SrcLoadsAndStores; + SmallVector DstLoadsAndStores; + if (!getLoadsAndStores(*I, SrcLoadsAndStores) || + !getLoadsAndStores(*J, DstLoadsAndStores)) + return false; + + if (!checkDependencies(SrcLoadsAndStores, DstLoadsAndStores, LoopDepth, + (I == J) ? LI.getLoopFor(*(*I).begin()) : nullptr, + DI)) + return false; + } + } + + return true; +} + +static bool isEligibleLoopForm(const Loop &Root) { + // Root must have a child. + if (Root.getSubLoops().size() != 1) return false; - // Check for dependencies between any blocks that may change order - unsigned LoopDepth = L->getLoopDepth(); - return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false, - DI) && - checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) && - checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false, - DI) && - checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true, - DI); + const Loop *L = &Root; + do { + // All loops in Root need to be in simplify and rotated form. + if (!L->isLoopSimplifyForm()) + return false; + + if (!L->isRotatedForm()) + return false; + + if (L->getHeader()->hasAddressTaken()) { + LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); + return false; + } + + unsigned SubLoopsSize = L->getSubLoops().size(); + if (SubLoopsSize == 0) + return true; + + // Only one child is allowed. + if (SubLoopsSize != 1) + return false; + + L = L->getSubLoops()[0]; + } while (L); + + return true; +} + +static Loop *getInnerMostLoop(Loop *L) { + while (L->getSubLoops().size() != 0) + L = L->getSubLoops()[0]; + return L; } bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, - DependenceInfo &DI) { + DependenceInfo &DI, LoopInfo &LI) { + if (!isEligibleLoopForm(*L)) + return false; + /* We currently handle outer loops like this: | ForeFirst <----\ } @@ -755,35 +844,13 @@ UnrollAndJamLoop if the trip count cannot be easily calculated. */ - if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1) - return false; - Loop *SubLoop = L->getSubLoops()[0]; - if (!SubLoop->isLoopSimplifyForm()) - return false; - - BasicBlock *Header = L->getHeader(); - BasicBlock *Latch = L->getLoopLatch(); - BasicBlock *Exit = L->getExitingBlock(); - BasicBlock *SubLoopHeader = SubLoop->getHeader(); - BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); - BasicBlock *SubLoopExit = SubLoop->getExitingBlock(); - - if (Latch != Exit) - return false; - if (SubLoopLatch != SubLoopExit) - return false; - - if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) { - LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); - return false; - } - // Split blocks into Fore/SubLoop/Aft based on dominators + Loop *JamLoop = getInnerMostLoop(L); BasicBlockSet SubLoopBlocks; - BasicBlockSet ForeBlocks; - BasicBlockSet AftBlocks; - if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, - AftBlocks, &DT)) { + DenseMap ForeBlocksMap; + DenseMap AftBlocksMap; + if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap, + AftBlocksMap, DT)) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n"); return false; } @@ -791,7 +858,7 @@ // Aft blocks may need to move instructions to fore blocks, which becomes more // difficult if there are multiple (potentially conditionally executed) // blocks. For now we just exclude loops with multiple aft blocks. - if (AftBlocks.size() != 1) { + if (AftBlocksMap[L].size() != 1) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle " "multiple blocks after the loop\n"); return false; @@ -799,7 +866,9 @@ // Check inner loop backedge count is consistent on all iterations of the // outer loop - if (!hasIterationCountInvariantInParent(SubLoop, SE)) { + if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) { + return !hasIterationCountInvariantInParent(SubLoop, SE); + })) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is " "not consistent on each iteration\n"); return false; @@ -820,6 +889,10 @@ // ForeBlock phi operands before the subloop // Make sure we can move all instructions we need to before the subloop + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + BasicBlockSet AftBlocks = AftBlocksMap[L]; + Loop *SubLoop = L->getSubLoops()[0]; if (!processHeaderPhiOperands( Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) { if (SubLoop->contains(I->getParent())) @@ -845,7 +918,8 @@ // Check for memory dependencies which prohibit the unrolling we are doing. // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub. - if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) { + if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI, + LI)) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n"); return false; }