diff --git a/llvm/include/llvm/Transforms/Utils/LoopPeel.h b/llvm/include/llvm/Transforms/Utils/LoopPeel.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/LoopPeel.h @@ -0,0 +1,40 @@ +//===- llvm/Transforms/Utils/LoopPeel.h ----- Peeling utilities -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines some loop peeling utilities. It does not define any +// actual pass or policy. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_LOOPPEEL_H +#define LLVM_TRANSFORMS_UTILS_LOOPPEEL_H + +#include "llvm/Analysis/TargetTransformInfo.h" + +namespace llvm { + +bool canPeel(Loop *L); + +bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE, + DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA); + +TargetTransformInfo::PeelingPreferences +gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, + const TargetTransformInfo &TTI, + Optional UserAllowPeeling, + Optional UserAllowProfileBasedPeeling, + bool UnrollingSpecficValues = false); + +void computePeelCount(Loop *L, unsigned LoopSize, + TargetTransformInfo::PeelingPreferences &PP, + unsigned &TripCount, ScalarEvolution &SE, + unsigned Threshold = UINT_MAX); + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_UTILS_LOOPPEEL_H diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -92,16 +92,6 @@ const TargetTransformInfo *TTI, bool PreserveLCSSA, Loop **ResultLoop = nullptr); -void computePeelCount(Loop *L, unsigned LoopSize, - TargetTransformInfo::UnrollingPreferences &UP, - TargetTransformInfo::PeelingPreferences &PP, - unsigned &TripCount, ScalarEvolution &SE); - -bool canPeel(Loop *L); - -bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE, - DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA); - LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, @@ -121,7 +111,6 @@ unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, - bool &UseUpperBound); void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, @@ -138,12 +127,6 @@ Optional UserAllowPartial, Optional UserRuntime, Optional UserUpperBound, Optional UserFullUnrollMaxCount); -TargetTransformInfo::PeelingPreferences -gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, - const TargetTransformInfo &TTI, - Optional UserAllowPeeling, - Optional UserAllowProfileBasedPeeling); - unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent, const TargetTransformInfo &TTI, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/User.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/LoopPeel.h" #include "llvm/Transforms/Utils/UnrollLoop.h" using namespace llvm; diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -46,6 +46,7 @@ #include "llvm/Transforms/Scalar/LoopFuse.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/LoopInfo.h" @@ -53,6 +54,7 @@ #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" @@ -64,6 +66,7 @@ #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/CodeMoverUtils.h" +#include "llvm/Transforms/Utils/LoopPeel.h" using namespace llvm; @@ -114,6 +117,14 @@ "Use all available analyses")), cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore); +static cl::opt + FusionPeel("loop-fusion-peel", cl::init(false), cl::Hidden, + cl::desc("Peel loops if needed so that fusion can take place")); + +static cl::opt FusionPeelMaxCount( + "loop-fusion-peel-max-count", cl::init(3), cl::Hidden, + cl::desc("Max number of iterations to be peeled from a loop")); + #ifndef NDEBUG static cl::opt VerboseFusionDebugging("loop-fusion-verbose-debug", @@ -157,6 +168,12 @@ bool Valid; /// Guard branch of the loop, if it exists BranchInst *GuardBranch; + /// Peeling Paramaters of the Loop. + TTI::PeelingPreferences PP; + /// Can you Peel this Loop? + bool AbleToPeel; + /// Has this loop been Peeled + bool Peeled; /// Dominator and PostDominator trees are needed for the /// FusionCandidateCompare function, required by FusionCandidateSet to @@ -168,11 +185,13 @@ OptimizationRemarkEmitter &ORE; FusionCandidate(Loop *L, const DominatorTree *DT, - const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE) + const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE, + TTI::PeelingPreferences PP) : Preheader(L->getLoopPreheader()), Header(L->getHeader()), ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), Latch(L->getLoopLatch()), L(L), Valid(true), - GuardBranch(L->getLoopGuardBranch()), DT(DT), PDT(PDT), ORE(ORE) { + GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)), + Peeled(false), DT(DT), PDT(PDT), ORE(ORE) { // Walk over all blocks in the loop and check for conditions that may // prevent fusion. For each block, walk over all instructions and collect @@ -243,6 +262,17 @@ return Preheader; } + /// After Peeling the loop is modified quite a bit, hence all of the Blocks + /// need to be updated accordingly. + void updateAfterPeeling() { + Preheader = L->getLoopPreheader(); + Header = L->getHeader(); + ExitingBlock = L->getExitingBlock(); + ExitBlock = L->getExitBlock(); + Latch = L->getLoopLatch(); + verify(); + } + /// Given a guarded loop, get the successor of the guard that is not in the /// loop. /// @@ -254,6 +284,8 @@ assert(GuardBranch && "Only valid on guarded loops."); assert(GuardBranch->isConditional() && "Expecting guard to be a conditional branch."); + if (Peeled) + return GuardBranch->getSuccessor(1); return (GuardBranch->getSuccessor(0) == Preheader) ? GuardBranch->getSuccessor(1) : GuardBranch->getSuccessor(0); @@ -358,25 +390,26 @@ bool operator()(const FusionCandidate &LHS, const FusionCandidate &RHS) const { const DominatorTree *DT = LHS.DT; + const PostDominatorTree *PDT = LHS.PDT; BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); // Do not save PDT to local variable as it is only used in asserts and thus // will trigger an unused variable warning if building without asserts. - assert(DT && LHS.PDT && "Expecting valid dominator tree"); + assert(DT && PDT && "Expecting valid dominator tree"); // Do this compare first so if LHS == RHS, function returns false. if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { // RHS dominates LHS // Verify LHS post-dominates RHS - assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); + assert(PDT->dominates(LHSEntryBlock, RHSEntryBlock)); return false; } if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { // Verify RHS Postdominates LHS - assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); + assert(PDT->dominates(RHSEntryBlock, LHSEntryBlock)); return true; } @@ -515,13 +548,17 @@ ScalarEvolution &SE; PostDominatorTree &PDT; OptimizationRemarkEmitter &ORE; + AssumptionCache &AC; + + const TargetTransformInfo &TTI; public: LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, ScalarEvolution &SE, PostDominatorTree &PDT, - OptimizationRemarkEmitter &ORE, const DataLayout &DL) + OptimizationRemarkEmitter &ORE, const DataLayout &DL, + AssumptionCache &AC, const TargetTransformInfo &TTI) : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), - DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE) {} + DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {} /// This is the main entry point for loop fusion. It will traverse the /// specified function and collect candidate loops to fuse, starting at the @@ -606,7 +643,9 @@ /// Flow Equivalent sets, sorted by dominance. void collectFusionCandidates(const LoopVector &LV) { for (Loop *L : LV) { - FusionCandidate CurrCand(L, &DT, &PDT, ORE); + TTI::PeelingPreferences PP = + gatherPeelingPreferences(L, SE, TTI, None, None); + FusionCandidate CurrCand(L, &DT, &PDT, ORE, PP); if (!CurrCand.isEligibleForFusion(SE)) continue; @@ -656,33 +695,126 @@ /// Determine if two fusion candidates have the same trip count (i.e., they /// execute the same number of iterations). /// - /// Note that for now this method simply returns a boolean value because there - /// are no mechanisms in loop fusion to handle different trip counts. In the - /// future, this behaviour can be extended to adjust one of the loops to make - /// the trip counts equal (e.g., loop peeling). When this is added, this - /// interface may need to change to return more information than just a - /// boolean value. - bool identicalTripCounts(const FusionCandidate &FC0, - const FusionCandidate &FC1) const { + /// This function will return a pair of values. The first is a boolean, + /// stating whether or not the two candidates are known at compile time to + /// have the same TripCount. The second is the difference in the two + /// TripCounts. This information can be used later to determine whether or not + /// peeling can be performed on either one of the candiates. + std::pair> + HaveIdenticalTripCounts(const FusionCandidate &FC0, + const FusionCandidate &FC1) const { + std::pair> Res = std::make_pair(false, None); + const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L); if (isa(TripCount0)) { UncomputableTripCount++; LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!"); - return false; + return Res; } const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L); if (isa(TripCount1)) { UncomputableTripCount++; LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!"); - return false; + return Res; } + LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " << *TripCount1 << " are " << (TripCount0 == TripCount1 ? "identical" : "different") << "\n"); - return (TripCount0 == TripCount1); + Res.first = (TripCount0 == TripCount1); + + if (Res.first) + return Res; + + LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, " + "determining the difference between trip counts\n"); + + // Currently only considering loops with a single exit point + // and a non-constant trip count. + unsigned TC0 = SE.getSmallConstantTripCount(FC0.L); + unsigned TC1 = SE.getSmallConstantTripCount(FC1.L); + + // If any of the tripcounts are zero that means that loop(s) do not have + // a single exit or a constant tripcount. + if (TC0 == 0 || TC1 == 0) { + LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not " + "have a constant number of iterations. Peeling " + "is not benefical\n"); + return Res; + } + + int diff = TC0 - TC1; + + if (diff > 0) + Res.second = diff; + else { + LLVM_DEBUG( + dbgs() + << "Difference is less than 0. FC1 (second loop) has more " + "iterations than the first one. Currently not supported.\n"); + } + + LLVM_DEBUG(dbgs() << "Difference in Loop trip count is: " << Res.second + << "\n"); + + return Res; + } + + void PeelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1, + unsigned PeelCount) { + assert(FC0.AbleToPeel && "Should be able to Peel Loop"); + + LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount + << " iterations of the first loop. \n"); + + FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, &DT, &AC, true); + if (FC0.Peeled) { + LLVM_DEBUG(dbgs() << "Done Peeling\n"); + + auto IdenticalTripCount = HaveIdenticalTripCounts(FC0, FC1); + + assert(IdenticalTripCount.first && IdenticalTripCount.second == None && + "Loops should have identical trip counts after peeling"); + + FC0.PP.PeelCount = PeelCount; + + // Peeling does not update the PDT + PDT.recalculate(*FC0.Preheader->getParent()); + + FC0.updateAfterPeeling(); + + // In this case the iterations of the loop are constant, so the first + // loop will exectue completely (will not jump from one of + // the peeled blocks to the second loop). Here we are updating the + // branch conditions of each of the peeled blocks, such that it will + // branch to its successor which is not the Preheader of the second Loop. + // Doing this update will ensure that the entry block of the first loop + // dominates the entry block of the second loop. + BasicBlock *BB = + FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader; + SmallVector TreeUpdates; + for (BasicBlock *Pred : predecessors(BB)) { + if (Pred != FC0.ExitBlock) { + BranchInst *Old = dyn_cast(Pred->getTerminator()); + BasicBlock *Succ = Old->getSuccessor(0); + if (Succ == BB) + Succ = Old->getSuccessor(1); + BranchInst *NewBranch = BranchInst::Create(Succ); + ReplaceInstWithInst(Old, NewBranch); + TreeUpdates.emplace_back( + DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB)); + } + } + DTU.applyUpdates(TreeUpdates); + DTU.flush(); + LLVM_DEBUG( + dbgs() << "Sucessfully Peeled " << FC0.PP.PeelCount + << " iterations from the first loop.\n" + "Both Loops have the same number of iterations now.\n"); + } } /// Walk each set of control flow equivalent fusion candidates and attempt to @@ -716,7 +848,28 @@ FC0->verify(); FC1->verify(); - if (!identicalTripCounts(*FC0, *FC1)) { + // Check if the candidates have identical tripcounts (first value of + // pair), and if not check the difference in the tripcounts between + // the loops (second value of pair). The difference is not equal to + // None iff the loops iterate a constant number of times, and have a + // single exit. + std::pair> IdenticalTripCountRes = + HaveIdenticalTripCounts(*FC0, *FC1); + + // Check if the user specfices to use Peeling with fusion + // Check FC0 (first loop) is able to be peeled + // Check that both loops have the different tripcounts + if (FusionPeel.getNumOccurrences() > 0 && FusionPeel && + FC0->AbleToPeel && !IdenticalTripCountRes.first && + IdenticalTripCountRes.second) { + assert(*IdenticalTripCountRes.second <= FusionPeelMaxCount && + "Peel Count: is greater than maximum value specificed"); + // Dependent on peeling being performed on the first loop, and + // assuming all other conditions for fusion returning true. + IdenticalTripCountRes.first = true; + } + + if (!IdenticalTripCountRes.first) { LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " "counts. Not fusing.\n"); reportLoopFusion(*FC0, *FC1, @@ -734,7 +887,8 @@ // Ensure that FC0 and FC1 have identical guards. // If one (or both) are not guarded, this check is not necessary. if (FC0->GuardBranch && FC1->GuardBranch && - !haveIdenticalGuards(*FC0, *FC1)) { + !haveIdenticalGuards(*FC0, *FC1) && + !IdenticalTripCountRes.second) { LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " "guards. Not Fusing.\n"); reportLoopFusion(*FC0, *FC1, @@ -796,20 +950,31 @@ FusionNotBeneficial); continue; } - // All analysis has completed and has determined that fusion is legal - // and profitable. At this point, start transforming the code and - // perform fusion. + // All analysis has completed and has determined that fusion is + // legal and profitable. At this point, start transforming the + // code and perform fusion. LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " << *FC1 << "\n"); + FusionCandidate FC0Copy = *FC0; + // Peel the loop after determining that fusion is legal. The Loops + // will still be safe to fuse after the peeling is performed. + if (IdenticalTripCountRes.second) + PeelFusionCandidate(FC0Copy, *FC1, *IdenticalTripCountRes.second); + // Report fusion to the Optimization Remarks. // Note this needs to be done *before* performFusion because // performFusion will change the original loops, making it not // possible to identify them after fusion is complete. - reportLoopFusion(*FC0, *FC1, FuseCounter); - - FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE); + reportLoopFusion( + (IdenticalTripCountRes.second ? FC0Copy : *FC0), *FC1, + FuseCounter); + + FusionCandidate FusedCand( + performFusion((IdenticalTripCountRes.second ? FC0Copy : *FC0), + *FC1), + &DT, &PDT, ORE, FC0Copy.PP); FusedCand.verify(); assert(FusedCand.isEligibleForFusion(SE) && "Fused candidate should be eligible for fusion!"); @@ -1094,8 +1259,9 @@ assert(FCLatchBranch->isConditional() && FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && "Expecting the two successors of FCLatchBranch to be the same"); - FCLatchBranch->setCondition( - llvm::ConstantInt::getTrue(FCLatchBranch->getCondition()->getType())); + BranchInst *NewBranch = + BranchInst::Create(FCLatchBranch->getSuccessor(0)); + ReplaceInstWithInst(FCLatchBranch, NewBranch); } } @@ -1155,7 +1321,8 @@ if (FC0.GuardBranch) return fuseGuardedLoops(FC0, FC1); - assert(FC1.Preheader == FC0.ExitBlock); + assert(FC1.Preheader == + (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock)); assert(FC1.Preheader->size() == 1 && FC1.Preheader->getSingleSuccessor() == FC1.Header); @@ -1178,7 +1345,7 @@ FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); // Then modify the control flow and update DT and PDT. - SmallVector TreeUpdates; + SmallVector TreeUpdates; // The old exiting block of the first loop (FC0) has to jump to the header // of the second as we need to execute the code in the second header block @@ -1197,12 +1364,27 @@ // to FC1.Header? I think this is basically what the three sequences are // trying to accomplish; however, doing this directly in the CFG may mean // the DT/PDT becomes invalid - FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader, - FC1.Header); - TreeUpdates.emplace_back(DominatorTree::UpdateType( - DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); - TreeUpdates.emplace_back(DominatorTree::UpdateType( - DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); + if (!FC0.Peeled) { + FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader, + FC1.Header); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); + } else { + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader)); + + // Remove the ExitBlock of the first Loop (also not needed) + FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, + FC1.Header); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); + FC0.ExitBlock->getTerminator()->eraseFromParent(); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); + new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); + } // The pre-header of L1 is not necessary anymore. assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader)); @@ -1246,7 +1428,7 @@ FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); - // Change the condition of FC0 latch branch to true, as both successors of + // Modify the latch branch of FC0 to be unconditional as both successors of // the branch are the same. simplifyLatchBranch(FC0); @@ -1268,6 +1450,11 @@ LI.removeBlock(FC1.Preheader); DTU.deleteBB(FC1.Preheader); + if (FC0.Peeled) { + LI.removeBlock(FC0.ExitBlock); + DTU.deleteBB(FC0.ExitBlock); + } + DTU.flush(); // Is there a way to keep SE up-to-date so we don't need to forget the loops @@ -1364,10 +1551,18 @@ BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); + BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor(); // Move instructions from the exit block of FC0 to the beginning of the exit - // block of FC1. - moveInstructionsToTheBeginning(*FC0.ExitBlock, *FC1.ExitBlock, DT, PDT, DI); + // block of FC1, in the case that the FC0 loop has not been peeled. In the + // case that FC0 loop is peeled, then move the instructions of the successor + // of the FC0 Exit block to the to the beginning of the exit block of FC1. + if (!FC0.Peeled) + moveInstructionsToTheBeginning(*FC0.ExitBlock, *FC1.ExitBlock, DT, PDT, + DI); + else + moveInstructionsToTheBeginning(*FC0ExitBlockSuccessor, *FC1.ExitBlock, DT, + PDT, DI); // Move instructions from the guard block of FC1 to the end of the guard // block of FC0. @@ -1387,8 +1582,12 @@ // for FC1 (where FC1 guard would have gone if FC1 was not executed). FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock); FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); - FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock, - FC1.Header); + if (!FC0.Peeled) + FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock, + FC1.Header); + else + FC0ExitBlockSuccessor->getTerminator()->replaceUsesOfWith(FC1GuardBlock, + FC1.Header); // The guard of FC1 is not necessary anymore. FC1.GuardBranch->eraseFromParent(); @@ -1403,6 +1602,15 @@ TreeUpdates.emplace_back(DominatorTree::UpdateType( DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); + if (FC0.Peeled) { + // Remove the Block after the ExitBlock of FC0 + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock)); + FC0ExitBlockSuccessor->getTerminator()->eraseFromParent(); + new UnreachableInst(FC0ExitBlockSuccessor->getContext(), + FC0ExitBlockSuccessor); + } + assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) && "Expecting guard block to have no predecessors"); assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) && @@ -1540,6 +1748,10 @@ LI.removeBlock(FC1GuardBlock); LI.removeBlock(FC1.Preheader); LI.removeBlock(FC0.ExitBlock); + if (FC0.Peeled) { + LI.removeBlock(FC0ExitBlockSuccessor); + DTU.deleteBB(FC0ExitBlockSuccessor); + } DTU.deleteBB(FC1GuardBlock); DTU.deleteBB(FC1.Preheader); DTU.deleteBB(FC0.ExitBlock); @@ -1606,6 +1818,8 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); @@ -1622,9 +1836,12 @@ auto &SE = getAnalysis().getSE(); auto &PDT = getAnalysis().getPostDomTree(); auto &ORE = getAnalysis().getORE(); - + auto &AC = getAnalysis().getAssumptionCache(F); + const TargetTransformInfo &TTI = + getAnalysis().getTTI(F); const DataLayout &DL = F.getParent()->getDataLayout(); - LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL); + + LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); return LF.fuseLoops(F); } }; @@ -1637,9 +1854,11 @@ auto &SE = AM.getResult(F); auto &PDT = AM.getResult(F); auto &ORE = AM.getResult(F); - + auto &AC = AM.getResult(F); + const TargetTransformInfo &TTI = AM.getResult(F); const DataLayout &DL = F.getParent()->getDataLayout(); - LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL); + + LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); bool Changed = LF.fuseLoops(F); if (!Changed) return PreservedAnalyses::all(); @@ -1662,6 +1881,8 @@ INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false) FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); } diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -41,6 +41,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/LoopPeel.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -56,6 +56,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/LoopPeel.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SizeOpts.h" @@ -115,10 +116,6 @@ cl::desc( "Set the max unroll count for full unrolling, for testing purposes")); -static cl::opt UnrollPeelCount( - "unroll-peel-count", cl::Hidden, - cl::desc("Set the unroll peeling count, for testing purposes")); - static cl::opt UnrollAllowPartial("unroll-allow-partial", cl::Hidden, cl::desc("Allows loops to be partially unrolled until " @@ -149,15 +146,6 @@ "threshold, the loop is considered as flat and will be less " "aggressively unrolled.")); -static cl::opt - UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden, - cl::desc("Allows loops to be peeled when the dynamic " - "trip count is known to be low.")); - -static cl::opt UnrollAllowLoopNestsPeeling( - "unroll-allow-loop-nests-peeling", cl::init(false), cl::Hidden, - cl::desc("Allows loop nests to be peeled.")); - static cl::opt UnrollUnrollRemainder( "unroll-remainder", cl::Hidden, cl::desc("Allow the loop remainder to be unrolled.")); @@ -175,6 +163,7 @@ "unroll-threshold-aggressive", cl::init(300), cl::Hidden, cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) " "optimizations")); + static cl::opt UnrollThresholdDefault("unroll-threshold-default", cl::init(150), cl::Hidden, @@ -275,33 +264,6 @@ return UP; } -TargetTransformInfo::PeelingPreferences -llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, - const TargetTransformInfo &TTI, - Optional UserAllowPeeling, - Optional UserAllowProfileBasedPeeling) { - TargetTransformInfo::PeelingPreferences PP; - - // Get Target Specifc Values - TTI.getPeelingPreferences(L, SE, PP); - - // User Specified Values using cl::opt - if (UnrollPeelCount.getNumOccurrences() > 0) - PP.PeelCount = UnrollPeelCount; - if (UnrollAllowPeeling.getNumOccurrences() > 0) - PP.AllowPeeling = UnrollAllowPeeling; - if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0) - PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling; - - // User Specifed values provided by argument - if (UserAllowPeeling.hasValue()) - PP.AllowPeeling = *UserAllowPeeling; - if (UserAllowProfileBasedPeeling.hasValue()) - PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling; - - return PP; -} - namespace { /// A struct to densely store the state of an instruction after unrolling at @@ -875,7 +837,7 @@ } // 4th priority is loop peeling. - computePeelCount(L, LoopSize, UP, PP, TripCount, SE); + computePeelCount(L, LoopSize, PP, TripCount, SE, UP.Threshold); if (PP.PeelCount) { UP.Runtime = false; UP.Count = 1; @@ -1081,7 +1043,7 @@ ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, ProvidedFullUnrollMaxCount); TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences( - L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling); + L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true); // Exit early if unrolling is disabled. For OptForSize, we pick the loop size // as threshold later on. diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -35,11 +35,11 @@ LCSSA.cpp LibCallsShrinkWrap.cpp Local.cpp + LoopPeel.cpp LoopRotationUtils.cpp LoopSimplify.cpp LoopUnroll.cpp LoopUnrollAndJam.cpp - LoopUnrollPeel.cpp LoopUnrollRuntime.cpp LoopUtils.cpp LoopVersioning.cpp diff --git a/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp rename from llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp rename to llvm/lib/Transforms/Utils/LoopPeel.cpp --- a/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -1,4 +1,4 @@ -//===- UnrollLoopPeel.cpp - Loop peeling utilities ------------------------===// +//===- LoopPeel.cpp -------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,53 +6,65 @@ // //===----------------------------------------------------------------------===// // -// This file implements some loop unrolling utilities for peeling loops -// with dynamically inferred (from PGO) trip counts. See LoopUnroll.cpp for -// unrolling loops with compile-time constant trip counts. -// +// Loop Peel Utilities. //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Utils/LoopPeel.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" +#include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopUnrollAnalyzer.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" -#include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar/LoopUnrollPass.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" #include "llvm/Transforms/Utils/ValueMapper.h" -#include -#include -#include -#include using namespace llvm; using namespace llvm::PatternMatch; -#define DEBUG_TYPE "loop-unroll" +#define DEBUG_TYPE "loop-peel" STATISTIC(NumPeeled, "Number of loops peeled"); +static cl::opt UnrollPeelCount( + "unroll-peel-count", cl::Hidden, + cl::desc("Set the unroll peeling count, for testing purposes")); + +static cl::opt + UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden, + cl::desc("Allows loops to be peeled when the dynamic " + "trip count is known to be low.")); + +static cl::opt + UnrollAllowLoopNestsPeeling("unroll-allow-loop-nests-peeling", + cl::init(false), cl::Hidden, + cl::desc("Allows loop nests to be peeled.")); + static cl::opt UnrollPeelMaxCount( "unroll-peel-max-count", cl::init(7), cl::Hidden, cl::desc("Max average trip count which will cause loop peeling.")); @@ -278,9 +290,9 @@ // Return the number of iterations we want to peel off. void llvm::computePeelCount(Loop *L, unsigned LoopSize, - TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, - unsigned &TripCount, ScalarEvolution &SE) { + unsigned &TripCount, ScalarEvolution &SE, + unsigned Threshold) { assert(LoopSize > 0 && "Zero loop size is not allowed!"); // Save the PP.PeelCount value set by the target in // TTI.getPeelingPreferences or by the flag -unroll-peel-count. @@ -322,7 +334,7 @@ // maximum number of iterations among these values, thus turning all those // Phis into invariants. // First, check that we can peel at least one iteration. - if (2 * LoopSize <= UP.Threshold && UnrollPeelMaxCount > 0) { + if (2 * LoopSize <= Threshold && UnrollPeelMaxCount > 0) { // Store the pre-calculated values here. SmallDenseMap IterationsToInvariance; // Now go through all Phis to calculate their the number of iterations they @@ -342,7 +354,7 @@ // Pay respect to limitations implied by loop size and the max peel count. unsigned MaxPeelCount = UnrollPeelMaxCount; - MaxPeelCount = std::min(MaxPeelCount, UP.Threshold / LoopSize - 1); + MaxPeelCount = std::min(MaxPeelCount, Threshold / LoopSize - 1); DesiredPeelCount = std::max(DesiredPeelCount, countToEliminateCompares(*L, MaxPeelCount, SE)); @@ -385,7 +397,7 @@ if (*PeelCount) { if ((*PeelCount + AlreadyPeeled <= UnrollPeelMaxCount) && - (LoopSize * (*PeelCount + 1) <= UP.Threshold)) { + (LoopSize * (*PeelCount + 1) <= Threshold)) { LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount << " iterations.\n"); PP.PeelCount = *PeelCount; @@ -396,7 +408,7 @@ LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n"); LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1) << "\n"); - LLVM_DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n"); + LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n"); } } } @@ -491,7 +503,7 @@ /// instructions in the last peeled-off iteration. static void cloneLoopBlocks( Loop *L, unsigned IterNumber, BasicBlock *InsertTop, BasicBlock *InsertBot, - SmallVectorImpl > &ExitEdges, + SmallVectorImpl> &ExitEdges, SmallVectorImpl &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT, LoopInfo *LI) { @@ -599,6 +611,34 @@ LVMap[KV.first] = KV.second; } +TargetTransformInfo::PeelingPreferences llvm::gatherPeelingPreferences( + Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, + Optional UserAllowPeeling, + Optional UserAllowProfileBasedPeeling, bool UnrollingSpecficValues) { + TargetTransformInfo::PeelingPreferences PP; + + // Get Target Specifc Values + TTI.getPeelingPreferences(L, SE, PP); + + if (UnrollingSpecficValues) { + // User Specified Values using cl::opt + if (UnrollPeelCount.getNumOccurrences() > 0) + PP.PeelCount = UnrollPeelCount; + if (UnrollAllowPeeling.getNumOccurrences() > 0) + PP.AllowPeeling = UnrollAllowPeeling; + if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0) + PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling; + } + + // User Specifed values provided by argument + if (UserAllowPeeling.hasValue()) + PP.AllowPeeling = *UserAllowPeeling; + if (UserAllowProfileBasedPeeling.hasValue()) + PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling; + + return PP; +} + /// Peel off the first \p PeelCount iterations of loop \p L. /// /// Note that this does not peel them off as a single straight-line block. @@ -609,8 +649,8 @@ /// for the bulk of dynamic execution, can be further simplified by scalar /// optimizations. bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, - ScalarEvolution *SE, DominatorTree *DT, - AssumptionCache *AC, bool PreserveLCSSA) { + ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, + bool PreserveLCSSA) { assert(PeelCount > 0 && "Attempt to peel out zero iterations?"); assert(canPeel(L) && "Attempt to peel a loop which is not peelable?"); diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -59,6 +59,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopPeel.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" diff --git a/llvm/test/Transforms/LoopFusion/guarded_peel.ll b/llvm/test/Transforms/LoopFusion/guarded_peel.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopFusion/guarded_peel.ll @@ -0,0 +1,84 @@ +; RUN: opt -S -loop-fusion -loop-fusion-peel < %s | FileCheck %s + +; This will test if we are able to fuse two guarded loops which have constant +; but different trip counts. The first two iterations of the first loop should +; be peeled off, and then the loops should be fused together. + +@B = common global [1024 x i32] zeroinitializer, align 16 + +; CHECK: void @main +; CHECK-next: entry: +; CHECK: br i1 %cmp4, label %for.first.entry, label %for.end +; CHECK: for.first.entry +; CHECK: br label %for.first.peel.begin +; CHECK: for.first.peel.begin: +; CHECK: br label %for.first.peel +; CHECK: for.first.peel: +; CHECK: br label %for.first.peel.next +; CHECK: for.first.peel.next: +; CHECK: br label %for.first.peel2 +; CHECK: for.first.peel2: +; CHECK: br label %for.first.peel.next1 +; CHECK: for.first.peel.next1: +; CHECK: br label %for.first.peel.next11 +; CHECK: for.first.peel.next11: +; CHECK: br label %for.first.entry.peel.newph +; CHECK: br label %for.first +; CHECK: for.first: +; CHECK: br i1 %cmp3, label %for.first, label %for.second.exit +; CHECK: for.second.exit: +; CHECK: br label %for.end +; CHECK: for.end: +; CHECK: ret void + +define void @main(i32* noalias %A) { +entry: + %cmp4 = icmp slt i64 0, 45 + br i1 %cmp4, label %for.first.entry, label %for.second.guard + +for.first.entry: ; preds = %entry + br label %for.first + +for.first: ; preds = %for.first.entry, %for.first + %i.05 = phi i64 [ %inc, %for.first ], [ 0, %for.first.entry ] + %sub = sub nsw i64 %i.05, 3 + %add = add nsw i64 %i.05, 3 + %mul = mul nsw i64 %sub, %add + %rem = srem i64 %mul, %i.05 + %conv = trunc i64 %rem to i32 + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.05 + store i32 %conv, i32* %arrayidx, align 4 + %inc = add nsw i64 %i.05, 1 + %cmp = icmp slt i64 %inc, 45 + br i1 %cmp, label %for.first, label %for.first.exit + +for.first.exit: ; preds = %for.first + br label %for.second.guard + +for.second.guard: ; preds = %for.first.exit, %entry + %cmp31 = icmp slt i64 2, 45 + br i1 %cmp31, label %for.second.entry, label %for.end + +for.second.entry: ; preds = %for.second.guard + br label %for.second + +for.second: ; preds = %for.second.entry, %for.second + %i1.02 = phi i64 [ %inc14, %for.second ], [ 2, %for.second.entry ] + %sub7 = sub nsw i64 %i1.02, 3 + %add8 = add nsw i64 %i1.02, 3 + %mul9 = mul nsw i64 %sub7, %add8 + %rem10 = srem i64 %mul9, %i1.02 + %conv11 = trunc i64 %rem10 to i32 + %arrayidx12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %i1.02 + store i32 %conv11, i32* %arrayidx12, align 4 + %inc14 = add nsw i64 %i1.02, 1 + %cmp3 = icmp slt i64 %inc14, 45 + br i1 %cmp3, label %for.second, label %for.second.exit + +for.second.exit: ; preds = %for.second + br label %for.end + +for.end: ; preds = %for.second.exit, %for.second.guard + ret void +} + diff --git a/llvm/test/Transforms/LoopFusion/guarded_unsafeblock_peel.ll b/llvm/test/Transforms/LoopFusion/guarded_unsafeblock_peel.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopFusion/guarded_unsafeblock_peel.ll @@ -0,0 +1,68 @@ +; RUN: opt -S -loop-fusion -loop-fusion-peel < %s | FileCheck %s + +; This will test that we do not fuse two guarded loops together. +; These loops do not have the same trip count, fusing should be possible after +; peeling the loops. However, the exit block of the first loop makes the loops +; unsafe to peel. +; The expected output of this test is the function as below. + +; CHECK: void @unsafe_exitblock +; CHECK: for.first.guard +; CHECK: br i1 %cmp3, label %for.first.preheader, label %for.second.guard +; CHECK: for.first: +; CHECK: br i1 %cmp, label %for.first, label %for.first.exit +; CHECK: for.first.exit: +; CHECK: br label %for.second.guard +; CHECK: for.second.guard: +; CHECK: br i1 %cmp21, label %for.second.preheader, label %for.end +; CHECK: for.second.preheader: +; CHECK: br label %for.second +; CHECK: for.second: +; CHECK: br i1 %cmp2, label %for.second, label %for.second.exit +; CHECK: for.second.exit: +; CHECK: br label %for.end +; CHECK: for.end: +; CHECK: ret void +define void @unsafe_exitblock(i32* noalias %A, i32* noalias %B) { +for.first.guard: + %cmp3 = icmp slt i64 0, 45 + br i1 %cmp3, label %for.first.preheader, label %for.second.guard + +for.first.preheader: + br label %for.first + +for.first: + %i.04 = phi i64 [ %inc, %for.first ], [ 0, %for.first.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.04 + store i32 0, i32* %arrayidx, align 4 + %inc = add nsw i64 %i.04, 1 + %cmp = icmp slt i64 %inc, 45 + br i1 %cmp, label %for.first, label %for.first.exit + +for.first.exit: + call void @bar() + br label %for.second.guard + +for.second.guard: + %cmp21 = icmp slt i64 2,45 + br i1 %cmp21, label %for.second.preheader, label %for.end + +for.second.preheader: + br label %for.second + +for.second: + %j.02 = phi i64 [ %inc6, %for.second ], [ 2, %for.second.preheader ] + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %j.02 + store i32 0, i32* %arrayidx4, align 4 + %inc6 = add nsw i64 %j.02, 1 + %cmp2 = icmp slt i64 %inc6, 45 + br i1 %cmp2, label %for.second, label %for.second.exit + +for.second.exit: + br label %for.end + +for.end: + ret void +} + +declare void @bar() diff --git a/llvm/test/Transforms/LoopFusion/nonadjacent_peel.ll b/llvm/test/Transforms/LoopFusion/nonadjacent_peel.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopFusion/nonadjacent_peel.ll @@ -0,0 +1,82 @@ +; RUN: opt -S -loop-fusion -loop-fusion-peel < %s | FileCheck %s + +; This will check that we do not fuse these two loops together. These loops are +; valid cadidates for peeling, however they are not adjacent. +; The expected output of this test is the function below. + +; CHECK: void @function +; CHECK-NEXT: for.first.preheader: +; CHECK-NEXT: br label %for.first +; CHECK: for.first: +; CHECK: br label %for.first.latch +; CHECK: for.first.latch: +; CHECK: br i1 %exitcond4, label %for.first, label %for.first.exit +; CHECK: for.first.exit: +; CHECK: br label %for.next +; CHECK: for.next: +; CHECK: br label %for.second.preheader +; CHECK: for.second.preheader: +; CHECK: br label %for.second +; CHECK: for.second: +; CHECK: br label %for.second.latch +; CHECK: br i1 %exitcond, label %for.second, label %for.end +; CHECK: for.end: +; CHECK-NEXT: ret void + +@B = common global [1024 x i32] zeroinitializer, align 16 + +define void @function(i32* noalias %arg) { +for.first.preheader: + br label %for.first + +for.first: ; preds = %for.first.preheader, %for.first.latch + %.014 = phi i32 [ 0, %for.first.preheader ], [ %tmp15, %for.first.latch ] + %indvars.iv23 = phi i64 [ 0, %for.first.preheader ], [ %indvars.iv.next3, %for.first.latch ] + %tmp = add nsw i32 %.014, -3 + %tmp8 = add nuw nsw i64 %indvars.iv23, 3 + %tmp9 = trunc i64 %tmp8 to i32 + %tmp10 = mul nsw i32 %tmp, %tmp9 + %tmp11 = trunc i64 %indvars.iv23 to i32 + %tmp12 = srem i32 %tmp10, %tmp11 + %tmp13 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv23 + store i32 %tmp12, i32* %tmp13, align 4 + br label %for.first.latch + +for.first.latch: ; preds = %for.first + %indvars.iv.next3 = add nuw nsw i64 %indvars.iv23, 1 + %tmp15 = add nuw nsw i32 %.014, 1 + %exitcond4 = icmp ne i64 %indvars.iv.next3, 100 + br i1 %exitcond4, label %for.first, label %for.first.exit + +for.first.exit: ; preds: %for.first.latch + br label %for.next + +for.next: ; preds = %for.first.exit + br label %for.second.preheader + +for.second.preheader: ; preds = %for.next + br label %for.second + +for.second: ; preds = %for.second.preheader, %for.second.latch + %.02 = phi i32 [ 0, %for.second.preheader ], [ %tmp28, %for.second.latch ] + %indvars.iv1 = phi i64 [ 3, %for.second.preheader ], [ %indvars.iv.next, %for.second.latch ] + %tmp20 = add nsw i32 %.02, -3 + %tmp21 = add nuw nsw i64 %indvars.iv1, 3 + %tmp22 = trunc i64 %tmp21 to i32 + %tmp23 = mul nsw i32 %tmp20, %tmp22 + %tmp24 = trunc i64 %indvars.iv1 to i32 + %tmp25 = srem i32 %tmp23, %tmp24 + %tmp26 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv1 + store i32 %tmp25, i32* %tmp26, align 4 + br label %for.second.latch + +for.second.latch: ; preds = %for.second + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %tmp28 = add nuw nsw i32 %.02, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.second, label %for.end + +for.end: ; preds = %for.second.latch + ret void +} + diff --git a/llvm/test/Transforms/LoopFusion/peel.ll b/llvm/test/Transforms/LoopFusion/peel.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopFusion/peel.ll @@ -0,0 +1,106 @@ +; RUN: opt -S -loop-fusion -loop-fusion-peel < %s | FileCheck %s + +; This will test whether we can fuse two loops together if they have constant +; but a different tripcount. +; The first three iterations of the first loop should be peeled, and then the +; two loops should be fused together in this example. + +; C Code +; +; int B[1024]; +; +; void function(int *arg) { +; for (int i = 0; i != 100; ++i) +; arg[i] = ((i - 3)*(i+3)) % i; +; +; for (int i = 3; i != 100; ++i) +; B[i] = ((i-6)*(i+3)) % i; +; } + +; CHECK: void @function +; CHECK-NEXT: for.first.preheader: +; CHECK-NEXT: br label %for.first.peel.begin +; CHECK: for.first.peel.begin: +; CHECK-NEXT: br label %for.first.peel +; CHECK: for.first.peel +; CHECK: br label %for.first.latch.peel +; CHECK: for.first.latch.peel: +; CHECK: br label %for.first.peel.next +; CHECK: for.first.peel.next: +; CHECK: br label %for.first.peel2 +; CHECK: for.first.peel2: +; CHECK: br label %for.first.latch.peel10 +; CHECK: for.first.latch.peel10: +; CHECK: br label %for.first.peel.next1 +; CHECK: for.first.peel.next1: +; CHECK: br label %for.first.peel15 +; CHECK: for.first.peel15: +; CHECK: br label %for.first.latch.peel23 +; CHECK: for.first.latch.peel23: +; CHECK: br label %for.first.peel.next14 +; CHECK: for.first.peel.next14: +; CHECK-NEXT: br label %for.first.peel.next27 +; CHECK: for.first.peel.next27: +; CHECK-NEXT: br label %for.first.preheader.peel.newph +; CHECK: for.first.preheader.peel.newph: +; CHECK-NEXT: br label %for.first +; CHECK: for.first: +; CHECK: br label %for.first.latch +; CHECK: for.first.latch: +; CHECK: br label %for.second.latch +; CHECK: for.second.latch: +; CHECK: br i1 %exitcond, label %for.first, label %for.end +; CHECK: for.end: +; CHECK-NEXT: ret void + +@B = common global [1024 x i32] zeroinitializer, align 16 + +define void @function(i32* noalias %arg) { +for.first.preheader: + br label %for.first + +for.first: ; preds = %for.first.preheader, %for.first.latch + %.014 = phi i32 [ 0, %for.first.preheader ], [ %tmp15, %for.first.latch ] + %indvars.iv23 = phi i64 [ 0, %for.first.preheader ], [ %indvars.iv.next3, %for.first.latch ] + %tmp = add nsw i32 %.014, -3 + %tmp8 = add nuw nsw i64 %indvars.iv23, 3 + %tmp9 = trunc i64 %tmp8 to i32 + %tmp10 = mul nsw i32 %tmp, %tmp9 + %tmp11 = trunc i64 %indvars.iv23 to i32 + %tmp12 = srem i32 %tmp10, %tmp11 + %tmp13 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv23 + store i32 %tmp12, i32* %tmp13, align 4 + br label %for.first.latch + +for.first.latch: ; preds = %for.first + %indvars.iv.next3 = add nuw nsw i64 %indvars.iv23, 1 + %tmp15 = add nuw nsw i32 %.014, 1 + %exitcond4 = icmp ne i64 %indvars.iv.next3, 100 + br i1 %exitcond4, label %for.first, label %for.second.preheader + +for.second.preheader: ; preds = %for.first.latch + br label %for.second + +for.second: ; preds = %for.second.preheader, %for.second.latch + %.02 = phi i32 [ 0, %for.second.preheader ], [ %tmp28, %for.second.latch ] + %indvars.iv1 = phi i64 [ 3, %for.second.preheader ], [ %indvars.iv.next, %for.second.latch ] + %tmp20 = add nsw i32 %.02, -3 + %tmp21 = add nuw nsw i64 %indvars.iv1, 3 + %tmp22 = trunc i64 %tmp21 to i32 + %tmp23 = mul nsw i32 %tmp20, %tmp22 + %tmp24 = trunc i64 %indvars.iv1 to i32 + %tmp25 = srem i32 %tmp23, %tmp24 + %tmp26 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv1 + store i32 %tmp25, i32* %tmp26, align 4 + br label %for.second.latch + +for.second.latch: ; preds = %for.second + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %tmp28 = add nuw nsw i32 %.02, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.second, label %for.end + +for.end: ; preds = %for.second.latch + ret void +} +