Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -310,6 +310,7 @@ void initializeDwarfEHPreparePass(PassRegistry&); void initializeFloat2IntPass(PassRegistry&); void initializeLoopDistributePass(PassRegistry&); +void initializeLoopFusePass(PassRegistry&); void initializeSjLjEHPreparePass(PassRegistry&); void initializeDemandedBitsPass(PassRegistry&); void initializeFuncletLayoutPass(PassRegistry &); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -481,6 +481,12 @@ //===----------------------------------------------------------------------===// // +// LoopFuse - Fuse loops. +// +FunctionPass *createLoopFusePass(); + +//===----------------------------------------------------------------------===// +// // LoopLoadElimination - Perform loop-aware load elimination. // FunctionPass *createLoopLoadEliminationPass(); Index: include/llvm/Transforms/Scalar/LoopFuse.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Scalar/LoopFuse.h @@ -0,0 +1,130 @@ +//===------------- LoopFuse.h - Loop Fusion Utility -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// Fuse two adjacent loops to improve cache locality. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" +#include + +namespace llvm { +/// \brief The pass class. +class LoopFuse : public FunctionPass { + +public: + // Kind of fusion made. + enum Kind { + NO_FUSION = 0, // Fusion was not made even to check dependence legality. + // This is when loops had failed basic structure checks. + REVERTED_FUSION, // Fusion was reverted due to failed dependence legality. + PURE_FUSION, // Fusion succeeded with removal of original loops. + VERSIONED_FUSION // Fusion succeeded with versioning due to runtime checks. + }; + +private: + // Analyses used. + LoopInfo *LI; + LoopAccessAnalysis *LAA; + DominatorTree *DT; + ScalarEvolution *SE; + + // FusionSwitcher - Branch instruction that controls switching between + // original and fused versions. This gets initialized to true when loops are + // multiversioned to check fusion legality. By default, it points to original + // version. + BranchInst *FusionSwitcher; + + Loop *FusedLoop; + + // LAI for FusedLoop. + const LoopAccessInfo *LAI; + + // Kind of fusion that happened. + Kind FusionKind = NO_FUSION; + + // CustomVMap: VMap of BBs for fused loop. The problem about having + // ValueToValueMapTy passed from a client is that it gets updated when the + // loops are removed based on fusion success and this is undesirable. Also + // a ValueToValueMapTy is used when both Values are present. So, only a + // normal llvm::Value* is maintained as map's value in contrast with + // ValueToValueMapTy's WeakVH. Clients can use this mapping as a VMap. + typedef std::map CustomVMap; + CustomVMap VMap; + + // Rewrite IncomingBlocks in PHIs of @Br's successor blocks from Br's parent + // to @To. + void RewritePHI(BranchInst *Br, BasicBlock *To); + + // Fuse loops - @L1 and @L2 and return the fused loop. + Loop *FuseLoops(Loop &L1, Loop &L2); + + // Legality and profitability checks. + bool DependenceLegal(Loop &L1, Loop &L2); + bool DefsUsedAcrossLoops(Loop &L1, Loop &L2); + bool IsLegalAndProfitable(Loop &L1, Loop &L2); + + // Removal routines based on fusion success. + void RemoveLoopCompletelyWithPreheader(Loop &L); + void RemoveFusionSwitcher(Loop &L); + + // Outside use updates. + void UpdateUsesOutsideLoop(Loop &L); + void AddPHIsOutsideLoop(Loop &L, BasicBlock *OrigIncomingBlock); + +public: + LoopFuse() : FunctionPass(ID) { + initializeLoopFusePass(*PassRegistry::getPassRegistry()); + } + + // Initialization interface when this pass is used as a utility. + LoopFuse(LoopInfo *_LI, LoopAccessAnalysis *_LAA, DominatorTree *_DT, + ScalarEvolution *_SE) + : FunctionPass(ID), LI(_LI), LAA(_LAA), DT(_DT), SE(_SE) {} + + Loop *getFusedLoop() { return FusedLoop; } + + const CustomVMap &getVMap() { return VMap; } + + unsigned getFusionKind() { return FusionKind; } + + // Interface; when this pass is used as a utility. + bool run(Loop &L1, Loop &L2); + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + } + + static char ID; +}; +} // anonymous namespace Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -98,6 +98,10 @@ "enable-loop-distribute", cl::init(false), cl::Hidden, cl::desc("Enable the new, experimental LoopDistribution Pass")); +static cl::opt EnableLoopFuse( + "enable-loop-fuse", cl::init(false), cl::Hidden, + cl::desc("Enable the new, experimental LoopFusion Pass")); + static cl::opt EnableNonLTOGlobalsModRef( "enable-non-lto-gmr", cl::init(true), cl::Hidden, cl::desc( @@ -468,6 +472,9 @@ if (EnableLoopDistribute) MPM.add(createLoopDistributePass()); + if (EnableLoopFuse) + MPM.add(createLoopFusePass()); + MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); // Eliminate loads by forwarding stores from the previous iteration to loads Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -18,6 +18,7 @@ LoadCombine.cpp LoopDeletion.cpp LoopDistribute.cpp + LoopFuse.cpp LoopIdiomRecognize.cpp LoopInstSimplify.cpp LoopInterchange.cpp Index: lib/Transforms/Scalar/LoopFuse.cpp =================================================================== --- /dev/null +++ lib/Transforms/Scalar/LoopFuse.cpp @@ -0,0 +1,560 @@ +//===------------- LoopFuse.cpp - Loop Fusion Pass ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// Fuse two adjacent loops to improve cache locality. Loops are multi-versioned +/// and unconditionally fused along one version to check for dependence +/// legality. Legality decides whether to keep the original version or the fused +/// version or both versions with runtime checks. LoopAccessAnalysis is used to +/// check dependence legality. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LoopFuse.h" + +#define DEBUG_TYPE "loop-fuse" + +using namespace llvm; + +static cl::opt + LFuseVerify("loop-fuse-verify", cl::Hidden, + cl::desc("Turn on DominatorTree and LoopInfo verification " + "after Loop Fusion"), + cl::init(false)); + +STATISTIC(NumLoopsFused, "Number of loops fused"); + +// Replace IncomingBlocks in PHI nodes of @Br successors from Br's parent to +// @To. +void LoopFuse::RewritePHI(BranchInst *Br, BasicBlock *To) { + assert((Br && To)); + for (auto *S : Br->successors()) { + auto I = S->begin(); + while (PHINode *P = dyn_cast(&*I)) { + P->setIncomingBlock(P->getBasicBlockIndex(Br->getParent()), To); + ++I; + } + } +} + +//===----------------------------------------------------------------------===// +// Loop Fusion Implementation. +// The idea to check fusion legality is by first fusing the loops and then look +// for fusion preventing dependences. This is done by versioning the loops +// first. The check is done on versioned loops and one of the version is +// discarded based on legality's success. +//===----------------------------------------------------------------------===// + +/* Fuse loops @L1 and @L2. Remove ConnectingBlock (CB) and connect L1Latch to + L2Header. Loop from L2Latch to L1Header. Make L1's indvar as indvar for the + fused loop. Update LI by moving L2Blocks into L1 and call L1 as FusedLoop. + Return FusedLoop. + L1 + | L1Blocks + CB --> | \ + | L2Blocks/ + L2 |/ +*/ +Loop *LoopFuse::FuseLoops(Loop &L1, Loop &L2) { + PHINode *P1 = L1.getCanonicalInductionVariable(); + PHINode *P2 = L2.getCanonicalInductionVariable(); + + BranchInst *Br1 = dyn_cast(L1.getLoopLatch()->getTerminator()); + BranchInst *Br2 = dyn_cast(L2.getLoopLatch()->getTerminator()); + + // Make Br2 to branch to L1 header based on Br1's condition. + unsigned LoopBack = 0; + if (Br2->getSuccessor(1) == L2.getHeader()) + LoopBack = 1; + assert((Br2->getSuccessor(LoopBack) == L2.getHeader())); + Br2->setSuccessor(LoopBack, L1.getHeader()); + Br2->setCondition(Br1->getCondition()); + RewritePHI(Br1, Br2->getParent()); + + // Zap L2 preheader and unconditionally branch from L1 latch to L2 header. + // L2 preheader is a connecting block and it is known to contain only an + // unconditional branch to L2 header. + BasicBlock *L2PH = L2.getLoopPreheader(), *L2Header = L2.getHeader(); + BranchInst *L2PHBr = dyn_cast(L2PH->getTerminator()); + RewritePHI(L2PHBr, Br1->getParent()); + DT->changeImmediateDominator(L2Header, L1.getLoopLatch()); + + BranchInst::Create(L2Header, Br1); + Br1->eraseFromParent(); + L2PH->dropAllReferences(); + L2PHBr->eraseFromParent(); + L2PH->eraseFromParent(); + DT->eraseNode(L2PH); + LI->removeBlock(L2PH); + + P2->replaceAllUsesWith(P1); + P2->eraseFromParent(); + + // Update LI. + // Move all blocks from L2 to L1. + SmallVector L2BBs; + for (auto bb = L2.block_begin(), bbe = L2.block_end(); bb != bbe; ++bb) + L2BBs.push_back(*bb); + for (auto *bb : L2BBs) { + LI->removeBlock(bb); + L1.addBasicBlockToLoop(bb, *LI); + } + // Remove L2. + SE->forgetLoop(&L2); + LI->markAsRemoved(&L2); + + // Update DT: DT changed only at L2PH zap and was updated during zapping. + + return &L1; +} + +/* Version the given loops along a parallel path and fuse the cloned loops. + Check the dependence legality of the fused loop. + + L1PH BooleanBB BooleanBB + | /\ /\ + L1 L1PH L1PH.clone L1PH FusedPH + | version | | Fuse along | | + CB (L1Exit/L2PH) ----> L1 L1.clone --------> L1 L1Blocks + | | | versioned | | \ + L2 CB CB.clone path CB L2Blocks | + | | | | | |/ + L2Exit L2 L2.clone L2 | + \ / \ / + L2Exit CommonExit + CB is ConnectingBlock. +*/ +bool LoopFuse::DependenceLegal(Loop &L1, Loop &L2) { + + // Version to fuse. LoopVersioning is not used here because: + // a. Runtime checks are inserted later. + // b. Intermediate VMap updates are required. + // Moreover it is convenient for now to just clone and remap. + BasicBlock *BooleanBB = L1.getLoopPreheader(); + BasicBlock *L1PH = SplitEdge(BooleanBB, L1.getHeader(), DT, LI); + + ValueToValueMapTy VMap1; + SmallVector ClonedBBs1; + Loop *ClonedLoop1 = + cloneLoopWithPreheader(L1.getExitBlock(), BooleanBB, &L1, VMap1, + Twine(".L1clone"), LI, DT, ClonedBBs1); + + ValueToValueMapTy VMap2; + SmallVector ClonedBBs2; + Loop *ClonedLoop2 = + cloneLoopWithPreheader(L2.getExitBlock(), L1.getExitBlock(), &L2, VMap2, + Twine(".L2clone"), LI, DT, ClonedBBs2); + remapInstructionsInBlocks(ClonedBBs2, VMap2); + VMap1[L1.getExitBlock()] = ClonedLoop2->getLoopPreheader(); + remapInstructionsInBlocks(ClonedBBs1, VMap1); + + // Build the custom VMap by concatenating VMap1 and VMap2. + for (auto V : VMap1) + VMap[V->first] = V->second; + for (auto V : VMap2) + VMap[V->first] = V->second; + + // VMap.size() != VMap1.size() + VMap2.size() because of redundants and + // L1Exit update in VMap1 above. + + // Branch to either of the versions - using a boolean flag. + Instruction *Term = BooleanBB->getTerminator(); + FusionSwitcher = + BranchInst::Create(L1PH, ClonedLoop1->getLoopPreheader(), + ConstantInt::getTrue(L1PH->getContext()), Term); + Term->eraseFromParent(); + + // The two versions join back at L2 exit. Update DT. + if (DT->dominates(L2.getLoopLatch(), L2.getExitBlock())) + DT->changeImmediateDominator(L2.getExitBlock(), BooleanBB); + + DEBUG(dbgs() << "ClonedLoop1: " << *ClonedLoop1 << "\n"); + DEBUG(dbgs() << "ClonedLoop2: " << *ClonedLoop2 << "\n"); + + FusedLoop = FuseLoops(*ClonedLoop1, *ClonedLoop2); + DEBUG(dbgs() << "FusedLoop: " << *FusedLoop << "\n"); + + // Check dependences. + DEBUG(dbgs() << "Loop fused on versioned path. Checking dependences...\n"); + LAI = &LAA->getInfo(FusedLoop, ValueToValueMap()); + DEBUG(LAI->print(dbgs())); + + auto Dependences = LAI->getDepChecker().getDependences(); + if (!Dependences || Dependences->empty()) { + DEBUG(dbgs() << "Failed to get dependences to check fusion legality!" + << " Skipping...\n"); + return false; + } + + // Fusion is illegal if there is a backward dependence between memory accesses + // whose source was in L1 and sink was in L2. ClonedBBs1 and ClonedBBs2 + // contain cloned BBs from L1 and L2 respectively. They are used to check the + // containment of srouce and sink. + for (auto &Dep : *Dependences) { + if (Dep.isPossiblyBackward()) { + Instruction *Source = Dep.getSource(*LAI); + Instruction *Sink = Dep.getDestination(*LAI); + if (std::find(ClonedBBs1.begin(), ClonedBBs1.end(), + Source->getParent()) == ClonedBBs1.end()) + continue; + if (std::find(ClonedBBs2.begin(), ClonedBBs2.end(), Sink->getParent()) == + ClonedBBs2.end()) + continue; + DEBUG(dbgs() << "Loop carried backward dependence prevents fusion!\n"); + return false; + } + } + DEBUG(dbgs() << "Loops are dependence legal to fuse!\n"); + return true; +} + +// Return true if any of the defs made in @L1 is used inside @L2. +bool LoopFuse::DefsUsedAcrossLoops(Loop &L1, Loop &L2) { + auto DefsUsedOutsideL1 = findDefsUsedOutsideOfLoop(&L1); + for (auto *D : DefsUsedOutsideL1) { + for (auto *U : D->users()) { + if (L2.contains(dyn_cast(U)->getParent())) + return true; + } + } + return false; +} + +bool LoopFuse::IsLegalAndProfitable(Loop &L1, Loop &L2) { + // Basic legality. + if (!L1.empty() || !L2.empty()) { + // TODO: Update cloneLoopWithPreheader() to update LoopInfo for subloops + // too and LoopFusion can be done for loops at any depth. + DEBUG(dbgs() << "Not innermost loops! Skipping...\n"); + return false; + } + + if (L1.getLoopDepth() != L2.getLoopDepth()) { + DEBUG(dbgs() << "Loops not at same depth! Skipping...\n"); + return false; + } + + if (!L1.getLoopPreheader() || !L2.getLoopPreheader()) { + DEBUG(dbgs() << "No preheader! Skipping...\n"); + return false; + } + + if (!L1.getExitBlock() || !L2.getExitBlock()) { + DEBUG(dbgs() << "Single exit block not found! Skipping...\n"); + return false; + } + + // Can fuse only bottom-tested loops and loops with latch being the single + // exiting block. + if ((L1.getExitingBlock() != L1.getLoopLatch()) || + (L2.getExitingBlock() != L2.getLoopLatch())) { + DEBUG(dbgs() << "Not a bottom-tested loop! Skipping...\n"); + return false; + } + + // Can fuse only adjacent loops. Adjacency is defined by: + // a. L1Exit has single entry only from L1Latch. + // b. L1Exit and L2Preheader are same i.e the block forms the ConnectingBlock. + // c. ConnectingBlock just branches unconditionally to L2Header. + auto *Br = dyn_cast(L1.getExitBlock()->begin()); + if ((L1.getExitBlock()->getSinglePredecessor() != L1.getLoopLatch()) || + (L1.getExitBlock() != L2.getLoopPreheader()) || + (!Br || Br->isConditional())) { + DEBUG(dbgs() << "Loops not adjacent! Skipping...\n"); + return false; + } + + // Indvars of both loops is known and canonicalized. + PHINode *P1 = L1.getCanonicalInductionVariable(); + PHINode *P2 = L2.getCanonicalInductionVariable(); + if (!P1 || !P2) { + DEBUG(dbgs() << "Unknown induction variables! Skipping...\n"); + return false; + } + + // P1 and P2 are canonical indvars. Backedge taken count check is enough to + // ascertain both loops have same iteration space. + if (SE->getBackedgeTakenCount(&L1) != SE->getBackedgeTakenCount(&L2)) + return false; + + // Cannot fuse if there are uses of L1 defs in L2. + if (DefsUsedAcrossLoops(L1, L2)) + return false; + + // Dependene based legality. + if (!DependenceLegal(L1, L2)) + return false; + + // TODO: Add profitability measures. + + return true; +} + +// Remove Loop @L completely by deleting the BBs and also from @LI, @DT and @SE +// including preheader. Finally connect the single predecessor (the BooleanBB +// that contains FusionSwitcher) of preheader to loop exit. +void LoopFuse::RemoveLoopCompletelyWithPreheader(Loop &L) { + DEBUG(dbgs() << "Removing loop: " << L << "\n"); + BasicBlock *PH = L.getLoopPreheader(); + BasicBlock *Exit = L.getExitBlock(); + assert(Exit && "Expected Exit bb and single pred to preheader!"); + + // No need to RewritePHIs of Exit block given the Loop is deleted because the + // uses remain same if FusedLoop is removed OR uses are already replaced if + // original loops are deleted. + + // Branch to Exit block from FusionSwitcher. + unsigned SuccNum = 0; + if (FusionSwitcher->getSuccessor(1) == PH) + SuccNum = 1; + assert((FusionSwitcher->getSuccessor(SuccNum) == PH)); + FusionSwitcher->setSuccessor(SuccNum, Exit); + if (DT->dominates(L.getLoopLatch(), Exit)) // L1 removal case. + // Exit blocks iDom is FusionSwitcher's block due to versioning. + DT->changeImmediateDominator(Exit, FusionSwitcher->getParent()); + + // Erase each of the loop blocks. Update SE, DT and LI. + SE->forgetLoop(&L); + PH->dropAllReferences(); + for (auto bb = L.block_begin(), bbe = L.block_end(); bb != bbe; ++bb) { + DT->changeImmediateDominator(*bb, PH); + (*bb)->dropAllReferences(); + } + + PH->eraseFromParent(); + for (auto bb = L.block_begin(), bbe = L.block_end(); bb != bbe; ++bb) { + // Now nuke bb and its DT. + (*bb)->eraseFromParent(); + DT->eraseNode(*bb); + } + DT->eraseNode(PH); + + SmallVector LBBs; + for (auto bb = L.block_begin(), bbe = L.block_end(); bb != bbe; ++bb) + LBBs.push_back(*bb); + for (auto *bb : LBBs) + LI->removeBlock(bb); + if (LI->getLoopFor(PH)) + LI->removeBlock(PH); + + LI->markAsRemoved(&L); +} + +// Remove FusionSwitcher and branch directly to given loop @L's header. This +// removes loop's preheader and make FusionSwitcher's block as preheader. +void LoopFuse::RemoveFusionSwitcher(Loop &L) { + assert(FusionSwitcher->isConditional()); + DEBUG(dbgs() << "Removing FusionSwitcher: " << *FusionSwitcher << "\n"); + + BasicBlock *PH = L.getLoopPreheader(); + assert((PH->size() == 1)); + + BranchInst *PHBr = dyn_cast(PH->getTerminator()); + assert(PHBr->isUnconditional()); + + RewritePHI(PHBr, FusionSwitcher->getParent()); + + PHBr->removeFromParent(); + PHBr->insertBefore(FusionSwitcher); + DT->changeImmediateDominator(L.getHeader(), FusionSwitcher->getParent()); + + FusionSwitcher->eraseFromParent(); + PH->eraseFromParent(); + DT->eraseNode(PH); + if (LI->getLoopFor(PH)) + LI->removeBlock(PH); +} + +// Update the uses of defs that reach outside original loop with the defs made +// made in fused loop. +void LoopFuse::UpdateUsesOutsideLoop(Loop &L) { + for (auto *D : findDefsUsedOutsideOfLoop(&L)) { + auto VI = VMap.find(D); + if (VI == VMap.end()) + continue; + + for (auto *U : D->users()) { + if (!L.contains(dyn_cast(U)->getParent())) { + if (auto *P = dyn_cast(U)) { + // Replace U in PHI with + for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) { + if (P->getIncomingValue(i) == U) { + P->removeIncomingValue(i); + P->addIncoming(VI->second, FusedLoop->getLoopLatch()); + } + } + } else + U->replaceUsesOfWith(D, VI->second); + } + } + } +} + +// Add/update phi for defs that reach uses outside the loop from original loop +// @L and from fused loop. Insert the phis into fused loop's exit block, which +// is also the exit block of original L2 loop. @OrigIncomingBlock refers to the +// block from where a def is reached outside of loop - L2 latch. +// TODO: This routine is similar to LoopVersioning's addPHINodes(), but +// rewritten here as access to internal data structures differ. +void LoopFuse::AddPHIsOutsideLoop(Loop &L, BasicBlock *OrigIncomingBlock) { + BasicBlock *PHIBlock = FusedLoop->getExitBlock(); + assert(PHIBlock && "Unable to find FusedLoop's ExitBlock!"); + + for (auto *Inst : findDefsUsedOutsideOfLoop(&L)) { + PHINode *PN = nullptr; + auto FusedInst = VMap.find(Inst); + assert((FusedInst != VMap.end()) && + "Expected an equivalent instruction in fused loop!"); + // Update/add phi node for this Inst. + bool FoundInst = false; + for (auto I = PHIBlock->begin(); !FoundInst && (PN = dyn_cast(I)); + ++I) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); !FoundInst && i != e; + ++i) + if (PN->getIncomingValue(i) == Inst) + FoundInst = true; + } + if (!PN) { + PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lfuse", + &PHIBlock->front()); + + for (auto *U : Inst->users()) + if (!L.contains(dyn_cast(U)->getParent())) + U->replaceUsesOfWith(Inst, PN); + + PN->addIncoming(Inst, OrigIncomingBlock); + } + // Add incoming value from fused loop. + PN->addIncoming(FusedInst->second, FusedLoop->getLoopLatch()); + } +} + +bool LoopFuse::run(Loop &L1, Loop &L2) { + assert((LI && LAA && DT && SE)); + DEBUG(dbgs() << "\nTrying to fuse:\n" << L1 << "AND\n" << L2 << "\n"); + + FusionSwitcher = nullptr; + FusedLoop = nullptr; + VMap.clear(); + bool Changed = false; + if (IsLegalAndProfitable(L1, L2)) { + assert((FusedLoop && FusionSwitcher)); + auto *RuntimePtrChecks = LAI->getRuntimePointerChecking(); + if (RuntimePtrChecks->Need) { + // Add runtime checks and add/update phis in exit block for the defs + // reaching from two versions. + Instruction *FirstCheck, *LastCheck; + std::tie(FirstCheck, LastCheck) = LAI->addRuntimeChecks(FusionSwitcher); + // TODO: Add SCEVRuntime checks? + FusionSwitcher->setCondition(LastCheck); + + AddPHIsOutsideLoop(L1, L2.getLoopLatch()); + AddPHIsOutsideLoop(L2, L2.getLoopLatch()); + FusionKind = VERSIONED_FUSION; + + } else { + // Remove original loops and retain FusedLoop. Also update the uses of + // defs from original loops with the defs from fused loop. + UpdateUsesOutsideLoop(L1); + UpdateUsesOutsideLoop(L2); + RemoveLoopCompletelyWithPreheader(L1); + RemoveLoopCompletelyWithPreheader(L2); + + // Remove FusionSwitcher and directly point to FusedLoop header. + if (DT->dominates(FusionSwitcher->getParent(), FusedLoop->getExitBlock())) + DT->changeImmediateDominator(FusedLoop->getExitBlock(), + FusedLoop->getLoopLatch()); + RemoveFusionSwitcher(*FusedLoop); + FusionKind = PURE_FUSION; + } + ++NumLoopsFused; + Changed = true; + + } else { + if (FusedLoop) { + // Loops were versioned to check legality. Rollback to original state. + RemoveLoopCompletelyWithPreheader(*FusedLoop); + + // Remove FusionSwitcher and directly point to L1 header. + if (DT->dominates(FusionSwitcher->getParent(), L2.getExitBlock())) + DT->changeImmediateDominator(L2.getExitBlock(), L2.getLoopLatch()); + RemoveFusionSwitcher(L1); + FusionKind = REVERTED_FUSION; + } + } + + if (LFuseVerify) { + LI->verify(); + DT->verifyDomTree(); + } + + return Changed; +} + +void PopulateInnermostLoopsOf(Loop &L, SmallVectorImpl &Loops) { + if (L.empty()) + Loops.push_back(&L); + for (auto I = L.begin(), E = L.end(); I != E; ++I) + PopulateInnermostLoopsOf(**I, Loops); +} + +bool LoopFuse::runOnFunction(Function &F) { + LI = &getAnalysis().getLoopInfo(); + LAA = &getAnalysis(); + DT = &getAnalysis().getDomTree(); + SE = &getAnalysis().getSE(); + + // Populate innermost loops and try a n^2 combination of loop fusion. + bool Changed = false; + SmallVector Loops; + for (auto L = LI->begin(), Le = LI->end(); L != Le; ++L) + PopulateInnermostLoopsOf(**L, Loops); + + auto L1 = Loops.begin(), L1e = Loops.end(); + while (L1 != L1e) { + auto L2 = Loops.begin(), L2e = Loops.end(); + while (L2 != L2e) { + if (L1 == L2) { + ++L2; + continue; + } + if (run(**L1, **L2)) { + // Remove L1 and L2 from Loops and add FusedLoop. + Loops.erase(L1); + Loops.erase(L2); + Loops.push_back(FusedLoop); + L1 = L2 = Loops.begin(); + L1e = L2e = Loops.end(); + Changed = true; + } else + ++L2; + } + ++L1; + } + + if (LFuseVerify) { + LI->verify(); + DT->verifyDomTree(); + assert((!verifyFunction(F, &dbgs())) && "Function verification failed!"); + } + + return Changed; +} + +char LoopFuse::ID; + +INITIALIZE_PASS_BEGIN(LoopFuse, "loop-fuse", "Loop Fusion", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(LoopFuse, "loop-fuse", "Loop Fusion", false, false) + +namespace llvm { +FunctionPass *createLoopFusePass() { return new LoopFuse(); } +} Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -84,6 +84,7 @@ initializePlaceSafepointsPass(Registry); initializeFloat2IntPass(Registry); initializeLoopDistributePass(Registry); + initializeLoopFusePass(Registry); initializeLoopLoadEliminationPass(Registry); initializeLoopSimplifyCFGPass(Registry); initializeLoopVersioningPassPass(Registry); Index: test/Transforms/LoopFuse/fuse.ll =================================================================== --- /dev/null +++ test/Transforms/LoopFuse/fuse.ll @@ -0,0 +1,87 @@ +; RUN: opt -loop-fuse -verify-loop-info -verify-dom-info %s -S -o - | FileCheck %s + +; 'C' equivalent: Partially generated and hand modified. +; void fuse(int *a, int *b, int *c) { +; for (i = 0; i < 1000; ++i) // L1 +; c[i] = a[i] + c[i + 1]; +; for (i = 0; i < 1000; ++i) // L2 +; c[i] = a[i] + b[i]; +; } +; There is no backward dependence from L1 to L2. So it is safe to fuse. + +; Test that there are two versions - original loops and fused loop. +; CHECK: br i1 %memcheck.conflict, label %entry.split, label %entry.split.L1clone + +; Test for fusion along fused path. +; CHECK: for.body.L1clone: ; preds = %for.body.1.L2clone, %entry.split.L1clone +; CHECK: for.body.1.L2clone: ; preds = %for.body.L1clone +; CHECK: br i1 %exitcond.L1clone, label %for.end.loopexit.1, label %for.body.L1clone, !llvm.loop !1 + +; Test for merged defs and its uses outside the loops. +; CHECK: for.end.loopexit.1: ; preds = %for.body.1.L2clone, %for.body.1 +; CHECK: %add11.lfuse = phi i32 [ %add11, %for.body.1 ], [ %add11.L2clone, %for.body.1.L2clone ] +; CHECK: %add4.lfuse = phi i32 [ %add4, %for.body.1 ], [ %add4.L1clone, %for.body.1.L2clone ] +; CHECK: %outsideUse = add nsw i32 %add11.lfuse, %add4.lfuse + +; ModuleID = '1.bc' + +; Function Attrs: norecurse nounwind uwtable +define void @bigLoop(i32* nocapture readonly %a, i32* nocapture readonly %b, i32* nocapture %c) #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.next + %1 = load i32, i32* %arrayidx3, align 4 + %add4 = add nsw i32 %1, %0 + %arrayidx6 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + store i32 %add4, i32* %arrayidx6, align 4 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !4 + +for.end.loopexit: ; preds = %for.body + br label %for.body.1 + +for.body.1: ; preds = %for.body.1, %for.end.loopexit + %indvars.iv.1 = phi i64 [ 0, %for.end.loopexit ], [ %indvars.iv.next.1, %for.body.1 ] + %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx10 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.1 + %3 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %3, %2 + %arrayidx12 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.1 + store i32 %add11, i32* %arrayidx12, align 4 + %indvars.iv.next.1 = add i64 %indvars.iv.1, 1 + %exitcond.1 = icmp eq i64 %indvars.iv.next.1, 1000 + br i1 %exitcond.1, label %for.end.loopexit.1, label %for.body.1, !llvm.loop !4 + +for.end.loopexit.1: ; preds = %for.body.1 + br label %for.end + +for.end: ; preds = %for.end.loopexit.1 + %outsideUse = add nsw i32 %add11, %add4 + ret void +} + +attributes #0 = { norecurse nounwind uwtable } +attributes #1 = { norecurse nounwind readonly uwtable } +attributes #2 = { nounwind uwtable } +attributes #3 = { nounwind readonly } +attributes #4 = { nounwind } +attributes #5 = { noreturn nounwind } +attributes #6 = { nounwind readonly } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.8.0"} +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 1} +!3 = !{!"llvm.loop.interleave.count", i32 1} +!4 = distinct !{!4, !5} +!5 = !{!"llvm.loop.unroll.disable"} +!6 = distinct !{!6, !2, !3} +!7 = distinct !{!7, !2, !3} Index: test/Transforms/LoopFuse/no-fuse.ll =================================================================== --- /dev/null +++ test/Transforms/LoopFuse/no-fuse.ll @@ -0,0 +1,78 @@ +; RUN: opt -loop-fuse -verify-loop-info -verify-dom-info %s -S -o - | FileCheck %s + +; 'C' equivalent: Partially generated and hand modified. +; void noFuse(int *a, int *b, int *c) { +; for (i = 0; i < 1000; ++i) // L1 +; c[i] = a[i] + c[i - 1]; +; for (i = 0; i < 1000; ++i) // L2 +; c[i] = a[i] + b[i]; +; } +; There is a backward dependence from L1 to L2. So it is unsafe to fuse. + +; CHECK: entry: +; CHECK-NEXT: br label %for.body +; CHECK: for.body: ; preds = %for.body, %entry +; CHECK: for.body.1: + +; ModuleID = '1.bc' + +; Function Attrs: norecurse nounwind uwtable +define void @bigLoop(i32* nocapture readonly %a, i32* nocapture readonly %b, i32* nocapture %c) #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %indvars.iv.next.back = add i64 %indvars.iv, -1 + %arrayidx3 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.next.back + %1 = load i32, i32* %arrayidx3, align 4 + %add4 = add nsw i32 %1, %0 + %arrayidx6 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + store i32 %add4, i32* %arrayidx6, align 4 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !4 + +for.end.loopexit: ; preds = %for.body + br label %for.body.1 + +for.body.1: ; preds = %for.body.1, %for.end.loopexit + %indvars.iv.1 = phi i64 [ 0, %for.end.loopexit ], [ %indvars.iv.next.1, %for.body.1 ] + %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx10 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.1 + %3 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %3, %2 + %arrayidx12 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.1 + store i32 %add11, i32* %arrayidx12, align 4 + %indvars.iv.next.1 = add i64 %indvars.iv.1, 1 + %exitcond.1 = icmp eq i64 %indvars.iv.next.1, 1000 + br i1 %exitcond.1, label %for.end.loopexit.1, label %for.body.1, !llvm.loop !4 + +for.end.loopexit.1: ; preds = %for.body.1 + br label %for.end + +for.end: ; preds = %for.end.loopexit.1 + ret void +} + +attributes #0 = { norecurse nounwind uwtable } +attributes #1 = { norecurse nounwind readonly uwtable } +attributes #2 = { nounwind uwtable } +attributes #3 = { nounwind readonly } +attributes #4 = { nounwind } +attributes #5 = { noreturn nounwind } +attributes #6 = { nounwind readonly } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.8.0"} +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 1} +!3 = !{!"llvm.loop.interleave.count", i32 1} +!4 = distinct !{!4, !5} +!5 = !{!"llvm.loop.unroll.disable"} +!6 = distinct !{!6, !2, !3} +!7 = distinct !{!7, !2, !3}