Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -167,6 +167,7 @@ void initializeLoopDeletionPass(PassRegistry&); void initializeLoopExtractorPass(PassRegistry&); void initializeLoopInfoWrapperPassPass(PassRegistry&); +void initializeLoopInterchangePass(PassRegistry &); void initializeLoopInstSimplifyPass(PassRegistry&); void initializeLoopRotatePass(PassRegistry&); void initializeLoopSimplifyPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -96,6 +96,7 @@ (void) llvm::createLICMPass(); (void) llvm::createLazyValueInfoPass(); (void) llvm::createLoopExtractorPass(); + (void)llvm::createLoopInterchangePass(); (void) llvm::createLoopSimplifyPass(); (void) llvm::createLoopStrengthReducePass(); (void) llvm::createLoopRerollPass(); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -132,6 +132,12 @@ //===----------------------------------------------------------------------===// // +// LoopInterchange - This pass is interchanges loops to give better cache hits. +// +Pass *createLoopInterchangePass(); + +//===----------------------------------------------------------------------===// +// // LoopStrengthReduce - This pass is strength reduces GEP instructions that use // a loop's canonical induction variable as one of their indices. // Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -17,6 +17,7 @@ LoopDeletion.cpp LoopIdiomRecognize.cpp LoopInstSimplify.cpp + LoopInterchange.cpp LoopRerollPass.cpp LoopRotation.cpp LoopStrengthReduce.cpp Index: lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- lib/Transforms/Scalar/LoopInterchange.cpp +++ lib/Transforms/Scalar/LoopInterchange.cpp @@ -0,0 +1,873 @@ +//===- LoopInterchange.cpp - Loop interchange pass------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This Pass handles loop interchange transform. This interchanges the inner +// and outer loop if interchanging can result in better cache hits. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "loop-interchange" + +namespace { + +typedef std::pair LoopPair; + +unsigned getInnerLoopCount(Loop &L, unsigned level) { + int lev = 0; + if (L.empty()) + return level + 1; + for (Loop *InnerL : L) + lev = getInnerLoopCount(*InnerL, level + 1); + return lev; +} + +static void populateWorklist(Loop &L, SmallVector &V) { + // TODO: Currently only handled for loops depth of 2. + // Also handle loop depth of 2 more appropriatly. + if (getInnerLoopCount(L, 0) == 2) { + Loop *Inner; + for (Loop *InnerL : L) + Inner = InnerL; + V.push_back(std::make_pair(&L, Inner)); + } +} + +static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { + PHINode *innerIndexVar = L->getCanonicalInductionVariable(); + if (innerIndexVar) + return innerIndexVar; + for (BasicBlock::iterator I = L->getHeader()->begin(); isa(I); ++I) { + PHINode *phiVar = dyn_cast(I); + if (phiVar->getNumIncomingValues() != 2) + continue; + const SCEVAddRecExpr *AddRec = + dyn_cast(SE->getSCEV(phiVar)); + if (!AddRec) + continue; + const SCEV *Step = AddRec->getStepRecurrence(*SE); + const SCEVConstant *C = dyn_cast(Step); + if (!C) + continue; + // Found the induction variable. + // TODO: What if we have 2 induction variable? Currently legality makes sure + // we have only 1 induction variable + return phiVar; + } + return nullptr; +} + +/// LoopInterchangeLegality checks if it is legal to interchange the loop. +class LoopInterchangeLegality { +public: + LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, + DependenceAnalysis *DA) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DA(DA) {} + + /// Check if the loops can be interchanged. + bool canInterchangeLoops(); + + bool currentLimitations(); + +private: + bool checkDependence(Loop *Outer, DependenceAnalysis *DA); + bool tightlyNested(Loop *Outer, Loop *Inner); + + Loop *OuterLoop; + Loop *InnerLoop; + + /// Scev analysis. + ScalarEvolution *SE; + /// Dependence analysis. + DependenceAnalysis *DA; +}; + +/// LoopInterchangeProfitability checks if it is profitable to interchange the +/// loop. +class LoopInterchangeProfitability { +public: + LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {} + + /// Check if the loop interchange is profitable + bool isProfitable(); + +private: + int getInstrOrderCost(PHINode *IV); + + Loop *OuterLoop; + Loop *InnerLoop; + + /// Scev analysis. + ScalarEvolution *SE; +}; + +/// LoopInterchangeTransform interchanges the loop +class LoopInterchangeTransform { +public: + LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, + LoopInfo *LI, DominatorTree *DT) + : outerLoop(Outer), innerLoop(Inner), SE(SE), LI(LI), DT(DT) { + initialize(); + } + + /// Interchange OuterLoop and InnerLoop. + bool transform(); + void initialize(); + +private: + void splitInnerLoopLatch(Instruction *); + void splitOuterLoopLatch(); + void splitInnerLoopHeader(); + void adjustOuterLoopLatch(); + bool adjustLoopLinks(); + void adjustOuterLoopPreheader(); + bool adjustLoopBranches(); + + Loop *outerLoop; + Loop *innerLoop; + BasicBlock *innerLoopHeader; + BasicBlock *outerLoopHeader; + BasicBlock *innerLoopLatch; + BasicBlock *outerLoopLatch; + BasicBlock *outerLatchLCSSAPHIBlock; + BasicBlock *outerLoopPreHeader; + BasicBlock *innerLoopPreHeader; + PHINode *outerIndexVar; + BasicBlock *outerLoopSuccessor; + // Instruction* innerIndexVarInc; + BasicBlock *innerLoopLatchPred; + BasicBlock *innerLoopHeaderSucc; + std::vector> interchangedLoops; + /// Scev analysis. + ScalarEvolution *SE; + LoopInfo *LI; + DominatorTree *DT; +}; + +// Main LoopInterchange Pass +struct LoopInterchange : public FunctionPass { + static char ID; + ScalarEvolution *SE; + LoopInfo *LI; + DependenceAnalysis *DA; + DominatorTree *DT; + LoopInterchange() + : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) { + initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + } + + bool runOnFunction(Function &F) override { + SE = &getAnalysis(); + LI = &getAnalysis().getLoopInfo(); + DA = &getAnalysis(); + auto *DTWP = getAnalysisIfAvailable(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + // Build up a worklist of loop pairs to analyze. + // [TODO] Currently only supports loop with level 2. + // Handle for loops greater than level 2. + SmallVector Worklist; + + for (Loop *L : *LI) + populateWorklist(*L, Worklist); + + DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n"); + + bool Changed = false; + while (!Worklist.empty()) + Changed |= processLoop(Worklist.pop_back_val()); + + return Changed; + } + + bool processLoop(LoopPair P) { + + // Check if it is legal to interchange loop + LoopInterchangeLegality LIL(P.first, P.second, SE, DA); + if (!LIL.canInterchangeLoops()) { + DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); + return false; + } + DEBUG(dbgs() << "Loops are legal to interchange\n"); + + LoopInterchangeProfitability LIP(P.first, P.second, SE); + if (!LIP.isProfitable()) { + DEBUG(dbgs() << "Interchanging Loops not profitable\n"); + return false; + } + LoopInterchangeTransform LIT(P.first, P.second, SE, LI, DT); + LIT.transform(); + DEBUG(dbgs() << "Loops interchanged\n"); + return true; + } +}; + +} // end of namespace + +bool LoopInterchangeLegality::tightlyNested(Loop *outerLoop, Loop *innerLoop) { + BasicBlock *outerLoopHeader = outerLoop->getHeader(); + BasicBlock *innerLoopPreHeader = innerLoop->getLoopPreheader(); + BasicBlock *outerLoopLatch = outerLoop->getLoopLatch(); + unsigned PHIcount = 0; + // A perfectly nested loop will not have any branch in between the outer and + // inner block i.e. outer header will branch to either inner preheader or + // outerloop latch. + BranchInst *outerLoopHeaderBI = + dyn_cast(outerLoopHeader->getTerminator()); + unsigned num = outerLoopHeaderBI->getNumSuccessors(); + for (unsigned i = 0; i < num; i++) { + if (outerLoopHeaderBI->getSuccessor(i) != innerLoopPreHeader && + outerLoopHeaderBI->getSuccessor(i) != outerLoopLatch) + return false; + } + + // We do not have any branch in between now make sure the outer header and + // outer loop latch doesnt contain any extra instructions. + + // The idea here is that we only need to check for usage of induction + // variables in header/latch to find extra instructions in the loop.Any loop + // independent instructions would anyhow have been sinked/hoised out of the + // loop by licm. + PHINode *indVar = getInductionVariable(outerLoop, SE); + unsigned numUsageinHeader = 0; + unsigned numUsageinLatch = 0; + for (auto I = indVar->user_begin(), E = indVar->user_end(); I != E; ++I) { + User *Use = *I; + Instruction *Inst = dyn_cast(Use); + if (Inst->getParent() == outerLoopHeader) + numUsageinHeader++; + if (Inst->getParent() == outerLoopLatch) + numUsageinLatch++; + } + + // We check total number of usages in latch and header as it is observed in + // some cases the induction usage is moved to header and in some cases it is + // in the loop latch. + if (numUsageinLatch + numUsageinHeader != 1) + return false; + + // Finally check there are no call/reductions in the header/latch block. + for (auto I = outerLoopHeader->begin(), E = outerLoopHeader->end(); I != E; + ++I) { + if (isa(I)) + return false; + if (isa(I)) { + PHIcount++; + if (PHIcount > 1) + return false; + } + } + + PHIcount = 0; + for (auto I = outerLoopLatch->begin(), E = outerLoopLatch->end(); I != E; + ++I) { + if (isa(I)) + return false; + if (isa(I)) { + PHIcount++; + if (PHIcount > 1) + return false; + } + } + + // We have a perfect loop nest. + return true; +} + +bool LoopInterchangeLegality::checkDependence(Loop *Outer, + DependenceAnalysis *DA) { + + typedef SmallVector ValueVector; + // Holds Load and Store *instructions*. + ValueVector MemInstr; + // For each block. + for (Loop::block_iterator bb = Outer->block_begin(), be = Outer->block_end(); + bb != be; ++bb) { + // Scan the BB and collect legal loads and stores. + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + Instruction *I = dyn_cast(it); + if (!I) + return false; + LoadInst *Ld = dyn_cast(it); + StoreInst *St = dyn_cast(it); + if (!St && !Ld) + continue; + if (Ld && !Ld->isSimple()) + return false; + if (St && !St->isSimple()) + return false; + MemInstr.push_back(I); + } + } + + DEBUG(dbgs() << "Found " << MemInstr.size() + << " Loads and stores to analyze\n"); + + ValueVector::iterator I, IE, J, JE; + for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { + for (J = I, JE = MemInstr.end(); J != JE; ++J) { + Instruction *Src = dyn_cast(*I); + Instruction *Des = dyn_cast(*J); + if (Src == Des) + continue; + if (auto D = DA->depends(Src, Des, true)) { + // TODO: Fix his handle only anti/output dep for now. + if (D->isFlow()) { + // TODO: Flow dependency can be interchanged?? + DEBUG(dbgs() << "Flow dependence not handled"); + return false; + } + if (D->isAnti()) { + DEBUG(dbgs() << "Found Anti dependence \n"); + unsigned Levels = D->getLevels(); + + // If the two memory instructions have an anti dependence check + // the distance or the direction by which they vary. + // Interchanging two loops with anti dependence is valid if the + // dependence distance is *not* positive in each level. + for (unsigned II = 1; II <= Levels; ++II) { + const SCEV *Distance = D->getDistance(II); + const SCEVConstant *SCEVConst = + dyn_cast_or_null(Distance); + if (SCEVConst) { + const ConstantInt *CI = SCEVConst->getValue(); + if (!CI || (!CI->isNegative() && !CI->isZeroValue())) + return false; + } else if (D->isScalar(II)) { + DEBUG(dbgs() + << "TODO:Scalars dependence are currently not handled\n"); + return false; + } else { + unsigned Direction = D->getDirection(II); + if (Direction == Dependence::DVEntry::LT || + Direction == Dependence::DVEntry::LE || + Direction == Dependence::DVEntry::EQ) + continue; + return false; + } + } + } + } + } + } + + return true; +} + +// This function indicates the current limitations in the transform as a result +// of which we do not proceed. +bool LoopInterchangeLegality::currentLimitations() { + + BasicBlock *innerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *innerLoopHeader = InnerLoop->getHeader(); + BasicBlock *innerLoopLatch = InnerLoop->getLoopLatch(); + unsigned userCount = 0; + int PhiCount = 0; + PHINode *PHI; + + // We currently handle only 1 induction variable inside the loop. We also do + // not handle reductions as of now. + for (auto I = innerLoopHeader->begin(), E = innerLoopHeader->end(); I != E; + ++I) { + if (isa(I)) { + PHI = dyn_cast(I); + PhiCount++; + } + if (PhiCount > 1) + return true; + } + + // TODO: Current limitation: Since we split the inner loop latch at the point + // were induction variable is incremented (induction.next); We cannot have + // more than 1 user of induction.next since it would result in broken code + // after split. + // e.g. + // for(i=0;igetIncomingBlock(0) == innerLoopPreHeader) + innerIndexVarInc = dyn_cast(PHI->getIncomingValue(1)); + else + innerIndexVarInc = dyn_cast(PHI->getIncomingValue(0)); + for (auto UI = innerIndexVarInc->user_begin(), + UE = innerIndexVarInc->user_end(); + UI != UE; ++UI) { + Instruction *II = dyn_cast(*UI); + BasicBlock *BB = II->getParent(); + if (BB == innerLoopLatch) + userCount++; + if (userCount > 1) + return true; + } + return false; +} + +bool LoopInterchangeLegality::canInterchangeLoops() { + + // We must have a loop in canonical form. Loops with indirectbr in them cannot + // be canonicalized. + if (!OuterLoop->getLoopPreheader() || !InnerLoop->getLoopPreheader()) { + DEBUG(dbgs() << "loop control flow is not understood"); + return false; + } + // ScalarEvolution needs to be able to find the exit count. + const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(OuterLoop); + const SCEV *ExitCountInner = SE->getBackedgeTakenCount(InnerLoop); + if (ExitCountOuter == SE->getCouldNotCompute() || + ExitCountInner == SE->getCouldNotCompute()) { + DEBUG(dbgs() << "Could not determine number of loop iterations\n"); + return false; + } + // We must have a single backedge. + if (OuterLoop->getNumBackEdges() != 1 || InnerLoop->getNumBackEdges() != 1) { + DEBUG(dbgs() << "loop control flow is not understood"); + return false; + } + // We must have a single exiting block. + if (!OuterLoop->getExitingBlock() || !InnerLoop->getExitingBlock()) { + DEBUG(dbgs() << "loop control flow is not understood"); + return false; + } + // Check if the loops are tightly nested. + if (!tightlyNested(OuterLoop, InnerLoop)) { + DEBUG(dbgs() << "Loops not tightly nested\n"); + return false; + } + // TODO: The loops could not be interchanged due to current limitations in the + // transform module. + if (currentLimitations()) { + DEBUG(dbgs() << "Not legal because of current transform limitation\n"); + return false; + } + + return checkDependence(OuterLoop, DA); +} + +int LoopInterchangeProfitability::getInstrOrderCost(PHINode *IV) { + unsigned goodOrder, badOrder; + badOrder = goodOrder = 0; + for (auto IB = IV->user_begin(), IE = IV->user_end(); IB != IE; ++IB) { + Instruction *UseInstr = cast(*IB); + if (isa(UseInstr)) { + GetElementPtrInst *GEP = dyn_cast(UseInstr); + if (GEP->getOperand(GEP->getNumOperands() - 1) == IV) + goodOrder += 1; + else + badOrder += 1; + } + } + return goodOrder - badOrder; +} + +bool LoopInterchangeProfitability::isProfitable() { + // TODO: Add Better Profitibility checks. + // e.g + // 1) Construct dependency matrix and move the one with no loop carried dep + // inside to enable vectorization. + // 2) If reordering results in inner loop having stride of 1 etc. + + // This is rough cost estimation algorithm. It counts the good and bad order + // of induction variables in the instruction and allows reordering if number + // of bad orders is more than good. + int Cost = 0; + for (BasicBlock::iterator I = InnerLoop->getHeader()->begin(); + isa(I); ++I) { + Cost += getInstrOrderCost(cast(I)); + } + DEBUG(dbgs() << "Cost = " << Cost << "\n"); + if (Cost < 0) + return true; + return false; +} + +bool LoopInterchangeTransform::transform() { + + DEBUG(dbgs() << "transform\n"); + bool transformed = false; + Instruction *innerIndexVar; + PHINode *inductionPHI = getInductionVariable(innerLoop, SE); + if (!inductionPHI) { + DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); + return false; + } + + if (inductionPHI->getIncomingBlock(0) == innerLoopPreHeader) + innerIndexVar = dyn_cast(inductionPHI->getIncomingValue(1)); + else + innerIndexVar = dyn_cast(inductionPHI->getIncomingValue(0)); + + // Split at the place were the induction variable is incremented/decremented. + // TODO: This splitting logic may not work always. Fix this. + splitInnerLoopLatch(innerIndexVar); + DEBUG(dbgs() << "splitInnerLoopLatch Done\n"); + + // Splits the inner loops phi nodes out into a seperate basic block. + splitInnerLoopHeader(); + DEBUG(dbgs() << "splitInnerLoopHeader Done\n"); + + // Splits the LCSSA PHI nodes into a seperate block. + splitOuterLoopLatch(); + DEBUG(dbgs() << "splitOuterLoopLatch Done\n"); + + adjustOuterLoopLatch(); + + transformed |= adjustLoopLinks(); + if (!transformed) + DEBUG(dbgs() << "adjustLoopLinks Failed\n"); + return true; +} + +void LoopInterchangeTransform::initialize() { + innerLoopHeader = innerLoop->getHeader(); + outerLoopHeader = outerLoop->getHeader(); + innerLoopLatch = innerLoop->getLoopLatch(); + outerLoopLatch = outerLoop->getLoopLatch(); + outerLoopPreHeader = outerLoop->getLoopPreheader(); + innerLoopPreHeader = innerLoop->getLoopPreheader(); + outerIndexVar = outerLoop->getCanonicalInductionVariable(); +} + +void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *inc) { + BasicBlock::iterator I = innerLoopLatch->begin(); + BasicBlock::iterator E = innerLoopLatch->end(); + for (; I != E; ++I) { + if (inc == I) + break; + } + + // Split the inner loop latch out. + innerLoopLatchPred = innerLoopLatch; + innerLoopLatch = innerLoopLatch->splitBasicBlock(I); +} + +void LoopInterchangeTransform::splitOuterLoopLatch() { + outerLatchLCSSAPHIBlock = outerLoopLatch; + outerLoopLatch = + outerLoopLatch->splitBasicBlock(outerLoopLatch->getFirstNonPHI()); +} + +void LoopInterchangeTransform::adjustOuterLoopLatch() { + + for (auto BI = outerLoop->block_begin(), BE = outerLoop->block_end(); + BI != BE; ++BI) { + BranchInst *BInstr = dyn_cast((*BI)->getTerminator()); + unsigned NumOp = BInstr->getNumSuccessors(); + for (unsigned i = 0; i < NumOp; ++i) { + if (BInstr->getSuccessor(i) == outerLatchLCSSAPHIBlock) { + BInstr->setSuccessor(i, outerLoopLatch); + } + } + } + + // Now set outerLatchLCSSAPHIBlock as successor of outerLoopLatch. + BranchInst *BInstr = dyn_cast(outerLoopLatch->getTerminator()); + BasicBlock *LoopNestExit; + if (BInstr->getSuccessor(0) == outerLoopHeader) { + LoopNestExit = BInstr->getSuccessor(1); + BInstr->setSuccessor(1, outerLatchLCSSAPHIBlock); + } else { + LoopNestExit = BInstr->getSuccessor(0); + BInstr->setSuccessor(0, outerLatchLCSSAPHIBlock); + } + + // Incoming block changed adjust PHI nodes in outerLatchLCSSAPHIBlock. + // One block will branch from outer pre header after checking the condition + // for inner loop and another from inner loop latch. + for (auto I = outerLatchLCSSAPHIBlock->begin(), + E = outerLatchLCSSAPHIBlock->end(); + I != E; ++I) { + if (PHINode *LCSSAPHI = dyn_cast(I)) { + unsigned NumOp = LCSSAPHI->getNumIncomingValues(); + for (unsigned i = 0; i < NumOp; ++i) { + if (LCSSAPHI->getIncomingBlock(i) != outerLoopHeader) + LCSSAPHI->setIncomingBlock(i, innerLoopLatch); + else + LCSSAPHI->setIncomingBlock(i, outerLoopPreHeader); + } + } + } + + BInstr = dyn_cast(outerLatchLCSSAPHIBlock->getTerminator()); + BInstr->setSuccessor(0, LoopNestExit); + // Incoming block changed adjust PHI nodes in LoopNestExit + for (auto I = LoopNestExit->begin(), E = LoopNestExit->end(); I != E; ++I) { + if (PHINode *PHI = dyn_cast(I)) { + unsigned NumOp = PHI->getNumIncomingValues(); + for (unsigned i = 0; i < NumOp; ++i) { + if (PHI->getIncomingBlock(i) == outerLoopLatch) + PHI->setIncomingBlock(i, outerLatchLCSSAPHIBlock); + } + } + } +} + +void LoopInterchangeTransform::splitInnerLoopHeader() { + + // Split the inner loop header out. + innerLoopHeaderSucc = + innerLoopHeader->splitBasicBlock(innerLoopHeader->getFirstNonPHI()); + + DEBUG(dbgs() << "Output of splitInnerLoopHeader innerLoopHeaderSucc & " + "innerLoopHeader \n"); +} + +void LoopInterchangeTransform::adjustOuterLoopPreheader() { + // Adjust the outerLoop preheader to jump to inner loop preheader and + // if the + BranchInst *outerLoopHeaderBI = + dyn_cast(outerLoopHeader->getTerminator()); + BranchInst *outerLoopPreHeaderBI = + dyn_cast(outerLoopPreHeader->getTerminator()); + BranchInst *ExitInst = dyn_cast(outerLoopLatch->getTerminator()); + + BasicBlock *TrueBlock; + BasicBlock *FalseBlock; + BasicBlock *LoopNestExitBlock; + if (outerLoopHeaderBI->isUnconditional()) { + BranchInst::Create(innerLoopPreHeader, outerLoopPreHeaderBI); + outerLoopPreHeaderBI->eraseFromParent(); + return; + } + // Find the loop nest exit block. + if (ExitInst->getSuccessor(0) == outerLoopHeader) + LoopNestExitBlock = dyn_cast(ExitInst->getSuccessor(1)); + else + LoopNestExitBlock = dyn_cast(ExitInst->getSuccessor(0)); + // OuterLoopPreheader will branch to inner loop preheader and exit to Loop + // nest exit based on inner loops branch condition. On exit condition we + // should now branch to loop exit. + // Rewrite the Branch instruction to handle this. + if (outerLoopHeaderBI->getSuccessor(0) == innerLoopPreHeader) { + TrueBlock = innerLoopPreHeader; + FalseBlock = LoopNestExitBlock; + } else { + FalseBlock = innerLoopPreHeader; + TrueBlock = LoopNestExitBlock; + } + BranchInst::Create(TrueBlock, FalseBlock, outerLoopHeaderBI->getCondition(), + outerLoopPreHeaderBI); + outerLoopPreHeaderBI->eraseFromParent(); + return; +} + +bool LoopInterchangeTransform::adjustLoopBranches() { + + SmallVector LoopPhis; + SmallVector DepInstr; + BranchInst *innerPreheaderBI = + dyn_cast(innerLoopPreHeader->getTerminator()); + BranchInst *innerHeaderBI = + dyn_cast(innerLoopHeader->getTerminator()); + BranchInst *outerHeaderBI = + dyn_cast(outerLoopHeader->getTerminator()); + BranchInst *innerLoopLatchPredBI = + dyn_cast(innerLoopLatchPred->getTerminator()); + BranchInst *outerLoopLatchBI = + dyn_cast(outerLoopLatch->getTerminator()); + BranchInst *innerLoopLatchBI = + dyn_cast(innerLoopLatch->getTerminator()); + BranchInst *innerLoopHeaderSuccBI = + dyn_cast(innerLoopHeaderSucc->getTerminator()); + + BasicBlock *LoopNestExit; + if (!innerPreheaderBI || !innerHeaderBI || !innerPreheaderBI || + !innerLoopLatchPredBI || !outerLoopLatchBI || !innerLoopLatchBI || + !innerLoopHeaderSuccBI) + assert(0 && "This should not be triggered.We have already modified parts " + "of the loop"); + + // Update outerLoopHeader PHI nodes as incoming block has changed. + // Collect all Phi nodes and instructions in the outer loop header. + for (auto I = outerLoopHeader->begin(), E = outerLoopHeader->end(); I != E; + ++I) { + Instruction *Ins = I; + if (isa(I)) + LoopPhis.push_back(dyn_cast(I)); + for (auto UI = Ins->user_begin(), UE = Ins->user_end(); UI != UE; ++UI) { + User *Use = *UI; + Instruction *U = dyn_cast(Use); + if (!U) + continue; + // find any instuctions in inner loop preheader which may requires + // instructions in outer loop pre header + if (U->getParent() == innerLoopPreHeader) { + DepInstr.push_back(U); + } + } + } + + // Move these dependent instructions to the *new* inner loop header(old outer + // loop header) + // Insert in the same order as it was present use rbegin and rend + for (auto I = DepInstr.rbegin(), E = DepInstr.rend(); I != E; ++I) { + Instruction *Ins = *I; + Ins->moveBefore(outerLoopHeader->getTerminator()); + } + + // Create an unconditional branch to the new splitted innerLoopHeader + // from inner loop preheader. Erase the old branch instruction. + BranchInst::Create(innerLoopHeader, innerPreheaderBI); + innerPreheaderBI->eraseFromParent(); + + // Branch from the old inner loop header to outer loop header. + // Erase old branch instruction. + BranchInst::Create(outerLoopHeader, innerHeaderBI); + innerHeaderBI->eraseFromParent(); + + // Adjust Phi nodes of the outer loop header. The previous incoming + // outerLoopPreHeader + // is now gone and the new incoming block is innerLoopHeader. + while (!LoopPhis.empty()) { + PHINode *CurrIV = LoopPhis.pop_back_val(); + unsigned numIncomingBlocks = CurrIV->getNumIncomingValues(); + for (unsigned i = 0; i < numIncomingBlocks; ++i) { + if (CurrIV->getIncomingBlock(i) == outerLoopPreHeader) { + CurrIV->setIncomingBlock(i, innerLoopHeader); + } + } + } + + // The outer header will now branch to the innerLoopHeaderSucc(which was + // obtained after spiltting PHI nodes of the inner loop header) instead of + // inner loop preheader. + BranchInst::Create(innerLoopHeaderSucc, outerHeaderBI); + outerHeaderBI->eraseFromParent(); + + BasicBlock *InnerLoopExitBlock; + BasicBlock *InnerLoopExitIncomingBlock = nullptr; + // TODO: Need to update phi nodes if any in innerLoopLatch? + if (innerLoopLatchBI->getSuccessor(0) == innerLoopHeader) + InnerLoopExitBlock = innerLoopLatchBI->getSuccessor(1); + else + InnerLoopExitBlock = innerLoopLatchBI->getSuccessor(0); + if (innerLoopHeaderSuccBI->getSuccessor(0) == innerLoopLatch) { + innerLoopHeaderSuccBI->setSuccessor(0, InnerLoopExitBlock); + InnerLoopExitIncomingBlock = innerLoopHeaderSucc; + } else if (!innerLoopHeaderSuccBI->isUnconditional() && + innerLoopHeaderSuccBI->getSuccessor(1) == innerLoopLatch) { + innerLoopHeaderSuccBI->setSuccessor(1, InnerLoopExitBlock); + InnerLoopExitIncomingBlock = innerLoopHeaderSucc; + } + + // If the split did't result in same basic block adjust innerLoopLatchPred + if (innerLoopHeaderSuccBI != innerLoopLatchPredBI) { + if (innerLoopLatchPredBI->getSuccessor(0) == innerLoopLatch) { + innerLoopLatchPredBI->setSuccessor(0, InnerLoopExitBlock); + InnerLoopExitIncomingBlock = innerLoopLatchPred; + } else if (!innerLoopLatchPredBI->isUnconditional() && + innerLoopLatchPredBI->getSuccessor(1) == innerLoopLatch) { + innerLoopLatchPredBI->setSuccessor(1, InnerLoopExitBlock); + InnerLoopExitIncomingBlock = innerLoopLatchPred; + } + } + + // Update lcssa phi's in exitblock of old inner loop as incoming block has + // changed. + for (auto I = InnerLoopExitBlock->begin(), E = InnerLoopExitBlock->end(); + I != E; ++I) { + if (!isa(I)) + continue; + PHINode *PHI = dyn_cast(I); + unsigned numBlocks = PHI->getNumIncomingValues(); + for (unsigned i = 0; i < numBlocks; ++i) { + if (PHI->getIncomingBlock(i) == innerLoopLatch && + InnerLoopExitIncomingBlock != nullptr) + PHI->setIncomingBlock(i, InnerLoopExitIncomingBlock); + } + } + + // exit of outerLoop latch will now branch to inner loop latch + if (outerLoopLatchBI->getSuccessor(0) == outerLoopHeader) { + LoopNestExit = outerLoopLatchBI->getSuccessor(1); + outerLoopLatchBI->setSuccessor(1, innerLoopLatch); + } else { + LoopNestExit = outerLoopLatchBI->getSuccessor(0); + outerLoopLatchBI->setSuccessor(0, innerLoopLatch); + } + + // exit of innerLoop latch will now branch to loop nest exit + if (innerLoopLatchBI->getSuccessor(0) == innerLoopHeader) + innerLoopLatchBI->setSuccessor(1, LoopNestExit); + else + innerLoopLatchBI->setSuccessor(0, LoopNestExit); + + return true; +} + +bool LoopInterchangeTransform::adjustLoopLinks() { + + // Adjust the outerLoop preheader to jump to inner loop preheader and + // to loop nest exit based on inner loops branch condition. + adjustOuterLoopPreheader(); + + // Adjust all branches so that outer loop is moved inside and inner loop is + // moved outside. + adjustLoopBranches(); + + return true; +} + +char LoopInterchange::ID = 0; +INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", + "Interchanges loops for cache reuse", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) + +INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", + "Interchanges loops for cache reuse", false, false) + +Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); } Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -46,6 +46,7 @@ initializeLICMPass(Registry); initializeLoopDeletionPass(Registry); initializeLoopInstSimplifyPass(Registry); + initializeLoopInterchangePass(Registry); initializeLoopRotatePass(Registry); initializeLoopStrengthReducePass(Registry); initializeLoopRerollPass(Registry); Index: test/Transforms/LoopInterchange/dependence.ll =================================================================== --- test/Transforms/LoopInterchange/dependence.ll +++ test/Transforms/LoopInterchange/dependence.ll @@ -0,0 +1,53 @@ +; RUN: opt -S -basicaa -loop-interchange -gvn -loop-vectorize < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@N = common global i32 0, align 4 +@A = common global [100 x [100 x i32]] zeroinitializer, align 16 +@B = common global [100 x [100 x i32]] zeroinitializer, align 16 + +;CHECK-LABEL: @dependence_01 +;CHECK: load <4 x i32>* +;CHECK: store <4 x i32> + +define void @dependence_01() { +entry: + %0 = load i32* @N + %cmp26 = icmp sgt i32 %0, 0 + br i1 %cmp26, label %for.cond1.preheader.lr.ph, label %for.end16 + +for.cond1.preheader.lr.ph: ; preds = %entry + %1 = load i32* @N + %cmp224 = icmp sgt i32 %1, 1 + %2 = add i32 %1, -1 + %3 = sext i32 %1 to i64 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.inc14 + %indvars.iv29 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next30, %for.inc14 ] + br i1 %cmp224, label %for.body3, label %for.inc14 + +for.body3: ; preds = %for.cond1.preheader, %for.body3 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ] + %4 = add nsw i64 %indvars.iv, -1 + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]]* @A, i64 0, i64 %4, i64 %indvars.iv29 + %5 = load i32* %arrayidx5 + %arrayidx9 = getelementptr inbounds [100 x [100 x i32]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv29 + %6 = load i32* %arrayidx9 + %add = add nsw i32 %6, %5 + %arrayidx13 = getelementptr inbounds [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv29 + store i32 %add, i32* %arrayidx13 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %2 + br i1 %exitcond, label %for.inc14, label %for.body3 + +for.inc14: ; preds = %for.body3, %for.cond1.preheader + %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1 + %cmp = icmp slt i64 %indvars.iv.next30, %3 + br i1 %cmp, label %for.cond1.preheader, label %for.end16 + +for.end16: ; preds = %for.inc14, %entry + ret void +}