Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -166,6 +166,7 @@ void initializeLoopDeletionPass(PassRegistry&); void initializeLoopExtractorPass(PassRegistry&); void initializeLoopInfoWrapperPassPass(PassRegistry&); +void initializeLoopInterchangePass(PassRegistry &); void initializeLoopInstSimplifyPass(PassRegistry&); void initializeLoopRotatePass(PassRegistry&); void initializeLoopSimplifyPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -95,6 +95,7 @@ (void) llvm::createLICMPass(); (void) llvm::createLazyValueInfoPass(); (void) llvm::createLoopExtractorPass(); + (void)llvm::createLoopInterchangePass(); (void) llvm::createLoopSimplifyPass(); (void) llvm::createLoopStrengthReducePass(); (void) llvm::createLoopRerollPass(); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -140,6 +140,13 @@ //===----------------------------------------------------------------------===// // +// LoopInterchange - This pass interchanges loops to provide a more +// cache-friendly memory access patterns. +// +Pass *createLoopInterchangePass(); + +//===----------------------------------------------------------------------===// +// // LoopStrengthReduce - This pass is strength reduces GEP instructions that use // a loop's canonical induction variable as one of their indices. // Index: lib/Analysis/DependenceAnalysis.cpp =================================================================== --- lib/Analysis/DependenceAnalysis.cpp +++ lib/Analysis/DependenceAnalysis.cpp @@ -3360,6 +3360,11 @@ DstIdx = DstGEP->idx_begin(); SrcIdx != SrcEnd; ++SrcIdx, ++DstIdx, ++P) { + // If not isSCEVable return dependency as not computable. + if (!SE->isSCEVable((*SrcIdx)->getType()) || + !SE->isSCEVable((*DstIdx)->getType())) { + return make_unique(Src, Dst); + } Pair[P].Src = SE->getSCEV(*SrcIdx); Pair[P].Dst = SE->getSCEV(*DstIdx); unifySubscriptType(&Pair[P]); @@ -3786,6 +3791,10 @@ DstIdx = DstGEP->idx_begin(); SrcIdx != SrcEnd; ++SrcIdx, ++DstIdx, ++P) { + if (!SE->isSCEVable((*SrcIdx)->getType()) || + !SE->isSCEVable((*DstIdx)->getType())) { + return nullptr; + } Pair[P].Src = SE->getSCEV(*SrcIdx); Pair[P].Dst = SE->getSCEV(*DstIdx); } Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -77,6 +77,10 @@ EnableMLSM("mlsm", cl::init(true), cl::Hidden, cl::desc("Enable motion of merged load and store")); +static cl::opt EnableLoopInterchange( + "enable-loopinterchange", cl::init(false), cl::Hidden, + cl::desc("Enable the new, experimental LoopInterchange Pass")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -239,6 +243,8 @@ MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. MPM.add(createLoopDeletionPass()); // Delete dead loops + if (EnableLoopInterchange) + MPM.add(createLoopInterchangePass()); // Interchange loops if (!DisableUnrollLoops) MPM.add(createSimpleLoopUnrollPass()); // Unroll small loops @@ -454,6 +460,9 @@ // More loops are countable; try to optimize them. PM.add(createIndVarSimplifyPass()); PM.add(createLoopDeletionPass()); + if (EnableLoopInterchange) + PM.add(createLoopInterchangePass()); + PM.add(createLoopVectorizePass(true, LoopVectorize)); // More scalar chains could be vectorized due to more alias information Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -18,6 +18,7 @@ LoopDeletion.cpp LoopIdiomRecognize.cpp LoopInstSimplify.cpp + LoopInterchange.cpp LoopRerollPass.cpp LoopRotation.cpp LoopStrengthReduce.cpp Index: lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- lib/Transforms/Scalar/LoopInterchange.cpp +++ lib/Transforms/Scalar/LoopInterchange.cpp @@ -0,0 +1,1288 @@ +//===- LoopInterchange.cpp - Loop interchange pass------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This Pass handles loop interchange transform. +// This pass interchanges loops to provide a more cache-friendly memory access +// patterns. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +using namespace llvm; + +#define DEBUG_TYPE "loop-interchange" + +namespace { + +typedef SmallVector LoopVector; + +// TODO: Check if we can use a sparse matrix here. +typedef std::vector> CharMatrix; + +// Maximum number of dependencies that can be handled in the dependency matrix. +static const unsigned MaxMemInstrCount = 100; + +// Maximum loop depth supported. +static const unsigned MaxLoopNestDepth = 10; + +class LoopInterchange; +// Returns the maximum inner loop depth starting from L this is used to populate +// worklist with loops of depth 2. +unsigned getMaxNestingLevel(Loop &L) { + unsigned Level = 0; + if (L.empty()) + return 1; + for (Loop *InnerL : L) + Level = std::max(Level, getMaxNestingLevel(*InnerL) + 1); + return Level; +} + +void printDepMatrix(CharMatrix &DepMatrix) { +#ifdef ENABLE_DEBUGGING + for (auto I = DepMatrix.begin(), E = DepMatrix.end(); I != E; ++I) { + std::vector Vec = *I; + for (auto II = Vec.begin(), EE = Vec.end(); II != EE; ++II) + DEBUG(dbgs() << *II << " "); + DEBUG(dbgs() << "\n"); + } +#endif +} + +bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, + DependenceAnalysis *DA) { + typedef SmallVector ValueVector; + ValueVector MemInstr; + + if (Level > MaxLoopNestDepth) { + DEBUG(dbgs() << "Cannot handle loops of depth greater than " + << MaxLoopNestDepth << "\n"); + return false; + } + + // For each block. + for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end(); + BB != BE; ++BB) { + // Scan the BB and collect legal loads and stores. + for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; + ++I) { + Instruction *Ins = dyn_cast(I); + if (!Ins) + return false; + LoadInst *Ld = dyn_cast(I); + StoreInst *St = dyn_cast(I); + if (!St && !Ld) + continue; + if (Ld && !Ld->isSimple()) + return false; + if (St && !St->isSimple()) + return false; + MemInstr.push_back(I); + } + } + + DEBUG(dbgs() << "Found " << MemInstr.size() + << " Loads and Stores to analyze\n"); + + ValueVector::iterator I, IE, J, JE; + + for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { + for (J = I, JE = MemInstr.end(); J != JE; ++J) { + std::vector Dep; + Instruction *Src = dyn_cast(*I); + Instruction *Des = dyn_cast(*J); + if (Src == Des) + continue; + if (isa(Src) && isa(Des)) + continue; + if (auto D = DA->depends(Src, Des, true)) { + DEBUG(dbgs() << "Found Dependency between Src=" << Src << " Des=" << Des + << "\n"); + if (D->isFlow()) { + // TODO: Handle Flow dependence + DEBUG(dbgs() << "Flow dependence not handled"); + return false; + } + if (D->isAnti()) { + DEBUG(dbgs() << "Found Anti dependence \n"); + unsigned Levels = D->getLevels(); + char Direction; + for (unsigned II = 1; II <= Levels; ++II) { + const SCEV *Distance = D->getDistance(II); + const SCEVConstant *SCEVConst = + dyn_cast_or_null(Distance); + if (SCEVConst) { + const ConstantInt *CI = SCEVConst->getValue(); + if (CI->isNegative()) + Direction = '<'; + else if (CI->isZero()) + Direction = '='; + else + Direction = '>'; + Dep.push_back(Direction); + } else if (D->isScalar(II)) { + Direction = 'S'; + Dep.push_back(Direction); + } else { + unsigned Dir = D->getDirection(II); + if (Dir == Dependence::DVEntry::LT || + Dir == Dependence::DVEntry::LE) + Direction = '<'; + else if (Dir == Dependence::DVEntry::GT || + Dir == Dependence::DVEntry::GE) + Direction = '>'; + else if (Dir == Dependence::DVEntry::EQ) + Direction = '='; + else + Direction = '*'; + Dep.push_back(Direction); + } + } + while (Dep.size() != Level) { + Dep.push_back('I'); + } + + DepMatrix.push_back(Dep); + if (DepMatrix.size() > MaxMemInstrCount) { + DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount + << " dependencies inside loop\n"); + return false; + } + } + } + } + } + + // We don't have a DepMatrix to check legality return false + if (DepMatrix.size() == 0) + return false; + return true; +} + +// A loop is moved from index 'from' to an index 'to'. Update the Dependence +// matrix by exchanging the two columns. +void interChangeDepedencies(CharMatrix &DepMatrix, unsigned FromIndx, + unsigned ToIndx) { + unsigned numRows = DepMatrix.size(); + for (unsigned i = 0; i < numRows; ++i) { + char TmpVal = DepMatrix[i][ToIndx]; + DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx]; + DepMatrix[i][FromIndx] = TmpVal; + } +} + +// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is +// '>' +bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row, + unsigned Column) { + for (unsigned i = 0; i <= Column; ++i) { + if (DepMatrix[Row][i] == '<') + return false; + if (DepMatrix[Row][i] == '>') + return true; + } + // All dependencies were '=','S' or 'I' + return false; +} + +// Checks if no dependence exist in the dependency matrix in Row before Column. +bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row, + unsigned Column) { + for (unsigned i = 0; i < Column; ++i) { + if (DepMatrix[Row][i] != '=' || DepMatrix[Row][i] != 'S' || + DepMatrix[Row][i] != 'I') + return false; + } + return true; +} + +bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, + unsigned OuterLoopId, char InnerDep, char OuterDep) { + + if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId)) + return false; + + if (InnerDep == OuterDep) + return true; + + // It is legal to interchange if and only if after interchange no row has a + // '>' direction as the leftmost non-'='. + + if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I') + return true; + + if (InnerDep == '<') + return true; + + if (InnerDep == '>') { + // If OuterLoopId represents outermost loop then interchanging will make the + // 1st dependency as '>' + if (OuterLoopId == 0) + return false; + + // If all dependencies before OuterloopId are '=','S'or 'I'. Then + // interchanging will result in this row having an outermost non '=' + // dependency of '>' + if (!containsNoDependence(DepMatrix, Row, OuterLoopId)) + return true; + } + + return false; +} + +// Checks if it is legal to interchange 2 loops. +// [Theorm] A permutation of the loops in a perfect nest is legal if and only if +// the direction matrix, after the same permutation is applied to its columns, +// has no ">" direction as the leftmost non-"=" direction in any row. +bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, + unsigned OuterLoopId) { + + unsigned NumRows = DepMatrix.size(); + // For each row check if it is valid to interchange. + for (unsigned Row = 0; Row < NumRows; ++Row) { + char InnerDep = DepMatrix[Row][InnerLoopId]; + char OuterDep = DepMatrix[Row][OuterLoopId]; + if (InnerDep == '*' || OuterDep == '*') + return false; + else if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, + OuterDep)) + return false; + } + return true; +} + +static void populateWorklist(Loop &L, SmallVector &V) { + + DEBUG(dbgs() << "Calling populateWorklist called\n"); + LoopVector LoopList; + Loop *CurrentLoop = &L; + std::vector vec = CurrentLoop->getSubLoopsVector(); + while (vec.size() != 0) { + // The current loop has multiple subloops in it hence it is not tightly + // nested. + // Discard all loops above it added into Worklist. + if (vec.size() != 1) { + LoopList.clear(); + return; + } + LoopList.push_back(CurrentLoop); + CurrentLoop = *(vec.begin()); + vec = CurrentLoop->getSubLoopsVector(); + } + LoopList.push_back(CurrentLoop); + V.push_back(LoopList); +} + +static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { + PHINode *InnerIndexVar = L->getCanonicalInductionVariable(); + if (InnerIndexVar) + return InnerIndexVar; + if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr) + return nullptr; + for (BasicBlock::iterator I = L->getHeader()->begin(); isa(I); ++I) { + PHINode *PhiVar = cast(I); + Type *PhiTy = PhiVar->getType(); + if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && + !PhiTy->isPointerTy()) + return nullptr; + const SCEVAddRecExpr *AddRec = + dyn_cast(SE->getSCEV(PhiVar)); + if (!AddRec || !AddRec->isAffine()) + continue; + const SCEV *Step = AddRec->getStepRecurrence(*SE); + const SCEVConstant *C = dyn_cast(Step); + if (!C) + continue; + // Found the induction variable. + // FIXME: Handle loops with more than one induction variable. Note that, + // currently, legality makes sure we have only one induction variable. + return PhiVar; + } + return nullptr; +} + +/// LoopInterchangeLegality checks if it is legal to interchange the loop. +class LoopInterchangeLegality { +public: + LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, + DependenceAnalysis *DA, LoopInterchange *Pass) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DA(DA), Parent(Pass) {} + + /// Check if the loops can be interchanged. + bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, + CharMatrix &DepMatrix); + /// Check if the loop structure is understood. We do not handle triangular + /// loops for now. + bool isLoopStructureUnderstood(PHINode *InnerInductionVar); + + bool currentLimitations(); + +private: + bool checkDependence(Loop *Outer, DependenceAnalysis *DA); + bool tightlyNested(Loop *Outer, Loop *Inner); + + Loop *OuterLoop; + Loop *InnerLoop; + + /// Scev analysis. + ScalarEvolution *SE; + /// Dependence analysis. + DependenceAnalysis *DA; + LoopInterchange *Parent; +}; + +/// LoopInterchangeProfitability checks if it is profitable to interchange the +/// loop. +class LoopInterchangeProfitability { +public: + LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {} + + /// Check if the loop interchange is profitable + bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, + CharMatrix &DepMatrix); + +private: + int getInstrOrderCost(); + + Loop *OuterLoop; + Loop *InnerLoop; + + /// Scev analysis. + ScalarEvolution *SE; +}; + +/// LoopInterchangeTransform interchanges the loop +class LoopInterchangeTransform { +public: + LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, + LoopInfo *LI, DominatorTree *DT, + LoopInterchange *Pass, BasicBlock *LoopNestExit) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), + Parent(Pass), LoopExit(LoopNestExit) { + initialize(); + } + + /// Interchange OuterLoop and InnerLoop. + bool transform(); + void restructureLoops(Loop *InnerLoop, Loop *OuterLoop); + void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); + void initialize(); + +private: + void splitInnerLoopLatch(Instruction *); + void splitOuterLoopLatch(); + void splitInnerLoopHeader(); + bool adjustLoopLinks(); + void adjustLoopPreheaders(); + void adjustOuterLoopPreheader(); + void adjustInnerLoopPreheader(); + bool adjustLoopBranches(); + + Loop *OuterLoop; + Loop *InnerLoop; + + /// Scev analysis. + ScalarEvolution *SE; + LoopInfo *LI; + DominatorTree *DT; + LoopInterchange *Parent; + BasicBlock *LoopExit; +}; + +// Main LoopInterchange Pass +struct LoopInterchange : public FunctionPass { + static char ID; + ScalarEvolution *SE; + LoopInfo *LI; + DependenceAnalysis *DA; + DominatorTree *DT; + LoopInterchange() + : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) { + initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + } + + bool runOnFunction(Function &F) override { + SE = &getAnalysis(); + LI = &getAnalysis().getLoopInfo(); + DA = &getAnalysis(); + auto *DTWP = getAnalysisIfAvailable(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + // Build up a worklist of loop pairs to analyze. + SmallVector Worklist; + + for (Loop *L : *LI) + populateWorklist(*L, Worklist); + + DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n"); + bool Changed = true; + while (!Worklist.empty()) { + LoopVector LoopList = Worklist.pop_back_val(); + Changed = processLoopList(LoopList); + } + return Changed; + } + + bool isComputableLoopNest(LoopVector LoopList) { + for (auto I = LoopList.begin(), E = LoopList.end(); I != E; ++I) { + Loop *L = *I; + const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); + if (ExitCountOuter == SE->getCouldNotCompute()) { + DEBUG(dbgs() << "Couldn't compute Backedge count\n"); + return false; + } + if (L->getNumBackEdges() != 1) { + DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); + return false; + } + if (!L->getExitingBlock()) { + DEBUG(dbgs() << "Loop Doesn't have unique exit block\n"); + return false; + } + } + return true; + } + + unsigned selectLoopForInterchange(LoopVector LoopList) { + // TODO: Add a better heuristic to select the loop to be interchanged based + // on the dependece matrix. Currently we select the innermost loop. + return LoopList.size() - 1; + } + + bool processLoopList(LoopVector LoopList) { + bool Changed = false; + bool containsLCSSAPHI = false; + CharMatrix DependencyMatrix; + if (LoopList.size() < 2) { + DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n"); + return false; + } + if (!isComputableLoopNest(LoopList)) { + DEBUG(dbgs() << "Not vaild loop candidate for interchange\n"); + return false; + } + Loop *OuterMostLoop = *(LoopList.begin()); + + DEBUG(dbgs() << "Processing LoopList of size = " << LoopList.size() + << "\n"); + + if (!populateDependencyMatrix(DependencyMatrix, LoopList.size(), + OuterMostLoop, DA)) { + DEBUG(dbgs() << "Populating Dependency matrix failed\n"); + return false; + } +#ifdef ENABLE_DEBUGGING + DEBUG(dbgs() << "Dependence before inter change \n"); + printDepMatrix(DependencyMatrix); +#endif + + BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch(); + BranchInst *OuterMostLoopLatchBI = + dyn_cast(OuterMostLoopLatch->getTerminator()); + if (!OuterMostLoopLatchBI) + return false; + + // Since we currently do not handle LCSSA PHI's any failure in loop + // condition will now branch to LoopNestExit. + // TODO: This should be removed once we handle LCSSA PHI nodes. + + // Get the Outermost loop exit. + BasicBlock *LoopNestExit; + if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader()) + LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1); + else + LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0); + + for (auto I = LoopList.begin(), E = LoopList.end(); I != E; ++I) { + Loop *L = *I; + BasicBlock *Latch = L->getLoopLatch(); + BasicBlock *Header = L->getHeader(); + if (Latch && Latch != Header && isa(Latch->begin())) { + containsLCSSAPHI = true; + break; + } + } + + // TODO: Handle lcssa PHI's. + if (containsLCSSAPHI) + return false; + + unsigned SelecLoopId = selectLoopForInterchange(LoopList); + // Move the selected loop outwards to the best posible position. + for (unsigned i = SelecLoopId; i > 0; i--) { + bool Interchanged = + processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix); + if (!Interchanged) + return Changed; + // Loops interchanged reflect the same in LoopList + Loop *OldOuterLoop = LoopList[i - 1]; + LoopList[i - 1] = LoopList[i]; + LoopList[i] = OldOuterLoop; + + // Update the DependencyMatrix + interChangeDepedencies(DependencyMatrix, i, i - 1); + +#ifdef ENABLE_DEBUGGING + DEBUG(dbgs() << "Dependence after inter change \n"); + printDepMatrix(DependencyMatrix); +#endif + Changed |= Interchanged; + } + return Changed; + } + + bool processLoop(LoopVector LoopList, unsigned InnerLoopId, + unsigned OuterLoopId, BasicBlock *LoopNestExit, + std::vector> &DependencyMatrix) { + + DEBUG(dbgs() << "Processing Innder Loop Id = " << InnerLoopId + << " and OuterLoopId = " << OuterLoopId << "\n"); + Loop *InnerLoop = LoopList[InnerLoopId]; + Loop *OuterLoop = LoopList[OuterLoopId]; + + LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, DA, this); + if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { + DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); + return false; + } + DEBUG(dbgs() << "Loops are legal to interchange\n"); + LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE); + if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { + DEBUG(dbgs() << "Interchanging Loops not profitable\n"); + return false; + } + + LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, this, + LoopNestExit); + LIT.transform(); + DEBUG(dbgs() << "Loops interchanged\n"); + return true; + } +}; + +} // end of namespace + +static bool containsUnsafeInstructions(BasicBlock *BB) { + for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { + if (I->mayHaveSideEffects() || I->mayReadFromMemory()) + return true; + } + return false; +} + +bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + + DEBUG(dbgs() << "Checking if Loops are Tightly Nested\n"); + + // A perfectly nested loop will not have any branch in between the outer and + // inner block i.e. outer header will branch to either inner preheader and + // outerloop latch. + BranchInst *outerLoopHeaderBI = + dyn_cast(OuterLoopHeader->getTerminator()); + if (!outerLoopHeaderBI) + return false; + unsigned num = outerLoopHeaderBI->getNumSuccessors(); + for (unsigned i = 0; i < num; i++) { + if (outerLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader && + outerLoopHeaderBI->getSuccessor(i) != OuterLoopLatch) + return false; + } + + DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch \n"); + // We do not have any basic block in between now make sure the outer header + // and outer loop latch doesnt contain any unsafe instructions. + if (containsUnsafeInstructions(OuterLoopHeader) || + containsUnsafeInstructions(OuterLoopLatch)) + return false; + + DEBUG(dbgs() << "Loops are perfectly nested \n"); + // We have a perfect loop nest. + return true; +} + +bool LoopInterchangeLegality::checkDependence(Loop *Outer, + DependenceAnalysis *DA) { + + typedef SmallVector ValueVector; + // Holds Load and Store *instructions*. + ValueVector MemInstr; + // For each block. + for (Loop::block_iterator BI = Outer->block_begin(), BE = Outer->block_end(); + BI != BE; ++BI) { + // Scan the BB and collect legal loads and stores. + for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; + ++I) { + Instruction *Ins = dyn_cast(I); + if (!Ins) + return false; + LoadInst *Ld = dyn_cast(I); + StoreInst *St = dyn_cast(I); + if (!St && !Ld) + continue; + if (Ld && !Ld->isSimple()) + return false; + if (St && !St->isSimple()) + return false; + MemInstr.push_back(Ins); + } + } + + DEBUG(dbgs() << "Found " << MemInstr.size() + << " Loads and stores to analyze\n"); + + ValueVector::iterator I, IE, J, JE; + for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { + for (J = I, JE = MemInstr.end(); J != JE; ++J) { + Instruction *Src = cast(*I); + Instruction *Des = cast(*J); + if (Src == Des) + continue; + DEBUG(dbgs() << "Checking Depencency between Src " << Src << " and Des" + << Des << "\n"); + + if (auto D = DA->depends(Src, Des, true)) { + // TODO: Fix his handle only anti/output dep for now. + if (D->isFlow()) { + // TODO: Flow dependency can be interchanged?? + DEBUG(dbgs() << "Flow dependence not handled"); + return false; + } + if (D->isAnti()) { + DEBUG(dbgs() << "Found Anti dependence \n"); + unsigned Levels = D->getLevels(); + + // If the two memory instructions have an anti dependence check + // the distance or the direction by which they vary. + // Interchanging two loops with anti dependence is valid if the + // dependence distance is *not* positive in each level. + for (unsigned II = 1; II <= Levels; ++II) { + const SCEV *Distance = D->getDistance(II); + const SCEVConstant *SCEVConst = + dyn_cast_or_null(Distance); + if (SCEVConst) { + const ConstantInt *CI = SCEVConst->getValue(); + if (!CI || (!CI->isNegative() && !CI->isZeroValue())) + return false; + } else if (D->isScalar(II)) { + DEBUG(dbgs() + << "TODO:Scalars dependence are currently not handled\n"); + return false; + } else { + unsigned Direction = D->getDirection(II); + if (Direction == Dependence::DVEntry::LT || + Direction == Dependence::DVEntry::LE || + Direction == Dependence::DVEntry::EQ) + continue; + return false; + } + } + } + } + } + } + + return true; +} + +static unsigned getPHICount(BasicBlock *BB) { + unsigned PhiCount = 0; + for (auto I = BB->begin(); isa(I); ++I) + PhiCount++; + return PhiCount; +} + +bool LoopInterchangeLegality::isLoopStructureUnderstood( + PHINode *InnerInduction) { + + unsigned Num = InnerInduction->getNumOperands(); + BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); + for (unsigned i = 0; i < Num; ++i) { + Value *Val = InnerInduction->getOperand(i); + if (isa(Val)) + continue; + Instruction *I = dyn_cast(Val); + if (!I) + return false; + // TODO: Handle triangular loops. + // e.g. for(int i=0;igetIncomingBlock(IncomBlockIndx) == + InnerLoopPreheader && + !OuterLoop->isLoopInvariant(I)) { + return false; + } + } + return true; +} + +// This function indicates the current limitations in the transform as a result +// of which we do not proceed. +bool LoopInterchangeLegality::currentLimitations() { + + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + + PHINode *InnerInductionVar; + PHINode *OuterInductionVar; + + // We currently handle only 1 induction variable inside the loop. We also do + // not handle reductions as of now. + if (getPHICount(InnerLoopHeader) > 1) + return true; + + if (getPHICount(OuterLoopHeader) > 1) + return true; + + InnerInductionVar = getInductionVariable(InnerLoop, SE); + OuterInductionVar = getInductionVariable(OuterLoop, SE); + + if (!OuterInductionVar || !InnerInductionVar) { + DEBUG(dbgs() << "Induction variable not found\n"); + return true; + } + + // TODO: Triangular loops are not handled for now. + if (!isLoopStructureUnderstood(InnerInductionVar)) { + DEBUG(dbgs() << "Loop structure not understood by pass\n"); + return true; + } + + // TODO: Loops with LCSSA PHI's are currently not handled. + if (isa(OuterLoopLatch->begin())) { + DEBUG(dbgs() << "Found and LCSSA PHI in outer loop latch\n"); + return true; + } + if (InnerLoopLatch != InnerLoopHeader && + isa(InnerLoopLatch->begin())) { + DEBUG(dbgs() << "Found and LCSSA PHI in inner loop latch\n"); + return true; + } + + // TODO: Current limitation: Since we split the inner loop latch at the point + // were induction variable is incremented (induction.next); We cannot have + // more than 1 user of induction.next since it would result in broken code + // after split. + // e.g. + // for(i=0;igetIncomingBlock(0) == InnerLoopPreHeader) + InnerIndexVarInc = + dyn_cast(InnerInductionVar->getIncomingValue(1)); + else + InnerIndexVarInc = + dyn_cast(InnerInductionVar->getIncomingValue(0)); + + if (!InnerIndexVarInc) + return true; + + // Since we split the inner loop latch on this induction variable. Make sure + // we do not have any instruction between the induction variable and branch + // instruction. + + for (auto I = InnerLoopLatch->rbegin(), E = InnerLoopLatch->rend(); + I != E && !FoundInduction; ++I) { + if (isa(*I) || isa(*I) || isa(*I)) + continue; + const Instruction &Ins = *I; + // We found an instruction. If this is not induction variable then it is not + // safe to split this loop latch. + if (!Ins.isIdenticalTo(InnerIndexVarInc)) + return true; + else + FoundInduction = true; + } + // The loop latch ended and we didnt find the induction variable return as + // current limitation. + if (!FoundInduction) + return true; + + return false; +} + +bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, + unsigned OuterLoopId, + CharMatrix &DepMatrix) { + + if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { + DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId + << "and OuterLoopId = " << OuterLoopId + << "due to dependence\n"); + return false; + } + + // Create unique Preheaders if we already do not have one. + BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + + // Create a unique outer preheader - + // 1) If OuterLoop preheader is not present. + // 2) If OuterLoop Preheader is same as OuterLoop Header + // 3) If OuterLoop Preheader is same as Header of the previous loop. + // 4) If OuterLoop Preheader is Entry node. + if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() || + isa(OuterLoopPreHeader->begin()) || + !OuterLoopPreHeader->getUniquePredecessor()) { + OuterLoopPreHeader = InsertPreheaderForLoop(OuterLoop, Parent); + } + + if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() || + InnerLoopPreHeader == OuterLoop->getHeader()) { + InnerLoopPreHeader = InsertPreheaderForLoop(InnerLoop, Parent); + } + + // Check if the loops are tightly nested. + if (!tightlyNested(OuterLoop, InnerLoop)) { + DEBUG(dbgs() << "Loops not tightly nested\n"); + return false; + } + + // TODO: The loops could not be interchanged due to current limitations in the + // transform module. + if (currentLimitations()) { + DEBUG(dbgs() << "Not legal because of current transform limitation\n"); + return false; + } + + return true; +} + +int LoopInterchangeProfitability::getInstrOrderCost() { + unsigned GoodOrder, BadOrder; + BadOrder = GoodOrder = 0; + for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end(); + BI != BE; ++BI) { + for (auto I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) { + const Instruction &Ins = *I; + if (const GetElementPtrInst *GEP = dyn_cast(&Ins)) { + unsigned NumOp = GEP->getNumOperands(); + bool FoundInnerInduction = false; + bool FoundOuterInduction = false; + for (unsigned i = 0; i < NumOp; ++i) { + const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i)); + const SCEVAddRecExpr *AR = dyn_cast(OperandVal); + if (!AR) + continue; + + // If we find the inner induction after an outer induction e.g. + // for(int i=0;igetLoop() == InnerLoop) { + // We found an InnerLoop induction after OuterLoop induction. It is + // a good order. + FoundInnerInduction = true; + if (FoundOuterInduction) { + GoodOrder++; + break; + } + } + // If we find the outer induction after an inner induction e.g. + // for(int i=0;igetLoop() == OuterLoop) { + // We found an OuterLoop induction after InnerLoop induction. It is + // a bad order. + FoundOuterInduction = true; + if (FoundInnerInduction) { + BadOrder++; + break; + } + } + } + } + } + } + return GoodOrder - BadOrder; +} + +bool isProfitabileForVectorization(unsigned InnerLoopId, unsigned OuterLoopId, + CharMatrix &DepMatrix) { + // TODO: Improve this heuristic to catch more cases. + // If the inner loop is loop independent or doesn't carry any dependency it is + // profitable to move this to outer position. + unsigned Row = DepMatrix.size(); + for (unsigned i = 0; i < Row; ++i) { + if (DepMatrix[i][InnerLoopId] != 'S' && DepMatrix[i][InnerLoopId] != 'I') + return false; + // TODO: We need to improve this heuristic. + if (DepMatrix[i][OuterLoopId] != '=') + return false; + } + // If outer loop has dependence and inner loop is loop independent then it is + // profitable to interchange to enable parallelism. + return true; +} + +bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, + unsigned OuterLoopId, + CharMatrix &DepMatrix) { + + // TODO: Add Better Profitibility checks. + // e.g + // 1) Construct dependency matrix and move the one with no loop carried dep + // inside to enable vectorization. + + // This is rough cost estimation algorithm. It counts the good and bad order + // of induction variables in the instruction and allows reordering if number + // of bad orders is more than good. + int Cost = 0; + Cost += getInstrOrderCost(); + DEBUG(dbgs() << "Cost = " << Cost << "\n"); + if (Cost < 0) + return true; + + // It is not profitable as per current cache profitibility model. But check if + // we can move this loop outside to improve parallelism. + bool ImprovesPar = + isProfitabileForVectorization(InnerLoopId, OuterLoopId, DepMatrix); + return ImprovesPar; +} + +void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, + Loop *InnerLoop) { + for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end();; ++I) { + assert(I != E && "Couldn't find loop"); + if (*I == InnerLoop) { + OuterLoop->removeChildLoop(I); + return; + } + } +} +void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop, + Loop *OuterLoop) { + Loop *OuterLoopParent = OuterLoop->getParentLoop(); + if (OuterLoopParent) { + // Remove the loop from its parent loop. + removeChildLoop(OuterLoopParent, OuterLoop); + removeChildLoop(OuterLoop, InnerLoop); + OuterLoopParent->addChildLoop(InnerLoop); + } else { + removeChildLoop(OuterLoop, InnerLoop); + LI->changeTopLevelLoop(OuterLoop, InnerLoop); + } + + for (Loop::iterator I = InnerLoop->begin(), E = InnerLoop->end(); I != E; ++I) + OuterLoop->addChildLoop(InnerLoop->removeChildLoop(I)); + + InnerLoop->addChildLoop(OuterLoop); +} + +bool LoopInterchangeTransform::transform() { + + DEBUG(dbgs() << "transform\n"); + bool Transformed = false; + Instruction *InnerIndexVar; + + if (InnerLoop->getSubLoops().size() == 0) { + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + DEBUG(dbgs() << "Calling Split Inner Loop\n"); + PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); + if (!InductionPHI) { + DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); + return false; + } + + if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) + InnerIndexVar = dyn_cast(InductionPHI->getIncomingValue(1)); + else + InnerIndexVar = dyn_cast(InductionPHI->getIncomingValue(0)); + + // + // Split at the place were the induction variable is + // incremented/decremented. + // TODO: This splitting logic may not work always. Fix this. + splitInnerLoopLatch(InnerIndexVar); + DEBUG(dbgs() << "splitInnerLoopLatch Done\n"); + + // Splits the inner loops phi nodes out into a seperate basic block. + splitInnerLoopHeader(); + DEBUG(dbgs() << "splitInnerLoopHeader Done\n"); + } + + Transformed |= adjustLoopLinks(); + if (!Transformed) { + DEBUG(dbgs() << "adjustLoopLinks Failed\n"); + return false; + } + + restructureLoops(InnerLoop, OuterLoop); + return true; +} + +void LoopInterchangeTransform::initialize() {} + +void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *inc) { + + BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); + BasicBlock::iterator I = InnerLoopLatch->begin(); + BasicBlock::iterator E = InnerLoopLatch->end(); + for (; I != E; ++I) { + if (inc == I) + break; + } + + BasicBlock *InnerLoopLatchPred = InnerLoopLatch; + InnerLoopLatch = SplitBlock(InnerLoopLatchPred, I, DT, LI); +} + +void LoopInterchangeTransform::splitOuterLoopLatch() { + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + BasicBlock *OuterLatchLcssaPhiBlock = OuterLoopLatch; + OuterLoopLatch = SplitBlock(OuterLatchLcssaPhiBlock, + OuterLoopLatch->getFirstNonPHI(), DT, LI); +} + +void LoopInterchangeTransform::splitInnerLoopHeader() { + + // Split the inner loop header out. + BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); + + DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & " + "InnerLoopHeader \n"); +} + +void LoopInterchangeTransform::adjustOuterLoopPreheader() { + BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + SmallVector Inst; + for (auto I = OuterLoopPreHeader->begin(), E = OuterLoopPreHeader->end(); + I != E; ++I) { + if (isa(*I)) + break; + Inst.push_back(I); + } + + BasicBlock *InnerPreHeader = InnerLoop->getLoopPreheader(); + for (auto I = Inst.begin(), E = Inst.end(); I != E; ++I) { + Instruction *Ins = cast(*I); + Ins->moveBefore(InnerPreHeader->getTerminator()); + } +} + +void LoopInterchangeTransform::adjustInnerLoopPreheader() { + + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + SmallVector Inst; + for (auto I = InnerLoopPreHeader->begin(), E = InnerLoopPreHeader->end(); + I != E; ++I) { + if (isa(*I)) + break; + Inst.push_back(I); + } + BasicBlock *OuterHeader = OuterLoop->getHeader(); + for (auto I = Inst.begin(), E = Inst.end(); I != E; ++I) { + Instruction *Ins = cast(*I); + Ins->moveBefore(OuterHeader->getTerminator()); + } +} + +bool LoopInterchangeTransform::adjustLoopBranches() { + + DEBUG(dbgs() << "adjustLoopBranches called\n"); + // Adjust the loop preheader + BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); + BasicBlock *InnerLoopLatchPredecessor = + InnerLoopLatch->getUniquePredecessor(); + BasicBlock *InnerLoopLatchSuccessor; + BasicBlock *OuterLoopLatchSuccessor; + + BranchInst *OuterLoopLatchBI = + dyn_cast(OuterLoopLatch->getTerminator()); + BranchInst *InnerLoopLatchBI = + dyn_cast(InnerLoopLatch->getTerminator()); + BranchInst *OuterLoopHeaderBI = + dyn_cast(OuterLoopHeader->getTerminator()); + BranchInst *InnerLoopHeaderBI = + dyn_cast(InnerLoopHeader->getTerminator()); + + if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor || + !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI || + !InnerLoopHeaderBI) + return false; + + BranchInst *InnerLoopLatchPredecessorBI = + dyn_cast(InnerLoopLatchPredecessor->getTerminator()); + BranchInst *OuterLoopPredecessorBI = + dyn_cast(OuterLoopPredecessor->getTerminator()); + + if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) + return false; + BasicBlock *InnerLoopHeaderSucessor = InnerLoopHeader->getUniqueSuccessor(); + if (!InnerLoopHeaderSucessor) + return false; + + // Adjust Loop Preheader and headers + + unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors(); + for (unsigned i = 0; i < NumSucc; ++i) { + if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader) + OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader); + } + + NumSucc = OuterLoopHeaderBI->getNumSuccessors(); + for (unsigned i = 0; i < NumSucc; ++i) { + if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch) + OuterLoopHeaderBI->setSuccessor(i, LoopExit); + else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader) + OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSucessor); + } + + BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI); + InnerLoopHeaderBI->eraseFromParent(); + + // -------------Adjust loop latches----------- + if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) + InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1); + else + InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0); + + NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors(); + for (unsigned i = 0; i < NumSucc; ++i) { + if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch) + InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor); + } + + if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) + OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); + else + OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0); + + if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor) + InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor); + else + InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor); + + if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) { + OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch); + } else { + OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch); + } + + return true; +} +void LoopInterchangeTransform::adjustLoopPreheaders() { + + // We have interchanged the preheaders so we need to interchange the data in + // the preheader as well. + // This is because the content of inner preheader was previously executed + // inside the outer loop. + BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + BranchInst *InnerTermBI = + cast(InnerLoopPreHeader->getTerminator()); + + SmallVector OuterPreheaderInstr; + SmallVector InnerPreheaderInstr; + + for (auto I = OuterLoopPreHeader->begin(); !isa(I); ++I) + OuterPreheaderInstr.push_back(I); + + for (auto I = InnerLoopPreHeader->begin(); !isa(I); ++I) + InnerPreheaderInstr.push_back(I); + + BasicBlock *HeaderSplit = + SplitBlock(OuterLoopHeader, OuterLoopHeader->getTerminator(), DT, LI); + Instruction *InsPoint = HeaderSplit->getFirstNonPHI(); + // These instructions should now be executed inside the loop. + // Move instruction into a new block after outer header. + for (auto I = InnerPreheaderInstr.begin(), E = InnerPreheaderInstr.end(); + I != E; ++I) { + Instruction *Ins = cast(*I); + Ins->moveBefore(InsPoint); + } + // These instructions were not executed previously in the loop so move them to + // the older inner loop preheader. + for (auto I = OuterPreheaderInstr.begin(), E = OuterPreheaderInstr.end(); + I != E; ++I) { + Instruction *Ins = cast(*I); + Ins->moveBefore(InnerTermBI); + } +} + +bool LoopInterchangeTransform::adjustLoopLinks() { + + // Adjust all branches in the inner and outer loop. + bool Changed = adjustLoopBranches(); + if (Changed) + adjustLoopPreheaders(); + return Changed; +} + +char LoopInterchange::ID = 0; +INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", + "Interchanges loops for cache reuse", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) + +INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", + "Interchanges loops for cache reuse", false, false) + +Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); } Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -48,6 +48,7 @@ initializeLoopDeletionPass(Registry); initializeLoopAccessAnalysisPass(Registry); initializeLoopInstSimplifyPass(Registry); + initializeLoopInterchangePass(Registry); initializeLoopRotatePass(Registry); initializeLoopStrengthReducePass(Registry); initializeLoopRerollPass(Registry); Index: test/Transforms/LoopInterchange/currentLimitation.ll =================================================================== --- test/Transforms/LoopInterchange/currentLimitation.ll +++ test/Transforms/LoopInterchange/currentLimitation.ll @@ -0,0 +1,58 @@ +; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s +;; These are test that fail to interchange due to current limitation. This will go off once we extend the loop interchange pass. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer +@B = common global [100 x [100 x [100 x i32]]] zeroinitializer + +;;--------------------------------------Test case 01------------------------------------ +;; [FIXME] This loop though valid is currently not interchanged due to the limitation that we cannot split the inner loop latch due to multiple use of inner induction +;; variable.(used to increment the loop counter and to access A[j+1][i+1] +;; for(int i=0;i=0;j--) +;; A[j][i] = A[j][i]+k; + +define void @interchange_02(i32 %k) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc10 ] + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ 100, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19 + %0 = load i32, i32* %arrayidx5 + %add = add nsw i32 %0, %k + store i32 %add, i32* %arrayidx5 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp2 = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp2, label %for.body3, label %for.inc10 + +for.inc10: + %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 + %exitcond = icmp eq i64 %indvars.iv.next20, 100 + br i1 %exitcond, label %for.end11, label %for.cond1.preheader + +for.end11: + ret void +} + +; CHECK-LABEL: @interchange_02 +; CHECK: entry: +; CHECK: br label %for.body3.preheader +; CHECK: for.cond1.preheader.preheader: +; CHECK: br label %for.cond1.preheader +; CHECK: for.cond1.preheader: +; CHECK: %indvars.iv19 = phi i64 [ %indvars.iv.next20, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ] +; CHECK: br label %for.body3.split1 +; CHECK: for.body3.preheader: +; CHECK: br label %for.body3 +; CHECK: for.body3: +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 100, %for.body3.preheader ] +; CHECK: br label %for.cond1.preheader.preheader +; CHECK: for.body3.split1: ; preds = %for.cond1.preheader +; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19 +; CHECK: %0 = load i32, i32* %arrayidx5 +; CHECK: %add = add nsw i32 %0, %k +; CHECK: store i32 %add, i32* %arrayidx5 +; CHECK: br label %for.inc10 +; CHECK: for.body3.split: +; CHECK: %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK: %cmp2 = icmp sgt i64 %indvars.iv, 0 +; CHECK: br i1 %cmp2, label %for.body3, label %for.end11 +; CHECK: for.inc10: +; CHECK: %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 +; CHECK: %exitcond = icmp eq i64 %indvars.iv.next20, 100 +; CHECK: br i1 %exitcond, label %for.body3.split, label %for.cond1.preheader +; CHECK: for.end11: +; CHECK: ret void + +;;--------------------------------------Test case 03------------------------------------- +;; Loops should not be interchanged in this case as it is not profitable. +;; for(int i=0;i<100;i++) +;; for(int j=0;j<100;j++) +;; A[i][j] = A[i][j]+k; + +define void @interchange_03(i32 %k) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc10 ] + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv + %0 = load i32, i32* %arrayidx5 + %add = add nsw i32 %0, %k + store i32 %add, i32* %arrayidx5 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.inc10, label %for.body3 + +for.inc10: + %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 + %exitcond23 = icmp eq i64 %indvars.iv.next22, 100 + br i1 %exitcond23, label %for.end12, label %for.cond1.preheader + +for.end12: + ret void +} + +; CHECK-LABEL: @interchange_03 +; CHECK: entry: +; CHECK: br label %for.cond1.preheader.preheader +; CHECK: for.cond1.preheader.preheader: ; preds = %entry +; CHECK: br label %for.cond1.preheader +; CHECK: for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc10 +; CHECK: %indvars.iv21 = phi i64 [ %indvars.iv.next22, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ] +; CHECK: br label %for.body3.preheader +; CHECK: for.body3.preheader: ; preds = %for.cond1.preheader +; CHECK: br label %for.body3 +; CHECK: for.body3: ; preds = %for.body3.preheader, %for.body3 +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ] +; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv +; CHECK: %0 = load i32, i32* %arrayidx5 +; CHECK: %add = add nsw i32 %0, %k +; CHECK: store i32 %add, i32* %arrayidx5 +; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK: %exitcond = icmp eq i64 %indvars.iv.next, 100 +; CHECK: br i1 %exitcond, label %for.inc10, label %for.body3 +; CHECK: for.inc10: ; preds = %for.body3 +; CHECK: %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 +; CHECK: %exitcond23 = icmp eq i64 %indvars.iv.next22, 100 +; CHECK: br i1 %exitcond23, label %for.end12, label %for.cond1.preheader +; CHECK: for.end12: ; preds = %for.inc10 +; CHECK: ret void + + +;;--------------------------------------Test case 04------------------------------------- +;; Loops should not be interchanged in this case as it is not legal due to dependency. +;; for(int j=0;j<99;j++) +;; for(int i=0;i<99;i++) +;; A[j][i+1] = A[j+1][i]+k; + +define void @interchange_04(i32 %k){ +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv + %0 = load i32, i32* %arrayidx5 + %add6 = add nsw i32 %0, %k + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next + store i32 %add6, i32* %arrayidx11 + %exitcond = icmp eq i64 %indvars.iv.next, 99 + br i1 %exitcond, label %for.inc12, label %for.body3 + +for.inc12: + %exitcond25 = icmp eq i64 %indvars.iv.next24, 99 + br i1 %exitcond25, label %for.end14, label %for.cond1.preheader + +for.end14: + ret void +} + +; CHECK-LABEL: @interchange_04 +; CHECK: entry: +; CHECK: br label %for.cond1.preheader +; CHECK: for.cond1.preheader: ; preds = %for.inc12, %entry +; CHECK: %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ] +; CHECK: %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 +; CHECK: br label %for.body3 +; CHECK: for.body3: ; preds = %for.body3, %for.cond1.preheader +; CHECK: %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] +; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv +; CHECK: %0 = load i32, i32* %arrayidx5 +; CHECK: %add6 = add nsw i32 %0, %k +; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK: %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next +; CHECK: store i32 %add6, i32* %arrayidx11 +; CHECK: %exitcond = icmp eq i64 %indvars.iv.next, 99 +; CHECK: br i1 %exitcond, label %for.inc12, label %for.body3 +; CHECK: for.inc12: ; preds = %for.body3 +; CHECK: %exitcond25 = icmp eq i64 %indvars.iv.next24, 99 +; CHECK: br i1 %exitcond25, label %for.end14, label %for.cond1.preheader +; CHECK: for.end14: ; preds = %for.inc12 +; CHECK: ret void + + + +;;--------------------------------------Test case 05------------------------------------- +;; Loops not tightly nested are not interchanged +;; for(int j=0;j