Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -168,6 +168,7 @@ void initializeLoopDeletionPass(PassRegistry&); void initializeLoopExtractorPass(PassRegistry&); void initializeLoopInfoWrapperPassPass(PassRegistry&); +void initializeLoopInterchangePass(PassRegistry &); void initializeLoopInstSimplifyPass(PassRegistry&); void initializeLoopRotatePass(PassRegistry&); void initializeLoopSimplifyPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -97,6 +97,7 @@ (void) llvm::createLICMPass(); (void) llvm::createLazyValueInfoPass(); (void) llvm::createLoopExtractorPass(); + (void)llvm::createLoopInterchangePass(); (void) llvm::createLoopSimplifyPass(); (void) llvm::createLoopStrengthReducePass(); (void) llvm::createLoopRerollPass(); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -140,6 +140,13 @@ //===----------------------------------------------------------------------===// // +// LoopInterchange - This pass interchanges loops to provide a more +// cache-friendly memory access patterns. +// +Pass *createLoopInterchangePass(); + +//===----------------------------------------------------------------------===// +// // LoopStrengthReduce - This pass is strength reduces GEP instructions that use // a loop's canonical induction variable as one of their indices. // Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -18,6 +18,7 @@ LoopDeletion.cpp LoopIdiomRecognize.cpp LoopInstSimplify.cpp + LoopInterchange.cpp LoopRerollPass.cpp LoopRotation.cpp LoopStrengthReduce.cpp Index: lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- lib/Transforms/Scalar/LoopInterchange.cpp +++ lib/Transforms/Scalar/LoopInterchange.cpp @@ -0,0 +1,876 @@ +//===- LoopInterchange.cpp - Loop interchange pass------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This Pass handles loop interchange transform. This interchanges the inner +// and outer loop if interchanging can result in better cache hits. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +using namespace llvm; + +#define DEBUG_TYPE "loop-interchange" + +namespace { + +typedef std::pair LoopPair; +class LoopInterchange; +// Returns the maximum inner loop depth starting from L this is used to populate +// worklist with loops of depth 2. +unsigned getMaxNestingLevel(Loop &L) { + unsigned Level = 0; + if (L.empty()) + return 1; + for (Loop *InnerL : L) + Level = std::max(Level, getMaxNestingLevel(*InnerL) + 1); + return Level; +} + +static void populateWorklist(Loop &L, SmallVector &V) { + // TODO: Currently only handled for loops depth of 2. + unsigned SubLoopsize = L.getSubLoops().size(); + unsigned Count = getMaxNestingLevel(L); + if (Count == 2 && SubLoopsize == 1) { + Loop *Inner; + for (Loop *InnerL : L) + Inner = InnerL; + + V.push_back(std::make_pair(&L, Inner)); + } +} + +static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { + PHINode *InnerIndexVar = L->getCanonicalInductionVariable(); + if (InnerIndexVar) + return InnerIndexVar; + if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr) + return nullptr; + for (BasicBlock::iterator I = L->getHeader()->begin(); isa(I); ++I) { + PHINode *PhiVar = cast(I); + const SCEVAddRecExpr *AddRec = + dyn_cast(SE->getSCEV(PhiVar)); + if (!AddRec || !AddRec->isAffine()) + continue; + const SCEV *Step = AddRec->getStepRecurrence(*SE); + const SCEVConstant *C = dyn_cast(Step); + if (!C) + continue; + // Found the induction variable. + // FIXME: Handle loops with more than one induction variable. Note that, + // currently, legality makes sure we have only one induction variable. + return PhiVar; + } + return nullptr; +} + +/// LoopInterchangeLegality checks if it is legal to interchange the loop. +class LoopInterchangeLegality { +public: + LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, + DependenceAnalysis *DA, LoopInterchange *Pass) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DA(DA), Parent(Pass) {} + + /// Check if the loops can be interchanged. + bool canInterchangeLoops(); + + bool currentLimitations(); + +private: + bool checkDependence(Loop *Outer, DependenceAnalysis *DA); + bool tightlyNested(Loop *Outer, Loop *Inner); + + Loop *OuterLoop; + Loop *InnerLoop; + + /// Scev analysis. + ScalarEvolution *SE; + /// Dependence analysis. + DependenceAnalysis *DA; + LoopInterchange *Parent; +}; + +/// LoopInterchangeProfitability checks if it is profitable to interchange the +/// loop. +class LoopInterchangeProfitability { +public: + LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {} + + /// Check if the loop interchange is profitable + bool isProfitable(); + +private: + int getInstrOrderCost(); + + Loop *OuterLoop; + Loop *InnerLoop; + + /// Scev analysis. + ScalarEvolution *SE; +}; + +/// LoopInterchangeTransform interchanges the loop +class LoopInterchangeTransform { +public: + LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, + LoopInfo *LI, DominatorTree *DT) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT) { + initialize(); + } + + /// Interchange OuterLoop and InnerLoop. + bool transform(); + void initialize(); + +private: + void splitInnerLoopLatch(Instruction *); + void splitOuterLoopLatch(); + void splitInnerLoopHeader(); + void adjustOuterLoopLatch(); + bool adjustLoopLinks(); + void adjustOuterLoopPreheader(); + bool adjustLoopBranches(); + + Loop *OuterLoop; + Loop *InnerLoop; + BasicBlock *InnerLoopHeader; + BasicBlock *OuterLoopHeader; + BasicBlock *InnerLoopLatch; + BasicBlock *OuterLoopLatch; + BasicBlock *OuterLatchLcssaPhiBlock; + BasicBlock *OuterLoopPreHeader; + BasicBlock *InnerLoopPreHeader; + BasicBlock *InnerLoopLatchPred; + BasicBlock *InnerLoopHeaderSucc; + std::vector> interchangedLoops; + /// Scev analysis. + ScalarEvolution *SE; + LoopInfo *LI; + DominatorTree *DT; +}; + +// Main LoopInterchange Pass +struct LoopInterchange : public FunctionPass { + static char ID; + ScalarEvolution *SE; + LoopInfo *LI; + DependenceAnalysis *DA; + DominatorTree *DT; + LoopInterchange() + : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) { + initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + } + + bool runOnFunction(Function &F) override { + SE = &getAnalysis(); + LI = &getAnalysis().getLoopInfo(); + DA = &getAnalysis(); + auto *DTWP = getAnalysisIfAvailable(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + // Build up a worklist of loop pairs to analyze. + SmallVector Worklist; + + for (Loop *L : *LI) + populateWorklist(*L, Worklist); + + DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n"); + + bool Changed = false; + while (!Worklist.empty()) + Changed |= processLoop(Worklist.pop_back_val()); + + return Changed; + } + + bool processLoop(LoopPair P) { + // Check if it is legal to interchange loop + LoopInterchangeLegality LIL(P.first, P.second, SE, DA, this); + if (!LIL.canInterchangeLoops()) { + DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); + return false; + } + DEBUG(dbgs() << "Loops are legal to interchange\n"); + LoopInterchangeProfitability LIP(P.first, P.second, SE); + if (!LIP.isProfitable()) { + DEBUG(dbgs() << "Interchanging Loops not profitable\n"); + return false; + } + LoopInterchangeTransform LIT(P.first, P.second, SE, LI, DT); + LIT.transform(); + DEBUG(dbgs() << "Loops interchanged\n"); + return true; + } +}; + +} // end of namespace + +static bool containsUnsafeInstructions(BasicBlock *BB) { + for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { + if (I->mayHaveSideEffects() || I->mayReadFromMemory()) + return true; + } + return false; +} + +bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + + // A perfectly nested loop will not have any branch in between the outer and + // inner block i.e. outer header will branch to either inner preheader and + // outerloop latch. + BranchInst *outerLoopHeaderBI = + cast(OuterLoopHeader->getTerminator()); + unsigned num = outerLoopHeaderBI->getNumSuccessors(); + for (unsigned i = 0; i < num; i++) { + if (outerLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader && + outerLoopHeaderBI->getSuccessor(i) != OuterLoopLatch) + return false; + } + + // We do not have any basic block in between now make sure the outer header + // and outer loop latch doesnt contain any unsafe instructions. + if (containsUnsafeInstructions(OuterLoopHeader) || + containsUnsafeInstructions(OuterLoopLatch)) + return false; + + // We have a perfect loop nest. + return true; +} + +bool LoopInterchangeLegality::checkDependence(Loop *Outer, + DependenceAnalysis *DA) { + + typedef SmallVector ValueVector; + // Holds Load and Store *instructions*. + ValueVector MemInstr; + // For each block. + for (Loop::block_iterator BI = Outer->block_begin(), BE = Outer->block_end(); + BI != BE; ++BI) { + // Scan the BB and collect legal loads and stores. + for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; + ++I) { + Instruction *Ins = dyn_cast(I); + if (!Ins) + return false; + LoadInst *Ld = dyn_cast(I); + StoreInst *St = dyn_cast(I); + if (!St && !Ld) + continue; + if (Ld && !Ld->isSimple()) + return false; + if (St && !St->isSimple()) + return false; + MemInstr.push_back(Ins); + } + } + + DEBUG(dbgs() << "Found " << MemInstr.size() + << " Loads and stores to analyze\n"); + + ValueVector::iterator I, IE, J, JE; + for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { + for (J = I, JE = MemInstr.end(); J != JE; ++J) { + Instruction *Src = cast(*I); + Instruction *Des = cast(*J); + if (Src == Des) + continue; + DEBUG(dbgs() << "Checking Depencency between Src " << Src << " and Des" + << Des << "\n"); + + if (auto D = DA->depends(Src, Des, true)) { + // TODO: Fix his handle only anti/output dep for now. + if (D->isFlow()) { + // TODO: Flow dependency can be interchanged?? + DEBUG(dbgs() << "Flow dependence not handled"); + return false; + } + if (D->isAnti()) { + DEBUG(dbgs() << "Found Anti dependence \n"); + unsigned Levels = D->getLevels(); + + // If the two memory instructions have an anti dependence check + // the distance or the direction by which they vary. + // Interchanging two loops with anti dependence is valid if the + // dependence distance is *not* positive in each level. + for (unsigned II = 1; II <= Levels; ++II) { + const SCEV *Distance = D->getDistance(II); + const SCEVConstant *SCEVConst = + dyn_cast_or_null(Distance); + if (SCEVConst) { + const ConstantInt *CI = SCEVConst->getValue(); + if (!CI || (!CI->isNegative() && !CI->isZeroValue())) + return false; + } else if (D->isScalar(II)) { + DEBUG(dbgs() + << "TODO:Scalars dependence are currently not handled\n"); + return false; + } else { + unsigned Direction = D->getDirection(II); + if (Direction == Dependence::DVEntry::LT || + Direction == Dependence::DVEntry::LE || + Direction == Dependence::DVEntry::EQ) + continue; + return false; + } + } + } + } + } + } + + return true; +} + +// This function indicates the current limitations in the transform as a result +// of which we do not proceed. +bool LoopInterchangeLegality::currentLimitations() { + + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); + int PhiCount = 0; + PHINode *PHI; + + // We currently handle only 1 induction variable inside the loop. We also do + // not handle reductions as of now. + for (auto I = InnerLoopHeader->begin(); isa(I); ++I) { + PHI = cast(I); + PhiCount++; + if (PhiCount > 1) + return true; + } + + // TODO: Current limitation: Since we split the inner loop latch at the point + // were induction variable is incremented (induction.next); We cannot have + // more than 1 user of induction.next since it would result in broken code + // after split. + // e.g. + // for(i=0;igetIncomingBlock(0) == InnerLoopPreHeader) + InnerIndexVarInc = dyn_cast(PHI->getIncomingValue(1)); + else + InnerIndexVarInc = dyn_cast(PHI->getIncomingValue(0)); + + if (!InnerIndexVarInc) + return true; + + // Since we split the inner loop latch on this induction variable. Make sure + // we do not have any instruction between the induction variable and branch + // instruction. + + for (auto I = InnerLoopLatch->rbegin(), E = InnerLoopLatch->rend(); + I != E && !FoundInduction; ++I) { + if (isa(*I) || isa(*I) || isa(*I)) + continue; + const Instruction &Ins = *I; + // We found an instruction. If this is not induction variable then it is not + // safe to split this loop latch. + if (!Ins.isIdenticalTo(InnerIndexVarInc)) + return true; + else + FoundInduction = true; + } + // The loop latch ended and we didnt find the induction variable return as + // current limitation. + if (!FoundInduction) + return true; + + return false; +} + +bool LoopInterchangeLegality::canInterchangeLoops() { + + BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader()) { + OuterLoopPreHeader = InsertPreheaderForLoop(OuterLoop, Parent); + } + if (!InnerLoopPreHeader || InnerLoopPreHeader == OuterLoop->getHeader() || + InnerLoopPreHeader == InnerLoop->getHeader()) { + InnerLoopPreHeader = InsertPreheaderForLoop(InnerLoop, Parent); + } + + // ScalarEvolution needs to be able to find the exit count. + const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(OuterLoop); + const SCEV *ExitCountInner = SE->getBackedgeTakenCount(InnerLoop); + if (ExitCountOuter == SE->getCouldNotCompute() || + ExitCountInner == SE->getCouldNotCompute()) { + DEBUG(dbgs() << "Could not determine number of loop iterations\n"); + return false; + } + + // We must have a single backedge. + if (OuterLoop->getNumBackEdges() != 1 || InnerLoop->getNumBackEdges() != 1) { + DEBUG(dbgs() << "loop control flow is not understood"); + return false; + } + // We must have a single exiting block. + if (!OuterLoop->getExitingBlock() || !InnerLoop->getExitingBlock()) { + DEBUG(dbgs() << "loop control flow is not understood"); + return false; + } + // Check if the loops are tightly nested. + if (!tightlyNested(OuterLoop, InnerLoop)) { + DEBUG(dbgs() << "Loops not tightly nested\n"); + return false; + } + + // TODO: The loops could not be interchanged due to current limitations in the + // transform module. + if (currentLimitations()) { + DEBUG(dbgs() << "Not legal because of current transform limitation\n"); + return false; + } + return checkDependence(OuterLoop, DA); +} + +int LoopInterchangeProfitability::getInstrOrderCost() { + unsigned GoodOrder, BadOrder; + BadOrder = GoodOrder = 0; + for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end(); + BI != BE; ++BI) { + for (auto I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) { + const Instruction &Ins = *I; + if (const GetElementPtrInst *GEP = dyn_cast(&Ins)) { + unsigned NumOp = GEP->getNumOperands(); + for (unsigned i = 0; i < NumOp; ++i) { + const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i)); + const SCEVAddRecExpr *AR = dyn_cast(OperandVal); + if (!AR) + continue; + // If leftmost operand comes from inner loop then it is a bad order. + if (AR->getLoop() == InnerLoop) { + BadOrder++; + break; + } + // If leftmost operand comes from outer loop then it is a good order. + if (AR->getLoop() == OuterLoop) { + GoodOrder++; + break; + } + } + } + } + } + + return GoodOrder - BadOrder; +} + +bool LoopInterchangeProfitability::isProfitable() { + // TODO: Add Better Profitibility checks. + // e.g + // 1) Construct dependency matrix and move the one with no loop carried dep + // inside to enable vectorization. + // 2) If reordering results in inner loop having stride of 1 etc. + + // This is rough cost estimation algorithm. It counts the good and bad order + // of induction variables in the instruction and allows reordering if number + // of bad orders is more than good. + int Cost = 0; + Cost += getInstrOrderCost(); + DEBUG(dbgs() << "Cost = " << Cost << "\n"); + if (Cost < 0) + return true; + return false; +} + +bool LoopInterchangeTransform::transform() { + + DEBUG(dbgs() << "transform\n"); + bool Transformed = false; + Instruction *InnerIndexVar; + PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); + if (!InductionPHI) { + DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); + return false; + } + + if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) + InnerIndexVar = dyn_cast(InductionPHI->getIncomingValue(1)); + else + InnerIndexVar = dyn_cast(InductionPHI->getIncomingValue(0)); + + // + // Split at the place were the induction variable is incremented/decremented. + // TODO: This splitting logic may not work always. Fix this. + splitInnerLoopLatch(InnerIndexVar); + DEBUG(dbgs() << "splitInnerLoopLatch Done\n"); + + // Splits the inner loops phi nodes out into a seperate basic block. + splitInnerLoopHeader(); + DEBUG(dbgs() << "splitInnerLoopHeader Done\n"); + + // Splits the LCSSA PHI nodes into a seperate block. + splitOuterLoopLatch(); + DEBUG(dbgs() << "splitOuterLoopLatch Done\n"); + + adjustOuterLoopLatch(); + + Transformed |= adjustLoopLinks(); + if (!Transformed) { + DEBUG(dbgs() << "adjustLoopLinks Failed\n"); + return false; + } + SE->forgetLoop(OuterLoop); + SE->forgetLoop(InnerLoop); + return true; +} + +void LoopInterchangeTransform::initialize() { + InnerLoopHeader = InnerLoop->getHeader(); + OuterLoopHeader = OuterLoop->getHeader(); + InnerLoopLatch = InnerLoop->getLoopLatch(); + OuterLoopLatch = OuterLoop->getLoopLatch(); + OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + InnerLoopPreHeader = InnerLoop->getLoopPreheader(); +} + +void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *inc) { + + BasicBlock::iterator I = InnerLoopLatch->begin(); + BasicBlock::iterator E = InnerLoopLatch->end(); + for (; I != E; ++I) { + if (inc == I) + break; + } + + // Split the inner loop latch out. + InnerLoopLatchPred = InnerLoopLatch; + InnerLoopLatch = SplitBlock(InnerLoopLatchPred, I, DT, LI); +} + +void LoopInterchangeTransform::splitOuterLoopLatch() { + OuterLatchLcssaPhiBlock = OuterLoopLatch; + OuterLoopLatch = SplitBlock(OuterLatchLcssaPhiBlock, + OuterLoopLatch->getFirstNonPHI(), DT, LI); +} + +void LoopInterchangeTransform::adjustOuterLoopLatch() { + + for (auto BI = OuterLoop->block_begin(), BE = OuterLoop->block_end(); + BI != BE; ++BI) { + BranchInst *BInstr = dyn_cast((*BI)->getTerminator()); + if (!BInstr) + continue; + unsigned NumOp = BInstr->getNumSuccessors(); + for (unsigned i = 0; i < NumOp; ++i) { + if (BInstr->getSuccessor(i) == OuterLatchLcssaPhiBlock) { + BInstr->setSuccessor(i, OuterLoopLatch); + } + } + } + // Now set OuterLatchLcssaPhiBlock as successor of OuterLoopLatch. + BranchInst *BInstr = cast(OuterLoopLatch->getTerminator()); + BasicBlock *LoopNestExit; + if (BInstr->getSuccessor(0) == OuterLoopHeader) { + LoopNestExit = BInstr->getSuccessor(1); + BInstr->setSuccessor(1, OuterLatchLcssaPhiBlock); + } else { + LoopNestExit = BInstr->getSuccessor(0); + BInstr->setSuccessor(0, OuterLatchLcssaPhiBlock); + } + + // Incoming block changed adjust PHI nodes in OuterLatchLcssaPhiBlock. + // One block will branch from outer pre header after checking the condition + // for inner loop and another from inner loop latch. + for (auto I = OuterLatchLcssaPhiBlock->begin(); isa(I); ++I) { + PHINode *LCSSAPHI = cast(I); + unsigned NumOp = LCSSAPHI->getNumIncomingValues(); + for (unsigned i = 0; i < NumOp; ++i) { + if (LCSSAPHI->getIncomingBlock(i) != OuterLoopHeader) + LCSSAPHI->setIncomingBlock(i, InnerLoopLatch); + else + LCSSAPHI->setIncomingBlock(i, OuterLoopPreHeader); + } + } + + BInstr = cast(OuterLatchLcssaPhiBlock->getTerminator()); + BInstr->setSuccessor(0, LoopNestExit); + + // Outer Loop's LCSSA nodes have been now moved outside loop. This is done + // because after interchange we need to have a check for inner loops branch + // condition in the preheader and exit the loop in case the condition fails. + LI->removeBlock(OuterLatchLcssaPhiBlock); + + // Incoming block changed adjust PHI nodes in LoopNestExit + for (auto I = LoopNestExit->begin(); isa(I); ++I) { + PHINode *PHI = cast(I); + unsigned NumOp = PHI->getNumIncomingValues(); + for (unsigned i = 0; i < NumOp; ++i) { + if (PHI->getIncomingBlock(i) == OuterLoopLatch) + PHI->setIncomingBlock(i, OuterLatchLcssaPhiBlock); + } + } +} + +void LoopInterchangeTransform::splitInnerLoopHeader() { + + // Split the inner loop header out. + InnerLoopHeaderSucc = + SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); + + DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & " + "InnerLoopHeader \n"); +} + +void LoopInterchangeTransform::adjustOuterLoopPreheader() { + // Adjust the outerLoop preheader to jump to inner loop preheader and + // if the + BranchInst *outerLoopHeaderBI = + cast(OuterLoopHeader->getTerminator()); + BranchInst *outerLoopPreHeaderBI = + cast(OuterLoopPreHeader->getTerminator()); + BranchInst *ExitInst = cast(OuterLoopLatch->getTerminator()); + + BasicBlock *TrueBlock; + BasicBlock *FalseBlock; + BasicBlock *LoopNestExitBlock; + if (outerLoopHeaderBI->isUnconditional()) { + BranchInst::Create(InnerLoopPreHeader, outerLoopPreHeaderBI); + outerLoopPreHeaderBI->eraseFromParent(); + return; + } + // Find the loop nest exit block. + if (ExitInst->getSuccessor(0) == OuterLoopHeader) + LoopNestExitBlock = ExitInst->getSuccessor(1); + else + LoopNestExitBlock = ExitInst->getSuccessor(0); + // OuterLoopPreheader will branch to inner loop preheader and exit to Loop + // nest exit based on inner loops branch condition. On exit condition we + // should now branch to loop exit. + // Rewrite the Branch instruction to handle this. + if (outerLoopHeaderBI->getSuccessor(0) == InnerLoopPreHeader) { + TrueBlock = InnerLoopPreHeader; + FalseBlock = LoopNestExitBlock; + } else { + FalseBlock = InnerLoopPreHeader; + TrueBlock = LoopNestExitBlock; + } + BranchInst::Create(TrueBlock, FalseBlock, outerLoopHeaderBI->getCondition(), + outerLoopPreHeaderBI); + outerLoopPreHeaderBI->eraseFromParent(); + return; +} + +bool LoopInterchangeTransform::adjustLoopBranches() { + + SmallVector LoopPhis; + SmallVector DepInstr; + BranchInst *InnerPreheaderBI = + dyn_cast(InnerLoopPreHeader->getTerminator()); + BranchInst *InnerHeaderBI = + dyn_cast(InnerLoopHeader->getTerminator()); + BranchInst *OuterHeaderBI = + dyn_cast(OuterLoopHeader->getTerminator()); + BranchInst *InnerLoopLatchPredBI = + dyn_cast(InnerLoopLatchPred->getTerminator()); + BranchInst *OuterLoopLatchBI = + dyn_cast(OuterLoopLatch->getTerminator()); + BranchInst *InnerLoopLatchBI = + dyn_cast(InnerLoopLatch->getTerminator()); + BranchInst *InnerLoopHeaderSuccBI = + dyn_cast(InnerLoopHeaderSucc->getTerminator()); + + BasicBlock *LoopNestExit; + if (!InnerPreheaderBI || !InnerHeaderBI || !InnerPreheaderBI || + !InnerLoopLatchPredBI || !OuterLoopLatchBI || !InnerLoopLatchBI || + !InnerLoopHeaderSuccBI) + llvm_unreachable( + "This should not be triggered.We have already modified parts " + "of the loop"); + + // Update OuterLoopHeader PHI nodes as incoming block has changed. + // Collect all Phi nodes and instructions in the outer loop header. + for (auto I = OuterLoopHeader->begin(), E = OuterLoopHeader->end(); I != E; + ++I) { + Instruction *Ins = I; + if (isa(I)) + LoopPhis.push_back(cast(I)); + for (auto UI = Ins->user_begin(), UE = Ins->user_end(); UI != UE; ++UI) { + User *Use = *UI; + Instruction *U = dyn_cast(Use); + if (!U) + continue; + // find any instuctions in inner loop preheader which may requires + // instructions in outer loop pre header + if (U->getParent() == InnerLoopPreHeader) { + DepInstr.push_back(U); + } + } + } + + // Move these dependent instructions to the *new* inner loop header(old outer + // loop header) + // Insert in the same order as it was present use rbegin and rend + for (auto I = DepInstr.rbegin(), E = DepInstr.rend(); I != E; ++I) { + Instruction *Ins = *I; + Ins->moveBefore(OuterLoopHeader->getTerminator()); + } + + // Create an unconditional branch to the new splitted innerLoopHeader + // from inner loop preheader. Erase the old branch instruction. + BranchInst::Create(InnerLoopHeader, InnerPreheaderBI); + InnerPreheaderBI->eraseFromParent(); + + // Branch from the old inner loop header to outer loop header. + // Erase old branch instruction. + BranchInst::Create(OuterLoopHeader, InnerHeaderBI); + InnerHeaderBI->eraseFromParent(); + + // Adjust Phi nodes of the outer loop header. The previous incoming + // OuterLoopPreHeader is now gone and the new incoming block is + // innerLoopHeader. + while (!LoopPhis.empty()) { + PHINode *CurrIV = LoopPhis.pop_back_val(); + unsigned numIncomingBlocks = CurrIV->getNumIncomingValues(); + for (unsigned i = 0; i < numIncomingBlocks; ++i) { + if (CurrIV->getIncomingBlock(i) == OuterLoopPreHeader) { + CurrIV->setIncomingBlock(i, InnerLoopHeader); + } + } + } + + // The outer header will now branch to the InnerLoopHeaderSucc(which was + // obtained after spiltting PHI nodes of the inner loop header) instead of + // inner loop preheader. + BranchInst::Create(InnerLoopHeaderSucc, OuterHeaderBI); + OuterHeaderBI->eraseFromParent(); + + BasicBlock *InnerLoopExitBlock; + BasicBlock *InnerLoopExitIncomingBlock = nullptr; + + if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) + InnerLoopExitBlock = InnerLoopLatchBI->getSuccessor(1); + else + InnerLoopExitBlock = InnerLoopLatchBI->getSuccessor(0); + + if (InnerLoopHeaderSuccBI->getSuccessor(0) == InnerLoopLatch) { + InnerLoopHeaderSuccBI->setSuccessor(0, InnerLoopExitBlock); + InnerLoopExitIncomingBlock = InnerLoopHeaderSucc; + } else if (!InnerLoopHeaderSuccBI->isUnconditional() && + InnerLoopHeaderSuccBI->getSuccessor(1) == InnerLoopLatch) { + InnerLoopHeaderSuccBI->setSuccessor(1, InnerLoopExitBlock); + InnerLoopExitIncomingBlock = InnerLoopHeaderSucc; + } + + // If the split did't result in same basic block adjust InnerLoopLatchPred + if (InnerLoopHeaderSuccBI != InnerLoopLatchPredBI) { + if (InnerLoopLatchPredBI->getSuccessor(0) == InnerLoopLatch) { + InnerLoopLatchPredBI->setSuccessor(0, InnerLoopExitBlock); + InnerLoopExitIncomingBlock = InnerLoopLatchPred; + } else if (!InnerLoopLatchPredBI->isUnconditional() && + InnerLoopLatchPredBI->getSuccessor(1) == InnerLoopLatch) { + InnerLoopLatchPredBI->setSuccessor(1, InnerLoopExitBlock); + InnerLoopExitIncomingBlock = InnerLoopLatchPred; + } + } + + // Update lcssa phi's in exitblock of old inner loop as incoming block has + // changed. + for (auto I = InnerLoopExitBlock->begin(); isa(I); ++I) { + PHINode *PHI = cast(I); + unsigned numBlocks = PHI->getNumIncomingValues(); + for (unsigned i = 0; i < numBlocks; ++i) { + if (PHI->getIncomingBlock(i) == InnerLoopLatch && + InnerLoopExitIncomingBlock != nullptr) + PHI->setIncomingBlock(i, InnerLoopExitIncomingBlock); + } + } + + // exit of outerLoop latch will now branch to inner loop latch + if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) { + LoopNestExit = OuterLoopLatchBI->getSuccessor(1); + OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch); + } else { + LoopNestExit = OuterLoopLatchBI->getSuccessor(0); + OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch); + } + + // exit of innerLoop latch will now branch to loop nest exit + if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) + InnerLoopLatchBI->setSuccessor(1, LoopNestExit); + else + InnerLoopLatchBI->setSuccessor(0, LoopNestExit); + + return true; +} + +bool LoopInterchangeTransform::adjustLoopLinks() { + + // Adjust the outerLoop preheader to jump to inner loop preheader and + // to loop nest exit based on inner loops branch condition. + adjustOuterLoopPreheader(); + + // Adjust all branches so that outer loop is moved inside and inner loop is + // moved outside. + adjustLoopBranches(); + return true; +} + +char LoopInterchange::ID = 0; +INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", + "Interchanges loops for cache reuse", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) + +INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", + "Interchanges loops for cache reuse", false, false) + +Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); } Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -47,6 +47,7 @@ initializeLICMPass(Registry); initializeLoopDeletionPass(Registry); initializeLoopInstSimplifyPass(Registry); + initializeLoopInterchangePass(Registry); initializeLoopRotatePass(Registry); initializeLoopStrengthReducePass(Registry); initializeLoopRerollPass(Registry); Index: test/Transforms/LoopInterchange/currentLimitation.ll =================================================================== --- test/Transforms/LoopInterchange/currentLimitation.ll +++ test/Transforms/LoopInterchange/currentLimitation.ll @@ -0,0 +1,165 @@ +; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s +;; These are test that fail to interchange due to current limitation. This will go off once we extend the loop interchange pass. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer +@B = common global [100 x [100 x [100 x i32]]] zeroinitializer + +;;--------------------------------------Test case 01------------------------------------ +;; [FIXME] This loop though valid is currently not interchanged due to the limitation that we cannot split the inner loop latch due to multiple use of inner induction +;; variable.(used to increment the loop counter and to access A[j+1][i+1] +;; for(int i=0;i=0;j--) +;; A[j][i] = A[j][i]+k; + +define void @interchange_02(i32 %k) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc10, %entry + %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc10 ] + br label %for.body3 + +for.body3: ; preds = %for.cond1.preheader, %for.body3 + %indvars.iv = phi i64 [ 100, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19 + %0 = load i32* %arrayidx5 + %add = add nsw i32 %0, %k + store i32 %add, i32* %arrayidx5 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp2 = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp2, label %for.body3, label %for.inc10 + +for.inc10: ; preds = %for.body3 + %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 + %exitcond = icmp eq i64 %indvars.iv.next20, 100 + br i1 %exitcond, label %for.end11, label %for.cond1.preheader + +for.end11: ; preds = %for.inc10 + ret void +} + +; CHECK-LABEL: @interchange_02 +; CHECK: entry: +; CHECK: br label %for.body3.preheader + +; CHECK: for.cond1.preheader: +; CHECK: %indvars.iv19 = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next20, %for.inc10.split ] +; CHECK: br label %for.body3.split1 +; CHECK: for.body3.preheader: +; CHECK: br label %for.body3 + +; CHECK: for.body3: +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 100, %for.body3.preheader ] +; CHECK: br label %for.cond1.preheader + +; CHECK: for.body3.split1: +; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19 +; CHECK: %0 = load i32* %arrayidx5 +; CHECK: %add = add nsw i32 %0, %k +; CHECK: store i32 %add, i32* %arrayidx5 +; CHECK: br label %for.inc10.split + +; CHECK: for.body3.split: +; CHECK: %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK: %cmp2 = icmp sgt i64 %indvars.iv, 0 +; CHECK: br i1 %cmp2, label %for.body3, label %for.inc10 + +; CHECK: for.inc10: +; CHECK: br label %for.end11 + +; CHECK: for.inc10.split: +; CHECK: %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 +; CHECK: %exitcond = icmp eq i64 %indvars.iv.next20, 100 +; CHECK: br i1 %exitcond, label %for.body3.split, label %for.cond1.preheader + +; CHECK: for.end11: +; CHECK: ret void + + +;;--------------------------------------Test case 03------------------------------------- +;; Loops should not be interchanged in this case as it is not profitable. +;; for(int i=0;i<100;i++) +;; for(int j=0;j<100;j++) +;; A[i][j] = A[i][j]+k; + +define void @interchange_03(i32 %k) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc10 ] + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv + %0 = load i32* %arrayidx5 + %add = add nsw i32 %0, %k + store i32 %add, i32* %arrayidx5 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.inc10, label %for.body3 + +for.inc10: + %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 + %exitcond23 = icmp eq i64 %indvars.iv.next22, 100 + br i1 %exitcond23, label %for.end12, label %for.cond1.preheader + +for.end12: + ret void +} + +; CHECK-LABEL: @interchange_03 +; CHECK: entry: +; CHECK: br label %for.cond1.preheader + +; CHECK: for.cond1.preheader: +; CHECK: %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc10 ] +; CHECK: br label %for.body3 + +; CHECK: for.body3: +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ] +; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv +; CHECK: %0 = load i32* %arrayidx5 +; CHECK: %add = add nsw i32 %0, %k +; CHECK: store i32 %add, i32* %arrayidx5 +; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK: %exitcond = icmp eq i64 %indvars.iv.next, 100 +; CHECK: br i1 %exitcond, label %for.inc10, label %for.body3 + +; CHECK: for.inc10: +; CHECK: %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 +; CHECK: %exitcond23 = icmp eq i64 %indvars.iv.next22, 100 +; CHECK: br i1 %exitcond23, label %for.end12, label %for.cond1.preheader + +; CHECK: for.end12: +; CHECK: ret void + + +;;--------------------------------------Test case 04------------------------------------- +;; Loops should not be interchanged in this case as it is not legal due to dependency. +;; for(int j=0;j<99;j++) +;; for(int i=0;i<99;i++) +;; A[j][i+1] = A[j+1][i]+k; + +define void @interchange_04(i32 %k){ +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv + %0 = load i32* %arrayidx5 + %add6 = add nsw i32 %0, %k + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx11 = getelementptr inbounds [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next + store i32 %add6, i32* %arrayidx11 + %exitcond = icmp eq i64 %indvars.iv.next, 99 + br i1 %exitcond, label %for.inc12, label %for.body3 + +for.inc12: + %exitcond25 = icmp eq i64 %indvars.iv.next24, 99 + br i1 %exitcond25, label %for.end14, label %for.cond1.preheader + +for.end14: + ret void +} + +; CHECK-LABEL: @interchange_04 +; CHECK: entry: +; CHECK: br label %for.cond1.preheader + +; CHECK: for.cond1.preheader: +; CHECK: %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ] +; CHECK: %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 +; CHECK: br label %for.body3.preheader + +; CHECK: for.body3.preheader: +; CHECK: br label %for.body3 + +; CHECK: for.body3: +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ] +; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv +; CHECK: %0 = load i32* %arrayidx5 +; CHECK: %add6 = add nsw i32 %0, %k +; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK: %arrayidx11 = getelementptr inbounds [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next +; CHECK: store i32 %add6, i32* %arrayidx11 +; CHECK: %exitcond = icmp eq i64 %indvars.iv.next, 99 +; CHECK: br i1 %exitcond, label %for.inc12, label %for.body3 + +; CHECK: for.inc12: +; CHECK: %exitcond25 = icmp eq i64 %indvars.iv.next24, 99 +; CHECK: br i1 %exitcond25, label %for.end14, label %for.cond1.preheader + +; CHECK: for.end14: +; CHECK: ret void + + + +;;--------------------------------------Test case 05------------------------------------- +;; Loops not tightly nested are not interchanged +;; for(int j=0;j