Index: include/llvm-c/Transforms/Scalar.h =================================================================== --- include/llvm-c/Transforms/Scalar.h +++ include/llvm-c/Transforms/Scalar.h @@ -65,6 +65,9 @@ /** See llvm::createLoopRotatePass function. */ void LLVMAddLoopRotatePass(LLVMPassManagerRef PM); +/** See llvm::createLoopRerollPass function. */ +void LLVMAddLoopRerollPass(LLVMPassManagerRef PM); + /** See llvm::createLoopUnrollPass function. */ void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM); Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -162,6 +162,7 @@ void initializeLoopSimplifyPass(PassRegistry&); void initializeLoopStrengthReducePass(PassRegistry&); void initializeGlobalMergePass(PassRegistry&); +void initializeLoopRerollPass(PassRegistry&); void initializeLoopUnrollPass(PassRegistry&); void initializeLoopUnswitchPass(PassRegistry&); void initializeLoopIdiomRecognizePass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -91,6 +91,7 @@ (void) llvm::createLoopExtractorPass(); (void) llvm::createLoopSimplifyPass(); (void) llvm::createLoopStrengthReducePass(); + (void) llvm::createLoopRerollPass(); (void) llvm::createLoopUnrollPass(); (void) llvm::createLoopUnswitchPass(); (void) llvm::createLoopIdiomPass(); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -142,6 +142,12 @@ //===----------------------------------------------------------------------===// // +// LoopReroll - This pass is a simple loop rerolling pass. +// +Pass *createLoopRerollPass(); + +//===----------------------------------------------------------------------===// +// // LoopRotate - This pass is a simple loop rotating pass. // Pass *createLoopRotatePass(); Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -16,6 +16,7 @@ LoopInstSimplify.cpp LoopRotation.cpp LoopStrengthReduce.cpp + LoopRerollPass.cpp LoopUnrollPass.cpp LoopUnswitch.cpp LowerAtomic.cpp Index: lib/Transforms/Scalar/LoopRerollPass.cpp =================================================================== --- /dev/null +++ lib/Transforms/Scalar/LoopRerollPass.cpp @@ -0,0 +1,806 @@ +//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements a simple loop reroller. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-reroll" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +using namespace llvm; + +STATISTIC(NumRerolledLoops, "Number of rerolled loops"); + +static cl::opt +MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, + cl::desc("The maximum increment for loop rerolling")); + +namespace { + class LoopReroll : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopReroll() : LoopPass(ID) { + initializeLoopRerollPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM); + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG... + /// + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + + AU.addRequired(); + AU.addPreserved(); + + AU.addRequired(); + AU.addPreserved(); + + AU.addRequired(); + + AU.addRequired(); + } + +protected: + AliasAnalysis *AA; + LoopInfo *LI; + ScalarEvolution *SE; + DataLayout *DL; + TargetLibraryInfo *TLI; + DominatorTree *DT; + + typedef std::pair IVInfo; + void collectPossibleIVs(Loop *L, SmallVector &PossibleIVs); + void collectPossibleReductions(Loop *L, + SmallVector, 16> &PossibleReds); + void collectInLoopUserSet(Loop *L, + const SmallVector &Roots, + const SmallSet &Exclude, + const SmallSet &Final, + DenseSet &Users); + void collectInLoopUserSet(Loop *L, + Instruction * Root, + const SmallSet &Exclude, + const SmallSet &Final, + DenseSet &Users); + }; +} + +char LoopReroll::ID = 0; +INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) + +Pass *llvm::createLoopRerollPass() { + return new LoopReroll; +} + +// Returns true if the provided instruction is used outside the given loop. +// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in +// non-loop blocks to be outside the loop. +static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { + for (Value::use_iterator UI = I->use_begin(), + UIE = I->use_end(); UI != UIE; ++UI) { + Instruction *User = cast(*UI); + if (!L->contains(User)) + return true; + } + + return false; +} + +void LoopReroll::collectPossibleIVs(Loop *L, + SmallVector &PossibleIVs) { + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) { + for (BasicBlock::iterator J = (*I)->begin(), + JE = (*I)->getFirstInsertionPt(); J != JE; ++J) { + if (!isa(J)) + continue; + if (!J->getType()->isIntegerTy()) + continue; + + if (const SCEVAddRecExpr *PHISCEV = + dyn_cast(SE->getSCEV(J))) { + if (PHISCEV->getLoop() != L) + continue; + if (!PHISCEV->isAffine()) + continue; + if (const SCEVConstant *IncSCEV = + dyn_cast(PHISCEV->getOperand(1))) { + if (!IncSCEV->getValue()->getValue().isStrictlyPositive()) + continue; + if (IncSCEV->getValue()->uge(MaxInc)) + continue; + + DEBUG(dbgs() << "LRR: Possible IV: " << *J << " = " << + *PHISCEV << "\n"); + PossibleIVs.push_back(IVInfo(J, PHISCEV)); + } + } + } + } +} + +void LoopReroll::collectPossibleReductions(Loop *L, + SmallVector, 16> &PossibleReds) { + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) { + for (BasicBlock::iterator J = (*I)->begin(), + JE = (*I)->getFirstInsertionPt(); J != JE; ++J) { + if (!isa(J)) + continue; + if (!J->getType()->isSingleValueType()) + continue; + + // The reduction variable must be a chain of single-use instructions + // (including the PHI), except for the last value (which is used by the + // PHI and also outside the loop). + bool ChainGood = true; + SmallVector Chain(1, J); + Instruction *C = J; + do { + C = cast(*C->use_begin()); + if (C->hasOneUse()) { + if (!C->isBinaryOp()) { + ChainGood = false; + break; + } + + if (!(isa(Chain.back()) || + C->isSameOperationAs(Chain.back()))) { + ChainGood = false; + break; + } + + Chain.push_back(C); + } + } while (C->hasOneUse() && ChainGood); + + if (!ChainGood || Chain.size() < 2 || + !C->isSameOperationAs(Chain.back()) || C->use_begin() == C->use_end()) + continue; + + // C is now the (potential) last instruction in the reduction chain. + for (Value::use_iterator UI = C->use_begin(), UIE = C->use_end(); + UI != UIE; ++UI) { + // The only in-loop user can be the initial PHI. + if (L->contains(cast(*UI))) + if (cast(*UI ) != Chain.front()) { + ChainGood = false; + break; + } + } + + if (!ChainGood) + continue; + + Chain.push_back(C); + + DEBUG(dbgs() << "LRR: Possible reduction: " << *J << " (with " << + Chain.size() << " chained instructions)\n"); + PossibleReds.push_back(Chain); + } + } +} + +void LoopReroll::collectInLoopUserSet(Loop *L, + const SmallVector &Roots, + const SmallSet &Exclude, + const SmallSet &Final, + DenseSet &Users) { + for (SmallVector::const_iterator I = Roots.begin(), + IE = Roots.end(); I != IE; ++I) + collectInLoopUserSet(L, *I, Exclude, Final, Users); +} + +void LoopReroll::collectInLoopUserSet(Loop *L, + Instruction *Root, const SmallSet &Exclude, + const SmallSet &Final, + DenseSet &Users) { + SmallVector Queue(1, Root); + while (!Queue.empty()) { + Instruction *I = Queue.pop_back_val(); + if (!Users.insert(I).second) + continue; + + if (!Final.count(I)) + for (Value::use_iterator UI = I->use_begin(), + UIE = I->use_end(); UI != UIE; ++UI) { + Instruction *User = cast(*UI); + if (PHINode *PN = dyn_cast(User)) { + // Ignore "wrap-around" uses to PHIs of this loop's header. + if (PN->getIncomingBlock(UI) == L->getHeader()) + continue; + } + + if (L->contains(User) && !Exclude.count(User)) { + Queue.push_back(User); + } + } + + // We also want to collect single-user "feeder" values. + for (User::op_iterator OI = I->op_begin(), + OIE = I->op_end(); OI != OIE; ++OI) { + if (Instruction *Op = dyn_cast(*OI)) + if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) && + !Final.count(Op)) + Queue.push_back(Op); + } + } +} + +static bool isSimpleLoadStore(Instruction *I) { + if (LoadInst *LI = dyn_cast(I)) + return LI->isSimple(); + if (StoreInst *SI = dyn_cast(I)) + return SI->isSimple(); + if (MemIntrinsic *MI = dyn_cast(I)) + return !MI->isVolatile(); + return false; +} + +bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { + AA = &getAnalysis(); + LI = &getAnalysis(); + SE = &getAnalysis(); + TLI = &getAnalysis(); + DL = getAnalysisIfAvailable(); + DT = &getAnalysis(); + + BasicBlock *Header = L->getHeader(); + DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << + "] Loop %" << Header->getName() << " (" << + L->getNumBlocks() << " block(s))\n"); + + bool Changed = false; + + // For now, we'll handle only single BB loops. + if (L->getNumBlocks() > 1) + return Changed; + + if (!SE->hasLoopInvariantBackedgeTakenCount(L)) + return Changed; + + const SCEV *LIBETC = SE->getBackedgeTakenCount(L); + const SCEV *IterCount = + SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1)); + DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n"); + + // First, we need to find the induction variable with respect to which we can + // reroll (there may be several possible options). + SmallVector PossibleIVs; + collectPossibleIVs(L, PossibleIVs); + + if (PossibleIVs.empty()) { + DEBUG(dbgs() << "LRR: No possible IVs found\n"); + return Changed; + } + + SmallVector, 16> PossibleReds; + collectPossibleReductions(L, PossibleReds); + + // For each possible IV, collect the associated possible set of 'root' nodes + // (i+1, i+2, etc.). + for (SmallVector::iterator I = PossibleIVs.begin(), + IE = PossibleIVs.end(); I != IE; ++I) { + uint64_t Inc = cast(I->second->getOperand(1))-> + getValue()->getZExtValue(); + // The collection of loop increment instructions. + SmallVector LoopIncs; + uint64_t Scale = Inc; + + Instruction *IV = I->first; + if (Inc == 1) { + // This is a special case: here we're looking for all uses (except for + // the increment) to be multiplied by a common factor. The increment must + // be by one. + if (I->first->getNumUses() != 2) + continue; + Instruction *User1 = cast(*I->first->use_begin()), + *User2 = cast(*llvm::next( + I->first->use_begin())); + if (User1->getOpcode() != Instruction::Mul) + std::swap(User1, User2); + + if (User1->getOpcode() != Instruction::Mul) + continue; + Value *MulScale = + User1->getOperand(User1->getOperand(1) == I->first ? 0 : 1); + if (ConstantInt *MulScaleCI = dyn_cast(MulScale)) { + if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc)) + continue; + Scale = MulScaleCI->getZExtValue(); + IV = User1; + } else + continue; + + if (User2->getOpcode() != Instruction::Add) + continue; + Value *AddInc = + User2->getOperand(User2->getOperand(1) == I->first ? 0 : 1); + if (ConstantInt *AddIncCI = dyn_cast(AddInc)) { + if (!AddIncCI->isOne()) + continue; + LoopIncs.push_back(User2); + } else + continue; + + DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n"); + } + + assert(Scale <= MaxInc && "Scale is too large"); + assert(Scale > 1 && "Scale must be at least 2"); + + // The set of increment instructions for each increment value. + SmallVector, 32> Roots(Scale-1); + SmallSet AllRoots; + + for (Value::use_iterator UI = IV->use_begin(), + UIE = IV->use_end(); UI != UIE; ++UI) { + Instruction *User = cast(*UI); + if (!SE->isSCEVable(User->getType())) + continue; + if (User->getType() != IV->getType()) + continue; + if (!L->contains(User)) + continue; + if (hasUsesOutsideLoop(User, L)) + continue; + + if (const SCEVConstant *Diff = dyn_cast(SE->getMinusSCEV( + SE->getSCEV(User), SE->getSCEV(IV)))) { + uint64_t Idx = Diff->getValue()->getValue().getZExtValue(); + if (Idx > 0 && Idx < Scale) { + Roots[Idx-1].push_back(User); + AllRoots.insert(User); + } else if (Idx == Scale && Inc > 1) { + LoopIncs.push_back(User); + } + } + } + + if (Roots[0].empty()) + continue; + bool AllSame = true; + for (unsigned i = 1; i < Scale-1; ++i) + if (Roots[i].size() != Roots[0].size()) { + AllSame = false; + break; + } + + if (!AllSame) + continue; + + DEBUG(dbgs() << "LRR: Found all root induction increments for: " << + *I->first << "\n"); + + + // An array of just the possible reductions for this scale factor. When we + // collect the set of all users of some root instructions, these reduction + // instructions are treated as 'final' (their uses are not considered). + // This is important because we don't want the root use set to search down + // the reduction chain. + SmallSet PossibleRedSet; + SmallSet PossibleRedLastSet, PossibleRedFirstSet; + DenseMap PossibleRedIdx; + DenseMap PossibleRedIter; + DenseSet Reds; + for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i) + if (PossibleReds[i].size() % (Scale + 1) == 0) { + PossibleRedLastSet.insert(PossibleReds[i].back()); + PossibleRedFirstSet.insert(PossibleReds[i].front()); + + for (SmallVector::iterator J = PossibleReds[i].begin(), + JE = PossibleReds[i].end(); J != JE; ++J) { + PossibleRedSet.insert(*J); + PossibleRedIdx[*J] = i; + } + } + + // We now need to check for equivalence of the use graph of each root with + // that of the primary induction variable (excluding the roots). Our goal + // here is not to solve the full graph isomorphism problem, but rather to + // catch common cases without a lot of work. As a result, we will assume + // that the relative order of the instructions in each unrolled iteration + // is the same (although we will not make an assumption about how the + // different iterations are intermixed). Note that while the order must be + // the same, the instructions may not be in the same basic block. + SmallSet Exclude(AllRoots); + Exclude.insert(LoopIncs.begin(), LoopIncs.end()); + + DenseSet BaseUseSet; + collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet); + + DenseSet AllRootUses; + std::vector > RootUseSets(Scale-1); + + bool MatchFailed = false; + for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) { + DenseSet &RootUseSet = RootUseSets[i]; + collectInLoopUserSet(L, Roots[i], SmallSet(), + PossibleRedSet, RootUseSet); + + DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() << + " vs. iteration increment " << (i+1) << + " use set size: " << RootUseSet.size() << "\n"); + + if (BaseUseSet.size() != RootUseSet.size()) { + MatchFailed = true; + break; + } + + // In addition to regular aliasing information, we need to look for + // instructions from later (future) iterations that have side effects + // preventing us from reordering them past other instructions with side + // effects. + bool FutureSideEffects = false; + AliasSetTracker AST(*AA); + DenseMap BaseMap; + + assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops"); + for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(), + JE = Header->end(); J1 != JE && !MatchFailed; ++J1) { + if (cast(J1) == I->first) + continue; + if (cast(J1) == IV) + continue; + if (!BaseUseSet.count(J1)) + continue; + if (PossibleRedFirstSet.count(J1)) // Skip reduction PHIs. + continue; + + while (J2 != JE && (!RootUseSet.count(J2) || + std::find(Roots[i].begin(), Roots[i].end(), J2) != + Roots[i].end())) { + // As we iterate through the instructions, instructions that don't + // belong to previous iterations (or the base case), must belong to + // future iterations. We want to track the alias set of writes from + // previous iterations. + if (!isa(J2) && !BaseUseSet.count(J2) && + !AllRootUses.count(J2)) { + if (J2->mayWriteToMemory()) + AST.add(J2); + + // Note: This is specifically guarded by a check on isa, + // which while a valid (somewhat arbitrary) micro-optimization, is + // needed because otherwise isSafeToSpeculativelyExecute returns + // false on PHI nodes. + if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL)) + FutureSideEffects = true; + } + + ++J2; + } + + if (!J1->isSameOperationAs(J2)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << "\n"); + MatchFailed = true; + break; + } + + // Make sure that this instruction, which is in the use set of this + // root instruction, does not also belong to the base set or the set of + // some previous root instruction. + if (BaseUseSet.count(J2) || AllRootUses.count(J2)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (prev. case overlap)\n"); + MatchFailed = true; + break; + } + + // Make sure that we don't alias with any instruction in the alias set + // tracker. If we do, then we depend on a future iteration, and we + // can't reroll. + if (J2->mayReadFromMemory()) { + for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end(); + K != KE && !MatchFailed; ++K) { + if (K->aliasesUnknownInst(J2, *AA)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (depends on future store)\n"); + MatchFailed = true; + break; + } + } + } + + // If we've past an instruction from a future iteration that may have + // side effects, and this instruction might also, then we can't reorder + // them, and this matching fails. As an exception, we allow the alias + // set tracker to handle regular (simple) load/store dependencies. + if (FutureSideEffects && + ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) || + (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << + " (side effects prevent reordering)\n"); + MatchFailed = true; + break; + } + + // Don't match the operands for a reduction instruction. + bool MatchOps = true; + DenseMap::iterator J1I = PossibleRedIdx.find(J1); + if (J1I != PossibleRedIdx.end()) { + DenseMap::iterator J2I = PossibleRedIdx.find(J2); + if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second) + MatchOps = false; + } + + if (MatchOps) + for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) { + Value *Op2 = J2->getOperand(j); + + DenseMap::iterator BMI = BaseMap.find(Op2); + if (BMI != BaseMap.end()) + Op2 = BMI->second; + else if (std::find(Roots[i].begin(), Roots[i].end(), + (Instruction*) Op2) != Roots[i].end()) + Op2 = IV; + + if (J1->getOperand(j) != Op2) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (operand " << j << ")\n"); + MatchFailed = true; + break; + } + } + + if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) || + (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (uses outside loop)\n"); + MatchFailed = true; + break; + } + + if (!MatchFailed) + BaseMap.insert(std::pair(J2, J1)); + + AllRootUses.insert(J2); + + if (PossibleRedSet.count(J1)) { + PossibleRedIter[J1] = 0; + Reds.insert(PossibleRedIdx[J1]); + } + if (PossibleRedSet.count(J2)) { + PossibleRedIter[J2] = i+1; + Reds.insert(PossibleRedIdx[J2]); + } + + ++J2; + } + } + + if (MatchFailed) + continue; + + DEBUG(dbgs() << "LRR: Matched all iteration increments for " << + *I->first << "\n"); + + DenseSet LoopIncUseSet; + collectInLoopUserSet(L, LoopIncs, SmallSet(), + SmallSet(), LoopIncUseSet); + DEBUG(dbgs() << "LRR: Loop increment set size: " << + LoopIncUseSet.size() << "\n"); + + // Make sure that all instructions in the loop have been included in some + // use set. + for (BasicBlock::iterator J = Header->begin(), JE = Header->end(); + J != JE; ++J) { + if (isa(J)) + continue; + if (cast(J) == I->first) + continue; + if (cast(J) == IV) + continue; + if (BaseUseSet.count(J) || AllRootUses.count(J) || + (LoopIncUseSet.count(J) && (J->isTerminator() || + isSafeToSpeculativelyExecute(J, DL)))) + continue; + + bool Found = false; + for (unsigned i = 0; i < Scale-1; ++i) { + if (std::find(Roots[i].begin(), Roots[i].end(), J) != Roots[i].end()) { + Found = true; + break; + } + } + + if (Found) + continue; + + for (DenseSet::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE && !MatchFailed; ++RI) { + int i = *RI; + if (cast(J) == PossibleReds[i].front()) { + Found = true; + break; + } + } + + if (Found) + continue; + + DEBUG(dbgs() << "LRR: aborting reroll based on " << *I->first << + " unprocessed instruction found: " << *J << "\n"); + MatchFailed = true; + break; + } + + if (MatchFailed) + continue; + + DEBUG(dbgs() << "LRR: all instructions processed from " << + *I->first << "\n"); + + // For a non-associative reduction, the chain entries must appear in order. + for (DenseSet::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE && !MatchFailed; ++RI) { + int i = *RI; + int PrevIter = 0, BaseCount = 0, Count = 0; + for (SmallVector::iterator J = + llvm::next(PossibleReds[i].begin()), JE = PossibleReds[i].end(); + J != JE; ++J) { + // Note that all instructions in the chain must have been found because + // all instructions in the function must have been assigned to some + // iteration. + int Iter = PossibleRedIter[*J]; + if (Iter != PrevIter && Iter != PrevIter + 1 && + !PossibleReds[i].back()->isAssociative()) { + DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " << + *J << "\n"); + MatchFailed = true; + break; + } + + if (Iter != PrevIter) { + if (Count != BaseCount) { + DEBUG(dbgs() << "LRR: Iteration " << PrevIter << + " reduction use count " << Count << + " is not equal to the base use count " << + BaseCount << "\n"); + MatchFailed = true; + break; + } + + Count = 0; + } + + ++Count; + if (Iter == 0) + ++BaseCount; + + PrevIter = Iter; + } + } + + if (MatchFailed) + continue; + + Changed = true; + + // Fixup reductions to refer to the last instruction associated with the + // first iteration (not the last). + for (DenseSet::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + int j = 1; + for (int e = PossibleReds[i].size(); j != e; ++j) + if (PossibleRedIter[PossibleReds[i][j]] != 0) { + --j; + break; + } + + // Replace users with the new end-of-chain value. + SmallVector Users; + for (Value::use_iterator UI = PossibleReds[i].back()->use_begin(), + UIE = PossibleReds[i].back()->use_end(); UI != UIE; ++UI) + Users.push_back(cast(*UI)); + + for (SmallVector::iterator J = Users.begin(), + JE = Users.end(); J != JE; ++J) + (*J)->replaceUsesOfWith(PossibleReds[i].back(), PossibleReds[i][j]); + } + + // Remove instructions associated with non-base iterations. + for (BasicBlock::reverse_iterator J = Header->rbegin(); + J != Header->rend();) { + if (AllRootUses.count(&*J)) { + Instruction *D = &*J; + DEBUG(dbgs() << "LRR: removing: " << *D << "\n"); + D->eraseFromParent(); + continue; + } + + ++J; + } + + // Insert the new induction variable. + const SCEV *Start = I->second->getOperand(0); + if (Inc == 1) + Start = SE->getMulExpr(Start, + SE->getConstant(Start->getType(), Scale)); + const SCEVAddRecExpr *H = + cast(SE->getAddRecExpr(Start, + SE->getConstant(I->second->getType(), 1), + L, SCEV::FlagAnyWrap)); + { // Limit the lifetime of SCEVExpander. + SCEVExpander Expander(*SE, "reroll"); + PHINode *NewIV = + cast(Expander.expandCodeFor(H, IV->getType(), + Header->begin())); + for (DenseSet::iterator J = BaseUseSet.begin(), + JE = BaseUseSet.end(); J != JE; ++J) + (*J)->replaceUsesOfWith(IV, NewIV); + + if (BranchInst *BI = dyn_cast(Header->getTerminator())) { + if (LoopIncUseSet.count(BI)) { + const SCEV *ICSCEV = I->second->evaluateAtIteration(IterCount, *SE); + if (Inc == 1) + ICSCEV = + SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale)); + Value *IC; + if (isa(ICSCEV)) { + IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), BI); + } else { + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + Preheader = InsertPreheaderForLoop(L, this); + + IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), + Preheader->getTerminator()); + } + + Value *NewIVNext = NewIV->getIncomingValueForBlock(Header); + Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIVNext, IC, + "exitcond"); + BI->setCondition(Cond); + + if (BI->getSuccessor(1) != Header) + BI->swapSuccessors(); + } + } + } + + SimplifyInstructionsInBlock(Header, DL, TLI); + DeleteDeadPHIs(Header, TLI); + ++NumRerolledLoops; + break; + } + + return Changed; +} + Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -43,6 +43,7 @@ initializeLoopInstSimplifyPass(Registry); initializeLoopRotatePass(Registry); initializeLoopStrengthReducePass(Registry); + initializeLoopRerollPass(Registry); initializeLoopUnrollPass(Registry); initializeLoopUnswitchPass(Registry); initializeLoopIdiomRecognizePass(Registry); @@ -111,6 +112,10 @@ unwrap(PM)->add(createLoopRotatePass()); } +void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopRerollPass()); +} + void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopUnrollPass()); } Index: test/Transforms/LoopReroll/basic.ll =================================================================== --- /dev/null +++ test/Transforms/LoopReroll/basic.ll @@ -0,0 +1,327 @@ +; RUN: opt < %s -loop-reroll -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; int foo(int a); +; void bar(int *x) { +; for (int i = 0; i < 500; i += 3) { +; foo(i); +; foo(i+1); +; foo(i+2); +; } +; } + +; Function Attrs: nounwind uwtable +define void @bar(i32* nocapture readnone %x) #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %call = tail call i32 @foo(i32 %i.08) #1 + %add = add nsw i32 %i.08, 1 + %call1 = tail call i32 @foo(i32 %add) #1 + %add2 = add nsw i32 %i.08, 2 + %call3 = tail call i32 @foo(i32 %add2) #1 + %add3 = add nsw i32 %i.08, 3 + %exitcond = icmp eq i32 %add3, 500 + br i1 %exitcond, label %for.end, label %for.body + +; CHECK-LABEL: @bar + +; CHECK: for.body: +; CHECK: %indvar = phi i32 [ %indvar.next, %for.body ], [ 0, %entry ] +; CHECK: %call = tail call i32 @foo(i32 %indvar) #1 +; CHECK: %indvar.next = add i32 %indvar, 1 +; CHECK: %exitcond1 = icmp eq i32 %indvar.next, 498 +; CHECK: br i1 %exitcond1, label %for.end, label %for.body + +; CHECK: ret + +for.end: ; preds = %for.body + ret void +} + +declare i32 @foo(i32) + +; void hi1(int *x) { +; for (int i = 0; i < 1500; i += 3) { +; x[i] = foo(0); +; x[i+1] = foo(0); +; x[i+2] = foo(0); +; } +; } + +; Function Attrs: nounwind uwtable +define void @hi1(i32* nocapture %x) #0 { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %call = tail call i32 @foo(i32 0) #1 + %arrayidx = getelementptr inbounds i32* %x, i64 %indvars.iv + store i32 %call, i32* %arrayidx, align 4 + %call1 = tail call i32 @foo(i32 0) #1 + %0 = add nsw i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds i32* %x, i64 %0 + store i32 %call1, i32* %arrayidx3, align 4 + %call4 = tail call i32 @foo(i32 0) #1 + %1 = add nsw i64 %indvars.iv, 2 + %arrayidx7 = getelementptr inbounds i32* %x, i64 %1 + store i32 %call4, i32* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3 + %2 = trunc i64 %indvars.iv.next to i32 + %cmp = icmp slt i32 %2, 1500 + br i1 %cmp, label %for.body, label %for.end + +; CHECK-LABEL: @hi1 + +; CHECK: for.body: +; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ] +; CHECK: %call = tail call i32 @foo(i32 0) #1 +; CHECK: %arrayidx = getelementptr inbounds i32* %x, i64 %indvar +; CHECK: store i32 %call, i32* %arrayidx, align 4 +; CHECK: %indvar.next = add i64 %indvar, 1 +; CHECK: %exitcond = icmp eq i64 %indvar.next, 1500 +; CHECK: br i1 %exitcond, label %for.end, label %for.body + +; CHECK: ret + +for.end: ; preds = %for.body + ret void +} + +; void hi2(int *x) { +; for (int i = 0; i < 500; ++i) { +; x[3*i] = foo(0); +; x[3*i+1] = foo(0); +; x[3*i+2] = foo(0); +; } +; } + +; Function Attrs: nounwind uwtable +define void @hi2(i32* nocapture %x) #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %call = tail call i32 @foo(i32 0) #1 + %0 = mul nsw i64 %indvars.iv, 3 + %arrayidx = getelementptr inbounds i32* %x, i64 %0 + store i32 %call, i32* %arrayidx, align 4 + %call1 = tail call i32 @foo(i32 0) #1 + %1 = add nsw i64 %0, 1 + %arrayidx4 = getelementptr inbounds i32* %x, i64 %1 + store i32 %call1, i32* %arrayidx4, align 4 + %call5 = tail call i32 @foo(i32 0) #1 + %2 = add nsw i64 %0, 2 + %arrayidx9 = getelementptr inbounds i32* %x, i64 %2 + store i32 %call5, i32* %arrayidx9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 500 + br i1 %exitcond, label %for.end, label %for.body + +; CHECK-LABEL: @hi2 + +; CHECK: for.body: +; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +; CHECK: %call = tail call i32 @foo(i32 0) #1 +; CHECK: %arrayidx = getelementptr inbounds i32* %x, i64 %indvars.iv +; CHECK: store i32 %call, i32* %arrayidx, align 4 +; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK: %exitcond1 = icmp eq i64 %indvars.iv.next, 1500 +; CHECK: br i1 %exitcond1, label %for.end, label %for.body + +; CHECK: ret + +for.end: ; preds = %for.body + ret void +} + +; void goo(float alpha, float *a, float *b) { +; for (int i = 0; i < 3200; i += 5) { +; a[i] += alpha * b[i]; +; a[i + 1] += alpha * b[i + 1]; +; a[i + 2] += alpha * b[i + 2]; +; a[i + 3] += alpha * b[i + 3]; +; a[i + 4] += alpha * b[i + 4]; +; } +; } + +; Function Attrs: nounwind uwtable +define void @goo(float %alpha, float* nocapture %a, float* nocapture readonly %b) #0 { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv + %0 = load float* %arrayidx, align 4 + %mul = fmul float %0, %alpha + %arrayidx2 = getelementptr inbounds float* %a, i64 %indvars.iv + %1 = load float* %arrayidx2, align 4 + %add = fadd float %1, %mul + store float %add, float* %arrayidx2, align 4 + %2 = add nsw i64 %indvars.iv, 1 + %arrayidx5 = getelementptr inbounds float* %b, i64 %2 + %3 = load float* %arrayidx5, align 4 + %mul6 = fmul float %3, %alpha + %arrayidx9 = getelementptr inbounds float* %a, i64 %2 + %4 = load float* %arrayidx9, align 4 + %add10 = fadd float %4, %mul6 + store float %add10, float* %arrayidx9, align 4 + %5 = add nsw i64 %indvars.iv, 2 + %arrayidx13 = getelementptr inbounds float* %b, i64 %5 + %6 = load float* %arrayidx13, align 4 + %mul14 = fmul float %6, %alpha + %arrayidx17 = getelementptr inbounds float* %a, i64 %5 + %7 = load float* %arrayidx17, align 4 + %add18 = fadd float %7, %mul14 + store float %add18, float* %arrayidx17, align 4 + %8 = add nsw i64 %indvars.iv, 3 + %arrayidx21 = getelementptr inbounds float* %b, i64 %8 + %9 = load float* %arrayidx21, align 4 + %mul22 = fmul float %9, %alpha + %arrayidx25 = getelementptr inbounds float* %a, i64 %8 + %10 = load float* %arrayidx25, align 4 + %add26 = fadd float %10, %mul22 + store float %add26, float* %arrayidx25, align 4 + %11 = add nsw i64 %indvars.iv, 4 + %arrayidx29 = getelementptr inbounds float* %b, i64 %11 + %12 = load float* %arrayidx29, align 4 + %mul30 = fmul float %12, %alpha + %arrayidx33 = getelementptr inbounds float* %a, i64 %11 + %13 = load float* %arrayidx33, align 4 + %add34 = fadd float %13, %mul30 + store float %add34, float* %arrayidx33, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5 + %14 = trunc i64 %indvars.iv.next to i32 + %cmp = icmp slt i32 %14, 3200 + br i1 %cmp, label %for.body, label %for.end + +; CHECK-LABEL: @goo + +; CHECK: for.body: +; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ] +; CHECK: %arrayidx = getelementptr inbounds float* %b, i64 %indvar +; CHECK: %0 = load float* %arrayidx, align 4 +; CHECK: %mul = fmul float %0, %alpha +; CHECK: %arrayidx2 = getelementptr inbounds float* %a, i64 %indvar +; CHECK: %1 = load float* %arrayidx2, align 4 +; CHECK: %add = fadd float %1, %mul +; CHECK: store float %add, float* %arrayidx2, align 4 +; CHECK: %indvar.next = add i64 %indvar, 1 +; CHECK: %exitcond = icmp eq i64 %indvar.next, 3200 +; CHECK: br i1 %exitcond, label %for.end, label %for.body + +; CHECK: ret + +for.end: ; preds = %for.body + ret void +} + +; void hoo(float alpha, float *a, float *b, int *ip) { +; for (int i = 0; i < 3200; i += 5) { +; a[i] += alpha * b[ip[i]]; +; a[i + 1] += alpha * b[ip[i + 1]]; +; a[i + 2] += alpha * b[ip[i + 2]]; +; a[i + 3] += alpha * b[ip[i + 3]]; +; a[i + 4] += alpha * b[ip[i + 4]]; +; } +; } + +; Function Attrs: nounwind uwtable +define void @hoo(float %alpha, float* nocapture %a, float* nocapture readonly %b, i32* nocapture readonly %ip) #0 { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %ip, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %idxprom1 = sext i32 %0 to i64 + %arrayidx2 = getelementptr inbounds float* %b, i64 %idxprom1 + %1 = load float* %arrayidx2, align 4 + %mul = fmul float %1, %alpha + %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv + %2 = load float* %arrayidx4, align 4 + %add = fadd float %2, %mul + store float %add, float* %arrayidx4, align 4 + %3 = add nsw i64 %indvars.iv, 1 + %arrayidx7 = getelementptr inbounds i32* %ip, i64 %3 + %4 = load i32* %arrayidx7, align 4 + %idxprom8 = sext i32 %4 to i64 + %arrayidx9 = getelementptr inbounds float* %b, i64 %idxprom8 + %5 = load float* %arrayidx9, align 4 + %mul10 = fmul float %5, %alpha + %arrayidx13 = getelementptr inbounds float* %a, i64 %3 + %6 = load float* %arrayidx13, align 4 + %add14 = fadd float %6, %mul10 + store float %add14, float* %arrayidx13, align 4 + %7 = add nsw i64 %indvars.iv, 2 + %arrayidx17 = getelementptr inbounds i32* %ip, i64 %7 + %8 = load i32* %arrayidx17, align 4 + %idxprom18 = sext i32 %8 to i64 + %arrayidx19 = getelementptr inbounds float* %b, i64 %idxprom18 + %9 = load float* %arrayidx19, align 4 + %mul20 = fmul float %9, %alpha + %arrayidx23 = getelementptr inbounds float* %a, i64 %7 + %10 = load float* %arrayidx23, align 4 + %add24 = fadd float %10, %mul20 + store float %add24, float* %arrayidx23, align 4 + %11 = add nsw i64 %indvars.iv, 3 + %arrayidx27 = getelementptr inbounds i32* %ip, i64 %11 + %12 = load i32* %arrayidx27, align 4 + %idxprom28 = sext i32 %12 to i64 + %arrayidx29 = getelementptr inbounds float* %b, i64 %idxprom28 + %13 = load float* %arrayidx29, align 4 + %mul30 = fmul float %13, %alpha + %arrayidx33 = getelementptr inbounds float* %a, i64 %11 + %14 = load float* %arrayidx33, align 4 + %add34 = fadd float %14, %mul30 + store float %add34, float* %arrayidx33, align 4 + %15 = add nsw i64 %indvars.iv, 4 + %arrayidx37 = getelementptr inbounds i32* %ip, i64 %15 + %16 = load i32* %arrayidx37, align 4 + %idxprom38 = sext i32 %16 to i64 + %arrayidx39 = getelementptr inbounds float* %b, i64 %idxprom38 + %17 = load float* %arrayidx39, align 4 + %mul40 = fmul float %17, %alpha + %arrayidx43 = getelementptr inbounds float* %a, i64 %15 + %18 = load float* %arrayidx43, align 4 + %add44 = fadd float %18, %mul40 + store float %add44, float* %arrayidx43, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5 + %19 = trunc i64 %indvars.iv.next to i32 + %cmp = icmp slt i32 %19, 3200 + br i1 %cmp, label %for.body, label %for.end + +; CHECK-LABEL: @hoo + +; CHECK: for.body: +; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ] +; CHECK: %arrayidx = getelementptr inbounds i32* %ip, i64 %indvar +; CHECK: %0 = load i32* %arrayidx, align 4 +; CHECK: %idxprom1 = sext i32 %0 to i64 +; CHECK: %arrayidx2 = getelementptr inbounds float* %b, i64 %idxprom1 +; CHECK: %1 = load float* %arrayidx2, align 4 +; CHECK: %mul = fmul float %1, %alpha +; CHECK: %arrayidx4 = getelementptr inbounds float* %a, i64 %indvar +; CHECK: %2 = load float* %arrayidx4, align 4 +; CHECK: %add = fadd float %2, %mul +; CHECK: store float %add, float* %arrayidx4, align 4 +; CHECK: %indvar.next = add i64 %indvar, 1 +; CHECK: %exitcond = icmp eq i64 %indvar.next, 3200 +; CHECK: br i1 %exitcond, label %for.end, label %for.body + +; CHECK: ret + +for.end: ; preds = %for.body + ret void +} + +attributes #0 = { nounwind uwtable } +attributes #1 = { nounwind } + Index: test/Transforms/LoopReroll/reduction.ll =================================================================== --- /dev/null +++ test/Transforms/LoopReroll/reduction.ll @@ -0,0 +1,51 @@ +; RUN: opt < %s -loop-reroll -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @foo(i32* nocapture readonly %x) #0 { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.029 = phi i32 [ 0, %entry ], [ %add12, %for.body ] + %arrayidx = getelementptr inbounds i32* %x, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, %r.029 + %1 = or i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds i32* %x, i64 %1 + %2 = load i32* %arrayidx3, align 4 + %add4 = add nsw i32 %add, %2 + %3 = or i64 %indvars.iv, 2 + %arrayidx7 = getelementptr inbounds i32* %x, i64 %3 + %4 = load i32* %arrayidx7, align 4 + %add8 = add nsw i32 %add4, %4 + %5 = or i64 %indvars.iv, 3 + %arrayidx11 = getelementptr inbounds i32* %x, i64 %5 + %6 = load i32* %arrayidx11, align 4 + %add12 = add nsw i32 %add8, %6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 + %7 = trunc i64 %indvars.iv.next to i32 + %cmp = icmp slt i32 %7, 400 + br i1 %cmp, label %for.body, label %for.end + +; CHECK-LABEL: @foo + +; CHECK: for.body: +; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ] +; CHECK: %r.029 = phi i32 [ 0, %entry ], [ %add, %for.body ] +; CHECK: %arrayidx = getelementptr inbounds i32* %x, i64 %indvar +; CHECK: %0 = load i32* %arrayidx, align 4 +; CHECK: %add = add nsw i32 %0, %r.029 +; CHECK: %indvar.next = add i64 %indvar, 1 +; CHECK: %exitcond = icmp eq i64 %indvar.next, 400 +; CHECK: br i1 %exitcond, label %for.end, label %for.body + +; CHECK: ret + +for.end: ; preds = %for.body + ret i32 %add12 +} + +attributes #0 = { nounwind readonly uwtable } +