Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -304,6 +304,7 @@ void initializeFloat2IntPass(PassRegistry&); void initializeLoopDistributePass(PassRegistry&); void initializeSjLjEHPreparePass(PassRegistry&); +void initializeLoopEditorTestPass(PassRegistry&); } #endif Index: include/llvm/Transforms/Utils/LoopEditor.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Utils/LoopEditor.h @@ -0,0 +1,413 @@ +//===-- LoopEditor.h - High-level loop transformations --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The LoopEditor provides a toolkit for performing high-level transforms on +// loops. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_UTILS_LOOPEDITOR_H +#define LLVM_TRANSFORMS_SCALAR_UTILS_LOOPEDITOR_H + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace llvm { + +class BasicBlock; +class DataLayout; +class DominatorTree; +class Loop; +class LoopInfo; +class PHINode; +class SCEV; +class ScalarEvolution; +class Value; + +/// \brief This class provides a toolkit to create high-level operations on +/// loops. +/// +/// The functionality consists of a set of core functions which are designed +/// to be composable, and a set of helper routines that stitch together +/// the core functions to provide higher-level constructions such as loop +/// versioning, or loop peeling. +/// +/// Many functions can take a Delegate object, which allows the user to be +/// notified when different events happen (such as for example "an instruction +/// has been interleaved"). This allows the core functionality to remain +/// simple and not need too much configuration, while allowing the user +/// the flexibility to hook into the internals. Some delegate methods offer +/// the user the chance to influence the edit operation. +/// +/// All of the core functions are designed to keep LoopInfo, ScalarEvolution +/// and DominatorTree up to date. +class LoopEditor { +public: + /// A recurrence is any PHI node in the header block of a loop. + class Recurrence { + protected: + enum RType { + tySimple, tyReduction, tyInduction + }; + /// Default constructor purely for operator[] on maps. + Recurrence() {} + Recurrence(PHINode *BO, Value *StartV, PHINode *Val) : + BasedOn(BO), StartValue(StartV), Val(Val), Ty(tySimple) { + } + /// If a loop is cloned, its clone will have this member of each of its + /// recurrences and inductions set to the original recurrence or induction. + /// + /// This allows LoopEditor to reason about sets of PHIs and how to merge + /// them together when stitching the outputs of loops to the input of other + /// loops (such as in LoopEditor::addIncoming). + PHINode *BasedOn; + /// For an induction or a reduction, the "start value" of the PHI (the value + /// incoming from the preheader). + Value *StartValue; + /// The PHInode itself. + PHINode *Val; + /// The type of the recurrence. + RType Ty; + // This class is deliberately opaque to users, but allow LoopEditor full + // access. + friend class LoopEditor; + public: + /// Return the start value of this recurrence + Value *getStartValue() const { return StartValue; } + /// Return the PHI in the loop header that identifies this recurrence. + PHINode *getPHI() const { return Val; } + bool operator < (const Recurrence &R) const { + return Val < R.Val; + } + }; + + /// A reduction is a type of recurrence that has one user outside the loop, + /// and is inductive: R[i] = f(i) OP R[i-1], where OP is an associative + /// binary operator or min()/max(). + class Reduction : public Recurrence { + Reduction(Value *StartV, PHINode *Val) : + Recurrence(nullptr, StartV, Val) { + Ty = tyReduction; + } + /// Only available for operator[] on maps. + Reduction() { assert(0 && "Unreachable!"); } + public: + /// Emit code to reduce Op1 and Op2 to one value. + Value *createOp(IRBuilder<> &IRB, Value *Op1, Value *Op2) const; + /// Methods for support type inquiry through isa, cast, and dyn_cast: + static inline bool classof(const Recurrence *R) { + return R->Ty == tyReduction; + } + // Open up to LoopEditor. + friend class LoopEditor; + }; + /// A semi-opaque descriptor for an induction. An induction is a recurrence + /// with a constant inductive step: I[i] = C + I[i-1]. + class Induction : private Recurrence { + Induction(Value *StartV, PHINode *Val) : + Recurrence(nullptr, StartV, Val) { + Ty = tyInduction; + } + /// Only available for operator[] on maps. + Induction() { assert(0 && "Unreachable!"); } + public: + /// Methods for support type inquiry through isa, cast, and dyn_cast: + static inline bool classof(const Recurrence *R) { + return R->Ty == tyInduction; + } + + // Open up to LoopEditor. + friend class LoopEditor; + }; + +public: + /// Delegates should be subclassed by users. They can optionally chain + /// to other delegates for composability. + /// + /// Delegates provide many callback functions, but only a subset (if any) + /// will be invoked for any one core function call. + /// + /// It is forbidden to modify the content of the loop when servicing one + /// of the "notify*" callbacks. + struct Delegate { + Delegate *Next; + + Delegate() : Next(nullptr) {} + Delegate(Delegate *Next) : Next(Next) {} + virtual void anchor(); // Provide a home for the vtable. + + /// A call to interleave() is about to happen, for the given iteration. + /// Iteration starts at 1 (the zero'th iteration would correspond to + /// the original loop content). + /// + /// Called by: widenAndInterleave() + virtual void notifyInterleaveIterationStarting(unsigned Iteration) { + if (Next) Next->notifyInterleaveIterationStarting(Iteration); + } + /// \c OldInst has been cloned to \c NewInst, during a call to + /// \c interleave() + /// + /// Called by: interleave() + virtual void notifyInstructionInterleaved(Instruction *OldInst, + Instruction *NewInst) { + if (Next) Next->notifyInstructionInterleaved(OldInst, NewInst); + } + /// \c OldInst has been peeled out of the loop as \c NewInst. + /// The peel iteration was \c Iteration, which starts from zero. + /// + /// Called by: peelBefore(),peelAfter() + virtual void notifyInstructionPeeled(Instruction *OldInst, + Instruction *NewInst, + unsigned Iteration) { + if (Next) Next->notifyInstructionPeeled(OldInst, NewInst, Iteration); + } + /// During a clone() operation, \c V needs to be cloned. VM contains the + /// values that have already been cloned (apart from PHI operands which + /// will be remapped later). + /// \return The value that V should have in the cloned loop. + /// + /// Called by: clone() + virtual Value *hookCloneValue(Value *V, ValueToValueMapTy &VM, + IRBuilder<> &IRB) { + if (Next) return Next->hookCloneValue(V, VM, IRB); + return nullptr; + } + /// When interleaving a loop, each interleaved iteration will require an + /// offset to be applied to induction variables. This hook defines how + /// that offset is computed. For example, for an UF=4 interleave: + /// iteration 0 uses IndVar+0 + /// iteration 1 uses IndVar+S + /// iteration 2 uses IndVar+S*2 + /// iteration 2 uses IndVar+S*3 + /// + /// Where S is the return value of this hook. + /// + /// Called by: widenAndInterleave() + virtual unsigned hookInterleaveInductionStep() { + if (Next) return Next->hookInterleaveInductionStep(); + return 1; + } + /// The reduction \c R, made up of \c Values, needs to be reduced to one + /// value at the exit of the loop. + /// + /// Called by: widenAndInterleave() + virtual Value *hookReduceReductions(Reduction *R, ArrayRef Values, + IRBuilder<> &IRB) { + if (Next) return Next->hookReduceReductions(R, Values, IRB); + return nullptr; + } + }; + + /// Different transforms require the loop to be analyzable to a different + /// degree. For example, loop cloning doesn't need to know anything about + /// the loop, whereas loop interleaving needs to know that all recurrences + /// are either inductions or reductions. + /// + /// The AnalysisLevel describes the extent to which the loop was analyzable; + /// some functions may have minimum analysis requirements. + enum AnalysisLevel { + /// The loop was not able to be analyzed at all. + AL_None, + /// The loop trip count was able to be determined. + AL_TripCount, + /// The loop contains a single backedge and is in LCSSA form. + AL_LCSSA, + /// All recurrences were categorized (into inductions and reductions), + /// as well as the trip count being identified. + AL_AllRecurrences + }; + + /// Contructs a new loop editor, editing L. + LoopEditor(Loop *L, ScalarEvolution *SE, const DataLayout *DL, LoopInfo *LI, + DominatorTree *DT); + + /// + /// Queries + /// + + /// Returns the level to which the loop was able to be analyzed. + AnalysisLevel getAnalysisLevel() const; + + /// Returns true if I is contained within blocks the loop editor owns. + /// This is equivalent to Loop::contains(), but it also checks the optional + /// bypass and dedicated exit blocks that are outside of Loop's knowledge. + bool contains(Instruction *I); + + /// Allow accesses to the underlying Loop using ->. + Loop *operator -> () const { return L; } + + /// Return the recurrence referred to by this PHI, if any. A recurrence is + /// referred to by its PHI in the loop header block. Returns nullptr on + /// failure. + Recurrence *getRecurrenceByPHI(PHINode *V); + + /// Retrieves all known recurrences. + ArrayRef getAllRecurrences(); + + /// Returns the value of the given recurrence on exiting the loop. + Value *getOutgoingRecurrence(const Recurrence &R); + + /// Retrieves a Recurrence valid for this loop, if one is found to be based + /// on or derived from the given recurrence. This will happen if this loop has + /// been cloned from another loop previously. + Recurrence *getMatchingRecurrence(const Recurrence &R); + + /// Retrieves the executed trip count at the end of the loop. This may be + /// zero if the loop was bypassed by a predicate, or less than the normal + /// trip count if the loop was widened. + /// + /// Requires AnalysisLevel AL_TripCount or above. + Value *getExecutedTripCount(); + + /// Retrieves the trip count of the loop as a SCEV. Note that this involves + /// adding one to the backedge count and as such may overflow when expanded. + /// + /// Requires AnalysisLevel AL_TripCount or above. + const SCEV *getTripCount(); + + /// + /// Mutations + /// + + /// Produce a new version of this loop. The new loop is returned in + /// LoopEditor. + /// + /// The new loop is inserted into the CFG on IncomingEdge (IncomingEdge is + /// updated), so if IncomingEdge is A -> B, the result of this function will + /// be A -> LOOP -> B. + LoopEditor clone(Use &IncomingEdge, const Twine &NameSuffix = ""); + + /// A "widened" loop iterates over the same range but with a wider step. + /// For example the AddRec {5,1}<%L> widened by a factor 4 becomes {5,4}<%L>. + /// + /// This is a requirement for loop interleaving and loop vectorization, but + /// importantly this does not actually do any unrolling or vectorization. + /// Instead, all induction variables have their steps altered and the trip + /// count is modified. + /// + /// Reductions are not altered. Requires AnalysisLevel AL_AllRecurrences. + void widen(unsigned Factor); + + /// Set the loop trip count. If MayOverflow is set, then an overflow check + /// will be emitted to ensure that TripCount is not zero. + /// + /// The overflow check may be omitted if the LoopEditor can prove that + /// TripCount is less than the current loop trip count, and there is currently + /// no overflow check. + /// + /// Requires AnalysisLevel AL_TripCount. + void setTripCount(const SCEV *TripCount, bool MayOverflow=true); + + /// Adds a new reduction variable starting at StartValue. The reduction does + /// not do anything until it is connected with connectReduction(). + /// + /// Requires AnalysisLevel AL_LCSSA. + Reduction *addReduction(Value *StartValue); + + /// Connects the backedge of a reduction added with addReduction(). + /// + /// Requires AnalysisLevel AL_LCSSA. + void connectReduction(const Reduction *ID, Value *V); + + /// Creates a new copy of all instructions in the loop body, except control flow + /// instructions. Does not modify the control flow in the loop at all. + /// + /// The inserted instructions are interleaved. Requires AnalysisLevel + /// AL_AllRecurrences. + /// + /// Invokes the following functions on the delegate: + /// hookInductionVariableUse(Induction &Induction, IRBuilder<> IRB) + /// Called when a use of an induction variable is encountered. Expected + /// to return the value to use as an induction variable. + /// + /// hookReductionVariableUse(Reduction &Reduction, IRBuilder<> IRB) + /// Called when a use of a reduction variable is encountered. Expected + /// to return the value to use as a reduction. + /// + /// notifyReductionVariableDefined(Reduction &Reduction, Value *Def) + /// Called when a feed of a reduction variable is seen. The arguments are + /// the newly created Value and the Reduction it fed in the original loop. + void interleave(Delegate *D); + + /// Returns a dedicated exiting block, creating one if it does not exist. A + /// dedicated exiting block is one where the only incoming arcs are from + /// the loop latch or optionally the bypass block (The bypass block is a + /// check to see if the loop would actually be executed at least once, as + /// the trip count check is on the backedge). + BasicBlock *getOrCreateDedicatedExitingBlock(); + + /// Changes the block that is executed after this loop. + void updateExitBlock(BasicBlock *BB); + + /// Add an incoming edge from the loop L, with the given mappings between our + /// and L's recurrences + /// + /// The Recurrences can either reference this loop or L's. + void addIncoming(LoopEditor &L, + const std::map &Recurrences); + + /// Add an incoming edge from the basic block BB. + void addIncoming(BasicBlock *BB); + + /// Remove an incoming edge from BB. The edge must already exist. + void removeIncoming(BasicBlock *BB); + + /// + /// Macro-mutations - These are all helper functions implemented in terms of + /// the mutations above. + /// + + /// Widens the loop by \c Factor and performs interleaved loop unrolling. + /// + /// The reductions in the original loop get duplicated \c Factor times, + /// then reduced at the end of the loop into one value and returned in Ret. + void widenAndInterleaveLoop(unsigned Factor, + std::map &Ret, + Delegate *D=nullptr); + + /// Versions, widens, and interleaves the loop by \c Factor. + /// + /// The original loop is cloned and a widened loop created, which branches + /// to the original scalar loop as a tail. + /// + /// PredBB is created as a block to determine whether to branch to the widened + /// loop or the scalar one (independent of any trip count and overflow checks). + /// It is initially set up with just one branch: + /// br i1 true, label %versioned-loop, label %scalar-loop + LoopEditor versionWidenAndInterleaveLoop(unsigned Factor, BasicBlock *&PredBB, + Delegate *D=nullptr); + + /// Peels \c Iterations iterations out of the loop and places them before the + /// loop. The trip count of the loop is decreased to compensate. + void peelBefore(unsigned Iterations, Delegate *D=nullptr); + + /// Peels \c Iterations iterations out of the loop and places them after the + /// loop. The trip count of the loop is decreased to compensate. + void peelAfter(unsigned Iterations, Delegate *D=nullptr); + +private: + // + // Immutable/analysis state + // + Loop *L; + ScalarEvolution *SE; + const DataLayout *DL; + LoopInfo *LI; + DominatorTree *DT; + bool Analyzed; +}; +} // end namespace llvm + +#endif Index: lib/Target/ARM/Thumb2ITBlockPass.cpp =================================================================== --- lib/Target/ARM/Thumb2ITBlockPass.cpp +++ lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -183,6 +183,7 @@ DebugLoc dl = MI->getDebugLoc(); unsigned PredReg = 0; ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg); + if (CC == ARMCC::AL) { ++MBBI; continue; Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -81,6 +81,7 @@ initializePlaceSafepointsPass(Registry); initializeFloat2IntPass(Registry); initializeLoopDistributePass(Registry); + initializeLoopEditorTestPass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { Index: lib/Transforms/Utils/CMakeLists.txt =================================================================== --- lib/Transforms/Utils/CMakeLists.txt +++ lib/Transforms/Utils/CMakeLists.txt @@ -18,6 +18,7 @@ IntegerDivision.cpp LCSSA.cpp Local.cpp + LoopEditor.cpp LoopSimplify.cpp LoopUnroll.cpp LoopUnrollRuntime.cpp Index: lib/Transforms/Utils/LoopEditor.cpp =================================================================== --- /dev/null +++ lib/Transforms/Utils/LoopEditor.cpp @@ -0,0 +1,244 @@ +//===-- LoopEditor.cpp - High-level loop transformations ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The LoopEditor provides a toolkit for performing high-level transforms on +// loops. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/LoopEditor.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/PrettyStackTrace.h" + +#define DEBUG_TYPE "loopeditor" +using namespace llvm; + +void LoopEditor::Delegate::anchor() {} + +// +// Helper functions. +// +Value *LoopEditor::Reduction::createOp(IRBuilder<> &IRB, Value *Op1, Value *Op2) const { + assert(0 && "Implement!"); +} + +// +// Public API +// + +LoopEditor::LoopEditor(Loop *L, ScalarEvolution *SE, const DataLayout *DL, + LoopInfo *LI, DominatorTree *DT) + : L(L), SE(SE), DL(DL), LI(LI), DT(DT) { +} + +LoopEditor::AnalysisLevel LoopEditor::getAnalysisLevel() const { + assert(0 && "Not implemented!"); + return AL_None; +} + +bool LoopEditor::contains(Instruction *I) { + assert(0 && "Not implemented!"); + return false; +} + +LoopEditor::Recurrence *LoopEditor::getRecurrenceByPHI(PHINode *V) { + assert(0 && "Not implemented!"); + return nullptr; +} + +ArrayRef LoopEditor::getAllRecurrences() { + assert(0 && "Not implemented!"); + return ArrayRef(); +} + +Value *LoopEditor::getOutgoingRecurrence(const Recurrence &R) { + assert(0 && "Not implemented!"); + return nullptr; +} + +LoopEditor::Recurrence *LoopEditor::getMatchingRecurrence(const Recurrence &R) { + assert(0 && "Not implemented!"); + return nullptr; +} + +Value *LoopEditor::getExecutedTripCount() { + assert(0 && "Not implemented!"); + return nullptr; +} + +const SCEV *LoopEditor::getTripCount() { + assert(0 && "Not implemented!"); + return nullptr; +} + +void LoopEditor::setTripCount(const SCEV *TC, bool MayOverflow) { + assert(0 && "Not implemented!"); +} + +LoopEditor LoopEditor::clone(Use &IncomingEdge, + const Twine &NameSuffix) { + assert(0 && "Not implemented!"); + return *this; +} + +void LoopEditor::widen(unsigned Factor) { + assert(0 && "Not implemented!"); +} + +LoopEditor::Reduction *LoopEditor::addReduction(Value *StartValue) { + assert(0 && "Not implemented!"); + return nullptr; +} + +void LoopEditor::connectReduction(const Reduction *ID, Value *V) { + assert(0 && "Not implemented!"); +} + +void LoopEditor::interleave(Delegate *D) { + assert(0 && "Not implemented!"); +} + +BasicBlock *LoopEditor::getOrCreateDedicatedExitingBlock() { + assert(0 && "Not implemented!"); + return nullptr; +} + +void LoopEditor::updateExitBlock(BasicBlock *BB) { + assert(0 && "Not implemented!"); +} + +void LoopEditor::addIncoming( + LoopEditor &LE, + const std::map &Recurrences) { + assert(0 && "Not implemented!"); +} + +void LoopEditor::addIncoming(BasicBlock *BB) { + assert(0 && "Not implemented!"); +} + +void LoopEditor::removeIncoming(BasicBlock *BB) { + assert(0 && "Not implemented!"); +} + +// +// Macro-mutators +// + +void LoopEditor::widenAndInterleaveLoop(unsigned Factor, + std::map &Ret, + Delegate *D) { + assert(0 && "Not implemented!"); +} + +LoopEditor LoopEditor::versionWidenAndInterleaveLoop(unsigned Factor, + BasicBlock *&PredBB, + Delegate *D) { + assert(0 && "Not implemented!"); +} + +void LoopEditor::peelBefore(unsigned Iterations, Delegate *D) { + assert(0 && "Not implemented!"); +} + +void LoopEditor::peelAfter(unsigned Iterations, Delegate *D) { + assert(0 && "Not implemented!"); +} + +// +// Dummy test pass +// +enum TestCommand { + TC_Clone +}; +cl::opt TestCommand( + cl::desc("Loop editor test command"), cl::Hidden, cl::values( + clEnumValN(TC_Clone, "clone", "Test the ::clone() function"), + clEnumValEnd)); +cl::opt TestLoop(cl::desc("Loop editor loop to operate on"), + cl::Hidden, cl::init("TestLoop")); +cl::opt TestEdge(cl::desc("Loop editor edge to clone on"), + cl::Hidden, cl::init("TestEdge")); + +struct LoopEditorTest : FunctionPass { + static char ID; + LoopEditorTest() : FunctionPass(ID) { + initializeLoopEditorTestPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } +}; +char LoopEditorTest::ID = 0; +INITIALIZE_PASS_BEGIN(LoopEditorTest, "test-loop-editor", + "Dummy test pass to test LoopEditor", + false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_END(LoopEditorTest, "test-loop-editor", + "Dummy test pass to test LoopEditor", + false, false) + +bool LoopEditorTest::runOnFunction(Function &F) { + LoopInfo &LI = getAnalysis().getLoopInfo(); + DominatorTree &DT = getAnalysis().getDomTree(); + ScalarEvolution &SE = getAnalysis(); + + auto I = std::find_if(F.begin(), F.end(), [](const BasicBlock &BB) { + return BB.getName() == TestLoop; + }); + if (I == F.end()) + report_fatal_error("Could not find basicblock %" + TestLoop + + " in function @" + F.getName(), false); + + Loop *L = LI.getLoopFor(I); + if (!L) + report_fatal_error("Could not find loop for BB %" + TestLoop + + " in function @" + F.getName(), false); + + LoopEditor LE(L, &SE, &F.getParent()->getDataLayout(), &LI, &DT); + switch (TestCommand) { + case TC_Clone: { + auto I = std::find_if(F.begin(), F.end(), [](const BasicBlock &BB) { + return BB.getName() == TestEdge; + }); + if (I == F.end()) + report_fatal_error("Could not find test edge %" + TestEdge, false); + if (I->size() != 1) + report_fatal_error("Test edge %" + TestEdge + " does not contain solely" + " an unconditional branch!", false); + BranchInst *BI = dyn_cast(I->getTerminator()); + if (!BI || BI->isConditional()) + report_fatal_error("Test edge %" + TestEdge + " was not an unconditional" + " branch!", false); + Use &Edge = *BI->op_begin(); + LE.clone(Edge, ".cloned"); + } + } + + return true; +} Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -98,6 +98,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/LoopEditor.h" #include #include #include @@ -289,7 +290,7 @@ virtual ~InnerLoopVectorizer() {} -protected: +public: /// A small list of PHINodes. typedef SmallVector PhiVector; /// When we unroll loops we have multiple vector values for each scalar. @@ -329,6 +330,7 @@ /// A helper function to vectorize a single BB within the innermost loop. void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); + void vectorizeSingleInstruction(Instruction *I, PhiVector *PV); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and @@ -433,7 +435,8 @@ /// vector elements. unsigned VF; -protected: + friend class VectorizerDelegate; +public: /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -2456,404 +2459,7 @@ } void InnerLoopVectorizer::createEmptyLoop() { - /* - In this function we generate a new loop. The new loop will contain - the vectorized instructions while the old loop will continue to run the - scalar remainder. - - [ ] <-- Back-edge taken count overflow check. - / | - / v - | [ ] <-- vector loop bypass (may consist of multiple blocks). - | / | - | / v - || [ ] <-- vector pre header. - || | - || v - || [ ] \ - || [ ]_| <-- vector loop. - || | - | \ v - | >[ ] <--- middle-block. - | / | - | / v - -|- >[ ] <--- new preheader. - | | - | v - | [ ] \ - | [ ]_| <-- old scalar loop to handle remainder. - \ | - \ v - >[ ] <-- exit block. - ... - */ - - BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); - BasicBlock *ExitBlock = OrigLoop->getExitBlock(); - assert(VectorPH && "Invalid loop structure"); - assert(ExitBlock && "Must have an exit block"); - - // Some loops have a single integer induction variable, while other loops - // don't. One example is c++ iterators that often have multiple pointer - // induction variables. In the code below we also support a case where we - // don't have a single induction variable. - OldInduction = Legal->getInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - - // Find the loop boundaries. - const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); - assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); - - // The exit count might have the type of i64 while the phi is i32. This can - // happen if we have an induction variable that is sign extended before the - // compare. The only way that we get a backedge taken count is that the - // induction variable was signed and as such will not overflow. In such a case - // truncation is legal. - if (ExitCount->getType()->getPrimitiveSizeInBits() > - IdxTy->getPrimitiveSizeInBits()) - ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); - - const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); - // Get the total trip count from the count by adding 1. - ExitCount = SE->getAddExpr(BackedgeTakeCount, - SE->getConstant(BackedgeTakeCount->getType(), 1)); - - const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout(); - - // Expand the trip count and place the new instructions in the preheader. - // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*SE, DL, "induction"); - - // We need to test whether the backedge-taken count is uint##_max. Adding one - // to it will cause overflow and an incorrect loop trip count in the vector - // body. In case of overflow we want to directly jump to the scalar remainder - // loop. - Value *BackedgeCount = - Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), - VectorPH->getTerminator()); - if (BackedgeCount->getType()->isPointerTy()) - BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, - "backedge.ptrcnt.to.int", - VectorPH->getTerminator()); - Instruction *CheckBCOverflow = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, - Constant::getAllOnesValue(BackedgeCount->getType()), - "backedge.overflow", VectorPH->getTerminator()); - - // The loop index does not have to start at Zero. Find the original start - // value from the induction PHI node. If we don't have an induction variable - // then we know that it starts at zero. - Builder.SetInsertPoint(VectorPH->getTerminator()); - Value *StartIdx = ExtendedIdx = - OldInduction - ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH), - IdxTy) - : ConstantInt::get(IdxTy, 0); - - // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - VectorPH->getTerminator()); - - LoopBypassBlocks.push_back(VectorPH); - - // Split the single block loop into the two loop structure described above. - BasicBlock *VecBody = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); - BasicBlock *MiddleBlock = - VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); - BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); - - // Create and register the new vector loop. - Loop* Lp = new Loop(); - Loop *ParentLoop = OrigLoop->getParentLoop(); - - // Insert the new loop into the loop nest and register the new basic blocks - // before calling any utilities such as SCEV that require valid LoopInfo. - if (ParentLoop) { - ParentLoop->addChildLoop(Lp); - ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); - ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); - } else { - LI->addTopLevelLoop(Lp); - } - Lp->addBasicBlockToLoop(VecBody, *LI); - - // Use this IR builder to create the loop instructions (Phi, Br, Cmp) - // inside the loop. - Builder.SetInsertPoint(VecBody->getFirstNonPHI()); - - // Generate the induction variable. - setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); - Induction = Builder.CreatePHI(IdxTy, 2, "index"); - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Constant *Step = ConstantInt::get(IdxTy, VF * UF); - - // Generate code to check that the loop's trip count that we computed by - // adding one to the backedge-taken count will not overflow. - BasicBlock *NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow)); - VectorPH = NewVectorPH; - - // This is the IR builder that we use to add all of the logic for bypassing - // the new vector loop. - IRBuilder<> BypassBuilder(VectorPH->getTerminator()); - setDebugLocFromInst(BypassBuilder, - getDebugLocFromInstOrOperands(OldInduction)); - - // We may need to extend the index in case there is a type mismatch. - // We know that the count starts at zero and does not overflow. - if (Count->getType() != IdxTy) { - // The exit count can be of pointer type. Convert it to the correct - // integer type. - if (ExitCount->getType()->isPointerTy()) - Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); - else - Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); - } - - // Add the start index to the loop count to get the new end index. - Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); - - // Now we need to generate the expression for N - (N % VF), which is - // the part that the vectorized body will execute. - Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); - Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); - Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, - "end.idx.rnd.down"); - - // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. - Value *Cmp = - BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - ReplaceInstWithInst(VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, Cmp)); - VectorPH = NewVectorPH; - - // Generate the code to check that the strides we assumed to be one are really - // one. We want the new basic block to start at the first instruction in a - // sequence of instructions that form a check. - Instruction *StrideCheck; - Instruction *FirstCheckInst; - std::tie(FirstCheckInst, StrideCheck) = - addStrideCheck(VectorPH->getTerminator()); - if (StrideCheck) { - AddedSafetyChecks = true; - // Create a new block containing the stride check. - VectorPH->setName("vector.stridecheck"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - - // Replace the branch into the memory check block with a conditional branch - // for the "few elements case". - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck)); - - VectorPH = NewVectorPH; - } - - // Generate the code that checks in runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeCheck(VectorPH->getTerminator()); - if (MemRuntimeCheck) { - AddedSafetyChecks = true; - // Create a new block containing the memory check. - VectorPH->setName("vector.memcheck"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - - // Replace the branch into the memory check block with a conditional branch - // for the "few elements case". - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck)); - - VectorPH = NewVectorPH; - } - - // We are going to resume the execution of the scalar loop. - // Go over all of the induction variables that we found and fix the - // PHIs that are left in the scalar version of the loop. - // The starting values of PHI nodes depend on the counter of the last - // iteration in the vectorized loop. - // If we come from a bypass edge then we need to start from the original - // start value. - - // This variable saves the new starting index for the scalar loop. - PHINode *ResumeIndex = nullptr; - LoopVectorizationLegality::InductionList::iterator I, E; - LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); - // Set builder to point to last bypass block. - BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); - for (I = List->begin(), E = List->end(); I != E; ++I) { - PHINode *OrigPhi = I->first; - LoopVectorizationLegality::InductionInfo II = I->second; - - Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); - PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", - MiddleBlock->getTerminator()); - // We might have extended the type of the induction variable but we need a - // truncated version for the scalar loop. - PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? - PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", - MiddleBlock->getTerminator()) : nullptr; - - // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val", - ScalarPH->getTerminator()); - BCResumeVal->addIncoming(ResumeVal, MiddleBlock); - - PHINode *BCTruncResumeVal = nullptr; - if (OrigPhi == OldInduction) { - BCTruncResumeVal = - PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val", - ScalarPH->getTerminator()); - BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock); - } - - Value *EndValue = nullptr; - switch (II.IK) { - case LoopVectorizationLegality::IK_NoInduction: - llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IK_IntInduction: { - // Handle the integer induction counter. - assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); - - // We have the canonical induction variable. - if (OrigPhi == OldInduction) { - // Create a truncated version of the resume value for the scalar loop, - // we might have promoted the type to a larger width. - EndValue = - BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); - // The new PHI merges the original incoming value, in case of a bypass, - // or the value at the end of the vectorized loop. - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); - TruncResumeVal->addIncoming(EndValue, VecBody); - - BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); - - // We know what the end value is. - EndValue = IdxEndRoundDown; - // We also know which PHI node holds it. - ResumeIndex = ResumeVal; - break; - } - - // Not the canonical induction variable - add the vector loop count to the - // start value. - Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, - II.StartValue->getType(), - "cast.crd"); - EndValue = II.transform(BypassBuilder, CRD); - EndValue->setName("ind.end"); - break; - } - case LoopVectorizationLegality::IK_PtrInduction: { - Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, - II.StepValue->getType(), - "cast.crd"); - EndValue = II.transform(BypassBuilder, CRD); - EndValue->setName("ptr.ind.end"); - break; - } - }// end of case - - // The new PHI merges the original incoming value, in case of a bypass, - // or the value at the end of the vectorized loop. - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) { - if (OrigPhi == OldInduction) - ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); - else - ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); - } - ResumeVal->addIncoming(EndValue, VecBody); - - // Fix the scalar body counter (PHI node). - unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); - - // The old induction's phi node in the scalar body needs the truncated - // value. - if (OrigPhi == OldInduction) { - BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]); - OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal); - } else { - BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); - OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); - } - } - - // If we are generating a new induction variable then we also need to - // generate the code that calculates the exit value. This value is not - // simply the end of the counter because we may skip the vectorized body - // in case of a runtime check. - if (!OldInduction){ - assert(!ResumeIndex && "Unexpected resume value found"); - ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", - MiddleBlock->getTerminator()); - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); - ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); - } - - // Make sure that we found the index where scalar loop needs to continue. - assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && - "Invalid resume Index"); - - // Add a check in the middle block to see if we have completed - // all of the iterations in the first vector loop. - // If (N - N%VF) == N, then we *don't* need to run the remainder. - Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, - ResumeIndex, "cmp.n", - MiddleBlock->getTerminator()); - ReplaceInstWithInst(MiddleBlock->getTerminator(), - BranchInst::Create(ExitBlock, ScalarPH, CmpN)); - - // Create i+1 and fill the PHINode. - Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); - Induction->addIncoming(StartIdx, VectorPH); - Induction->addIncoming(NextIdx, VecBody); - // Create the compare. - Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); - Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); - - // Now we have two terminators. Remove the old one from the block. - VecBody->getTerminator()->eraseFromParent(); - - // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); - - // Save the state. - LoopVectorPreHeader = VectorPH; - LoopScalarPreHeader = ScalarPH; - LoopMiddleBlock = MiddleBlock; - LoopExitBlock = ExitBlock; - LoopVectorBody.push_back(VecBody); - LoopScalarBody = OldBasicBlock; - - LoopVectorizeHints Hints(Lp, true); + LoopVectorizeHints Hints(OrigLoop, true); Hints.setAlreadyVectorized(); } @@ -3021,239 +2627,99 @@ return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); } -void InnerLoopVectorizer::vectorizeLoop() { - //===------------------------------------------------===// - // - // Notice: any optimization or new instruction that go - // into the code below should be also be implemented in - // the cost-model. - // - //===------------------------------------------------===// - Constant *Zero = Builder.getInt32(0); +class VectorizerDelegate : public LoopEditor::Delegate { + InnerLoopVectorizer &LV; +public: + VectorizerDelegate(InnerLoopVectorizer &LV) : LV(LV) { + }; - // In order to support reduction variables we need to be able to vectorize - // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two - // stages. First, we create a new vector PHI node with no incoming edges. - // We use this value when we vectorize all of the instructions that use the - // PHI. Next, after all of the instructions in the block are complete we - // add the new incoming edges to the PHI. At this point all of the - // instructions in the basic block are vectorized, so we can use them to - // construct the PHI. - PhiVector RdxPHIsToFix; - - // Scan the loop in a topological order to ensure that defs are vectorized - // before users. - LoopBlocksDFS DFS(OrigLoop); - DFS.perform(LI); + Value *hookCloneValue(Value *V, ValueToValueMapTy &VM, + IRBuilder<> &IRB) override { + LV.Builder.SetInsertPoint(IRB.GetInsertPoint()); + unsigned UF = LV.UF; + LV.UF = 1; + InnerLoopVectorizer::PhiVector Dummy; + LV.vectorizeSingleInstruction(cast(V), &Dummy); - // Vectorize all of the blocks in the original loop. - for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), - be = DFS.endRPO(); bb != be; ++bb) - vectorizeBlockInLoop(*bb, &RdxPHIsToFix); - - // At this point every instruction in the original loop is widened to - // a vector form. We are almost done. Now, we need to fix the PHI nodes - // that we vectorized. The PHI nodes are currently empty because we did - // not want to introduce cycles. Notice that the remaining PHI nodes - // that we need to fix are reduction variables. - - // Create the 'reduced' values for each of the induction vars. - // The reduced values are the vector values that we scalarize and combine - // after the loop is finished. - for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); - it != e; ++it) { - PHINode *RdxPhi = *it; - assert(RdxPhi && "Unable to recover vectorized PHI"); - - // Find the reduction variable descriptor. - assert(Legal->getReductionVars()->count(RdxPhi) && - "Unable to find the reduction variable"); - RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; - - RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); - TrackingVH ReductionStartValue = RdxDesc.getRecurrenceStartValue(); - Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = - RdxDesc.getMinMaxRecurrenceKind(); - setDebugLocFromInst(Builder, ReductionStartValue); - - // We need to generate a reduction vector from the incoming scalar. - // To do so, we need to generate the 'identity' vector and override - // one of the elements with the incoming scalar reduction. We need - // to do it in the vector-loop preheader. - Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); - - // This is the vector-clone of the value that leaves the loop. - VectorParts &VectorExit = getVectorValue(LoopExitInst); - Type *VecTy = VectorExit[0]->getType(); - - // Find the reduction identity variable. Zero for addition, or, xor, - // one for multiplication, -1 for And. - Value *Identity; - Value *VectorStart; - if (RK == RecurrenceDescriptor::RK_IntegerMinMax || - RK == RecurrenceDescriptor::RK_FloatMinMax) { - // MinMax reduction have the start value as their identify. - if (VF == 1) { - VectorStart = Identity = ReductionStartValue; - } else { - VectorStart = Identity = - Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); - } - } else { - // Handle other reduction kinds: - Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( - RK, VecTy->getScalarType()); - if (VF == 1) { - Identity = Iden; - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - VectorStart = ReductionStartValue; - } else { - Identity = ConstantVector::getSplat(VF, Iden); + LV.UF = UF; + return LV.WidenMap.get(V)[0]; + } - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - VectorStart = - Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); - } - } + unsigned hookInterleaveInductionStep() override { + return LV.VF; + } - // Fix the vector-loop phi. + Value *hookReduceReductions(LoopEditor::Reduction *R, + ArrayRef Values, + IRBuilder<> &IRB) override { + LV.Builder.SetInsertPoint(IRB.GetInsertPoint()); - // Reductions do not have to start at zero. They can start with - // any loop invariant values. - VectorParts &VecRdxPhi = WidenMap.get(RdxPhi); - BasicBlock *Latch = OrigLoop->getLoopLatch(); - Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); - VectorParts &Val = getVectorValue(LoopVal); - for (unsigned part = 0; part < UF; ++part) { - // Make sure to add the reduction stat value only to the - // first unroll part. - Value *StartVal = (part == 0) ? VectorStart : Identity; - cast(VecRdxPhi[part])->addIncoming(StartVal, - LoopVectorPreHeader); - cast(VecRdxPhi[part])->addIncoming(Val[part], - LoopVectorBody.back()); + Value *ReducedPartRdx = nullptr; + for (auto *V : Values) { + if (!ReducedPartRdx) + ReducedPartRdx = V; + else + ReducedPartRdx = R->createOp(IRB, ReducedPartRdx, V); } - // Before each round, move the insertion point right between - // the PHIs and the values we are going to write. - // This allows us to write both PHINodes and the extractelement - // instructions. - Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); - - VectorParts RdxParts; - setDebugLocFromInst(Builder, LoopExitInst); - for (unsigned part = 0; part < UF; ++part) { - // This PHINode contains the vectorized reduction variable, or - // the initial value vector, if we bypass the vector loop. - VectorParts &RdxExitVal = getVectorValue(LoopExitInst); - PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); - Value *StartVal = (part == 0) ? VectorStart : Identity; - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); - NewPhi->addIncoming(RdxExitVal[part], - LoopVectorBody.back()); - RdxParts.push_back(NewPhi); + if (LV.VF == 1) + return ReducedPartRdx; + + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(LV.VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = ReducedPartRdx; + SmallVector ShuffleMask(LV.VF, nullptr); + for (unsigned i = LV.VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i/2; ++j) + ShuffleMask[j] = LV.Builder.getInt32(i/2 + j); + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i/2], ShuffleMask.end(), + UndefValue::get(LV.Builder.getInt32Ty())); + + Value *Shuf = + IRB.CreateShuffleVector(TmpVec, + UndefValue::get(TmpVec->getType()), + ConstantVector::get(ShuffleMask), + "rdx.shuf"); + + TmpVec = R->createOp(IRB, TmpVec, Shuf); } - // Reduce all of the unrolled parts into a single vector. - Value *ReducedPartRdx = RdxParts[0]; - unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); - setDebugLocFromInst(Builder, ReducedPartRdx); - for (unsigned part = 1; part < UF; ++part) { - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - // Floating point operations had to be 'fast' to enable the reduction. - ReducedPartRdx = addFastMathFlag( - Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], - ReducedPartRdx, "bin.rdx")); - else - ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp( - Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]); - } + // The result is in the first element of the vector. + return IRB.CreateExtractElement(TmpVec, IRB.getInt32(0)); + } +}; - if (VF > 1) { - // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles - // and vector ops, reducing the set of values being computed by half each - // round. - assert(isPowerOf2_32(VF) && - "Reduction emission only supported for pow2 vectors!"); - Value *TmpVec = ReducedPartRdx; - SmallVector ShuffleMask(VF, nullptr); - for (unsigned i = VF; i != 1; i >>= 1) { - // Move the upper half of the vector to the lower half. - for (unsigned j = 0; j != i/2; ++j) - ShuffleMask[j] = Builder.getInt32(i/2 + j); - - // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i/2], ShuffleMask.end(), - UndefValue::get(Builder.getInt32Ty())); - - Value *Shuf = - Builder.CreateShuffleVector(TmpVec, - UndefValue::get(TmpVec->getType()), - ConstantVector::get(ShuffleMask), - "rdx.shuf"); - - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - // Floating point operations had to be 'fast' to enable the reduction. - TmpVec = addFastMathFlag(Builder.CreateBinOp( - (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); - else - TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, - TmpVec, Shuf); - } +void InnerLoopVectorizer::vectorizeLoop() { + //===------------------------------------------------===// + // + // Notice: any optimization or new instruction that go + // into the code below should be also be implemented in + // the cost-model. + // + //===------------------------------------------------===// - // The result is in the first element of the vector. - ReducedPartRdx = Builder.CreateExtractElement(TmpVec, - Builder.getInt32(0)); - } + BasicBlock *PredBB; + const DataLayout &DL = + OrigLoop->getHeader()->getParent()->getParent()->getDataLayout(); + LoopEditor ScalarLE(OrigLoop, SE, &DL, LI, DT); + VectorizerDelegate D(*this); + ScalarLE.versionWidenAndInterleaveLoop(UF, PredBB, &D); + + Instruction *MemRuntimeCheck, *FirstCheckInst; + std::tie(FirstCheckInst, MemRuntimeCheck) = + Legal->getLAI()->addRuntimeCheck(PredBB->getTerminator()); + if (MemRuntimeCheck) { + AddedSafetyChecks = true; + // Create a new block containing the memory check. + PredBB->setName("vector.memcheck"); + } - // Create a phi node that merges control-flow from the backedge-taken check - // block and the middle block. - PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", - LoopScalarPreHeader->getTerminator()); - BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]); - BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); - - // Now, we need to fix the users of the reduction variable - // inside and outside of the scalar remainder loop. - // We know that the loop is in LCSSA form. We need to update the - // PHI nodes in the exit blocks. - for (BasicBlock::iterator LEI = LoopExitBlock->begin(), - LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { - PHINode *LCSSAPhi = dyn_cast(LEI); - if (!LCSSAPhi) break; - - // All PHINodes need to have a single entry edge, or two if - // we already fixed them. - assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); - - // We found our reduction value exit-PHI. Update it with the - // incoming bypass edge. - if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) { - // Add an edge coming from the bypass. - LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); - break; - } - }// end of the LCSSA phi scan. - - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. - int IncomingEdgeBlockIdx = - (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); - assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); - // Pick the other block. - int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); - (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); - }// end of for each redux variable. - - fixLCSSAPHIs(); - - // Remove redundant induction instructions. - cse(LoopVectorBody); } void InnerLoopVectorizer::fixLCSSAPHIs() { @@ -3458,16 +2924,21 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + vectorizeSingleInstruction(it, PV); + } +} + +void InnerLoopVectorizer::vectorizeSingleInstruction(Instruction *it, PhiVector *PV) { VectorParts &Entry = WidenMap.get(it); switch (it->getOpcode()) { case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the // loop control flow instructions. - continue; + return; case Instruction::PHI: { // Vectorize PHINodes. widenPHIInstruction(it, Entry, UF, VF, PV); - continue; + return; }// End of PHI. case Instruction::Add: @@ -3611,7 +3082,7 @@ break; setDebugLocFromInst(Builder, it); - Module *M = BB->getParent()->getParent(); + Module *M = it->getParent()->getParent()->getParent(); CallInst *CI = cast(it); StringRef FnName = CI->getCalledFunction()->getName(); @@ -3686,38 +3157,9 @@ scalarizeInstruction(it); break; }// end of switch. - }// end of for_each instr. } void InnerLoopVectorizer::updateAnalysis() { - // Forget the original basic block. - SE->forgetLoop(OrigLoop); - - // Update the dominator tree information. - assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && - "Entry does not dominate exit."); - - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); - DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); - - // Due to if predication of stores we might create a sequence of "if(pred) - // a[i] = ...; " blocks. - for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) { - if (i == 0) - DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); - else if (isPredicatedBlock(i)) { - DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]); - } else { - DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]); - } - } - - DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); - DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); - DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); - DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); - DEBUG(DT->verifyDomTree()); }