Index: llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt +++ llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt @@ -3,6 +3,7 @@ LoopVectorize.cpp SLPVectorizer.cpp Vectorize.cpp + VPlan.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -47,6 +47,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/LoopVectorize.h" +#include "VPlan.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" @@ -98,6 +99,7 @@ #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Vectorize.h" #include +#include #include #include @@ -250,6 +252,10 @@ class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizationRequirements; +class VPInterleaveRecipe; +class VPReplicateRecipe; +class VPWidenIntOrFpInductionRecipe; +class VPWidenRecipe; /// Returns true if the given loop body has a cycle, excluding the loop /// itself. @@ -362,6 +368,9 @@ : ConstantFP::get(Ty, C); } +} // end anonymous namespace + +namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -393,10 +402,11 @@ AddedSafetyChecks(false) {} /// Create a new empty loop. Unlink the old loop and connect the new one. - void createVectorizedLoopSkeleton(); + /// Return the pre-header block of the new loop. + BasicBlock *createVectorizedLoopSkeleton(); - /// Vectorize a single instruction within the innermost loop. - void vectorizeInstruction(Instruction &I); + /// Widen a single instruction within the innermost loop. + void widenInstruction(Instruction &I); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(); @@ -406,15 +416,72 @@ virtual ~InnerLoopVectorizer() {} -protected: - /// A small list of PHINodes. - typedef SmallVector PhiVector; - /// A type for vectorized values in the new loop. Each value from the /// original loop, when vectorized, is represented by UF vector values in the /// new unrolled loop, where UF is the unroll factor. typedef SmallVector VectorParts; + /// A helper function that computes the predicate of the block BB, assuming + /// that the header block of the loop is set to True. It returns the *entry* + /// mask for the block BB. + VectorParts createBlockInMask(BasicBlock *BB); + + /// Vectorize a single PHINode in a block. This method handles the induction + /// variable canonicalization. It supports both VF = 1 for unrolled loops and + /// arbitrary length vectors. + void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); + + /// A helper function to scalarize a single Instruction in the innermost loop. + /// Generates a sequence of scalar instances for each lane between \p MinLane + /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, + /// inclusive.. + void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, + bool IfPredicateInstr); + + /// Widen an integer or floating-point induction variable \p IV. If \p Trunc + /// is provided, the integer induction variable will first be truncated to + /// the corresponding type. + void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); + + /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a + /// vector or scalar value on-demand if one is not yet available. When + /// vectorizing a loop, we visit the definition of an instruction before its + /// uses. When visiting the definition, we either vectorize or scalarize the + /// instruction, creating an entry for it in the corresponding map. (In some + /// cases, such as induction variables, we will create both vector and scalar + /// entries.) Then, as we encounter uses of the definition, we derive values + /// for each scalar or vector use unless such a value is already available. + /// For example, if we scalarize a definition and one of its uses is vector, + /// we build the required vector on-demand with an insertelement sequence + /// when visiting the use. Otherwise, if the use is scalar, we can use the + /// existing scalar definition. + /// + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll index \p Part. If the value has already been vectorized, + /// the corresponding vector entry in VectorLoopValueMap is returned. If, + /// however, the value has a scalar entry in VectorLoopValueMap, we construct + /// a new vector value on-demand by inserting the scalar values into a vector + /// with an insertelement sequence. If the value has been neither vectorized + /// nor scalarized, it must be loop invariant, so we simply broadcast the + /// value into a vector. + Value *getOrCreateVectorValue(Value *V, unsigned Part); + + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll and vector indices \p Instance. If the value has been + /// vectorized but not scalarized, the necessary extractelement instruction + /// will be generated. + Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); + + /// Construct the vector value of a scalarized value \p V one lane at a time. + void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); + + /// Try to vectorize the interleaved access group that \p Instr belongs to. + void vectorizeInterleaveGroup(Instruction *Instr); + +protected: + /// A small list of PHINodes. + typedef SmallVector PhiVector; + /// A type for scalarized values in the new loop. Each value from the /// original loop, when scalarized, is represented by UF x VF scalar values /// in the new unrolled loop, where UF is the unroll factor and VF is the @@ -457,37 +524,18 @@ /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); - /// Predicate conditional instructions that require predication on their - /// respective conditions. - void predicateInstructions(); - /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. void truncateToMinimalBitwidths(); - /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True. It returns the *entry* - /// mask for the block BB. - VectorParts createBlockInMask(BasicBlock *BB); /// A helper function that computes the predicate of the edge between SRC /// and DST. VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - /// Vectorize a single PHINode in a block. This method handles the induction - /// variable canonicalization. It supports both VF = 1 for unrolled loops and - /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); - /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. void updateAnalysis(); - /// This instruction is un-vectorizable. Implement it as a sequence - /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each - /// scalarized instruction behind an if block predicated on the control - /// dependence of the instruction. - void scalarizeInstruction(Instruction *Instr, bool IfPredicateInstr = false); - /// Vectorize Load and Store instructions, virtual void vectorizeMemoryInstruction(Instruction *Instr); @@ -521,11 +569,6 @@ void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, Value *Step, Instruction *EntryVal); - /// Widen an integer or floating-point induction variable \p IV. If \p Trunc - /// is provided, the integer induction variable will first be truncated to - /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); - /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. bool shouldScalarizeInstruction(Instruction *I) const; @@ -533,38 +576,6 @@ /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; - /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a - /// vector or scalar value on-demand if one is not yet available. When - /// vectorizing a loop, we visit the definition of an instruction before its - /// uses. When visiting the definition, we either vectorize or scalarize the - /// instruction, creating an entry for it in the corresponding map. (In some - /// cases, such as induction variables, we will create both vector and scalar - /// entries.) Then, as we encounter uses of the definition, we derive values - /// for each scalar or vector use unless such a value is already available. - /// For example, if we scalarize a definition and one of its uses is vector, - /// we build the required vector on-demand with an insertelement sequence - /// when visiting the use. Otherwise, if the use is scalar, we can use the - /// existing scalar definition. - /// - /// Return a value in the new loop corresponding to \p V from the original - /// loop at unroll index \p Part. If the value has already been vectorized, - /// the corresponding vector entry in VectorLoopValueMap is returned. If, - /// however, the value has a scalar entry in VectorLoopValueMap, we construct - /// a new vector value on-demand by inserting the scalar values into a vector - /// with an insertelement sequence. If the value has been neither vectorized - /// nor scalarized, it must be loop invariant, so we simply broadcast the - /// value into a vector. - Value *getOrCreateVectorValue(Value *V, unsigned Part); - - /// Return a value in the new loop corresponding to \p V from the original - /// loop at unroll index \p Part and vector index \p Lane. If the value has - /// been vectorized but not scalarized, the necessary extractelement - /// instruction will be generated. - Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane); - - /// Try to vectorize the interleaved access group that \p Instr belongs to. - void vectorizeInterleaveGroup(Instruction *Instr); - /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -605,127 +616,6 @@ /// the instruction. void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); - /// This is a helper class for maintaining vectorization state. It's used for - /// mapping values from the original loop to their corresponding values in - /// the new loop. Two mappings are maintained: one for vectorized values and - /// one for scalarized values. Vectorized values are represented with UF - /// vector values in the new loop, and scalarized values are represented with - /// UF x VF scalar values in the new loop. UF and VF are the unroll and - /// vectorization factors, respectively. - /// - /// Entries can be added to either map with setVectorValue and setScalarValue, - /// which assert that an entry was not already added before. If an entry is to - /// replace an existing one, call resetVectorValue. This is currently needed - /// to modify the mapped values during "fix-up" operations that occur once the - /// first phase of widening is complete. These operations include type - /// truncation and the second phase of recurrence widening. - /// - /// Entries from either map can be retrieved using the getVectorValue and - /// getScalarValue functions, which assert that the desired value exists. - - struct ValueMap { - - /// Construct an empty map with the given unroll and vectorization factors. - ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} - - /// \return True if the map has any vector entry for \p Key. - bool hasAnyVectorValue(Value *Key) const { - return VectorMapStorage.count(Key); - } - - /// \return True if the map has a vector entry for \p Key and \p Part. - bool hasVectorValue(Value *Key, unsigned Part) const { - assert(Part < UF && "Queried Vector Part is too large."); - if (!hasAnyVectorValue(Key)) - return false; - const VectorParts &Entry = VectorMapStorage.find(Key)->second; - assert(Entry.size() == UF && "VectorParts has wrong dimensions."); - return Entry[Part] != nullptr; - } - - /// \return True if the map has any scalar entry for \p Key. - bool hasAnyScalarValue(Value *Key) const { - return ScalarMapStorage.count(Key); - } - - /// \return True if the map has a scalar entry for \p Key, \p Part and - /// \p Part. - bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const { - assert(Part < UF && "Queried Scalar Part is too large."); - assert(Lane < VF && "Queried Scalar Lane is too large."); - if (!hasAnyScalarValue(Key)) - return false; - const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; - assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions."); - return Entry[Part][Lane] != nullptr; - } - - /// Retrieve the existing vector value that corresponds to \p Key and - /// \p Part. - Value *getVectorValue(Value *Key, unsigned Part) { - assert(hasVectorValue(Key, Part) && "Getting non-existent value."); - return VectorMapStorage[Key][Part]; - } - - /// Retrieve the existing scalar value that corresponds to \p Key, \p Part - /// and \p Lane. - Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) { - assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value."); - return ScalarMapStorage[Key][Part][Lane]; - } - - /// Set a vector value associated with \p Key and \p Part. Assumes such a - /// value is not already set. If it is, use resetVectorValue() instead. - void setVectorValue(Value *Key, unsigned Part, Value *Vector) { - assert(!hasVectorValue(Key, Part) && "Vector value already set for part"); - if (!VectorMapStorage.count(Key)) { - VectorParts Entry(UF); - VectorMapStorage[Key] = Entry; - } - VectorMapStorage[Key][Part] = Vector; - } - - /// Set a scalar value associated with \p Key for \p Part and \p Lane. - /// Assumes such a value is not already set. - void setScalarValue(Value *Key, unsigned Part, unsigned Lane, - Value *Scalar) { - assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set"); - if (!ScalarMapStorage.count(Key)) { - ScalarParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF, nullptr); - // TODO: Consider storing uniform values only per-part, as they occupy - // lane 0 only, keeping the other VF-1 redundant entries null. - ScalarMapStorage[Key] = Entry; - } - ScalarMapStorage[Key][Part][Lane] = Scalar; - } - - /// Reset the vector value associated with \p Key for the given \p Part. - /// This function can be used to update values that have already been - /// vectorized. This is the case for "fix-up" operations including type - /// truncation and the second phase of recurrence vectorization. - void resetVectorValue(Value *Key, unsigned Part, Value *Vector) { - assert(hasVectorValue(Key, Part) && "Vector value not set for part"); - VectorMapStorage[Key][Part] = Vector; - } - - private: - /// The unroll factor. Each entry in the vector map contains UF vector - /// values. - unsigned UF; - - /// The vectorization factor. Each entry in the scalar map contains UF x VF - /// scalar values. - unsigned VF; - - /// The vector and scalar map storage. We use std::map and not DenseMap - /// because insertions to DenseMap invalidate its iterators. - std::map VectorMapStorage; - std::map ScalarMapStorage; - }; - /// The original loop. Loop *OrigLoop; /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies @@ -758,7 +648,6 @@ /// vector elements. unsigned VF; -protected: /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -792,11 +681,10 @@ /// vectorized loop. A key value can map to either vector values, scalar /// values or both kinds of values, depending on whether the key was /// vectorized and scalarized. - ValueMap VectorLoopValueMap; + VectorizerValueMap VectorLoopValueMap; - /// Store instructions that should be predicated, as a pair - /// - SmallVector, 4> PredicatedInstructions; + /// Store instructions that were predicated. + SmallVector PredicatedInstructions; EdgeMaskCacheTy EdgeMaskCache; BlockMaskCacheTy BlockMaskCache; /// Trip count of the original loop. @@ -816,6 +704,8 @@ // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. DenseMap IVEndValues; + + friend class LoopVectorizationPlanner; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -831,7 +721,6 @@ UnrollFactor, LVL, CM) {} private: - void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; Value *getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps Opcode = @@ -908,6 +797,10 @@ } } +} // namespace llvm + +namespace { + /// \brief The group of interleaved loads/stores sharing the same stride and /// close to each other. /// @@ -1920,6 +1813,7 @@ /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. bool isProfitableToScalarize(Instruction *I, unsigned VF) const { + assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); auto Scalars = InstsToScalarize.find(VF); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); @@ -2033,6 +1927,22 @@ return Legal->isInductionVariable(Op); } + /// Collects the instructions to scalarize for each predicated instruction in + /// the loop. + void collectInstsToScalarize(unsigned VF); + + /// Collect Uniform and Scalar values for the given \p VF. + /// The sets depend on CM decision for Load/Store instructions + /// that may be vectorized as interleave, gather-scatter or scalarized. + void collectUniformsAndScalars(unsigned VF) { + // Do the analysis once. + if (VF == 1 || Uniforms.count(VF)) + return; + setCostBasedWideningDecision(VF); + collectLoopUniforms(VF); + collectLoopScalars(VF); + } + private: /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. @@ -2134,10 +2044,6 @@ int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, unsigned VF); - /// Collects the instructions to scalarize for each predicated instruction in - /// the loop. - void collectInstsToScalarize(unsigned VF); - /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in /// the vectorized loop corresponding to each vector iteration. Examples of @@ -2156,18 +2062,6 @@ /// iteration of the original scalar loop. void collectLoopScalars(unsigned VF); - /// Collect Uniform and Scalar values for the given \p VF. - /// The sets depend on CM decision for Load/Store instructions - /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(unsigned VF) { - // Do the analysis once. - if (VF == 1 || Uniforms.count(VF)) - return; - setCostBasedWideningDecision(VF); - collectLoopUniforms(VF); - collectLoopScalars(VF); - } - /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. typedef DenseMap, @@ -2205,23 +2099,71 @@ SmallPtrSet VecValuesToIgnore; }; +} // end anonymous namespace + +namespace llvm { +/// InnerLoopVectorizer vectorizes loops which contain only one basic /// LoopVectorizationPlanner - drives the vectorization process after having /// passed Legality checks. +/// The planner builds and optimizes the Vectorization Plans which record the +/// decisions how to vectorize the given loop. In particular, represent the +/// control-flow of the vectorized version, the replication of instructions that +/// are to be scalarized, and interleave access groups. class LoopVectorizationPlanner { + /// The loop that we evaluate. + Loop *OrigLoop; + + /// Loop Info analysis. + LoopInfo *LI; + + /// Target Library Info. + const TargetLibraryInfo *TLI; + + /// Target Transform Info. + const TargetTransformInfo *TTI; + + /// The legality analysis. + LoopVectorizationLegality *Legal; + + /// The profitablity analysis. + LoopVectorizationCostModel &CM; + + SmallVector VPlans; + + unsigned BestVF; + unsigned BestUF; + public: - LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI, + LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM) - : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {} + : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), + BestVF(0), BestUF(0) {} - ~LoopVectorizationPlanner() {} + ~LoopVectorizationPlanner() { + while (!VPlans.empty()) { + VPlan *Plan = VPlans.back(); + VPlans.pop_back(); + delete Plan; + } + } /// Plan how to best vectorize, return the best VF and its cost. LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, unsigned UserVF); - /// Generate the IR code for the vectorized loop. - void executePlan(InnerLoopVectorizer &ILV); + /// Finalize the best decision and dispose of all other VPlans. + void setBestPlan(unsigned VF, unsigned UF); + + /// Generate the IR code for the body of the vectorized loop according to the + /// best selected VPlan. + void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); + + void printPlans(raw_ostream &O) { + for (VPlan *Plan : VPlans) + O << *Plan; + } protected: /// Collect the instructions from the original loop that would be trivially @@ -2229,20 +2171,77 @@ void collectTriviallyDeadInstructions( SmallPtrSetImpl &DeadInstructions); -private: - /// The loop that we evaluate. - Loop *OrigLoop; + /// A range of powers-of-2 vectorization factors with fixed start and + /// adjustable end. The range includes start and excludes end, e.g.,: + /// [1, 9) = {1, 2, 4, 8} + struct VFRange { + const unsigned Start; // A power of 2. + unsigned End; // Need not be a power of 2. If End <= Start range is empty. + }; - /// Loop Info analysis. - LoopInfo *LI; + /// Test a \p Predicate on a \p Range of VF's. Return the value of applying + /// \p Predicate on Range.Start, possibly decreasing Range.End such that the + /// returned value holds for the entire \p Range. + bool getDecisionAndClampRange(const std::function &Predicate, + VFRange &Range); + + /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, + /// according to the information gathered by Legal when it checked if it is + /// legal to vectorize the loop. + void buildVPlans(unsigned MinVF, unsigned MaxVF); - /// The legality analysis. - LoopVectorizationLegality *Legal; - - /// The profitablity analysis. - LoopVectorizationCostModel &CM; +private: + /// Check if \I belongs to an Interleave Group within the given VF \p Range, + /// \return true in the first returned value if so and false otherwise. + /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG + /// for \p Range.Start, and provide it as the second returned value. + /// Note that if \I is an adjunct member of an IG for \p Range.Start, the + /// \return value is , as it is handled by another recipe. + /// \p Range.End may be decreased to ensure same decision from \p Range.Start + /// to \p Range.End. + VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); + + /// Check if an induction recipe should be constructed for \I within the given + /// VF \p Range. If so build and return it. If not, return null. \p Range.End + /// may be decreased to ensure same decision from \p Range.Start to + /// \p Range.End. + VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I, + VFRange &Range); + + /// Check if \I can be widened within the given VF \p Range. If \I can be + /// widened for Range.Start, extend \p LastWidenRecipe to include \p I if + /// possible or else build a new VPWidenRecipe for it, and return the + /// VPWidenRecipe that includes \p I. If \p I cannot be widened for + /// Range.Start \return null. Range.End may be decreased to ensure same + /// decision from \p Range.Start to \p Range.End. + VPWidenRecipe *tryToWiden(Instruction *I, VPWidenRecipe *LastWidenRecipe, + VFRange &Range); + + /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it + /// is predicated. \return \p VPBB augmented with this new recipe if \p I is + /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new + /// Region. Update the packing decision of predicated instructions if they + /// feed \p I. Range.End may be decreased to ensure same recipe behavior from + /// \p Range.Start to \p Range.End. + VPBasicBlock *handleReplication( + Instruction *I, VFRange &Range, VPBasicBlock *VPBB, + DenseMap &PredInst2Recipe); + + /// Create a replicating region for instruction \p I that requires + /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. + VPRegionBlock *createReplicateRegion(Instruction *I, + VPRecipeBase *PredRecipe); + + /// Build a VPlan according to the information gathered by Legal. \return a + /// VPlan for vectorization factors \p Range.Start and up to \p Range.End + /// exclusive, possibly decreasing \p Range.End. + VPlan *buildVPlan(VFRange &Range); }; +} // namespace llvm + +namespace { + /// \brief This holds vectorization requirements that must be verified late in /// the process. The requirements are set by legalize and costmodel. Once /// vectorization has been determined to be possible and profitable the @@ -2663,15 +2662,15 @@ // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 : VF; - + Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 + : VF; // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); - VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add); + VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); } } } @@ -2709,7 +2708,7 @@ // vector form, we will construct the vector values on demand. if (VectorLoopValueMap.hasAnyScalarValue(V)) { - Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0); + Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); // If we've scalarized a value, that value should be an instruction. auto *I = cast(V); @@ -2726,8 +2725,8 @@ // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; - auto *LastInst = - cast(VectorLoopValueMap.getScalarValue(V, Part, LastLane)); + auto *LastInst = cast( + VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); // Set the insert point after the last scalarized instruction. This ensures // the insertelement sequence will directly follow the scalar definitions. @@ -2744,14 +2743,15 @@ Value *VectorValue = nullptr; if (Cost->isUniformAfterVectorization(I, VF)) { VectorValue = getBroadcastInstrs(ScalarValue); + VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { - VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); + // Initialize packing with insertelements to start from undef. + Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); + VectorLoopValueMap.setVectorValue(V, Part, Undef); for (unsigned Lane = 0; Lane < VF; ++Lane) - VectorValue = Builder.CreateInsertElement( - VectorValue, getOrCreateScalarValue(V, Part, Lane), - Builder.getInt32(Lane)); + packScalarIntoVectorValue(V, {Part, Lane}); + VectorValue = VectorLoopValueMap.getVectorValue(V, Part); } - VectorLoopValueMap.setVectorValue(V, Part, VectorValue); Builder.restoreIP(OldIP); return VectorValue; } @@ -2763,28 +2763,29 @@ return B; } -Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part, - unsigned Lane) { - +Value * +InnerLoopVectorizer::getOrCreateScalarValue(Value *V, + const VPIteration &Instance) { // If the value is not an instruction contained in the loop, it should // already be scalar. if (OrigLoop->isLoopInvariant(V)) return V; - assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast(V), VF) - : true && "Uniform values only have lane zero"); + assert(Instance.Lane > 0 + ? !Cost->isUniformAfterVectorization(cast(V), VF) + : true && "Uniform values only have lane zero"); // If the value from the original loop has not been vectorized, it is // represented by UF x VF scalar values in the new loop. Return the requested // scalar value. - if (VectorLoopValueMap.hasScalarValue(V, Part, Lane)) - return VectorLoopValueMap.getScalarValue(V, Part, Lane); + if (VectorLoopValueMap.hasScalarValue(V, Instance)) + return VectorLoopValueMap.getScalarValue(V, Instance); // If the value has not been scalarized, get its entry in VectorLoopValueMap // for the given unroll part. If this entry is not a vector type (i.e., the // vectorization factor is one), there is no need to generate an // extractelement instruction. - auto *U = getOrCreateVectorValue(V, Part); + auto *U = getOrCreateVectorValue(V, Instance.Part); if (!U->getType()->isVectorTy()) { assert(VF == 1 && "Value not scalarized has non-vector type"); return U; @@ -2793,7 +2794,20 @@ // Otherwise, the value from the original loop has been vectorized and is // represented by UF vector values. Extract and return the requested scalar // value from the appropriate vector lane. - return Builder.CreateExtractElement(U, Builder.getInt32(Lane)); + return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); +} + +void InnerLoopVectorizer::packScalarIntoVectorValue( + Value *V, const VPIteration &Instance) { + assert(V != Induction && "The new induction variable should not be used."); + assert(!V->getType()->isVectorTy() && "Can't pack a vector"); + assert(!V->getType()->isVoidTy() && "Type does not produce a value"); + + Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); + Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); + VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, + Builder.getInt32(Instance.Lane)); + VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); } Value *InnerLoopVectorizer::reverseVector(Value *Vec) { @@ -2866,7 +2880,7 @@ Index += (VF - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { - Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0); + Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2992,10 +3006,6 @@ Alignment = DL.getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = getMemInstAddressSpace(Instr); - // Scalarize the memory instruction if necessary. - if (Decision == LoopVectorizationCostModel::CM_Scalarize) - return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); - // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); @@ -3010,7 +3020,7 @@ // Handle consecutive loads/stores. if (ConsecutiveStride) - Ptr = getOrCreateScalarValue(Ptr, 0, 0); + Ptr = getOrCreateScalarValue(Ptr, {0, 0}); VectorParts Mask = createBlockInMask(Instr->getParent()); // Handle Stores: @@ -3107,71 +3117,41 @@ } void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + const VPIteration &Instance, bool IfPredicateInstr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); - DEBUG(dbgs() << "LV: Scalarizing" - << (IfPredicateInstr ? " and predicating:" : ":") << *Instr - << '\n'); - // Holds vector parameters or scalars, in case of uniform vals. - SmallVector Params; setDebugLocFromInst(Builder, Instr); // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - VectorParts Cond; - if (IfPredicateInstr) - Cond = createBlockInMask(Instr->getParent()); - - // Determine the number of scalars we need to generate for each unroll - // iteration. If the instruction is uniform, we only need to generate the - // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF; - - // For each vector unroll 'part': - for (unsigned Part = 0; Part < UF; ++Part) { - // For each scalar that we create: - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); + + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); + Cloned->setOperand(op, NewOp); + } + addNewMetadata(Cloned, Instr); + + // Place the cloned scalar in the new loop. + Builder.Insert(Cloned); + + // Add the cloned scalar to the scalar map entry. + VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); + + // If we just cloned a new assumption, add it the assumption cache. + if (auto *II = dyn_cast(Cloned)) + if (II->getIntrinsicID() == Intrinsic::assume) + AC->registerAssumption(II); - // Start if-block. - Value *Cmp = nullptr; - if (IfPredicateInstr) { - Cmp = Cond[Part]; - if (!Cmp) // Block in mask is all-one. - Cmp = Builder.getTrue(); - else if (Cmp->getType()->isVectorTy()) - Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane)); - } - - Instruction *Cloned = Instr->clone(); - if (!IsVoidRetTy) - Cloned->setName(Instr->getName() + ".cloned"); - - // Replace the operands of the cloned instructions with their scalar - // equivalents in the new loop. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane); - Cloned->setOperand(op, NewOp); - } - addNewMetadata(Cloned, Instr); - - // Place the cloned scalar in the new loop. - Builder.Insert(Cloned); - - // Add the cloned scalar to the scalar map entry. - VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned); - - // If we just cloned a new assumption, add it the assumption cache. - if (auto *II = dyn_cast(Cloned)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); - - // End if-block. - if (IfPredicateInstr) - PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); - } - } + // End if-block. + if (IfPredicateInstr) + PredicatedInstructions.push_back(Cloned); } PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, @@ -3375,7 +3355,7 @@ LVer->prepareNoAliasMetadata(); } -void InnerLoopVectorizer::createVectorizedLoopSkeleton() { +BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3556,6 +3536,8 @@ LoopVectorizeHints Hints(Lp, true, *ORE); Hints.setAlreadyVectorized(); + + return LoopVectorPreHeader; } // Fix up external users of the induction variable. At this point, we are @@ -3929,7 +3911,8 @@ IVEndValues[Entry.first], LoopMiddleBlock); fixLCSSAPHIs(); - predicateInstructions(); + for (Instruction *PI : PredicatedInstructions) + sinkScalarOperands(&*PI); // Remove redundant induction instructions. cse(LoopVectorBody); @@ -4384,126 +4367,6 @@ } while (Changed); } -void InnerLoopVectorizer::predicateInstructions() { - - // For each instruction I marked for predication on value C, split I into its - // own basic block to form an if-then construct over C. Since I may be fed by - // an extractelement instruction or other scalar operand, we try to - // iteratively sink its scalar operands into the predicated block. If I feeds - // an insertelement instruction, we try to move this instruction into the - // predicated block as well. For non-void types, a phi node will be created - // for the resulting value (either vector or scalar). - // - // So for some predicated instruction, e.g. the conditional sdiv in: - // - // for.body: - // ... - // %add = add nsw i32 %mul, %0 - // %cmp5 = icmp sgt i32 %2, 7 - // br i1 %cmp5, label %if.then, label %if.end - // - // if.then: - // %div = sdiv i32 %0, %1 - // br label %if.end - // - // if.end: - // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ] - // - // the sdiv at this point is scalarized and if-converted using a select. - // The inactive elements in the vector are not used, but the predicated - // instruction is still executed for all vector elements, essentially: - // - // vector.body: - // ... - // %17 = add nsw <2 x i32> %16, %wide.load - // %29 = extractelement <2 x i32> %wide.load, i32 0 - // %30 = extractelement <2 x i32> %wide.load51, i32 0 - // %31 = sdiv i32 %29, %30 - // %32 = insertelement <2 x i32> undef, i32 %31, i32 0 - // %35 = extractelement <2 x i32> %wide.load, i32 1 - // %36 = extractelement <2 x i32> %wide.load51, i32 1 - // %37 = sdiv i32 %35, %36 - // %38 = insertelement <2 x i32> %32, i32 %37, i32 1 - // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17 - // - // Predication will now re-introduce the original control flow to avoid false - // side-effects by the sdiv instructions on the inactive elements, yielding - // (after cleanup): - // - // vector.body: - // ... - // %5 = add nsw <2 x i32> %4, %wide.load - // %8 = icmp sgt <2 x i32> %wide.load52, - // %9 = extractelement <2 x i1> %8, i32 0 - // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue - // - // pred.sdiv.if: - // %10 = extractelement <2 x i32> %wide.load, i32 0 - // %11 = extractelement <2 x i32> %wide.load51, i32 0 - // %12 = sdiv i32 %10, %11 - // %13 = insertelement <2 x i32> undef, i32 %12, i32 0 - // br label %pred.sdiv.continue - // - // pred.sdiv.continue: - // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ] - // %15 = extractelement <2 x i1> %8, i32 1 - // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55 - // - // pred.sdiv.if54: - // %16 = extractelement <2 x i32> %wide.load, i32 1 - // %17 = extractelement <2 x i32> %wide.load51, i32 1 - // %18 = sdiv i32 %16, %17 - // %19 = insertelement <2 x i32> %14, i32 %18, i32 1 - // br label %pred.sdiv.continue55 - // - // pred.sdiv.continue55: - // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ] - // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5 - - for (auto KV : PredicatedInstructions) { - BasicBlock::iterator I(KV.first); - BasicBlock *Head = I->getParent(); - auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, - /*BranchWeights=*/nullptr, DT, LI); - I->moveBefore(T); - sinkScalarOperands(&*I); - - BasicBlock *PredicatedBlock = I->getParent(); - Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName(); - PredicatedBlock->setName(BBNamePrefix + ".if"); - PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue"); - - // If the instruction is non-void create a Phi node at reconvergence point. - if (!I->getType()->isVoidTy()) { - Value *IncomingTrue = nullptr; - Value *IncomingFalse = nullptr; - - if (I->hasOneUse() && isa(*I->user_begin())) { - // If the predicated instruction is feeding an insert-element, move it - // into the Then block; Phi node will be created for the vector. - InsertElementInst *IEI = cast(*I->user_begin()); - IEI->moveBefore(T); - IncomingTrue = IEI; // the new vector with the inserted element. - IncomingFalse = IEI->getOperand(0); // the unmodified vector - } else { - // Phi node will be created for the scalar predicated instruction. - IncomingTrue = &*I; - IncomingFalse = UndefValue::get(I->getType()); - } - - BasicBlock *PostDom = I->getParent()->getSingleSuccessor(); - assert(PostDom && "Then block has multiple successors"); - PHINode *Phi = - PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front()); - IncomingTrue->replaceAllUsesWith(Phi); - Phi->addIncoming(IncomingFalse, Head); - Phi->addIncoming(IncomingTrue, I->getParent()); - } - } - - DEBUG(DT->verifyDomTree()); -} - InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); @@ -4647,7 +4510,7 @@ llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: case InductionDescriptor::IK_FpInduction: - return widenIntOrFpInduction(P); + llvm_unreachable("Integer/fp induction is handled elsewhere."); case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); @@ -4666,7 +4529,7 @@ Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL); SclrGep->setName("next.gep"); - VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep); + VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); } } return; @@ -4692,25 +4555,11 @@ return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { - // Scalarize instructions that should remain scalar after vectorization. - if (VF > 1 && - !(isa(&I) || isa(&I) || isa(&I)) && - shouldScalarizeInstruction(&I)) { - scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); - return; - } - +void InnerLoopVectorizer::widenInstruction(Instruction &I) { switch (I.getOpcode()) { case Instruction::Br: - // Nothing to do for PHIs and BR, since we already took care of the - // loop control flow instructions. - break; - case Instruction::PHI: { - // Vectorize PHINodes. - widenPHIInstruction(&I, UF, VF); - break; - } // End of PHI. + case Instruction::PHI: + llvm_unreachable("This instruction is handled by a different recipe."); case Instruction::GetElementPtr: { // Construct a vector GEP by widening the operands of the scalar GEP as // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP @@ -4783,13 +4632,6 @@ case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: - // Scalarize with predication if this instruction may divide by zero and - // block execution is conditional, otherwise fallthrough. - if (Legal->isScalarWithPredication(&I)) { - scalarizeInstruction(&I, true); - break; - } - LLVM_FALLTHROUGH; case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -4837,7 +4679,7 @@ // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. - auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0); + auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); for (unsigned Part = 0; Part < UF; ++Part) { Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); @@ -4896,16 +4738,6 @@ auto *CI = dyn_cast(&I); setDebugLocFromInst(Builder, CI); - // Optimize the special case where the source is a constant integer - // induction variable. Notice that we can only optimize the 'trunc' case - // because (a) FP conversions lose precision, (b) sext/zext may wrap, and - // (c) other casts depend on pointer size. - if (Cost->isOptimizableIVTruncate(CI, VF)) { - widenIntOrFpInduction(cast(CI->getOperand(0)), - cast(CI)); - break; - } - /// Vectorize casts. Type *DestTy = (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); @@ -4936,11 +4768,7 @@ Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start)) { - scalarizeInstruction(&I); - break; - } + // The flag shows whether we use Intrinsic or a usual Call for vectorized // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? @@ -4948,10 +4776,8 @@ unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); bool UseVectorIntrinsic = ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; - if (!UseVectorIntrinsic && NeedToScalarize) { - scalarizeInstruction(&I); - break; - } + assert((UseVectorIntrinsic || !NeedToScalarize) && + "Instruction should be scalarized elsewhere."); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector Args; @@ -5001,9 +4827,9 @@ } default: - // All other instructions are unsupported. Scalarize them. - scalarizeInstruction(&I); - break; + // All other instructions are scalarized. + DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); } // end of switch. } @@ -5015,14 +4841,11 @@ assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && "Entry does not dominate exit."); - DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(), - LoopVectorPreHeader); DT->addNewBlock(LoopMiddleBlock, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); - DEBUG(DT->verifyDomTree()); } @@ -6962,13 +6785,6 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; - // Collect Uniform and Scalar instructions after vectorization with VF. - collectUniformsAndScalars(VF); - - // Collect the instructions (and their associated costs) that will be more - // profitable to scalarize. - collectInstsToScalarize(VF); - // For each block. for (BasicBlock *BB : TheLoop->blocks()) { VectorizationCostTy BlockCost; @@ -7626,11 +7442,26 @@ // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); + buildVPlans(UserVF, UserVF); + DEBUG(printPlans(dbgs())); return {UserVF, 0}; } unsigned MaxVF = MaybeMaxVF.getValue(); assert(MaxVF != 0 && "MaxVF is zero."); + + for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { + // Collect Uniform and Scalar instructions after vectorization with VF. + CM.collectUniformsAndScalars(VF); + + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + if (VF > 1) + CM.collectInstsToScalarize(VF); + } + + buildVPlans(1, MaxVF); + DEBUG(printPlans(dbgs())); if (MaxVF == 1) return NoVectorization; @@ -7638,11 +7469,31 @@ return CM.selectVectorizationFactor(MaxVF); } -void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) { +void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { + DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); + BestVF = VF; + BestUF = UF; + + for (auto *VPlanIter = VPlans.begin(); VPlanIter != VPlans.end();) { + VPlan *Plan = *VPlanIter; + if (Plan->hasVF(VF)) + ++VPlanIter; + else { + VPlanIter = VPlans.erase(VPlanIter); + delete Plan; + } + } + assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); +} + +void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, + DominatorTree *DT) { // Perform the actual loop transformation. // 1. Create a new empty loop. Unlink the old loop and connect the new one. - ILV.createVectorizedLoopSkeleton(); + VPTransformState State{ + BestVF, BestUF, LI, DT, ILV.Builder, ILV.VectorLoopValueMap, &ILV}; + State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); //===------------------------------------------------===// // @@ -7653,36 +7504,9 @@ //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - - // Move instructions to handle first-order recurrences. - DenseMap &SinkAfter = Legal->getSinkAfter(); - for (auto &Entry : SinkAfter) { - Entry.first->removeFromParent(); - Entry.first->insertAfter(Entry.second); - DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second - << " to vectorize a 1st order recurrence.\n"); - } - - // Collect instructions from the original loop that will become trivially dead - // in the vectorized loop. We don't need to vectorize these instructions. For - // example, original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet DeadInstructions; - collectTriviallyDeadInstructions(DeadInstructions); - - // Scan the loop in a topological order to ensure that defs are vectorized - // before users. - LoopBlocksDFS DFS(OrigLoop); - DFS.perform(LI); - - // Vectorize all instructions in the original loop that will not become - // trivially dead when vectorized. - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - for (Instruction &I : *BB) - if (!DeadInstructions.count(&I)) - ILV.vectorizeInstruction(I); + assert(VPlans.size() == 1 && "Not a single VPlan to execute."); + VPlan *Plan = *VPlans.begin(); + Plan->execute(&State); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7712,14 +7536,6 @@ DeadInstructions.insert(IndUpdate); } } - -void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { - auto *SI = dyn_cast(Instr); - bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); - - return scalarizeInstruction(Instr, IfPredicateInstr); -} - Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } @@ -7775,6 +7591,725 @@ } } +namespace { +/// VPWidenRecipe is a recipe for producing a copy of vector type for each +/// Instruction in its ingredients independently, in order. This recipe covers +/// most of the traditional vectorization cases where each ingredient transforms +/// into a vectorized version of itself. +class VPWidenRecipe : public VPRecipeBase { +private: + /// Hold the ingredients by pointing to their original BasicBlock location. + BasicBlock::iterator Begin; + BasicBlock::iterator End; + +public: + VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC) { + End = I->getIterator(); + Begin = End++; + } + + ~VPWidenRecipe() {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenSC; + } + + /// Produce widened copies of all Ingredients. + void execute(VPTransformState &State) override { + for (auto &Instr : make_range(Begin, End)) + State.ILV->widenInstruction(Instr); + } + + /// Augment the recipe to include Instr, if it lies at its End. + bool appendInstruction(Instruction *Instr) { + if (End != Instr->getIterator()) + return false; + End++; + return true; + } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" << Indent << "\"WIDEN\\l\""; + for (auto &Instr : make_range(Begin, End)) + O << " +\n" << Indent << "\" " << VPlanIngredient(&Instr) << "\\l\""; + } +}; + +/// A recipe for handling phi nodes of integer and floating-point inductions, +/// producing their vector and scalar values. +class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { +private: + PHINode *IV; + TruncInst *Trunc; + +public: + VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr) + : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) {} + + ~VPWidenIntOrFpInductionRecipe() {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC; + } + + /// Generate the vectorized and scalarized versions of the phi node as + /// needed by their users. + void execute(VPTransformState &State) override { + assert(!State.Instance && "Int or FP induction being replicated."); + State.ILV->widenIntOrFpInduction(IV, Trunc); + } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" << Indent << "\"WIDEN-INDUCTION"; + if (Trunc) { + O << "\\l\""; + O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; + O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc) << "\\l\""; + } else + O << " " << VPlanIngredient(IV) << "\\l\""; + } +}; + +/// A recipe for handling all phi nodes except for integer and FP inductions. +class VPWidenPHIRecipe : public VPRecipeBase { +private: + PHINode *Phi; + +public: + VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {} + + ~VPWidenPHIRecipe() {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenPHISC; + } + + /// Generate the phi/select nodes. + void execute(VPTransformState &State) override { + State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); + } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\""; + } +}; + +/// VPInterleaveRecipe is a recipe for transforming an interleave group of load +/// or stores into one wide load/store and shuffles. +class VPInterleaveRecipe : public VPRecipeBase { +private: + const InterleaveGroup *IG; + +public: + VPInterleaveRecipe(const InterleaveGroup *IG) + : VPRecipeBase(VPInterleaveSC), IG(IG) {} + + ~VPInterleaveRecipe() {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC; + } + + /// Generate the wide load or store, and shuffles. + void execute(VPTransformState &State) override { + assert(!State.Instance && "Interleave group being replicated."); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); + } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; + + const InterleaveGroup *getInterleaveGroup() { return IG; } +}; + +/// VPReplicateRecipe replicates a given instruction producing multiple scalar +/// copies of the original scalar type, one per lane, instead of producing a +/// single copy of widened type for all lanes. If the instruction is known to be +/// uniform only one copy, per lane zero, will be generated. +class VPReplicateRecipe : public VPRecipeBase { +private: + /// The instruction being replicated. + Instruction *Ingredient; + + /// Indicator if only a single replica per lane is needed. + bool IsUniform; + + /// Indicator if the replicas are also predicated. + bool IsPredicated; + + /// Indicator if the scalar values should also be packed into a vector. + bool AlsoPack; + +public: + VPReplicateRecipe(Instruction *I, bool IsUniform, bool IsPredicated = false) + : VPRecipeBase(VPReplicateSC), Ingredient(I), IsUniform(IsUniform), + IsPredicated(IsPredicated) { + // Retain the previous behavior of predicateInstructions(), where an + // insert-element of a predicated instruction got hoisted into the + // predicated basic block iff it was its only user. This is achieved by + // having predicated instructions also pack their values into a vector by + // default unless they have a replicated user which uses their scalar value. + AlsoPack = IsPredicated && !I->use_empty(); + } + + ~VPReplicateRecipe() {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPReplicateSC; + } + + /// Generate replicas of the desired Ingredient. Replicas will be generated + /// for all parts and lanes unless a specific part and lane are specified in + /// the \p State. + void execute(VPTransformState &State) override; + + void setAlsoPack(bool Pack) { AlsoPack = Pack; } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" + << Indent << "\"" << (IsUniform ? "CLONE " : "REPLICATE ") + << VPlanIngredient(Ingredient); + if (AlsoPack) + O << " (S->V)"; + O << "\\l\""; + } +}; + +/// A recipe for generating conditional branches on the bits of a mask. +class VPBranchOnMaskRecipe : public VPRecipeBase { +private: + /// The input IR basic block used to obtain the mask providing the condition + /// bits for the branch. + BasicBlock *MaskedBasicBlock; + +public: + VPBranchOnMaskRecipe(BasicBlock *BB) + : VPRecipeBase(VPBranchOnMaskSC), MaskedBasicBlock(BB) {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPBranchOnMaskSC; + } + + /// Generate the extraction of the appropriate bit from the block mask and the + /// conditional branch. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" + << Indent << "\"BRANCH-ON-MASK-OF " << MaskedBasicBlock->getName() + << "\\l\""; + } +}; + +/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when +/// control converges back from a Branch-on-Mask. The phi nodes are needed in +/// order to merge values that are set under such a branch and feed their uses. +/// The phi nodes can be scalar or vector depending on the users of the value. +/// This recipe works in concert with VPBranchOnMaskRecipe. +class VPPredInstPHIRecipe : public VPRecipeBase { +private: + Instruction *PredInst; + +public: + /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi + /// nodes after merging back from a Branch-on-Mask. + VPPredInstPHIRecipe(Instruction *PredInst) + : VPRecipeBase(VPPredInstPHISC), PredInst(PredInst) {} + + ~VPPredInstPHIRecipe() {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC; + } + + /// Generates phi nodes for live-outs as needed to retain SSA form. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" + << Indent << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst) + << "\\l\""; + } +}; +} // end anonymous namespace + +bool LoopVectorizationPlanner::getDecisionAndClampRange( + const std::function &Predicate, VFRange &Range) { + assert(Range.End > Range.Start && "Trying to test an empty VF range."); + bool PredicateAtRangeStart = Predicate(Range.Start); + + for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) + if (Predicate(TmpVF) != PredicateAtRangeStart) { + Range.End = TmpVF; + break; + } + + return PredicateAtRangeStart; +} + +/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, +/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range +/// of VF's starting at a given VF and extending it as much as possible. Each +/// vectorization decision can potentially shorten this sub-range during +/// buildVPlan(). +void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { + for (unsigned VF = MinVF; VF < MaxVF + 1;) { + VFRange SubRange = {VF, MaxVF + 1}; + VPlan *Plan = buildVPlan(SubRange); + VPlans.push_back(Plan); + VF = SubRange.End; + } +} + +VPInterleaveRecipe * +LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I, + VFRange &Range) { + const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I); + if (!IG) + return nullptr; + + // Now check if IG is relevant for VF's in the given range. + auto isIGMember = [&](Instruction *I) -> std::function { + return [=](unsigned VF) -> bool { + return (VF >= 2 && // Query is illegal for VF == 1 + CM.getWideningDecision(I, VF) == + LoopVectorizationCostModel::CM_Interleave); + }; + }; + if (!getDecisionAndClampRange(isIGMember(I), Range)) + return nullptr; + + // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) + // range. If it's the primary member of the IG construct a VPInterleaveRecipe. + // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. + assert(I == IG->getInsertPos() && + "Generating a recipe for an adjunct member of an interleave group"); + + return new VPInterleaveRecipe(IG); +} + +VPWidenIntOrFpInductionRecipe * +LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I, + VFRange &Range) { + if (PHINode *Phi = dyn_cast(I)) { + // Check if this is an integer or fp induction. If so, build the recipe that + // produces its scalar and vector values. + InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); + if (II.getKind() == InductionDescriptor::IK_IntInduction || + II.getKind() == InductionDescriptor::IK_FpInduction) + return new VPWidenIntOrFpInductionRecipe(Phi); + + return nullptr; + } + + // Optimize the special case where the source is a constant integer + // induction variable. Notice that we can only optimize the 'trunc' case + // because (a) FP conversions lose precision, (b) sext/zext may wrap, and + // (c) other casts depend on pointer size. + + // Determine whether \p K is a truncation based on an induction variable that + // can be optimized. + auto isOptimizableIVTruncate = + [&](Instruction *K) -> std::function { + return + [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; + }; + + if (isa(I) && + getDecisionAndClampRange(isOptimizableIVTruncate(I), Range)) + return new VPWidenIntOrFpInductionRecipe(cast(I->getOperand(0)), + cast(I)); + return nullptr; +} + +VPWidenRecipe *LoopVectorizationPlanner::tryToWiden( + Instruction *I, VPWidenRecipe *LastWidenRecipe, VFRange &Range) { + + if (Legal->isScalarWithPredication(I)) + return nullptr; + + static DenseSet VectorizableOpcodes = { + Instruction::Br, Instruction::PHI, Instruction::GetElementPtr, + Instruction::UDiv, Instruction::SDiv, Instruction::SRem, + Instruction::URem, Instruction::Add, Instruction::FAdd, + Instruction::Sub, Instruction::FSub, Instruction::Mul, + Instruction::FMul, Instruction::FDiv, Instruction::FRem, + Instruction::Shl, Instruction::LShr, Instruction::AShr, + Instruction::And, Instruction::Or, Instruction::Xor, + Instruction::Select, Instruction::ICmp, Instruction::FCmp, + Instruction::Store, Instruction::Load, Instruction::ZExt, + Instruction::SExt, Instruction::FPToUI, Instruction::FPToSI, + Instruction::FPExt, Instruction::PtrToInt, Instruction::IntToPtr, + Instruction::SIToFP, Instruction::UIToFP, Instruction::Trunc, + Instruction::FPTrunc, Instruction::BitCast, Instruction::Call}; + + if (!VectorizableOpcodes.count(I->getOpcode())) + return nullptr; + + if (CallInst *CI = dyn_cast(I)) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start)) + return nullptr; + } + + auto willWiden = [&](unsigned VF) -> bool { + if (!isa(I) && (CM.isScalarAfterVectorization(I, VF) || + CM.isProfitableToScalarize(I, VF))) + return false; + if (CallInst *CI = dyn_cast(I)) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + // The following case may be scalarized depending on the VF. + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize; + unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + bool UseVectorIntrinsic = + ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + return UseVectorIntrinsic || !NeedToScalarize; + } + if (isa(I) || isa(I)) { + LoopVectorizationCostModel::InstWidening Decision = + CM.getWideningDecision(I, VF); + assert(Decision != LoopVectorizationCostModel::CM_Unknown && + "CM decision should be taken at this point."); + assert(Decision != LoopVectorizationCostModel::CM_Interleave && + "Interleave memory opportunity should be caught earlier."); + return Decision != LoopVectorizationCostModel::CM_Scalarize; + } + return true; + }; + + if (!getDecisionAndClampRange(willWiden, Range)) + return nullptr; + + // Success: widen this instruction. We optimize the common case where + // consecutive instructions can be represented by a single recipe. + if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) + return LastWidenRecipe; + return new VPWidenRecipe(I); +} + +VPBasicBlock *LoopVectorizationPlanner::handleReplication( + Instruction *I, VFRange &Range, VPBasicBlock *VPBB, + DenseMap &PredInst2Recipe) { + + bool IsUniform = getDecisionAndClampRange( + [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, + Range); + + bool IsPredicated = Legal->isScalarWithPredication(I); + auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); + + // Find if I uses a predicated instruction. If so, it will use its scalar + // value. Avoid hoisting the insert-element which packs the scalar value into + // a vector value, as that happens iff all users use the vector value. + for (auto &Op : I->operands()) + if (auto *PredInst = dyn_cast(Op)) + if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) + PredInst2Recipe[PredInst]->setAlsoPack(false); + + // Finalize the recipe for Instr, first if it is not predicated. + if (!IsPredicated) { + DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); + VPBB->appendRecipe(Recipe); + return VPBB; + } + DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); + assert(VPBB->getSuccessors().empty() && + "VPBB has successors when handling predicated replication."); + // Record predicated instructions for above packing optimizations. + PredInst2Recipe[I] = Recipe; + VPBlockBase *Region = VPBB->setOneSuccessor(createReplicateRegion(I, Recipe)); + return cast(Region->setOneSuccessor(new VPBasicBlock())); +} + +VPRegionBlock * +LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr, + VPRecipeBase *PredRecipe) { + // Instructions marked for predication are replicated and placed under an + // if-then construct to prevent side-effects. + + // Build the triangular if-then region. + std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); + assert(Instr->getParent() && "Predicated instruction not in any basic block"); + auto *BOMRecipe = new VPBranchOnMaskRecipe(Instr->getParent()); + auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); + auto *PHIRecipe = + Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); + auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); + auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); + VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); + + // Note: first set Entry as region entry and then connect successors starting + // from it in order, to propagate the "parent" of each VPBasicBlock. + Entry->setTwoSuccessors(Pred, Exit); + Pred->setOneSuccessor(Exit); + + return Region; +} + +VPlan *LoopVectorizationPlanner::buildVPlan(VFRange &Range) { + + DenseMap &SinkAfter = Legal->getSinkAfter(); + DenseMap SinkAfterInverse; + + // Collect instructions from the original loop that will become trivially dead + // in the vectorized loop. We don't need to vectorize these instructions. For + // example, original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet DeadInstructions; + collectTriviallyDeadInstructions(DeadInstructions); + + // Hold a mapping from predicated instructions to their recipes, in order to + // fix their AlsoPack behavior if a user is determined to replicate and use a + // scalar instead of vector value. + DenseMap PredInst2Recipe; + + // Create a dummy pre-entry VPBasicBlock to start building the VPlan. + VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); + VPlan *Plan = new VPlan(VPBB); + + // Scan the body of the loop in a topological order to visit each basic block + // after having visited its predecessor basic blocks. + LoopBlocksDFS DFS(OrigLoop); + DFS.perform(LI); + + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { + // Relevant instructions from basic block BB will be grouped into VPRecipe + // ingredients and fill a new VPBasicBlock. + unsigned VPBBsForBB = 0; + auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); + VPBB->setOneSuccessor(FirstVPBBForBB); + VPBB = FirstVPBBForBB; + VPWidenRecipe *LastWidenRecipe = nullptr; + + std::vector Ingredients; + + // Organize the ingredients to vectorize from current basic block in the + // right order. + for (Instruction &I : *BB) { + Instruction *Instr = &I; + + // First filter out irrelevant instructions, to ensure no recipes are + // built for them. + if (isa(Instr) || isa(Instr) || + DeadInstructions.count(Instr)) + continue; + + // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct + // member of the IG, do not construct any Recipe for it. + const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr); + if (IG && Instr != IG->getInsertPos() && + Range.Start >= 2 && // Query is illegal for VF == 1 + CM.getWideningDecision(Instr, Range.Start) == + LoopVectorizationCostModel::CM_Interleave) + continue; + + // Move instructions to handle first-order recurrences, step 1: avoid + // handling this instruction until after we've handled the instruction it + // should follow. + auto SAIt = SinkAfter.find(Instr); + if (SAIt != SinkAfter.end()) { + DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second + << " to vectorize a 1st order recurrence.\n"); + SinkAfterInverse[SAIt->second] = Instr; + continue; + } + + Ingredients.push_back(Instr); + + // Move instructions to handle first-order recurrences, step 2: push the + // instruction to be sunk at its insertion point. + auto SAInvIt = SinkAfterInverse.find(Instr); + if (SAInvIt != SinkAfterInverse.end()) + Ingredients.push_back(SAInvIt->second); + } + + // Introduce each ingredient into VPlan. + for (Instruction *Instr : Ingredients) { + VPRecipeBase *Recipe = nullptr; + + // Check if Instr should belong to an interleave memory recipe, or already + // does. In the latter case Instr is irrelevant. + if ((Recipe = tryToInterleaveMemory(Instr, Range))) { + VPBB->appendRecipe(Recipe); + continue; + } + + // Check if Instr should form some PHI recipe. + if ((Recipe = tryToOptimizeInduction(Instr, Range))) { + VPBB->appendRecipe(Recipe); + continue; + } + if (PHINode *Phi = dyn_cast(Instr)) { + VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); + continue; + } + + // Check if Instr is to be widened by a general VPWidenRecipe, after + // having first checked for specific widening recipes that deal with + // Interleave Groups, Inductions and Phi nodes. + if ((Recipe = tryToWiden(Instr, LastWidenRecipe, Range))) { + if (Recipe != LastWidenRecipe) + VPBB->appendRecipe(Recipe); + LastWidenRecipe = cast(Recipe); + continue; + } + + // Otherwise, if all widening options failed, Instruction is to be + // replicated. This may create a successor for VPBB. + VPBasicBlock *NextVPBB = + handleReplication(Instr, Range, VPBB, PredInst2Recipe); + if (NextVPBB != VPBB) { + VPBB = NextVPBB; + VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) + : ""); + } + } + } + + // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks + // may also be empty, such as the last one VPBB, reflecting original + // basic-blocks with no recipes. + VPBasicBlock *PreEntry = cast(Plan->getEntry()); + assert(PreEntry->empty() && "Expecting empty pre-entry block."); + VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); + PreEntry->disconnectSuccessor(Entry); + delete PreEntry; + + std::string PlanName; + raw_string_ostream RSO(PlanName); + unsigned VF = Range.Start; + Plan->addVF(VF); + RSO << "Initial VPlan for VF={" << VF; + for (VF *= 2; VF < Range.End; VF *= 2) { + Plan->addVF(VF); + RSO << "," << VF; + } + RSO << "},UF>=1"; + RSO.flush(); + Plan->setName(PlanName); + + return Plan; +} + +void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" + << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + IG->getInsertPos()->printAsOperand(O, false); + O << "\\l\""; + for (unsigned i = 0; i < IG->getFactor(); ++i) + if (Instruction *I = IG->getMember(i)) + O << " +\n" + << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; +} + +void VPReplicateRecipe::execute(VPTransformState &State) { + + if (State.Instance) { // Generate a single instance. + State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); + if (AlsoPack) { // Insert scalar instance packing it into a vector. + // If we're constructing lane 0, initialize to start from undef. + if (State.Instance->Lane == 0) { + Value *Undef = + UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); + State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); + } + State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); + } + return; + } + + // Generate scalar instances for all VF lanes of all UF parts, unless the + // instruction is uniform inwhich case generate only the first lane for each + // of the UF parts. + unsigned EndLane = IsUniform ? 1 : State.VF; + for (unsigned Part = 0; Part < State.UF; ++Part) + for (unsigned Lane = 0; Lane < EndLane; ++Lane) + State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); + + // Broadcast or group together all instances into a vector. + if (AlsoPack) + for (unsigned Part = 0; Part < State.UF; ++Part) + State.ILV->getOrCreateVectorValue(Ingredient, Part); +} + +void VPBranchOnMaskRecipe::execute(VPTransformState &State) { + assert(State.Instance && "Branch on Mask works only on single instance."); + + unsigned Part = State.Instance->Part; + unsigned Lane = State.Instance->Lane; + + auto Cond = State.ILV->createBlockInMask(MaskedBasicBlock); + + Value *ConditionBit = Cond[Part]; + if (!ConditionBit) // Block in mask is all-one. + ConditionBit = State.Builder.getTrue(); + else if (ConditionBit->getType()->isVectorTy()) + ConditionBit = State.Builder.CreateExtractElement( + ConditionBit, State.Builder.getInt32(Lane)); + + // Replace the temporary unreachable terminator with a new conditional branch, + // whose two destinations will be set later when they are created. + auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); + assert(isa(CurrentTerminator) && + "Expected to replace unreachable terminator with conditional branch."); + auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); + CondBr->setSuccessor(0, nullptr); + ReplaceInstWithInst(CurrentTerminator, CondBr); + + DEBUG(dbgs() << "\nLV: vectorizing BranchOnMask recipe " + << MaskedBasicBlock->getName()); +} + +void VPPredInstPHIRecipe::execute(VPTransformState &State) { + assert(State.Instance && "Predicated instruction PHI works per instance."); + Instruction *ScalarPredInst = cast( + State.ValueMap.getScalarValue(PredInst, *State.Instance)); + BasicBlock *PredicatedBB = ScalarPredInst->getParent(); + BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); + assert(PredicatingBB && "Predicated block has no single predecessor."); + + // By current pack/unpack logic we need to generate only a single phi node: if + // a vector value for the predicated instruction exists at this point it means + // the instruction has vector users only, and a phi for the vector value is + // needed. In this case the recipe of the predicated instruction is marked to + // also do that packing, thereby "hoisting" the insert-element sequence. + // Otherwise, a phi node for the scalar value is needed. + unsigned Part = State.Instance->Part; + if (State.ValueMap.hasVectorValue(PredInst, Part)) { + Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); + InsertElementInst *IEI = cast(VectorValue); + PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); + VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. + VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. + State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. + } else { + Type *PredInstType = PredInst->getType(); + PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); + Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); + Phi->addIncoming(ScalarPredInst, PredicatedBB); + State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); + } +} + bool LoopVectorizePass::processLoop(Loop *L) { assert(L->empty() && "Only process inner loops."); @@ -7893,7 +8428,7 @@ CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, &LVL, CM); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); @@ -7980,6 +8515,8 @@ DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } + LVP.setBestPlan(VF.Width, IC); + using namespace ore; if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); @@ -7987,7 +8524,7 @@ // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM); - LVP.executePlan(Unroller); + LVP.executePlan(Unroller, DT); ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), L->getHeader()) @@ -7997,7 +8534,7 @@ // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM); - LVP.executePlan(LB); + LVP.executePlan(LB, DT); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there are Index: llvm/trunk/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/VPlan.h +++ llvm/trunk/lib/Transforms/Vectorize/VPlan.h @@ -0,0 +1,789 @@ +//===- VPlan.h - Represent A Vectorizer Plan ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declarations of the Vectorization Plan base classes: +/// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual +/// VPBlockBase, together implementing a Hierarchical CFG; +/// 2. Specializations of GraphTraits that allow VPBlockBase graphs to be +/// treated as proper graphs for generic algorithms; +/// 3. Pure virtual VPRecipeBase serving as the base class for recipes contained +/// within VPBasicBlocks; +/// 4. The VPlan class holding a candidate for vectorization; +/// 5. The VPlanPrinter class providing a way to print a plan in dot format. +/// These are documented in docs/VectorizationPlan.rst. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H + +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/ilist.h" +#include "llvm/ADT/ilist_node.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/raw_ostream.h" + +// The (re)use of existing LoopVectorize classes is subject to future VPlan +// refactoring. +namespace { +// Forward declarations. +//class InnerLoopVectorizer; +class LoopVectorizationLegality; +class LoopVectorizationCostModel; +} // namespace + +namespace llvm { + +// Forward declarations. +class BasicBlock; +class InnerLoopVectorizer; +class VPBasicBlock; + +/// In what follows, the term "input IR" refers to code that is fed into the +/// vectorizer whereas the term "output IR" refers to code that is generated by +/// the vectorizer. + +/// VPIteration represents a single point in the iteration space of the output +/// (vectorized and/or unrolled) IR loop. +struct VPIteration { + unsigned Part; ///< in [0..UF) + unsigned Lane; ///< in [0..VF) +}; + +/// This is a helper struct for maintaining vectorization state. It's used for +/// mapping values from the original loop to their corresponding values in +/// the new loop. Two mappings are maintained: one for vectorized values and +/// one for scalarized values. Vectorized values are represented with UF +/// vector values in the new loop, and scalarized values are represented with +/// UF x VF scalar values in the new loop. UF and VF are the unroll and +/// vectorization factors, respectively. +/// +/// Entries can be added to either map with setVectorValue and setScalarValue, +/// which assert that an entry was not already added before. If an entry is to +/// replace an existing one, call resetVectorValue and resetScalarValue. This is +/// currently needed to modify the mapped values during "fix-up" operations that +/// occur once the first phase of widening is complete. These operations include +/// type truncation and the second phase of recurrence widening. +/// +/// Entries from either map can be retrieved using the getVectorValue and +/// getScalarValue functions, which assert that the desired value exists. + +struct VectorizerValueMap { +private: + /// The unroll factor. Each entry in the vector map contains UF vector values. + unsigned UF; + + /// The vectorization factor. Each entry in the scalar map contains UF x VF + /// scalar values. + unsigned VF; + + /// The vector and scalar map storage. We use std::map and not DenseMap + /// because insertions to DenseMap invalidate its iterators. + typedef SmallVector VectorParts; + typedef SmallVector, 2> ScalarParts; + std::map VectorMapStorage; + std::map ScalarMapStorage; + +public: + /// Construct an empty map with the given unroll and vectorization factors. + VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} + + /// \return True if the map has any vector entry for \p Key. + bool hasAnyVectorValue(Value *Key) const { + return VectorMapStorage.count(Key); + } + + /// \return True if the map has a vector entry for \p Key and \p Part. + bool hasVectorValue(Value *Key, unsigned Part) const { + assert(Part < UF && "Queried Vector Part is too large."); + if (!hasAnyVectorValue(Key)) + return false; + const VectorParts &Entry = VectorMapStorage.find(Key)->second; + assert(Entry.size() == UF && "VectorParts has wrong dimensions."); + return Entry[Part] != nullptr; + } + + /// \return True if the map has any scalar entry for \p Key. + bool hasAnyScalarValue(Value *Key) const { + return ScalarMapStorage.count(Key); + } + + /// \return True if the map has a scalar entry for \p Key and \p Instance. + bool hasScalarValue(Value *Key, const VPIteration &Instance) const { + assert(Instance.Part < UF && "Queried Scalar Part is too large."); + assert(Instance.Lane < VF && "Queried Scalar Lane is too large."); + if (!hasAnyScalarValue(Key)) + return false; + const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; + assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); + assert(Entry[Instance.Part].size() == VF && + "ScalarParts has wrong dimensions."); + return Entry[Instance.Part][Instance.Lane] != nullptr; + } + + /// Retrieve the existing vector value that corresponds to \p Key and + /// \p Part. + Value *getVectorValue(Value *Key, unsigned Part) { + assert(hasVectorValue(Key, Part) && "Getting non-existent value."); + return VectorMapStorage[Key][Part]; + } + + /// Retrieve the existing scalar value that corresponds to \p Key and + /// \p Instance. + Value *getScalarValue(Value *Key, const VPIteration &Instance) { + assert(hasScalarValue(Key, Instance) && "Getting non-existent value."); + return ScalarMapStorage[Key][Instance.Part][Instance.Lane]; + } + + /// Set a vector value associated with \p Key and \p Part. Assumes such a + /// value is not already set. If it is, use resetVectorValue() instead. + void setVectorValue(Value *Key, unsigned Part, Value *Vector) { + assert(!hasVectorValue(Key, Part) && "Vector value already set for part"); + if (!VectorMapStorage.count(Key)) { + VectorParts Entry(UF); + VectorMapStorage[Key] = Entry; + } + VectorMapStorage[Key][Part] = Vector; + } + + /// Set a scalar value associated with \p Key and \p Instance. Assumes such a + /// value is not already set. + void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar) { + assert(!hasScalarValue(Key, Instance) && "Scalar value already set"); + if (!ScalarMapStorage.count(Key)) { + ScalarParts Entry(UF); + // TODO: Consider storing uniform values only per-part, as they occupy + // lane 0 only, keeping the other VF-1 redundant entries null. + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part].resize(VF, nullptr); + ScalarMapStorage[Key] = Entry; + } + ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; + } + + /// Reset the vector value associated with \p Key for the given \p Part. + /// This function can be used to update values that have already been + /// vectorized. This is the case for "fix-up" operations including type + /// truncation and the second phase of recurrence vectorization. + void resetVectorValue(Value *Key, unsigned Part, Value *Vector) { + assert(hasVectorValue(Key, Part) && "Vector value not set for part"); + VectorMapStorage[Key][Part] = Vector; + } + + /// Reset the scalar value associated with \p Key for \p Part and \p Lane. + /// This function can be used to update values that have already been + /// scalarized. This is the case for "fix-up" operations including scalar phi + /// nodes for scalarized and predicated instructions. + void resetScalarValue(Value *Key, const VPIteration &Instance, + Value *Scalar) { + assert(hasScalarValue(Key, Instance) && + "Scalar value not set for part and lane"); + ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; + } +}; + +/// VPTransformState holds information passed down when "executing" a VPlan, +/// needed for generating the output IR. +struct VPTransformState { + + VPTransformState(unsigned VF, unsigned UF, class LoopInfo *LI, + class DominatorTree *DT, IRBuilder<> &Builder, + VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV) + : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), + ValueMap(ValueMap), ILV(ILV) {} + + /// The chosen Vectorization and Unroll Factors of the loop being vectorized. + unsigned VF; + unsigned UF; + + /// Hold the indices to generate specific scalar instructions. Null indicates + /// that all instances are to be generated, using either scalar or vector + /// instructions. + Optional Instance; + + /// Hold state information used when constructing the CFG of the output IR, + /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. + struct CFGState { + /// The previous VPBasicBlock visited. Initially set to null. + VPBasicBlock *PrevVPBB; + /// The previous IR BasicBlock created or used. Initially set to the new + /// header BasicBlock. + BasicBlock *PrevBB; + /// The last IR BasicBlock in the output IR. Set to the new latch + /// BasicBlock, used for placing the newly created BasicBlocks. + BasicBlock *LastBB; + /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case + /// of replication, maps the BasicBlock of the last replica created. + SmallDenseMap VPBB2IRBB; + + CFGState() : PrevVPBB(nullptr), PrevBB(nullptr), LastBB(nullptr) {} + } CFG; + + /// Hold a pointer to LoopInfo to register new basic blocks in the loop. + class LoopInfo *LI; + + /// Hold a pointer to Dominator Tree to register new basic blocks in the loop. + class DominatorTree *DT; + + /// Hold a reference to the IRBuilder used to generate output IR code. + IRBuilder<> &Builder; + + /// Hold a reference to the Value state information used when generating the + /// Values of the output IR. + VectorizerValueMap &ValueMap; + + /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. + class InnerLoopVectorizer *ILV; +}; + +/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. +/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock. +class VPBlockBase { +private: + const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). + + /// An optional name for the block. + std::string Name; + + /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if + /// it is a topmost VPBlockBase. + class VPRegionBlock *Parent; + + /// List of predecessor blocks. + SmallVector Predecessors; + + /// List of successor blocks. + SmallVector Successors; + + /// Add \p Successor as the last successor to this block. + void appendSuccessor(VPBlockBase *Successor) { + assert(Successor && "Cannot add nullptr successor!"); + Successors.push_back(Successor); + } + + /// Add \p Predecessor as the last predecessor to this block. + void appendPredecessor(VPBlockBase *Predecessor) { + assert(Predecessor && "Cannot add nullptr predecessor!"); + Predecessors.push_back(Predecessor); + } + + /// Remove \p Predecessor from the predecessors of this block. + void removePredecessor(VPBlockBase *Predecessor) { + auto Pos = std::find(Predecessors.begin(), Predecessors.end(), Predecessor); + assert(Pos && "Predecessor does not exist"); + Predecessors.erase(Pos); + } + + /// Remove \p Successor from the successors of this block. + void removeSuccessor(VPBlockBase *Successor) { + auto Pos = std::find(Successors.begin(), Successors.end(), Successor); + assert(Pos && "Successor does not exist"); + Successors.erase(Pos); + } + +protected: + VPBlockBase(const unsigned char SC, const std::string &N) + : SubclassID(SC), Name(N), Parent(nullptr) {} + +public: + /// An enumeration for keeping track of the concrete subclass of VPBlockBase + /// that are actually instantiated. Values of this enumeration are kept in the + /// SubclassID field of the VPBlockBase objects. They are used for concrete + /// type identification. + typedef enum { VPBasicBlockSC, VPRegionBlockSC } VPBlockTy; + + typedef SmallVectorImpl VPBlocksTy; + + virtual ~VPBlockBase() {} + + const std::string &getName() const { return Name; } + + void setName(const Twine &newName) { Name = newName.str(); } + + /// \return an ID for the concrete type of this object. + /// This is used to implement the classof checks. This should not be used + /// for any other purpose, as the values may change as LLVM evolves. + unsigned getVPBlockID() const { return SubclassID; } + + const VPRegionBlock *getParent() const { return Parent; } + + void setParent(VPRegionBlock *P) { Parent = P; } + + /// \return the VPBasicBlock that is the entry of this VPBlockBase, + /// recursively, if the latter is a VPRegionBlock. Otherwise, if this + /// VPBlockBase is a VPBasicBlock, it is returned. + const VPBasicBlock *getEntryBasicBlock() const; + VPBasicBlock *getEntryBasicBlock(); + + /// \return the VPBasicBlock that is the exit of this VPBlockBase, + /// recursively, if the latter is a VPRegionBlock. Otherwise, if this + /// VPBlockBase is a VPBasicBlock, it is returned. + const VPBasicBlock *getExitBasicBlock() const; + VPBasicBlock *getExitBasicBlock(); + + const VPBlocksTy &getSuccessors() const { return Successors; } + VPBlocksTy &getSuccessors() { return Successors; } + + const VPBlocksTy &getPredecessors() const { return Predecessors; } + VPBlocksTy &getPredecessors() { return Predecessors; } + + /// \return the successor of this VPBlockBase if it has a single successor. + /// Otherwise return a null pointer. + VPBlockBase *getSingleSuccessor() const { + return (Successors.size() == 1 ? *Successors.begin() : nullptr); + } + + /// \return the predecessor of this VPBlockBase if it has a single + /// predecessor. Otherwise return a null pointer. + VPBlockBase *getSinglePredecessor() const { + return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr); + } + + /// An Enclosing Block of a block B is any block containing B, including B + /// itself. \return the closest enclosing block starting from "this", which + /// has successors. \return the root enclosing block if all enclosing blocks + /// have no successors. + VPBlockBase *getEnclosingBlockWithSuccessors(); + + /// \return the closest enclosing block starting from "this", which has + /// predecessors. \return the root enclosing block if all enclosing blocks + /// have no predecessors. + VPBlockBase *getEnclosingBlockWithPredecessors(); + + /// \return the successors either attached directly to this VPBlockBase or, if + /// this VPBlockBase is the exit block of a VPRegionBlock and has no + /// successors of its own, search recursively for the first enclosing + /// VPRegionBlock that has successors and return them. If no such + /// VPRegionBlock exists, return the (empty) successors of the topmost + /// VPBlockBase reached. + const VPBlocksTy &getHierarchicalSuccessors() { + return getEnclosingBlockWithSuccessors()->getSuccessors(); + } + + /// \return the hierarchical successor of this VPBlockBase if it has a single + /// hierarchical successor. Otherwise return a null pointer. + VPBlockBase *getSingleHierarchicalSuccessor() { + return getEnclosingBlockWithSuccessors()->getSingleSuccessor(); + } + + /// \return the predecessors either attached directly to this VPBlockBase or, + /// if this VPBlockBase is the entry block of a VPRegionBlock and has no + /// predecessors of its own, search recursively for the first enclosing + /// VPRegionBlock that has predecessors and return them. If no such + /// VPRegionBlock exists, return the (empty) predecessors of the topmost + /// VPBlockBase reached. + const VPBlocksTy &getHierarchicalPredecessors() { + return getEnclosingBlockWithPredecessors()->getPredecessors(); + } + + /// \return the hierarchical predecessor of this VPBlockBase if it has a + /// single hierarchical predecessor. Otherwise return a null pointer. + VPBlockBase *getSingleHierarchicalPredecessor() { + return getEnclosingBlockWithPredecessors()->getSinglePredecessor(); + } + + /// Sets a given VPBlockBase \p Successor as the single successor and \return + /// \p Successor. The parent of this Block is copied to be the parent of + /// \p Successor. + VPBlockBase *setOneSuccessor(VPBlockBase *Successor) { + assert(Successors.empty() && "Setting one successor when others exist."); + appendSuccessor(Successor); + Successor->appendPredecessor(this); + Successor->Parent = Parent; + return Successor; + } + + /// Sets two given VPBlockBases \p IfTrue and \p IfFalse to be the two + /// successors. The parent of this Block is copied to be the parent of both + /// \p IfTrue and \p IfFalse. + void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) { + assert(Successors.empty() && "Setting two successors when others exist."); + appendSuccessor(IfTrue); + appendSuccessor(IfFalse); + IfTrue->appendPredecessor(this); + IfFalse->appendPredecessor(this); + IfTrue->Parent = Parent; + IfFalse->Parent = Parent; + } + + void disconnectSuccessor(VPBlockBase *Successor) { + assert(Successor && "Successor to disconnect is null."); + removeSuccessor(Successor); + Successor->removePredecessor(this); + } + + /// The method which generates the output IR that correspond to this + /// VPBlockBase, thereby "executing" the VPlan. + virtual void execute(struct VPTransformState *State) = 0; + + /// Delete all blocks reachable from a given VPBlockBase, inclusive. + static void deleteCFG(VPBlockBase *Entry); +}; + +/// VPRecipeBase is a base class modeling a sequence of one or more output IR +/// instructions. +class VPRecipeBase : public ilist_node_with_parent { + friend VPBasicBlock; + +private: + const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). + + /// Each VPRecipe belongs to a single VPBasicBlock. + VPBasicBlock *Parent; + +public: + /// An enumeration for keeping track of the concrete subclass of VPRecipeBase + /// that is actually instantiated. Values of this enumeration are kept in the + /// SubclassID field of the VPRecipeBase objects. They are used for concrete + /// type identification. + typedef enum { + VPBranchOnMaskSC, + VPInterleaveSC, + VPPredInstPHISC, + VPReplicateSC, + VPWidenIntOrFpInductionSC, + VPWidenPHISC, + VPWidenSC, + } VPRecipeTy; + + VPRecipeBase(const unsigned char SC) : SubclassID(SC), Parent(nullptr) {} + + virtual ~VPRecipeBase() {} + + /// \return an ID for the concrete type of this object. + /// This is used to implement the classof checks. This should not be used + /// for any other purpose, as the values may change as LLVM evolves. + unsigned getVPRecipeID() const { return SubclassID; } + + /// \return the VPBasicBlock which this VPRecipe belongs to. + VPBasicBlock *getParent() { return Parent; } + const VPBasicBlock *getParent() const { return Parent; } + + /// The method which generates the output IR instructions that correspond to + /// this VPRecipe, thereby "executing" the VPlan. + virtual void execute(struct VPTransformState &State) = 0; + + /// Each recipe prints itself. + virtual void print(raw_ostream &O, const Twine &Indent) const = 0; +}; + +/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It +/// holds a sequence of zero or more VPRecipe's each representing a sequence of +/// output IR instructions. +class VPBasicBlock : public VPBlockBase { +public: + typedef iplist RecipeListTy; + +private: + /// The VPRecipes held in the order of output instructions to generate. + RecipeListTy Recipes; + +public: + /// Instruction iterators... + typedef RecipeListTy::iterator iterator; + typedef RecipeListTy::const_iterator const_iterator; + typedef RecipeListTy::reverse_iterator reverse_iterator; + typedef RecipeListTy::const_reverse_iterator const_reverse_iterator; + + //===--------------------------------------------------------------------===// + /// Recipe iterator methods + /// + inline iterator begin() { return Recipes.begin(); } + inline const_iterator begin() const { return Recipes.begin(); } + inline iterator end() { return Recipes.end(); } + inline const_iterator end() const { return Recipes.end(); } + + inline reverse_iterator rbegin() { return Recipes.rbegin(); } + inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); } + inline reverse_iterator rend() { return Recipes.rend(); } + inline const_reverse_iterator rend() const { return Recipes.rend(); } + + inline size_t size() const { return Recipes.size(); } + inline bool empty() const { return Recipes.empty(); } + inline const VPRecipeBase &front() const { return Recipes.front(); } + inline VPRecipeBase &front() { return Recipes.front(); } + inline const VPRecipeBase &back() const { return Recipes.back(); } + inline VPRecipeBase &back() { return Recipes.back(); } + + /// \brief Returns a pointer to a member of the recipe list. + static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) { + return &VPBasicBlock::Recipes; + } + + VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr) + : VPBlockBase(VPBasicBlockSC, Name.str()) { + if (Recipe) + appendRecipe(Recipe); + } + + ~VPBasicBlock() { Recipes.clear(); } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPBlockBase *V) { + return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC; + } + + /// Augment the existing recipes of a VPBasicBlock with an additional + /// \p Recipe as the last recipe. + void appendRecipe(VPRecipeBase *Recipe) { + assert(Recipe && "No recipe to append."); + assert(!Recipe->Parent && "Recipe already in VPlan"); + Recipe->Parent = this; + return Recipes.push_back(Recipe); + } + + /// The method which generates the output IR instructions that correspond to + /// this VPBasicBlock, thereby "executing" the VPlan. + void execute(struct VPTransformState *State) override; + +private: + /// Create an IR BasicBlock to hold the output instructions generated by this + /// VPBasicBlock, and return it. Update the CFGState accordingly. + BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG); +}; + +/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks +/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG. +/// A VPRegionBlock may indicate that its contents are to be replicated several +/// times. This is designed to support predicated scalarization, in which a +/// scalar if-then code structure needs to be generated VF * UF times. Having +/// this replication indicator helps to keep a single model for multiple +/// candidate VF's. The actual replication takes place only once the desired VF +/// and UF have been determined. +class VPRegionBlock : public VPBlockBase { +private: + /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock. + VPBlockBase *Entry; + + /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock. + VPBlockBase *Exit; + + /// An indicator whether this region is to generate multiple replicated + /// instances of output IR corresponding to its VPBlockBases. + bool IsReplicator; + +public: + VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit, + const std::string &Name = "", bool IsReplicator = false) + : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit), + IsReplicator(IsReplicator) { + assert(Entry->getPredecessors().empty() && "Entry block has predecessors."); + assert(Exit->getSuccessors().empty() && "Exit block has successors."); + Entry->setParent(this); + Exit->setParent(this); + } + + ~VPRegionBlock() { + if (Entry) + deleteCFG(Entry); + } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPBlockBase *V) { + return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC; + } + + const VPBlockBase *getEntry() const { return Entry; } + VPBlockBase *getEntry() { return Entry; } + + const VPBlockBase *getExit() const { return Exit; } + VPBlockBase *getExit() { return Exit; } + + /// An indicator whether this region is to generate multiple replicated + /// instances of output IR corresponding to its VPBlockBases. + bool isReplicator() const { return IsReplicator; } + + /// The method which generates the output IR instructions that correspond to + /// this VPRegionBlock, thereby "executing" the VPlan. + void execute(struct VPTransformState *State) override; +}; + +/// VPlan models a candidate for vectorization, encoding various decisions take +/// to produce efficient output IR, including which branches, basic-blocks and +/// output IR instructions to generate, and their cost. VPlan holds a +/// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry +/// VPBlock. +class VPlan { +private: + /// Hold the single entry to the Hierarchical CFG of the VPlan. + VPBlockBase *Entry; + + /// Holds the VFs applicable to this VPlan. + SmallSet VFs; + + /// Holds the name of the VPlan, for printing. + std::string Name; + +public: + VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {} + + ~VPlan() { + if (Entry) + VPBlockBase::deleteCFG(Entry); + } + + /// Generate the IR code for this VPlan. + void execute(struct VPTransformState *State); + + VPBlockBase *getEntry() { return Entry; } + const VPBlockBase *getEntry() const { return Entry; } + + VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; } + + void addVF(unsigned VF) { VFs.insert(VF); } + + bool hasVF(unsigned VF) { return VFs.count(VF); } + + const std::string &getName() const { return Name; } + + void setName(const Twine &newName) { Name = newName.str(); } + +private: + /// Add to the given dominator tree the header block and every new basic block + /// that was created between it and the latch block, inclusive. + static void updateDominatorTree(class DominatorTree *DT, + BasicBlock *LoopPreHeaderBB, + BasicBlock *LoopLatchBB); +}; + +/// VPlanPrinter prints a given VPlan to a given output stream. The printing is +/// indented and follows the dot format. +class VPlanPrinter { + friend inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan); + friend inline raw_ostream &operator<<(raw_ostream &OS, + const struct VPlanIngredient &I); + +private: + raw_ostream &OS; + VPlan &Plan; + unsigned Depth; + unsigned TabWidth = 2; + std::string Indent; + + unsigned BID = 0; + + SmallDenseMap BlockID; + + /// Handle indentation. + void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } + + /// Print a given \p Block of the Plan. + void dumpBlock(const VPBlockBase *Block); + + /// Print the information related to the CFG edges going out of a given + /// \p Block, followed by printing the successor blocks themselves. + void dumpEdges(const VPBlockBase *Block); + + /// Print a given \p BasicBlock, including its VPRecipes, followed by printing + /// its successor blocks. + void dumpBasicBlock(const VPBasicBlock *BasicBlock); + + /// Print a given \p Region of the Plan. + void dumpRegion(const VPRegionBlock *Region); + + unsigned getOrCreateBID(const VPBlockBase *Block) { + return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++; + } + + const Twine getOrCreateName(const VPBlockBase *Block); + + const Twine getUID(const VPBlockBase *Block); + + /// Print the information related to a CFG edge between two VPBlockBases. + void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, + const Twine &Label); + + VPlanPrinter(raw_ostream &O, VPlan &P) : OS(O), Plan(P) {} + + void dump(); + + static void printAsIngredient(raw_ostream &O, Value *V); +}; + +struct VPlanIngredient { + Value *V; + VPlanIngredient(Value *V) : V(V) {} +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { + VPlanPrinter::printAsIngredient(OS, I.V); + return OS; +} + +inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan) { + VPlanPrinter Printer(OS, Plan); + Printer.dump(); + return OS; +} + +//===--------------------------------------------------------------------===// +// GraphTraits specializations for VPlan/VPRegionBlock Control-Flow Graphs // +//===--------------------------------------------------------------------===// + +// Provide specializations of GraphTraits to be able to treat a VPBlockBase as a +// graph of VPBlockBase nodes... + +template <> struct GraphTraits { + typedef VPBlockBase *NodeRef; + typedef SmallVectorImpl::iterator ChildIteratorType; + + static NodeRef getEntryNode(NodeRef N) { return N; } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getSuccessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getSuccessors().end(); + } +}; + +template <> struct GraphTraits { + typedef const VPBlockBase *NodeRef; + typedef SmallVectorImpl::const_iterator ChildIteratorType; + + static NodeRef getEntryNode(NodeRef N) { return N; } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getSuccessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getSuccessors().end(); + } +}; + +// Provide specializations of GraphTraits to be able to treat a VPBlockBase as a +// graph of VPBlockBase nodes... and to walk it in inverse order. Inverse order +// for a VPBlockBase is considered to be when traversing the predecessors of a +// VPBlockBase instead of its successors. +// + +template <> struct GraphTraits> { + typedef VPBlockBase *NodeRef; + typedef SmallVectorImpl::iterator ChildIteratorType; + + static Inverse getEntryNode(Inverse B) { + return B; + } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getPredecessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getPredecessors().end(); + } +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H Index: llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp @@ -0,0 +1,401 @@ +//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This is the LLVM vectorization plan. It represents a candidate for +/// vectorization, allowing to plan and optimize how to vectorize a given loop +/// before generating LLVM-IR. +/// The vectorizer uses vectorization plans to estimate the costs of potential +/// candidates and if profitable to execute the desired plan, generating vector +/// LLVM-IR code. +/// +//===----------------------------------------------------------------------===// + +#include "VPlan.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "vplan" + +/// \return the VPBasicBlock that is the entry of Block, possibly indirectly. +const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const { + const VPBlockBase *Block = this; + while (const VPRegionBlock *Region = dyn_cast(Block)) + Block = Region->getEntry(); + return cast(Block); +} + +VPBasicBlock *VPBlockBase::getEntryBasicBlock() { + VPBlockBase *Block = this; + while (VPRegionBlock *Region = dyn_cast(Block)) + Block = Region->getEntry(); + return cast(Block); +} + +/// \return the VPBasicBlock that is the exit of Block, possibly indirectly. +const VPBasicBlock *VPBlockBase::getExitBasicBlock() const { + const VPBlockBase *Block = this; + while (const VPRegionBlock *Region = dyn_cast(Block)) + Block = Region->getExit(); + return cast(Block); +} + +VPBasicBlock *VPBlockBase::getExitBasicBlock() { + VPBlockBase *Block = this; + while (VPRegionBlock *Region = dyn_cast(Block)) + Block = Region->getExit(); + return cast(Block); +} + +VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() { + if (!Successors.empty() || !Parent) + return this; + assert(Parent->getExit() == this && + "Block w/o successors not the exit of its parent."); + return Parent->getEnclosingBlockWithSuccessors(); +} + +VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { + if (!Predecessors.empty() || !Parent) + return this; + assert(Parent->getEntry() == this && + "Block w/o predecessors not the entry of its parent."); + return Parent->getEnclosingBlockWithPredecessors(); +} + +void VPBlockBase::deleteCFG(VPBlockBase *Entry) { + SmallVector Blocks; + for (VPBlockBase *Block : depth_first(Entry)) + Blocks.push_back(Block); + + for (VPBlockBase *Block : Blocks) + delete Block; +} + +BasicBlock * +VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { + // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks. + // Pred stands for Predessor. Prev stands for Previous - last visited/created. + BasicBlock *PrevBB = CFG.PrevBB; + BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(), + PrevBB->getParent(), CFG.LastBB); + DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n'); + + // Hook up the new basic block to its predecessors. + for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { + VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock(); + auto &PredVPSuccessors = PredVPBB->getSuccessors(); + BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; + assert(PredBB && "Predecessor basic-block not found building successor."); + auto *PredBBTerminator = PredBB->getTerminator(); + DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); + if (isa(PredBBTerminator)) { + assert(PredVPSuccessors.size() == 1 && + "Predecessor ending w/o branch must have single successor."); + PredBBTerminator->eraseFromParent(); + BranchInst::Create(NewBB, PredBB); + } else { + assert(PredVPSuccessors.size() == 2 && + "Predecessor ending with branch must have two successors."); + unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; + assert(!PredBBTerminator->getSuccessor(idx) && + "Trying to reset an existing successor block."); + PredBBTerminator->setSuccessor(idx, NewBB); + } + } + return NewBB; +} + +void VPBasicBlock::execute(VPTransformState *State) { + bool Replica = State->Instance && + !(State->Instance->Part == 0 && State->Instance->Lane == 0); + VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB; + VPBlockBase *SingleHPred = nullptr; + BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. + + // 1. Create an IR basic block, or reuse the last one if possible. + // The last IR basic block is reused, as an optimization, in three cases: + // A. the first VPBB reuses the loop header BB - when PrevVPBB is null; + // B. when the current VPBB has a single (hierarchical) predecessor which + // is PrevVPBB and the latter has a single (hierarchical) successor; and + // C. when the current VPBB is an entry of a region replica - where PrevVPBB + // is the exit of this region from a previous instance, or the predecessor + // of this region. + if (PrevVPBB && /* A */ + !((SingleHPred = getSingleHierarchicalPredecessor()) && + SingleHPred->getExitBasicBlock() == PrevVPBB && + PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */ + !(Replica && getPredecessors().empty())) { /* C */ + + NewBB = createEmptyBasicBlock(State->CFG); + State->Builder.SetInsertPoint(NewBB); + // Temporarily terminate with unreachable until CFG is rewired. + UnreachableInst *Terminator = State->Builder.CreateUnreachable(); + State->Builder.SetInsertPoint(Terminator); + // Register NewBB in its loop. In innermost loops its the same for all BB's. + Loop *L = State->LI->getLoopFor(State->CFG.LastBB); + L->addBasicBlockToLoop(NewBB, *State->LI); + State->CFG.PrevBB = NewBB; + } + + // 2. Fill the IR basic block with IR instructions. + DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName() + << " in BB:" << NewBB->getName() << '\n'); + + State->CFG.VPBB2IRBB[this] = NewBB; + State->CFG.PrevVPBB = this; + + for (VPRecipeBase &Recipe : Recipes) + Recipe.execute(*State); + + DEBUG(dbgs() << "LV: filled BB:" << *NewBB); +} + +void VPRegionBlock::execute(VPTransformState *State) { + ReversePostOrderTraversal RPOT(Entry); + + if (!isReplicator()) { + // Visit the VPBlocks connected to "this", starting from it. + for (VPBlockBase *Block : RPOT) { + DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); + Block->execute(State); + } + return; + } + + assert(!State->Instance && "Replicating a Region with non-null instance."); + + // Enter replicating mode. + State->Instance = {0, 0}; + + for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { + State->Instance->Part = Part; + for (unsigned Lane = 0, VF = State->VF; Lane < VF; ++Lane) { + State->Instance->Lane = Lane; + // Visit the VPBlocks connected to \p this, starting from it. + for (VPBlockBase *Block : RPOT) { + DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); + Block->execute(State); + } + } + } + + // Exit replicating mode. + State->Instance.reset(); +} + +/// Generate the code inside the body of the vectorized loop. Assumes a single +/// LoopVectorBody basic-block was created for this. Introduce additional +/// basic-blocks as needed, and fill them all. +void VPlan::execute(VPTransformState *State) { + BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB; + BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor(); + assert(VectorHeaderBB && "Loop preheader does not have a single successor."); + BasicBlock *VectorLatchBB = VectorHeaderBB; + + // 1. Make room to generate basic-blocks inside loop body if needed. + VectorLatchBB = VectorHeaderBB->splitBasicBlock( + VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch"); + Loop *L = State->LI->getLoopFor(VectorHeaderBB); + L->addBasicBlockToLoop(VectorLatchBB, *State->LI); + // Remove the edge between Header and Latch to allow other connections. + // Temporarily terminate with unreachable until CFG is rewired. + // Note: this asserts the generated code's assumption that + // getFirstInsertionPt() can be dereferenced into an Instruction. + VectorHeaderBB->getTerminator()->eraseFromParent(); + State->Builder.SetInsertPoint(VectorHeaderBB); + UnreachableInst *Terminator = State->Builder.CreateUnreachable(); + State->Builder.SetInsertPoint(Terminator); + + // 2. Generate code in loop body. + State->CFG.PrevVPBB = nullptr; + State->CFG.PrevBB = VectorHeaderBB; + State->CFG.LastBB = VectorLatchBB; + + for (VPBlockBase *Block : depth_first(Entry)) + Block->execute(State); + + // 3. Merge the temporary latch created with the last basic-block filled. + BasicBlock *LastBB = State->CFG.PrevBB; + // Connect LastBB to VectorLatchBB to facilitate their merge. + assert(isa(LastBB->getTerminator()) && + "Expected VPlan CFG to terminate with unreachable"); + LastBB->getTerminator()->eraseFromParent(); + BranchInst::Create(VectorLatchBB, LastBB); + + // Merge LastBB with Latch. + bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI); + (void)Merged; + assert(Merged && "Could not merge last basic block with latch."); + VectorLatchBB = LastBB; + + updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB); +} + +void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, + BasicBlock *LoopLatchBB) { + BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor(); + assert(LoopHeaderBB && "Loop preheader does not have a single successor."); + DT->addNewBlock(LoopHeaderBB, LoopPreHeaderBB); + // The vector body may be more than a single basic-block by this point. + // Update the dominator tree information inside the vector body by propagating + // it from header to latch, expecting only triangular control-flow, if any. + BasicBlock *PostDomSucc = nullptr; + for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) { + // Get the list of successors of this block. + std::vector Succs(succ_begin(BB), succ_end(BB)); + assert(Succs.size() <= 2 && + "Basic block in vector loop has more than 2 successors."); + PostDomSucc = Succs[0]; + if (Succs.size() == 1) { + assert(PostDomSucc->getSinglePredecessor() && + "PostDom successor has more than one predecessor."); + DT->addNewBlock(PostDomSucc, BB); + continue; + } + BasicBlock *InterimSucc = Succs[1]; + if (PostDomSucc->getSingleSuccessor() == InterimSucc) { + PostDomSucc = Succs[1]; + InterimSucc = Succs[0]; + } + assert(InterimSucc->getSingleSuccessor() == PostDomSucc && + "One successor of a basic block does not lead to the other."); + assert(InterimSucc->getSinglePredecessor() && + "Interim successor has more than one predecessor."); + assert(std::distance(pred_begin(PostDomSucc), pred_end(PostDomSucc)) == 2 && + "PostDom successor has more than two predecessors."); + DT->addNewBlock(InterimSucc, BB); + DT->addNewBlock(PostDomSucc, BB); + } +} + +const Twine VPlanPrinter::getUID(const VPBlockBase *Block) { + return (isa(Block) ? "cluster_N" : "N") + + Twine(getOrCreateBID(Block)); +} + +const Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) { + const std::string &Name = Block->getName(); + if (!Name.empty()) + return Name; + return "VPB" + Twine(getOrCreateBID(Block)); +} + +void VPlanPrinter::dump() { + Depth = 1; + bumpIndent(0); + OS << "digraph VPlan {\n"; + OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan"; + if (!Plan.getName().empty()) + OS << "\\n" << DOT::EscapeString(Plan.getName()); + OS << "\"]\n"; + OS << "node [shape=rect, fontname=Courier, fontsize=30]\n"; + OS << "edge [fontname=Courier, fontsize=30]\n"; + OS << "compound=true\n"; + + for (VPBlockBase *Block : depth_first(Plan.getEntry())) + dumpBlock(Block); + + OS << "}\n"; +} + +void VPlanPrinter::dumpBlock(const VPBlockBase *Block) { + if (const VPBasicBlock *BasicBlock = dyn_cast(Block)) + dumpBasicBlock(BasicBlock); + else if (const VPRegionBlock *Region = dyn_cast(Block)) + dumpRegion(Region); + else + llvm_unreachable("Unsupported kind of VPBlock."); +} + +void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To, + bool Hidden, const Twine &Label) { + // Due to "dot" we print an edge between two regions as an edge between the + // exit basic block and the entry basic of the respective regions. + const VPBlockBase *Tail = From->getExitBasicBlock(); + const VPBlockBase *Head = To->getEntryBasicBlock(); + OS << Indent << getUID(Tail) << " -> " << getUID(Head); + OS << " [ label=\"" << Label << '\"'; + if (Tail != From) + OS << " ltail=" << getUID(From); + if (Head != To) + OS << " lhead=" << getUID(To); + if (Hidden) + OS << "; splines=none"; + OS << "]\n"; +} + +void VPlanPrinter::dumpEdges(const VPBlockBase *Block) { + auto &Successors = Block->getSuccessors(); + if (Successors.size() == 1) + drawEdge(Block, Successors.front(), false, ""); + else if (Successors.size() == 2) { + drawEdge(Block, Successors.front(), false, "T"); + drawEdge(Block, Successors.back(), false, "F"); + } else { + unsigned SuccessorNumber = 0; + for (auto *Successor : Successors) + drawEdge(Block, Successor, false, Twine(SuccessorNumber++)); + } +} + +void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { + OS << Indent << getUID(BasicBlock) << " [label =\n"; + bumpIndent(1); + OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\""; + bumpIndent(1); + for (const VPRecipeBase &Recipe : *BasicBlock) + Recipe.print(OS, Indent); + bumpIndent(-2); + OS << "\n" << Indent << "]\n"; + dumpEdges(BasicBlock); +} + +void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { + OS << Indent << "subgraph " << getUID(Region) << " {\n"; + bumpIndent(1); + OS << Indent << "fontname=Courier\n" + << Indent << "label=\"" + << DOT::EscapeString(Region->isReplicator() ? " " : " ") + << DOT::EscapeString(Region->getName()) << "\"\n"; + // Dump the blocks of the region. + assert(Region->getEntry() && "Region contains no inner blocks."); + for (const VPBlockBase *Block : depth_first(Region->getEntry())) + dumpBlock(Block); + bumpIndent(-1); + OS << Indent << "}\n"; + dumpEdges(Region); +} + +void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) { + std::string IngredientString; + raw_string_ostream RSO(IngredientString); + if (auto *Inst = dyn_cast(V)) { + if (!Inst->getType()->isVoidTy()) { + Inst->printAsOperand(RSO, false); + RSO << " = "; + } + RSO << Inst->getOpcodeName() << " "; + unsigned E = Inst->getNumOperands(); + if (E > 0) { + Inst->getOperand(0)->printAsOperand(RSO, false); + for (unsigned I = 1; I < E; ++I) + Inst->getOperand(I)->printAsOperand(RSO << ", ", false); + } + } else // !Inst + V->printAsOperand(RSO, false); + RSO.flush(); + O << DOT::EscapeString(IngredientString); +} Index: llvm/trunk/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -26,9 +26,9 @@ ; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]] ; CHECK: [[PRED_UDIV_IF]]: ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP5]], %x -; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], %x +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP6]], [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]] ; CHECK: [[PRED_UDIV_CONTINUE]]: @@ -37,9 +37,9 @@ ; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]] ; CHECK: [[PRED_UDIV_IF1]]: ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP12]], %x -; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP11]], %x +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP13]], [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]] ; CHECK: [[PRED_UDIV_CONTINUE2]]: Index: llvm/trunk/test/Transforms/LoopVectorize/AArch64/predication_costs.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -18,8 +18,8 @@ ; Cost of udiv: ; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 ; -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) { entry: @@ -59,8 +59,8 @@ ; Cost of store: ; (store(4) + extractelement(3)) / 2 = 3 ; -; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 ; define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -98,10 +98,10 @@ ; Cost of udiv: ; (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4 ; -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x -; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -143,10 +143,10 @@ ; Cost of store: ; store(4) / 2 = 2 ; -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 ; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 ; define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -192,16 +192,16 @@ ; Cost of store: ; store(4) / 2 = 2 ; -; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4 ; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x ; CHECK: Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2 ; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x ; CHECK: Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4 +; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4 ; define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) { entry: Index: llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll +++ llvm/trunk/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll @@ -24,10 +24,10 @@ for.end: ret void -; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 -; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, i32* %tmp0, align 4 - ; CHECK: LV: Scalarizing: %tmp1 = load i32, i32* %tmp0, align 4 ; CHECK: LV: Scalarizing: store i32 %tmp2, i32* %tmp0, align 4 + +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, i32* %tmp0, align 4 } Index: llvm/trunk/test/Transforms/LoopVectorize/first-order-recurrence.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ llvm/trunk/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -467,13 +467,6 @@ ; SINK-AFTER: %[[VCONV:.+]] = sext <4 x i16> %[[VSHUF]] to <4 x i32> ; SINK-AFTER: %[[VCONV3:.+]] = sext <4 x i16> %wide.load to <4 x i32> ; SINK-AFTER: mul nsw <4 x i32> %[[VCONV3]], %[[VCONV]] -; Check also that the sext sank after the load in the scalar loop. -; SINK-AFTER: for.body -; SINK-AFTER: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ %[[LOAD:.+]], %for.body ] -; SINK-AFTER: %[[LOAD]] = load i16, i16* %arrayidx2 -; SINK-AFTER: %[[CONV:.+]] = sext i16 %scalar.recur to i32 -; SINK-AFTER: %[[CONV3:.+]] = sext i16 %[[LOAD]] to i32 -; SINK-AFTER: %mul = mul nsw i32 %[[CONV3]], %[[CONV]] ; define void @sink_after(i16* %a, i32* %b, i64 %n) { entry: Index: llvm/trunk/test/Transforms/LoopVectorize/if-pred-non-void.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/if-pred-non-void.ll +++ llvm/trunk/test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -209,9 +209,9 @@ ; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]] ; CHECK: [[IF0]]: ; CHECK: %[[T00:.+]] = extractelement <2 x i32> %wide.load, i32 0 -; CHECK: %[[T01:.+]] = extractelement <2 x i32> %wide.load, i32 0 -; CHECK: %[[T02:.+]] = add nsw i32 %[[T01]], %x -; CHECK: %[[T03:.+]] = udiv i32 %[[T00]], %[[T02]] +; CHECK: %[[T01:.+]] = add nsw i32 %[[T00]], %x +; CHECK: %[[T02:.+]] = extractelement <2 x i32> %wide.load, i32 0 +; CHECK: %[[T03:.+]] = udiv i32 %[[T02]], %[[T01]] ; CHECK: %[[T04:.+]] = insertelement <2 x i32> undef, i32 %[[T03]], i32 0 ; CHECK: br label %[[CONT0]] ; CHECK: [[CONT0]]: @@ -219,9 +219,9 @@ ; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]] ; CHECK: [[IF1]]: ; CHECK: %[[T06:.+]] = extractelement <2 x i32> %wide.load, i32 1 -; CHECK: %[[T07:.+]] = extractelement <2 x i32> %wide.load, i32 1 -; CHECK: %[[T08:.+]] = add nsw i32 %[[T07]], %x -; CHECK: %[[T09:.+]] = udiv i32 %[[T06]], %[[T08]] +; CHECK: %[[T07:.+]] = add nsw i32 %[[T06]], %x +; CHECK: %[[T08:.+]] = extractelement <2 x i32> %wide.load, i32 1 +; CHECK: %[[T09:.+]] = udiv i32 %[[T08]], %[[T07]] ; CHECK: %[[T10:.+]] = insertelement <2 x i32> %[[T05]], i32 %[[T09]], i32 1 ; CHECK: br label %[[CONT1]] ; CHECK: [[CONT1]]: