Index: lib/Transforms/Vectorize/CMakeLists.txt =================================================================== --- lib/Transforms/Vectorize/CMakeLists.txt +++ lib/Transforms/Vectorize/CMakeLists.txt @@ -3,6 +3,7 @@ LoopVectorize.cpp SLPVectorizer.cpp Vectorize.cpp + VPlan.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -47,6 +47,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/LoopVectorize.h" +#include "VPlan.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" @@ -98,6 +99,7 @@ #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Vectorize.h" #include +#include #include #include @@ -250,6 +252,10 @@ class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizationRequirements; +class VPInterleaveRecipe; +class VPReplicateRecipe; +class VPWidenIntOrFpInductionRecipe; +class VPWidenRecipe; /// Returns true if the given loop body has a cycle, excluding the loop /// itself. @@ -362,6 +368,9 @@ : ConstantFP::get(Ty, C); } +} // end anonymous namespace + +namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -393,10 +402,11 @@ AddedSafetyChecks(false) {} /// Create a new empty loop. Unlink the old loop and connect the new one. - void createVectorizedLoopSkeleton(); + /// Return the pre-header block of the new loop. + BasicBlock *createVectorizedLoopSkeleton(); - /// Vectorize a single instruction within the innermost loop. - void vectorizeInstruction(Instruction &I); + /// Widen a single instruction within the innermost loop. + void widenInstruction(Instruction &I); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(); @@ -406,15 +416,72 @@ virtual ~InnerLoopVectorizer() {} -protected: - /// A small list of PHINodes. - typedef SmallVector PhiVector; - /// A type for vectorized values in the new loop. Each value from the /// original loop, when vectorized, is represented by UF vector values in the /// new unrolled loop, where UF is the unroll factor. typedef SmallVector VectorParts; + /// A helper function that computes the predicate of the block BB, assuming + /// that the header block of the loop is set to True. It returns the *entry* + /// mask for the block BB. + VectorParts createBlockInMask(BasicBlock *BB); + + /// Vectorize a single PHINode in a block. This method handles the induction + /// variable canonicalization. It supports both VF = 1 for unrolled loops and + /// arbitrary length vectors. + void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); + + /// A helper function to scalarize a single Instruction in the innermost loop. + /// Generates a sequence of scalar instances for each lane between \p MinLane + /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, + /// inclusive.. + void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, + bool IfPredicateInstr); + + /// Widen an integer or floating-point induction variable \p IV. If \p Trunc + /// is provided, the integer induction variable will first be truncated to + /// the corresponding type. + void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); + + /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a + /// vector or scalar value on-demand if one is not yet available. When + /// vectorizing a loop, we visit the definition of an instruction before its + /// uses. When visiting the definition, we either vectorize or scalarize the + /// instruction, creating an entry for it in the corresponding map. (In some + /// cases, such as induction variables, we will create both vector and scalar + /// entries.) Then, as we encounter uses of the definition, we derive values + /// for each scalar or vector use unless such a value is already available. + /// For example, if we scalarize a definition and one of its uses is vector, + /// we build the required vector on-demand with an insertelement sequence + /// when visiting the use. Otherwise, if the use is scalar, we can use the + /// existing scalar definition. + /// + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll index \p Part. If the value has already been vectorized, + /// the corresponding vector entry in VectorLoopValueMap is returned. If, + /// however, the value has a scalar entry in VectorLoopValueMap, we construct + /// a new vector value on-demand by inserting the scalar values into a vector + /// with an insertelement sequence. If the value has been neither vectorized + /// nor scalarized, it must be loop invariant, so we simply broadcast the + /// value into a vector. + Value *getOrCreateVectorValue(Value *V, unsigned Part); + + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll and vector indices \p Instance. If the value has been + /// vectorized but not scalarized, the necessary extractelement instruction + /// will be generated. + Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); + + /// Construct the vector value of a scalarized value \p V one lane at a time. + void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); + + /// Try to vectorize the interleaved access group that \p Instr belongs to. + void vectorizeInterleaveGroup(Instruction *Instr); + +protected: + /// A small list of PHINodes. + typedef SmallVector PhiVector; + /// A type for scalarized values in the new loop. Each value from the /// original loop, when scalarized, is represented by UF x VF scalar values /// in the new unrolled loop, where UF is the unroll factor and VF is the @@ -457,37 +524,18 @@ /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); - /// Predicate conditional instructions that require predication on their - /// respective conditions. - void predicateInstructions(); - /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. void truncateToMinimalBitwidths(); - /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True. It returns the *entry* - /// mask for the block BB. - VectorParts createBlockInMask(BasicBlock *BB); /// A helper function that computes the predicate of the edge between SRC /// and DST. VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - /// Vectorize a single PHINode in a block. This method handles the induction - /// variable canonicalization. It supports both VF = 1 for unrolled loops and - /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); - /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. void updateAnalysis(); - /// This instruction is un-vectorizable. Implement it as a sequence - /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each - /// scalarized instruction behind an if block predicated on the control - /// dependence of the instruction. - void scalarizeInstruction(Instruction *Instr, bool IfPredicateInstr = false); - /// Vectorize Load and Store instructions, virtual void vectorizeMemoryInstruction(Instruction *Instr); @@ -521,11 +569,6 @@ void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, Value *Step, Instruction *EntryVal); - /// Widen an integer or floating-point induction variable \p IV. If \p Trunc - /// is provided, the integer induction variable will first be truncated to - /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); - /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. bool shouldScalarizeInstruction(Instruction *I) const; @@ -533,38 +576,6 @@ /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; - /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a - /// vector or scalar value on-demand if one is not yet available. When - /// vectorizing a loop, we visit the definition of an instruction before its - /// uses. When visiting the definition, we either vectorize or scalarize the - /// instruction, creating an entry for it in the corresponding map. (In some - /// cases, such as induction variables, we will create both vector and scalar - /// entries.) Then, as we encounter uses of the definition, we derive values - /// for each scalar or vector use unless such a value is already available. - /// For example, if we scalarize a definition and one of its uses is vector, - /// we build the required vector on-demand with an insertelement sequence - /// when visiting the use. Otherwise, if the use is scalar, we can use the - /// existing scalar definition. - /// - /// Return a value in the new loop corresponding to \p V from the original - /// loop at unroll index \p Part. If the value has already been vectorized, - /// the corresponding vector entry in VectorLoopValueMap is returned. If, - /// however, the value has a scalar entry in VectorLoopValueMap, we construct - /// a new vector value on-demand by inserting the scalar values into a vector - /// with an insertelement sequence. If the value has been neither vectorized - /// nor scalarized, it must be loop invariant, so we simply broadcast the - /// value into a vector. - Value *getOrCreateVectorValue(Value *V, unsigned Part); - - /// Return a value in the new loop corresponding to \p V from the original - /// loop at unroll index \p Part and vector index \p Lane. If the value has - /// been vectorized but not scalarized, the necessary extractelement - /// instruction will be generated. - Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane); - - /// Try to vectorize the interleaved access group that \p Instr belongs to. - void vectorizeInterleaveGroup(Instruction *Instr); - /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -605,127 +616,6 @@ /// the instruction. void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); - /// This is a helper class for maintaining vectorization state. It's used for - /// mapping values from the original loop to their corresponding values in - /// the new loop. Two mappings are maintained: one for vectorized values and - /// one for scalarized values. Vectorized values are represented with UF - /// vector values in the new loop, and scalarized values are represented with - /// UF x VF scalar values in the new loop. UF and VF are the unroll and - /// vectorization factors, respectively. - /// - /// Entries can be added to either map with setVectorValue and setScalarValue, - /// which assert that an entry was not already added before. If an entry is to - /// replace an existing one, call resetVectorValue. This is currently needed - /// to modify the mapped values during "fix-up" operations that occur once the - /// first phase of widening is complete. These operations include type - /// truncation and the second phase of recurrence widening. - /// - /// Entries from either map can be retrieved using the getVectorValue and - /// getScalarValue functions, which assert that the desired value exists. - - struct ValueMap { - - /// Construct an empty map with the given unroll and vectorization factors. - ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} - - /// \return True if the map has any vector entry for \p Key. - bool hasAnyVectorValue(Value *Key) const { - return VectorMapStorage.count(Key); - } - - /// \return True if the map has a vector entry for \p Key and \p Part. - bool hasVectorValue(Value *Key, unsigned Part) const { - assert(Part < UF && "Queried Vector Part is too large."); - if (!hasAnyVectorValue(Key)) - return false; - const VectorParts &Entry = VectorMapStorage.find(Key)->second; - assert(Entry.size() == UF && "VectorParts has wrong dimensions."); - return Entry[Part] != nullptr; - } - - /// \return True if the map has any scalar entry for \p Key. - bool hasAnyScalarValue(Value *Key) const { - return ScalarMapStorage.count(Key); - } - - /// \return True if the map has a scalar entry for \p Key, \p Part and - /// \p Part. - bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const { - assert(Part < UF && "Queried Scalar Part is too large."); - assert(Lane < VF && "Queried Scalar Lane is too large."); - if (!hasAnyScalarValue(Key)) - return false; - const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; - assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions."); - return Entry[Part][Lane] != nullptr; - } - - /// Retrieve the existing vector value that corresponds to \p Key and - /// \p Part. - Value *getVectorValue(Value *Key, unsigned Part) { - assert(hasVectorValue(Key, Part) && "Getting non-existent value."); - return VectorMapStorage[Key][Part]; - } - - /// Retrieve the existing scalar value that corresponds to \p Key, \p Part - /// and \p Lane. - Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) { - assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value."); - return ScalarMapStorage[Key][Part][Lane]; - } - - /// Set a vector value associated with \p Key and \p Part. Assumes such a - /// value is not already set. If it is, use resetVectorValue() instead. - void setVectorValue(Value *Key, unsigned Part, Value *Vector) { - assert(!hasVectorValue(Key, Part) && "Vector value already set for part"); - if (!VectorMapStorage.count(Key)) { - VectorParts Entry(UF); - VectorMapStorage[Key] = Entry; - } - VectorMapStorage[Key][Part] = Vector; - } - - /// Set a scalar value associated with \p Key for \p Part and \p Lane. - /// Assumes such a value is not already set. - void setScalarValue(Value *Key, unsigned Part, unsigned Lane, - Value *Scalar) { - assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set"); - if (!ScalarMapStorage.count(Key)) { - ScalarParts Entry(UF); - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF, nullptr); - // TODO: Consider storing uniform values only per-part, as they occupy - // lane 0 only, keeping the other VF-1 redundant entries null. - ScalarMapStorage[Key] = Entry; - } - ScalarMapStorage[Key][Part][Lane] = Scalar; - } - - /// Reset the vector value associated with \p Key for the given \p Part. - /// This function can be used to update values that have already been - /// vectorized. This is the case for "fix-up" operations including type - /// truncation and the second phase of recurrence vectorization. - void resetVectorValue(Value *Key, unsigned Part, Value *Vector) { - assert(hasVectorValue(Key, Part) && "Vector value not set for part"); - VectorMapStorage[Key][Part] = Vector; - } - - private: - /// The unroll factor. Each entry in the vector map contains UF vector - /// values. - unsigned UF; - - /// The vectorization factor. Each entry in the scalar map contains UF x VF - /// scalar values. - unsigned VF; - - /// The vector and scalar map storage. We use std::map and not DenseMap - /// because insertions to DenseMap invalidate its iterators. - std::map VectorMapStorage; - std::map ScalarMapStorage; - }; - /// The original loop. Loop *OrigLoop; /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies @@ -758,7 +648,6 @@ /// vector elements. unsigned VF; -protected: /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -792,11 +681,10 @@ /// vectorized loop. A key value can map to either vector values, scalar /// values or both kinds of values, depending on whether the key was /// vectorized and scalarized. - ValueMap VectorLoopValueMap; + VectorizerValueMap VectorLoopValueMap; - /// Store instructions that should be predicated, as a pair - /// - SmallVector, 4> PredicatedInstructions; + /// Store instructions that were predicated. + SmallVector PredicatedInstructions; EdgeMaskCacheTy EdgeMaskCache; BlockMaskCacheTy BlockMaskCache; /// Trip count of the original loop. @@ -816,6 +704,8 @@ // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. DenseMap IVEndValues; + + friend class LoopVectorizationPlanner; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -831,7 +721,6 @@ UnrollFactor, LVL, CM) {} private: - void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; Value *getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps Opcode = @@ -908,6 +797,10 @@ } } +} // namespace llvm + +namespace { + /// \brief The group of interleaved loads/stores sharing the same stride and /// close to each other. /// @@ -1920,6 +1813,7 @@ /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. bool isProfitableToScalarize(Instruction *I, unsigned VF) const { + assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); auto Scalars = InstsToScalarize.find(VF); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); @@ -2033,6 +1927,22 @@ return Legal->isInductionVariable(Op); } + /// Collects the instructions to scalarize for each predicated instruction in + /// the loop. + void collectInstsToScalarize(unsigned VF); + + /// Collect Uniform and Scalar values for the given \p VF. + /// The sets depend on CM decision for Load/Store instructions + /// that may be vectorized as interleave, gather-scatter or scalarized. + void collectUniformsAndScalars(unsigned VF) { + // Do the analysis once. + if (VF == 1 || Uniforms.count(VF)) + return; + setCostBasedWideningDecision(VF); + collectLoopUniforms(VF); + collectLoopScalars(VF); + } + private: /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. @@ -2134,10 +2044,6 @@ int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, unsigned VF); - /// Collects the instructions to scalarize for each predicated instruction in - /// the loop. - void collectInstsToScalarize(unsigned VF); - /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in /// the vectorized loop corresponding to each vector iteration. Examples of @@ -2156,18 +2062,6 @@ /// iteration of the original scalar loop. void collectLoopScalars(unsigned VF); - /// Collect Uniform and Scalar values for the given \p VF. - /// The sets depend on CM decision for Load/Store instructions - /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(unsigned VF) { - // Do the analysis once. - if (VF == 1 || Uniforms.count(VF)) - return; - setCostBasedWideningDecision(VF); - collectLoopUniforms(VF); - collectLoopScalars(VF); - } - /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. typedef DenseMap, @@ -2205,23 +2099,71 @@ SmallPtrSet VecValuesToIgnore; }; +} // end anonymous namespace + +namespace llvm { +/// InnerLoopVectorizer vectorizes loops which contain only one basic /// LoopVectorizationPlanner - drives the vectorization process after having /// passed Legality checks. +/// The planner builds and optimizes the Vectorization Plans which record the +/// decisions how to vectorize the given loop. In particular, represent the +/// control-flow of the vectorized version, the replication of instructions that +/// are to be scalarized, and interleave access groups. class LoopVectorizationPlanner { + /// The loop that we evaluate. + Loop *OrigLoop; + + /// Loop Info analysis. + LoopInfo *LI; + + /// Target Library Info. + const TargetLibraryInfo *TLI; + + /// Target Transform Info. + const TargetTransformInfo *TTI; + + /// The legality analysis. + LoopVectorizationLegality *Legal; + + /// The profitablity analysis. + LoopVectorizationCostModel &CM; + + SmallVector VPlans; + + unsigned BestVF; + unsigned BestUF; + public: - LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI, + LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM) - : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {} + : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), + BestVF(0), BestUF(0) {} - ~LoopVectorizationPlanner() {} + ~LoopVectorizationPlanner() { + while (!VPlans.empty()) { + VPlan *Plan = VPlans.back(); + VPlans.pop_back(); + delete Plan; + } + } /// Plan how to best vectorize, return the best VF and its cost. LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, unsigned UserVF); - /// Generate the IR code for the vectorized loop. - void executePlan(InnerLoopVectorizer &ILV); + /// Finalize the best decision and dispose of all other VPlans. + void setBestPlan(unsigned VF, unsigned UF); + + /// Generate the IR code for the body of the vectorized loop according to the + /// best selected VPlan. + void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); + + void printPlans(raw_ostream &O) { + for (VPlan *Plan : VPlans) + O << *Plan; + } protected: /// Collect the instructions from the original loop that would be trivially @@ -2229,20 +2171,77 @@ void collectTriviallyDeadInstructions( SmallPtrSetImpl &DeadInstructions); -private: - /// The loop that we evaluate. - Loop *OrigLoop; + /// A range of powers-of-2 vectorization factors with fixed start and + /// adjustable end. The range includes start and excludes end, e.g.,: + /// [1, 9) = {1, 2, 4, 8} + struct VFRange { + const unsigned Start; // A power of 2. + unsigned End; // Need not be a power of 2. If End <= Start range is empty. + }; - /// Loop Info analysis. - LoopInfo *LI; + /// Test a \p Predicate on a \p Range of VF's. Return the value of applying + /// \p Predicate on Range.Start, possibly decreasing Range.End such that the + /// returned value holds for the entire \p Range. + bool getDecisionAndClampRange(const std::function &Predicate, + VFRange &Range); + + /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, + /// according to the information gathered by Legal when it checked if it is + /// legal to vectorize the loop. + void buildVPlans(unsigned MinVF, unsigned MaxVF); - /// The legality analysis. - LoopVectorizationLegality *Legal; - - /// The profitablity analysis. - LoopVectorizationCostModel &CM; +private: + /// Check if \I belongs to an Interleave Group within the given VF \p Range, + /// \return true in the first returned value if so and false otherwise. + /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG + /// for \p Range.Start, and provide it as the second returned value. + /// Note that if \I is an adjunct member of an IG for \p Range.Start, the + /// \return value is , as it is handled by another recipe. + /// \p Range.End may be decreased to ensure same decision from \p Range.Start + /// to \p Range.End. + VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); + + /// Check if an induction recipe should be constructed for \I within the given + /// VF \p Range. If so build and return it. If not, return null. \p Range.End + /// may be decreased to ensure same decision from \p Range.Start to + /// \p Range.End. + VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I, + VFRange &Range); + + /// Check if \I can be widened within the given VF \p Range. If \I can be + /// widened for Range.Start, extend \p LastWidenRecipe to include \p I if + /// possible or else build a new VPWidenRecipe for it, and return the + /// VPWidenRecipe that includes \p I. If \p I cannot be widened for + /// Range.Start \return null. Range.End may be decreased to ensure same + /// decision from \p Range.Start to \p Range.End. + VPWidenRecipe *tryToWiden(Instruction *I, VPWidenRecipe *LastWidenRecipe, + VFRange &Range); + + /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it + /// is predicated. \return \p VPBB augmented with this new recipe if \p I is + /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new + /// Region. Update the packing decision of predicated instructions if they + /// feed \p I. Range.End may be decreased to ensure same recipe behavior from + /// \p Range.Start to \p Range.End. + VPBasicBlock *handleReplication( + Instruction *I, VFRange &Range, VPBasicBlock *VPBB, + DenseMap &PredInst2Recipe); + + /// Create a replicating region for instruction \p I that requires + /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. + VPRegionBlock *createReplicateRegion(Instruction *I, + VPRecipeBase *PredRecipe); + + /// Build a VPlan according to the information gathered by Legal. \return a + /// VPlan for vectorization factors \p Range.Start and up to \p Range.End + /// exclusive, possibly decreasing \p Range.End. + VPlan *buildVPlan(VFRange &Range); }; +} // namespace llvm + +namespace { + /// \brief This holds vectorization requirements that must be verified late in /// the process. The requirements are set by legalize and costmodel. Once /// vectorization has been determined to be possible and profitable the @@ -2663,15 +2662,15 @@ // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 : VF; - + Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 + : VF; // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); - VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add); + VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); } } } @@ -2709,7 +2708,7 @@ // vector form, we will construct the vector values on demand. if (VectorLoopValueMap.hasAnyScalarValue(V)) { - Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0); + Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); // If we've scalarized a value, that value should be an instruction. auto *I = cast(V); @@ -2726,8 +2725,8 @@ // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; - auto *LastInst = - cast(VectorLoopValueMap.getScalarValue(V, Part, LastLane)); + auto *LastInst = cast( + VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); // Set the insert point after the last scalarized instruction. This ensures // the insertelement sequence will directly follow the scalar definitions. @@ -2744,14 +2743,15 @@ Value *VectorValue = nullptr; if (Cost->isUniformAfterVectorization(I, VF)) { VectorValue = getBroadcastInstrs(ScalarValue); + VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { - VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); + // Initialize packing with insertelements to start from undef. + Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); + VectorLoopValueMap.setVectorValue(V, Part, Undef); for (unsigned Lane = 0; Lane < VF; ++Lane) - VectorValue = Builder.CreateInsertElement( - VectorValue, getOrCreateScalarValue(V, Part, Lane), - Builder.getInt32(Lane)); + packScalarIntoVectorValue(V, {Part, Lane}); + VectorValue = VectorLoopValueMap.getVectorValue(V, Part); } - VectorLoopValueMap.setVectorValue(V, Part, VectorValue); Builder.restoreIP(OldIP); return VectorValue; } @@ -2763,28 +2763,29 @@ return B; } -Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part, - unsigned Lane) { - +Value * +InnerLoopVectorizer::getOrCreateScalarValue(Value *V, + const VPIteration &Instance) { // If the value is not an instruction contained in the loop, it should // already be scalar. if (OrigLoop->isLoopInvariant(V)) return V; - assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast(V), VF) - : true && "Uniform values only have lane zero"); + assert(Instance.Lane > 0 + ? !Cost->isUniformAfterVectorization(cast(V), VF) + : true && "Uniform values only have lane zero"); // If the value from the original loop has not been vectorized, it is // represented by UF x VF scalar values in the new loop. Return the requested // scalar value. - if (VectorLoopValueMap.hasScalarValue(V, Part, Lane)) - return VectorLoopValueMap.getScalarValue(V, Part, Lane); + if (VectorLoopValueMap.hasScalarValue(V, Instance)) + return VectorLoopValueMap.getScalarValue(V, Instance); // If the value has not been scalarized, get its entry in VectorLoopValueMap // for the given unroll part. If this entry is not a vector type (i.e., the // vectorization factor is one), there is no need to generate an // extractelement instruction. - auto *U = getOrCreateVectorValue(V, Part); + auto *U = getOrCreateVectorValue(V, Instance.Part); if (!U->getType()->isVectorTy()) { assert(VF == 1 && "Value not scalarized has non-vector type"); return U; @@ -2793,7 +2794,20 @@ // Otherwise, the value from the original loop has been vectorized and is // represented by UF vector values. Extract and return the requested scalar // value from the appropriate vector lane. - return Builder.CreateExtractElement(U, Builder.getInt32(Lane)); + return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); +} + +void InnerLoopVectorizer::packScalarIntoVectorValue( + Value *V, const VPIteration &Instance) { + assert(V != Induction && "The new induction variable should not be used."); + assert(!V->getType()->isVectorTy() && "Can't pack a vector"); + assert(!V->getType()->isVoidTy() && "Type does not produce a value"); + + Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); + Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); + VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, + Builder.getInt32(Instance.Lane)); + VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); } Value *InnerLoopVectorizer::reverseVector(Value *Vec) { @@ -2866,7 +2880,7 @@ Index += (VF - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { - Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0); + Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2992,10 +3006,6 @@ Alignment = DL.getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = getMemInstAddressSpace(Instr); - // Scalarize the memory instruction if necessary. - if (Decision == LoopVectorizationCostModel::CM_Scalarize) - return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); - // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); @@ -3010,7 +3020,7 @@ // Handle consecutive loads/stores. if (ConsecutiveStride) - Ptr = getOrCreateScalarValue(Ptr, 0, 0); + Ptr = getOrCreateScalarValue(Ptr, {0, 0}); VectorParts Mask = createBlockInMask(Instr->getParent()); // Handle Stores: @@ -3107,71 +3117,41 @@ } void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + const VPIteration &Instance, bool IfPredicateInstr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); - DEBUG(dbgs() << "LV: Scalarizing" - << (IfPredicateInstr ? " and predicating:" : ":") << *Instr - << '\n'); - // Holds vector parameters or scalars, in case of uniform vals. - SmallVector Params; setDebugLocFromInst(Builder, Instr); // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - VectorParts Cond; - if (IfPredicateInstr) - Cond = createBlockInMask(Instr->getParent()); - - // Determine the number of scalars we need to generate for each unroll - // iteration. If the instruction is uniform, we only need to generate the - // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF; - - // For each vector unroll 'part': - for (unsigned Part = 0; Part < UF; ++Part) { - // For each scalar that we create: - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); + + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); + Cloned->setOperand(op, NewOp); + } + addNewMetadata(Cloned, Instr); + + // Place the cloned scalar in the new loop. + Builder.Insert(Cloned); + + // Add the cloned scalar to the scalar map entry. + VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); + + // If we just cloned a new assumption, add it the assumption cache. + if (auto *II = dyn_cast(Cloned)) + if (II->getIntrinsicID() == Intrinsic::assume) + AC->registerAssumption(II); - // Start if-block. - Value *Cmp = nullptr; - if (IfPredicateInstr) { - Cmp = Cond[Part]; - if (!Cmp) // Block in mask is all-one. - Cmp = Builder.getTrue(); - else if (Cmp->getType()->isVectorTy()) - Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane)); - } - - Instruction *Cloned = Instr->clone(); - if (!IsVoidRetTy) - Cloned->setName(Instr->getName() + ".cloned"); - - // Replace the operands of the cloned instructions with their scalar - // equivalents in the new loop. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane); - Cloned->setOperand(op, NewOp); - } - addNewMetadata(Cloned, Instr); - - // Place the cloned scalar in the new loop. - Builder.Insert(Cloned); - - // Add the cloned scalar to the scalar map entry. - VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned); - - // If we just cloned a new assumption, add it the assumption cache. - if (auto *II = dyn_cast(Cloned)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); - - // End if-block. - if (IfPredicateInstr) - PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); - } - } + // End if-block. + if (IfPredicateInstr) + PredicatedInstructions.push_back(Cloned); } PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, @@ -3375,7 +3355,7 @@ LVer->prepareNoAliasMetadata(); } -void InnerLoopVectorizer::createVectorizedLoopSkeleton() { +BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3556,6 +3536,8 @@ LoopVectorizeHints Hints(Lp, true, *ORE); Hints.setAlreadyVectorized(); + + return LoopVectorPreHeader; } // Fix up external users of the induction variable. At this point, we are @@ -3929,7 +3911,8 @@ IVEndValues[Entry.first], LoopMiddleBlock); fixLCSSAPHIs(); - predicateInstructions(); + for (Instruction *PI : PredicatedInstructions) + sinkScalarOperands(&*PI); // Remove redundant induction instructions. cse(LoopVectorBody); @@ -4384,126 +4367,6 @@ } while (Changed); } -void InnerLoopVectorizer::predicateInstructions() { - - // For each instruction I marked for predication on value C, split I into its - // own basic block to form an if-then construct over C. Since I may be fed by - // an extractelement instruction or other scalar operand, we try to - // iteratively sink its scalar operands into the predicated block. If I feeds - // an insertelement instruction, we try to move this instruction into the - // predicated block as well. For non-void types, a phi node will be created - // for the resulting value (either vector or scalar). - // - // So for some predicated instruction, e.g. the conditional sdiv in: - // - // for.body: - // ... - // %add = add nsw i32 %mul, %0 - // %cmp5 = icmp sgt i32 %2, 7 - // br i1 %cmp5, label %if.then, label %if.end - // - // if.then: - // %div = sdiv i32 %0, %1 - // br label %if.end - // - // if.end: - // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ] - // - // the sdiv at this point is scalarized and if-converted using a select. - // The inactive elements in the vector are not used, but the predicated - // instruction is still executed for all vector elements, essentially: - // - // vector.body: - // ... - // %17 = add nsw <2 x i32> %16, %wide.load - // %29 = extractelement <2 x i32> %wide.load, i32 0 - // %30 = extractelement <2 x i32> %wide.load51, i32 0 - // %31 = sdiv i32 %29, %30 - // %32 = insertelement <2 x i32> undef, i32 %31, i32 0 - // %35 = extractelement <2 x i32> %wide.load, i32 1 - // %36 = extractelement <2 x i32> %wide.load51, i32 1 - // %37 = sdiv i32 %35, %36 - // %38 = insertelement <2 x i32> %32, i32 %37, i32 1 - // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17 - // - // Predication will now re-introduce the original control flow to avoid false - // side-effects by the sdiv instructions on the inactive elements, yielding - // (after cleanup): - // - // vector.body: - // ... - // %5 = add nsw <2 x i32> %4, %wide.load - // %8 = icmp sgt <2 x i32> %wide.load52, - // %9 = extractelement <2 x i1> %8, i32 0 - // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue - // - // pred.sdiv.if: - // %10 = extractelement <2 x i32> %wide.load, i32 0 - // %11 = extractelement <2 x i32> %wide.load51, i32 0 - // %12 = sdiv i32 %10, %11 - // %13 = insertelement <2 x i32> undef, i32 %12, i32 0 - // br label %pred.sdiv.continue - // - // pred.sdiv.continue: - // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ] - // %15 = extractelement <2 x i1> %8, i32 1 - // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55 - // - // pred.sdiv.if54: - // %16 = extractelement <2 x i32> %wide.load, i32 1 - // %17 = extractelement <2 x i32> %wide.load51, i32 1 - // %18 = sdiv i32 %16, %17 - // %19 = insertelement <2 x i32> %14, i32 %18, i32 1 - // br label %pred.sdiv.continue55 - // - // pred.sdiv.continue55: - // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ] - // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5 - - for (auto KV : PredicatedInstructions) { - BasicBlock::iterator I(KV.first); - BasicBlock *Head = I->getParent(); - auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, - /*BranchWeights=*/nullptr, DT, LI); - I->moveBefore(T); - sinkScalarOperands(&*I); - - BasicBlock *PredicatedBlock = I->getParent(); - Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName(); - PredicatedBlock->setName(BBNamePrefix + ".if"); - PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue"); - - // If the instruction is non-void create a Phi node at reconvergence point. - if (!I->getType()->isVoidTy()) { - Value *IncomingTrue = nullptr; - Value *IncomingFalse = nullptr; - - if (I->hasOneUse() && isa(*I->user_begin())) { - // If the predicated instruction is feeding an insert-element, move it - // into the Then block; Phi node will be created for the vector. - InsertElementInst *IEI = cast(*I->user_begin()); - IEI->moveBefore(T); - IncomingTrue = IEI; // the new vector with the inserted element. - IncomingFalse = IEI->getOperand(0); // the unmodified vector - } else { - // Phi node will be created for the scalar predicated instruction. - IncomingTrue = &*I; - IncomingFalse = UndefValue::get(I->getType()); - } - - BasicBlock *PostDom = I->getParent()->getSingleSuccessor(); - assert(PostDom && "Then block has multiple successors"); - PHINode *Phi = - PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front()); - IncomingTrue->replaceAllUsesWith(Phi); - Phi->addIncoming(IncomingFalse, Head); - Phi->addIncoming(IncomingTrue, I->getParent()); - } - } - - DEBUG(DT->verifyDomTree()); -} - InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); @@ -4647,7 +4510,7 @@ llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: case InductionDescriptor::IK_FpInduction: - return widenIntOrFpInduction(P); + llvm_unreachable("Integer/fp induction is handled elsewhere."); case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); @@ -4666,7 +4529,7 @@ Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL); SclrGep->setName("next.gep"); - VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep); + VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); } } return; @@ -4692,25 +4555,11 @@ return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { - // Scalarize instructions that should remain scalar after vectorization. - if (VF > 1 && - !(isa(&I) || isa(&I) || isa(&I)) && - shouldScalarizeInstruction(&I)) { - scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); - return; - } - +void InnerLoopVectorizer::widenInstruction(Instruction &I) { switch (I.getOpcode()) { case Instruction::Br: - // Nothing to do for PHIs and BR, since we already took care of the - // loop control flow instructions. - break; - case Instruction::PHI: { - // Vectorize PHINodes. - widenPHIInstruction(&I, UF, VF); - break; - } // End of PHI. + case Instruction::PHI: + llvm_unreachable("This instruction is handled by a different recipe."); case Instruction::GetElementPtr: { // Construct a vector GEP by widening the operands of the scalar GEP as // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP @@ -4783,13 +4632,6 @@ case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: - // Scalarize with predication if this instruction may divide by zero and - // block execution is conditional, otherwise fallthrough. - if (Legal->isScalarWithPredication(&I)) { - scalarizeInstruction(&I, true); - break; - } - LLVM_FALLTHROUGH; case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -4837,7 +4679,7 @@ // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. - auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0); + auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); for (unsigned Part = 0; Part < UF; ++Part) { Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); @@ -4896,16 +4738,6 @@ auto *CI = dyn_cast(&I); setDebugLocFromInst(Builder, CI); - // Optimize the special case where the source is a constant integer - // induction variable. Notice that we can only optimize the 'trunc' case - // because (a) FP conversions lose precision, (b) sext/zext may wrap, and - // (c) other casts depend on pointer size. - if (Cost->isOptimizableIVTruncate(CI, VF)) { - widenIntOrFpInduction(cast(CI->getOperand(0)), - cast(CI)); - break; - } - /// Vectorize casts. Type *DestTy = (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); @@ -4936,11 +4768,7 @@ Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start)) { - scalarizeInstruction(&I); - break; - } + // The flag shows whether we use Intrinsic or a usual Call for vectorized // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? @@ -4948,10 +4776,8 @@ unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); bool UseVectorIntrinsic = ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; - if (!UseVectorIntrinsic && NeedToScalarize) { - scalarizeInstruction(&I); - break; - } + assert((UseVectorIntrinsic || !NeedToScalarize) && + "Instruction should be scalarized elsewhere."); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector Args; @@ -5001,9 +4827,9 @@ } default: - // All other instructions are unsupported. Scalarize them. - scalarizeInstruction(&I); - break; + // All other instructions are scalarized. + DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); } // end of switch. } @@ -5015,14 +4841,11 @@ assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && "Entry does not dominate exit."); - DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(), - LoopVectorPreHeader); DT->addNewBlock(LoopMiddleBlock, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); - DEBUG(DT->verifyDomTree()); } @@ -6962,13 +6785,6 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; - // Collect Uniform and Scalar instructions after vectorization with VF. - collectUniformsAndScalars(VF); - - // Collect the instructions (and their associated costs) that will be more - // profitable to scalarize. - collectInstsToScalarize(VF); - // For each block. for (BasicBlock *BB : TheLoop->blocks()) { VectorizationCostTy BlockCost; @@ -7626,11 +7442,26 @@ // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); + buildVPlans(UserVF, UserVF); + DEBUG(printPlans(dbgs())); return {UserVF, 0}; } unsigned MaxVF = MaybeMaxVF.getValue(); assert(MaxVF != 0 && "MaxVF is zero."); + + for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { + // Collect Uniform and Scalar instructions after vectorization with VF. + CM.collectUniformsAndScalars(VF); + + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + if (VF > 1) + CM.collectInstsToScalarize(VF); + } + + buildVPlans(1, MaxVF); + DEBUG(printPlans(dbgs())); if (MaxVF == 1) return NoVectorization; @@ -7638,11 +7469,31 @@ return CM.selectVectorizationFactor(MaxVF); } -void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) { +void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { + DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); + BestVF = VF; + BestUF = UF; + + for (auto *VPlanIter = VPlans.begin(); VPlanIter != VPlans.end();) { + VPlan *Plan = *VPlanIter; + if (Plan->hasVF(VF)) + ++VPlanIter; + else { + VPlanIter = VPlans.erase(VPlanIter); + delete Plan; + } + } + assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); +} + +void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, + DominatorTree *DT) { // Perform the actual loop transformation. // 1. Create a new empty loop. Unlink the old loop and connect the new one. - ILV.createVectorizedLoopSkeleton(); + VPTransformState State{ + BestVF, BestUF, LI, DT, ILV.Builder, ILV.VectorLoopValueMap, &ILV}; + State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); //===------------------------------------------------===// // @@ -7653,36 +7504,9 @@ //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - - // Move instructions to handle first-order recurrences. - DenseMap &SinkAfter = Legal->getSinkAfter(); - for (auto &Entry : SinkAfter) { - Entry.first->removeFromParent(); - Entry.first->insertAfter(Entry.second); - DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second - << " to vectorize a 1st order recurrence.\n"); - } - - // Collect instructions from the original loop that will become trivially dead - // in the vectorized loop. We don't need to vectorize these instructions. For - // example, original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet DeadInstructions; - collectTriviallyDeadInstructions(DeadInstructions); - - // Scan the loop in a topological order to ensure that defs are vectorized - // before users. - LoopBlocksDFS DFS(OrigLoop); - DFS.perform(LI); - - // Vectorize all instructions in the original loop that will not become - // trivially dead when vectorized. - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - for (Instruction &I : *BB) - if (!DeadInstructions.count(&I)) - ILV.vectorizeInstruction(I); + assert(VPlans.size() == 1 && "Not a single VPlan to execute."); + VPlan *Plan = *VPlans.begin(); + Plan->execute(&State); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7712,14 +7536,6 @@ DeadInstructions.insert(IndUpdate); } } - -void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { - auto *SI = dyn_cast(Instr); - bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); - - return scalarizeInstruction(Instr, IfPredicateInstr); -} - Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } @@ -7775,6 +7591,725 @@ } } +namespace { +/// VPWidenRecipe is a recipe for producing a copy of vector type for each +/// Instruction in its ingredients independently, in order. This recipe covers +/// most of the traditional vectorization cases where each ingredient transforms +/// into a vectorized version of itself. +class VPWidenRecipe : public VPRecipeBase { +private: + /// Hold the ingredients by pointing to their original BasicBlock location. + BasicBlock::iterator Begin; + BasicBlock::iterator End; + +public: + VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC) { + End = I->getIterator(); + Begin = End++; + } + + ~VPWidenRecipe() {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenSC; + } + + /// Produce widened copies of all Ingredients. + void execute(VPTransformState &State) override { + for (auto &Instr : make_range(Begin, End)) + State.ILV->widenInstruction(Instr); + } + + /// Augment the recipe to include Instr, if it lies at its End. + bool appendInstruction(Instruction *Instr) { + if (End != Instr->getIterator()) + return false; + End++; + return true; + } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" << Indent << "\"WIDEN\\l\""; + for (auto &Instr : make_range(Begin, End)) + O << " +\n" << Indent << "\" " << VPlanIngredient(&Instr) << "\\l\""; + } +}; + +/// A recipe for handling phi nodes of integer and floating-point inductions, +/// producing their vector and scalar values. +class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { +private: + PHINode *IV; + TruncInst *Trunc; + +public: + VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr) + : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) {} + + ~VPWidenIntOrFpInductionRecipe() {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC; + } + + /// Generate the vectorized and scalarized versions of the phi node as + /// needed by their users. + void execute(VPTransformState &State) override { + assert(!State.Instance && "Int or FP induction being replicated."); + State.ILV->widenIntOrFpInduction(IV, Trunc); + } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" << Indent << "\"WIDEN-INDUCTION"; + if (Trunc) { + O << "\\l\""; + O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; + O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc) << "\\l\""; + } else + O << " " << VPlanIngredient(IV) << "\\l\""; + } +}; + +/// A recipe for handling all phi nodes except for integer and FP inductions. +class VPWidenPHIRecipe : public VPRecipeBase { +private: + PHINode *Phi; + +public: + VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {} + + ~VPWidenPHIRecipe() {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenPHISC; + } + + /// Generate the phi/select nodes. + void execute(VPTransformState &State) override { + State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); + } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\""; + } +}; + +/// VPInterleaveRecipe is a recipe for transforming an interleave group of load +/// or stores into one wide load/store and shuffles. +class VPInterleaveRecipe : public VPRecipeBase { +private: + const InterleaveGroup *IG; + +public: + VPInterleaveRecipe(const InterleaveGroup *IG) + : VPRecipeBase(VPInterleaveSC), IG(IG) {} + + ~VPInterleaveRecipe() {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC; + } + + /// Generate the wide load or store, and shuffles. + void execute(VPTransformState &State) override { + assert(!State.Instance && "Interleave group being replicated."); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); + } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; + + const InterleaveGroup *getInterleaveGroup() { return IG; } +}; + +/// VPReplicateRecipe replicates a given instruction producing multiple scalar +/// copies of the original scalar type, one per lane, instead of producing a +/// single copy of widened type for all lanes. If the instruction is known to be +/// uniform only one copy, per lane zero, will be generated. +class VPReplicateRecipe : public VPRecipeBase { +private: + /// The instruction being replicated. + Instruction *Ingredient; + + /// Indicator if only a single replica per lane is needed. + bool IsUniform; + + /// Indicator if the replicas are also predicated. + bool IsPredicated; + + /// Indicator if the scalar values should also be packed into a vector. + bool AlsoPack; + +public: + VPReplicateRecipe(Instruction *I, bool IsUniform, bool IsPredicated = false) + : VPRecipeBase(VPReplicateSC), Ingredient(I), IsUniform(IsUniform), + IsPredicated(IsPredicated) { + // Retain the previous behavior of predicateInstructions(), where an + // insert-element of a predicated instruction got hoisted into the + // predicated basic block iff it was its only user. This is achieved by + // having predicated instructions also pack their values into a vector by + // default unless they have a replicated user which uses their scalar value. + AlsoPack = IsPredicated && !I->use_empty(); + } + + ~VPReplicateRecipe() {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPReplicateSC; + } + + /// Generate replicas of the desired Ingredient. Replicas will be generated + /// for all parts and lanes unless a specific part and lane are specified in + /// the \p State. + void execute(VPTransformState &State) override; + + void setAlsoPack(bool Pack) { AlsoPack = Pack; } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" + << Indent << "\"" << (IsUniform ? "CLONE " : "REPLICATE ") + << VPlanIngredient(Ingredient); + if (AlsoPack) + O << " (S->V)"; + O << "\\l\""; + } +}; + +/// A recipe for generating conditional branches on the bits of a mask. +class VPBranchOnMaskRecipe : public VPRecipeBase { +private: + /// The input IR basic block used to obtain the mask providing the condition + /// bits for the branch. + BasicBlock *MaskedBasicBlock; + +public: + VPBranchOnMaskRecipe(BasicBlock *BB) + : VPRecipeBase(VPBranchOnMaskSC), MaskedBasicBlock(BB) {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPBranchOnMaskSC; + } + + /// Generate the extraction of the appropriate bit from the block mask and the + /// conditional branch. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" + << Indent << "\"BRANCH-ON-MASK-OF " << MaskedBasicBlock->getName() + << "\\l\""; + } +}; + +/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when +/// control converges back from a Branch-on-Mask. The phi nodes are needed in +/// order to merge values that are set under such a branch and feed their uses. +/// The phi nodes can be scalar or vector depending on the users of the value. +/// This recipe works in concert with VPBranchOnMaskRecipe. +class VPPredInstPHIRecipe : public VPRecipeBase { +private: + Instruction *PredInst; + +public: + /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi + /// nodes after merging back from a Branch-on-Mask. + VPPredInstPHIRecipe(Instruction *PredInst) + : VPRecipeBase(VPPredInstPHISC), PredInst(PredInst) {} + + ~VPPredInstPHIRecipe() {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC; + } + + /// Generates phi nodes for live-outs as needed to retain SSA form. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" + << Indent << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst) + << "\\l\""; + } +}; +} // end anonymous namespace + +bool LoopVectorizationPlanner::getDecisionAndClampRange( + const std::function &Predicate, VFRange &Range) { + assert(Range.End > Range.Start && "Trying to test an empty VF range."); + bool PredicateAtRangeStart = Predicate(Range.Start); + + for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) + if (Predicate(TmpVF) != PredicateAtRangeStart) { + Range.End = TmpVF; + break; + } + + return PredicateAtRangeStart; +} + +/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, +/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range +/// of VF's starting at a given VF and extending it as much as possible. Each +/// vectorization decision can potentially shorten this sub-range during +/// buildVPlan(). +void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { + for (unsigned VF = MinVF; VF < MaxVF + 1;) { + VFRange SubRange = {VF, MaxVF + 1}; + VPlan *Plan = buildVPlan(SubRange); + VPlans.push_back(Plan); + VF = SubRange.End; + } +} + +VPInterleaveRecipe * +LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I, + VFRange &Range) { + const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I); + if (!IG) + return nullptr; + + // Now check if IG is relevant for VF's in the given range. + auto isIGMember = [&](Instruction *I) -> std::function { + return [=](unsigned VF) -> bool { + return (VF >= 2 && // Query is illegal for VF == 1 + CM.getWideningDecision(I, VF) == + LoopVectorizationCostModel::CM_Interleave); + }; + }; + if (!getDecisionAndClampRange(isIGMember(I), Range)) + return nullptr; + + // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) + // range. If it's the primary member of the IG construct a VPInterleaveRecipe. + // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. + assert(I == IG->getInsertPos() && + "Generating a recipe for an adjunct member of an interleave group"); + + return new VPInterleaveRecipe(IG); +} + +VPWidenIntOrFpInductionRecipe * +LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I, + VFRange &Range) { + if (PHINode *Phi = dyn_cast(I)) { + // Check if this is an integer or fp induction. If so, build the recipe that + // produces its scalar and vector values. + InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); + if (II.getKind() == InductionDescriptor::IK_IntInduction || + II.getKind() == InductionDescriptor::IK_FpInduction) + return new VPWidenIntOrFpInductionRecipe(Phi); + + return nullptr; + } + + // Optimize the special case where the source is a constant integer + // induction variable. Notice that we can only optimize the 'trunc' case + // because (a) FP conversions lose precision, (b) sext/zext may wrap, and + // (c) other casts depend on pointer size. + + // Determine whether \p K is a truncation based on an induction variable that + // can be optimized. + auto isOptimizableIVTruncate = + [&](Instruction *K) -> std::function { + return + [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; + }; + + if (isa(I) && + getDecisionAndClampRange(isOptimizableIVTruncate(I), Range)) + return new VPWidenIntOrFpInductionRecipe(cast(I->getOperand(0)), + cast(I)); + return nullptr; +} + +VPWidenRecipe *LoopVectorizationPlanner::tryToWiden( + Instruction *I, VPWidenRecipe *LastWidenRecipe, VFRange &Range) { + + if (Legal->isScalarWithPredication(I)) + return nullptr; + + static DenseSet VectorizableOpcodes = { + Instruction::Br, Instruction::PHI, Instruction::GetElementPtr, + Instruction::UDiv, Instruction::SDiv, Instruction::SRem, + Instruction::URem, Instruction::Add, Instruction::FAdd, + Instruction::Sub, Instruction::FSub, Instruction::Mul, + Instruction::FMul, Instruction::FDiv, Instruction::FRem, + Instruction::Shl, Instruction::LShr, Instruction::AShr, + Instruction::And, Instruction::Or, Instruction::Xor, + Instruction::Select, Instruction::ICmp, Instruction::FCmp, + Instruction::Store, Instruction::Load, Instruction::ZExt, + Instruction::SExt, Instruction::FPToUI, Instruction::FPToSI, + Instruction::FPExt, Instruction::PtrToInt, Instruction::IntToPtr, + Instruction::SIToFP, Instruction::UIToFP, Instruction::Trunc, + Instruction::FPTrunc, Instruction::BitCast, Instruction::Call}; + + if (!VectorizableOpcodes.count(I->getOpcode())) + return nullptr; + + if (CallInst *CI = dyn_cast(I)) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start)) + return nullptr; + } + + auto willWiden = [&](unsigned VF) -> bool { + if (!isa(I) && (CM.isScalarAfterVectorization(I, VF) || + CM.isProfitableToScalarize(I, VF))) + return false; + if (CallInst *CI = dyn_cast(I)) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + // The following case may be scalarized depending on the VF. + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize; + unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + bool UseVectorIntrinsic = + ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + return UseVectorIntrinsic || !NeedToScalarize; + } + if (isa(I) || isa(I)) { + LoopVectorizationCostModel::InstWidening Decision = + CM.getWideningDecision(I, VF); + assert(Decision != LoopVectorizationCostModel::CM_Unknown && + "CM decision should be taken at this point."); + assert(Decision != LoopVectorizationCostModel::CM_Interleave && + "Interleave memory opportunity should be caught earlier."); + return Decision != LoopVectorizationCostModel::CM_Scalarize; + } + return true; + }; + + if (!getDecisionAndClampRange(willWiden, Range)) + return nullptr; + + // Success: widen this instruction. We optimize the common case where + // consecutive instructions can be represented by a single recipe. + if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) + return LastWidenRecipe; + return new VPWidenRecipe(I); +} + +VPBasicBlock *LoopVectorizationPlanner::handleReplication( + Instruction *I, VFRange &Range, VPBasicBlock *VPBB, + DenseMap &PredInst2Recipe) { + + bool IsUniform = getDecisionAndClampRange( + [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, + Range); + + bool IsPredicated = Legal->isScalarWithPredication(I); + auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); + + // Find if I uses a predicated instruction. If so, it will use its scalar + // value. Avoid hoisting the insert-element which packs the scalar value into + // a vector value, as that happens iff all users use the vector value. + for (auto &Op : I->operands()) + if (auto *PredInst = dyn_cast(Op)) + if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) + PredInst2Recipe[PredInst]->setAlsoPack(false); + + // Finalize the recipe for Instr, first if it is not predicated. + if (!IsPredicated) { + DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); + VPBB->appendRecipe(Recipe); + return VPBB; + } + DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); + assert(VPBB->getSuccessors().empty() && + "VPBB has successors when handling predicated replication."); + // Record predicated instructions for above packing optimizations. + PredInst2Recipe[I] = Recipe; + VPBlockBase *Region = VPBB->setOneSuccessor(createReplicateRegion(I, Recipe)); + return cast(Region->setOneSuccessor(new VPBasicBlock())); +} + +VPRegionBlock * +LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr, + VPRecipeBase *PredRecipe) { + // Instructions marked for predication are replicated and placed under an + // if-then construct to prevent side-effects. + + // Build the triangular if-then region. + std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); + assert(Instr->getParent() && "Predicated instruction not in any basic block"); + auto *BOMRecipe = new VPBranchOnMaskRecipe(Instr->getParent()); + auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); + auto *PHIRecipe = + Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); + auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); + auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); + VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); + + // Note: first set Entry as region entry and then connect successors starting + // from it in order, to propagate the "parent" of each VPBasicBlock. + Entry->setTwoSuccessors(Pred, Exit); + Pred->setOneSuccessor(Exit); + + return Region; +} + +VPlan *LoopVectorizationPlanner::buildVPlan(VFRange &Range) { + + DenseMap &SinkAfter = Legal->getSinkAfter(); + DenseMap SinkAfterInverse; + + // Collect instructions from the original loop that will become trivially dead + // in the vectorized loop. We don't need to vectorize these instructions. For + // example, original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet DeadInstructions; + collectTriviallyDeadInstructions(DeadInstructions); + + // Hold a mapping from predicated instructions to their recipes, in order to + // fix their AlsoPack behavior if a user is determined to replicate and use a + // scalar instead of vector value. + DenseMap PredInst2Recipe; + + // Create a dummy pre-entry VPBasicBlock to start building the VPlan. + VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); + VPlan *Plan = new VPlan(VPBB); + + // Scan the body of the loop in a topological order to visit each basic block + // after having visited its predecessor basic blocks. + LoopBlocksDFS DFS(OrigLoop); + DFS.perform(LI); + + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { + // Relevant instructions from basic block BB will be grouped into VPRecipe + // ingredients and fill a new VPBasicBlock. + unsigned VPBBsForBB = 0; + auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); + VPBB->setOneSuccessor(FirstVPBBForBB); + VPBB = FirstVPBBForBB; + VPWidenRecipe *LastWidenRecipe = nullptr; + + std::vector Ingredients; + + // Organize the ingredients to vectorize from current basic block in the + // right order. + for (Instruction &I : *BB) { + Instruction *Instr = &I; + + // First filter out irrelevant instructions, to ensure no recipes are + // built for them. + if (isa(Instr) || isa(Instr) || + DeadInstructions.count(Instr)) + continue; + + // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct + // member of the IG, do not construct any Recipe for it. + const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr); + if (IG && Instr != IG->getInsertPos() && + Range.Start >= 2 && // Query is illegal for VF == 1 + CM.getWideningDecision(Instr, Range.Start) == + LoopVectorizationCostModel::CM_Interleave) + continue; + + // Move instructions to handle first-order recurrences, step 1: avoid + // handling this instruction until after we've handled the instruction it + // should follow. + auto SAIt = SinkAfter.find(Instr); + if (SAIt != SinkAfter.end()) { + DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second + << " to vectorize a 1st order recurrence.\n"); + SinkAfterInverse[SAIt->second] = Instr; + continue; + } + + Ingredients.push_back(Instr); + + // Move instructions to handle first-order recurrences, step 2: push the + // instruction to be sunk at its insertion point. + auto SAInvIt = SinkAfterInverse.find(Instr); + if (SAInvIt != SinkAfterInverse.end()) + Ingredients.push_back(SAInvIt->second); + } + + // Introduce each ingredient into VPlan. + for (Instruction *Instr : Ingredients) { + VPRecipeBase *Recipe = nullptr; + + // Check if Instr should belong to an interleave memory recipe, or already + // does. In the latter case Instr is irrelevant. + if ((Recipe = tryToInterleaveMemory(Instr, Range))) { + VPBB->appendRecipe(Recipe); + continue; + } + + // Check if Instr should form some PHI recipe. + if ((Recipe = tryToOptimizeInduction(Instr, Range))) { + VPBB->appendRecipe(Recipe); + continue; + } + if (PHINode *Phi = dyn_cast(Instr)) { + VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); + continue; + } + + // Check if Instr is to be widened by a general VPWidenRecipe, after + // having first checked for specific widening recipes that deal with + // Interleave Groups, Inductions and Phi nodes. + if ((Recipe = tryToWiden(Instr, LastWidenRecipe, Range))) { + if (Recipe != LastWidenRecipe) + VPBB->appendRecipe(Recipe); + LastWidenRecipe = cast(Recipe); + continue; + } + + // Otherwise, if all widening options failed, Instruction is to be + // replicated. This may create a successor for VPBB. + VPBasicBlock *NextVPBB = + handleReplication(Instr, Range, VPBB, PredInst2Recipe); + if (NextVPBB != VPBB) { + VPBB = NextVPBB; + VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) + : ""); + } + } + } + + // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks + // may also be empty, such as the last one VPBB, reflecting original + // basic-blocks with no recipes. + VPBasicBlock *PreEntry = cast(Plan->getEntry()); + assert(PreEntry->empty() && "Expecting empty pre-entry block."); + VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); + PreEntry->disconnectSuccessor(Entry); + delete PreEntry; + + std::string PlanName; + raw_string_ostream RSO(PlanName); + unsigned VF = Range.Start; + Plan->addVF(VF); + RSO << "Initial VPlan for VF={" << VF; + for (VF *= 2; VF < Range.End; VF *= 2) { + Plan->addVF(VF); + RSO << "," << VF; + } + RSO << "},UF>=1"; + RSO.flush(); + Plan->setName(PlanName); + + return Plan; +} + +void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" + << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + IG->getInsertPos()->printAsOperand(O, false); + O << "\\l\""; + for (unsigned i = 0; i < IG->getFactor(); ++i) + if (Instruction *I = IG->getMember(i)) + O << " +\n" + << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; +} + +void VPReplicateRecipe::execute(VPTransformState &State) { + + if (State.Instance) { // Generate a single instance. + State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); + if (AlsoPack) { // Insert scalar instance packing it into a vector. + // If we're constructing lane 0, initialize to start from undef. + if (State.Instance->Lane == 0) { + Value *Undef = + UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); + State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); + } + State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); + } + return; + } + + // Generate scalar instances for all VF lanes of all UF parts, unless the + // instruction is uniform inwhich case generate only the first lane for each + // of the UF parts. + unsigned EndLane = IsUniform ? 1 : State.VF; + for (unsigned Part = 0; Part < State.UF; ++Part) + for (unsigned Lane = 0; Lane < EndLane; ++Lane) + State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); + + // Broadcast or group together all instances into a vector. + if (AlsoPack) + for (unsigned Part = 0; Part < State.UF; ++Part) + State.ILV->getOrCreateVectorValue(Ingredient, Part); +} + +void VPBranchOnMaskRecipe::execute(VPTransformState &State) { + assert(State.Instance && "Branch on Mask works only on single instance."); + + unsigned Part = State.Instance->Part; + unsigned Lane = State.Instance->Lane; + + auto Cond = State.ILV->createBlockInMask(MaskedBasicBlock); + + Value *ConditionBit = Cond[Part]; + if (!ConditionBit) // Block in mask is all-one. + ConditionBit = State.Builder.getTrue(); + else if (ConditionBit->getType()->isVectorTy()) + ConditionBit = State.Builder.CreateExtractElement( + ConditionBit, State.Builder.getInt32(Lane)); + + // Replace the temporary unreachable terminator with a new conditional branch, + // whose two destinations will be set later when they are created. + auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); + assert(isa(CurrentTerminator) && + "Expected to replace unreachable terminator with conditional branch."); + auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); + CondBr->setSuccessor(0, nullptr); + ReplaceInstWithInst(CurrentTerminator, CondBr); + + DEBUG(dbgs() << "\nLV: vectorizing BranchOnMask recipe " + << MaskedBasicBlock->getName()); +} + +void VPPredInstPHIRecipe::execute(VPTransformState &State) { + assert(State.Instance && "Predicated instruction PHI works per instance."); + Instruction *ScalarPredInst = cast( + State.ValueMap.getScalarValue(PredInst, *State.Instance)); + BasicBlock *PredicatedBB = ScalarPredInst->getParent(); + BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); + assert(PredicatingBB && "Predicated block has no single predecessor."); + + // By current pack/unpack logic we need to generate only a single phi node: if + // a vector value for the predicated instruction exists at this point it means + // the instruction has vector users only, and a phi for the vector value is + // needed. In this case the recipe of the predicated instruction is marked to + // also do that packing, thereby "hoisting" the insert-element sequence. + // Otherwise, a phi node for the scalar value is needed. + unsigned Part = State.Instance->Part; + if (State.ValueMap.hasVectorValue(PredInst, Part)) { + Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); + InsertElementInst *IEI = cast(VectorValue); + PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); + VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. + VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. + State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. + } else { + Type *PredInstType = PredInst->getType(); + PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); + Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); + Phi->addIncoming(ScalarPredInst, PredicatedBB); + State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); + } +} + bool LoopVectorizePass::processLoop(Loop *L) { assert(L->empty() && "Only process inner loops."); @@ -7893,7 +8428,7 @@ CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, &LVL, CM); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); @@ -7980,6 +8515,8 @@ DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } + LVP.setBestPlan(VF.Width, IC); + using namespace ore; if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); @@ -7987,7 +8524,7 @@ // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM); - LVP.executePlan(Unroller); + LVP.executePlan(Unroller, DT); ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), L->getHeader()) @@ -7997,7 +8534,7 @@ // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM); - LVP.executePlan(LB); + LVP.executePlan(LB, DT); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there are Index: test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -26,9 +26,9 @@ ; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]] ; CHECK: [[PRED_UDIV_IF]]: ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP5]], %x -; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], %x +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP6]], [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]] ; CHECK: [[PRED_UDIV_CONTINUE]]: @@ -37,9 +37,9 @@ ; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]] ; CHECK: [[PRED_UDIV_IF1]]: ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP12]], %x -; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP11]], %x +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP13]], [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1 ; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]] ; CHECK: [[PRED_UDIV_CONTINUE2]]: Index: test/Transforms/LoopVectorize/AArch64/predication_costs.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -18,8 +18,8 @@ ; Cost of udiv: ; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 ; -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) { entry: @@ -59,8 +59,8 @@ ; Cost of store: ; (store(4) + extractelement(3)) / 2 = 3 ; -; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 ; define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -98,10 +98,10 @@ ; Cost of udiv: ; (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4 ; -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x -; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -143,10 +143,10 @@ ; Cost of store: ; store(4) / 2 = 2 ; -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 ; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 ; define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -192,16 +192,16 @@ ; Cost of store: ; store(4) / 2 = 2 ; -; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4 ; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x ; CHECK: Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2 ; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x ; CHECK: Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4 +; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x +; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4 ; define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) { entry: Index: test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll =================================================================== --- test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll +++ test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll @@ -24,10 +24,10 @@ for.end: ret void -; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 -; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, i32* %tmp0, align 4 - ; CHECK: LV: Scalarizing: %tmp1 = load i32, i32* %tmp0, align 4 ; CHECK: LV: Scalarizing: store i32 %tmp2, i32* %tmp0, align 4 + +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, i32* %tmp0, align 4 } Index: test/Transforms/LoopVectorize/first-order-recurrence.ll =================================================================== --- test/Transforms/LoopVectorize/first-order-recurrence.ll +++ test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -467,13 +467,6 @@ ; SINK-AFTER: %[[VCONV:.+]] = sext <4 x i16> %[[VSHUF]] to <4 x i32> ; SINK-AFTER: %[[VCONV3:.+]] = sext <4 x i16> %wide.load to <4 x i32> ; SINK-AFTER: mul nsw <4 x i32> %[[VCONV3]], %[[VCONV]] -; Check also that the sext sank after the load in the scalar loop. -; SINK-AFTER: for.body -; SINK-AFTER: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ %[[LOAD:.+]], %for.body ] -; SINK-AFTER: %[[LOAD]] = load i16, i16* %arrayidx2 -; SINK-AFTER: %[[CONV:.+]] = sext i16 %scalar.recur to i32 -; SINK-AFTER: %[[CONV3:.+]] = sext i16 %[[LOAD]] to i32 -; SINK-AFTER: %mul = mul nsw i32 %[[CONV3]], %[[CONV]] ; define void @sink_after(i16* %a, i32* %b, i64 %n) { entry: Index: test/Transforms/LoopVectorize/if-pred-non-void.ll =================================================================== --- test/Transforms/LoopVectorize/if-pred-non-void.ll +++ test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -209,9 +209,9 @@ ; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]] ; CHECK: [[IF0]]: ; CHECK: %[[T00:.+]] = extractelement <2 x i32> %wide.load, i32 0 -; CHECK: %[[T01:.+]] = extractelement <2 x i32> %wide.load, i32 0 -; CHECK: %[[T02:.+]] = add nsw i32 %[[T01]], %x -; CHECK: %[[T03:.+]] = udiv i32 %[[T00]], %[[T02]] +; CHECK: %[[T01:.+]] = add nsw i32 %[[T00]], %x +; CHECK: %[[T02:.+]] = extractelement <2 x i32> %wide.load, i32 0 +; CHECK: %[[T03:.+]] = udiv i32 %[[T02]], %[[T01]] ; CHECK: %[[T04:.+]] = insertelement <2 x i32> undef, i32 %[[T03]], i32 0 ; CHECK: br label %[[CONT0]] ; CHECK: [[CONT0]]: @@ -219,9 +219,9 @@ ; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]] ; CHECK: [[IF1]]: ; CHECK: %[[T06:.+]] = extractelement <2 x i32> %wide.load, i32 1 -; CHECK: %[[T07:.+]] = extractelement <2 x i32> %wide.load, i32 1 -; CHECK: %[[T08:.+]] = add nsw i32 %[[T07]], %x -; CHECK: %[[T09:.+]] = udiv i32 %[[T06]], %[[T08]] +; CHECK: %[[T07:.+]] = add nsw i32 %[[T06]], %x +; CHECK: %[[T08:.+]] = extractelement <2 x i32> %wide.load, i32 1 +; CHECK: %[[T09:.+]] = udiv i32 %[[T08]], %[[T07]] ; CHECK: %[[T10:.+]] = insertelement <2 x i32> %[[T05]], i32 %[[T09]], i32 1 ; CHECK: br label %[[CONT1]] ; CHECK: [[CONT1]]: