diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -128,6 +128,11 @@ IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, unsigned Factor); + IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, + ElementCount Factor) + : IntrinsicCostAttributes(Id, CI, Factor.Min) { + assert(!Factor.Scalable); + } IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, unsigned Factor, unsigned ScalarCost); diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -299,13 +299,17 @@ typedef unsigned ID; } -/// A helper function for converting Scalar types to vector types. -/// If the incoming type is void, we return void. If the VF is 1, we return -/// the scalar type. -inline Type *ToVectorTy(Type *Scalar, unsigned VF, bool isScalable = false) { - if (Scalar->isVoidTy() || VF == 1) +/// A helper function for converting Scalar types to vector types. If +/// the incoming type is void, we return void. If the EC represents a +/// scalar, we return the scalar type. +inline Type *ToVectorTy(Type *Scalar, ElementCount EC) { + if (Scalar->isVoidTy() || EC.isScalar()) return Scalar; - return VectorType::get(Scalar, {VF, isScalable}); + return VectorType::get(Scalar, EC); +} + +inline Type *ToVectorTy(Type *Scalar, unsigned VF) { + return ToVectorTy(Scalar, {VF, false /*Scalable*/}); } /// Identify if the intrinsic is trivially vectorizable. diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h --- a/llvm/include/llvm/IR/DiagnosticInfo.h +++ b/llvm/include/llvm/IR/DiagnosticInfo.h @@ -21,6 +21,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/IR/DebugLoc.h" #include "llvm/Support/CBindingWrapping.h" +#include "llvm/Support/TypeSize.h" #include "llvm/Support/YAMLTraits.h" #include #include @@ -434,6 +435,7 @@ Argument(StringRef Key, unsigned N); Argument(StringRef Key, unsigned long N); Argument(StringRef Key, unsigned long long N); + Argument(StringRef Key, ElementCount EC); Argument(StringRef Key, bool B) : Key(Key), Val(B ? "true" : "false") {} Argument(StringRef Key, DebugLoc dl); }; diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h --- a/llvm/include/llvm/Support/TypeSize.h +++ b/llvm/include/llvm/Support/TypeSize.h @@ -56,8 +56,46 @@ ElementCount NextPowerOf2() const { return ElementCount(llvm::NextPowerOf2(Min), Scalable); } + + /// Ordering function for ElementCount, needed for `set` like containers. + bool operator<(const ElementCount &EC) const { + if (this->Scalable == EC.Scalable) + return this->Min < EC.Min; + if (!this->Scalable && EC.Scalable) + return true; + return false; + } + + /// Printing function. + void print(raw_ostream &OS) const { + // TODO: this should use sstream + if (Scalable) + OS << "vscale x "; + OS << Min; + } + /// Counting predicates. + /// + /// Notice that Min = 1 and Scalable = true is considered more than + /// one element. + /// + ///@{ No elements.. + bool isZero() const { return Min == 0; } + /// Exactly one element. + bool isScalar() const { return !Scalable && Min == 1; } + /// One or more elements. + bool isVector() const { return Scalable || Min > 1; } + ///@} + + /// Return the ElementCount instance representing a scalar. + static ElementCount getScalar() { return {1, false}; } }; +/// Stream operator function for `ElementCount`. +inline raw_ostream &operator<<(raw_ostream &OS, const ElementCount &EC) { + EC.print(OS); + return OS; +} + // This class is used to represent the size of types. If the type is of fixed // size, it will represent the exact size. If the type is a scalable vector, // it will represent the known minimum size. diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp --- a/llvm/lib/IR/DiagnosticInfo.cpp +++ b/llvm/lib/IR/DiagnosticInfo.cpp @@ -213,6 +213,13 @@ unsigned long long N) : Key(std::string(Key)), Val(utostr(N)) {} +DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, + ElementCount EC) + : Key(std::string(Key)) { + raw_string_ostream OS(Val); + EC.print(OS); +} + DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, DebugLoc Loc) : Key(std::string(Key)), Loc(Loc) { if (Loc) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -172,12 +172,14 @@ /// Information about vectorization costs struct VectorizationFactor { // Vector width with best cost - unsigned Width; + ElementCount Width; // Cost of the loop with that width unsigned Cost; // Width 1 means no vectorization, cost 0 means uncomputed cost. - static VectorizationFactor Disabled() { return {1, 0}; } + static VectorizationFactor Disabled() { + return {ElementCount::getScalar(), 0}; + } bool operator==(const VectorizationFactor &rhs) const { return Width == rhs.Width && Cost == rhs.Cost; @@ -227,7 +229,7 @@ /// A builder used to construct the current plan. VPBuilder Builder; - unsigned BestVF = 0; + ElementCount BestVF = ElementCount(0, false); unsigned BestUF = 0; public: @@ -242,14 +244,14 @@ /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional plan(unsigned UserVF, unsigned UserIC); + Optional plan(ElementCount UserVF, unsigned UserIC); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(unsigned UserVF); + VectorizationFactor planInVPlanNativePath(ElementCount UserVF); /// Finalize the best decision and dispose of all other VPlans. - void setBestPlan(unsigned VF, unsigned UF); + void setBestPlan(ElementCount VF, unsigned UF); /// Generate the IR code for the body of the vectorized loop according to the /// best selected VPlan. @@ -264,7 +266,7 @@ /// \p Predicate on Range.Start, possibly decreasing Range.End such that the /// returned value holds for the entire \p Range. static bool - getDecisionAndClampRange(const std::function &Predicate, + getDecisionAndClampRange(const std::function &Predicate, VFRange &Range); protected: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -313,11 +313,12 @@ /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type at the given vectorization factor. -static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { +static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { + assert(!VF.Scalable && "invalid number of elements"); // Determine if an array of VF elements of type Ty is "bitcast compatible" // with a vector. - if (VF > 1) { - auto *VectorTy = FixedVectorType::get(Ty, VF); + if (VF.isVector()) { + auto *VectorTy = FixedVectorType::get(Ty, VF.Min); return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); } @@ -399,7 +400,7 @@ LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, unsigned VecWidth, + OptimizationRemarkEmitter *ORE, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) @@ -449,13 +450,13 @@ /// Vectorize a single GetElementPtrInst based on information gathered and /// decisions taken during planning. void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, - unsigned VF, bool IsPtrLoopInvariant, + ElementCount VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); + void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane @@ -743,7 +744,7 @@ /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. - unsigned VF; + ElementCount VF; /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. @@ -832,8 +833,9 @@ LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, - UnrollFactor, LVL, CM, BFI, PSI) {} + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + ElementCount::getScalar(), UnrollFactor, LVL, CM, + BFI, PSI) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -869,7 +871,7 @@ const DILocation *DIL = Inst->getDebugLoc(); if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && !isa(Inst)) { - auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); + auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.Min); if (NewDIL) B.SetCurrentDebugLocation(NewDIL.getValue()); else @@ -1034,7 +1036,7 @@ VectorizationFactor selectVectorizationFactor(unsigned MaxVF); /// Setup cost-based decisions for user vectorization factor. - void selectUserVectorizationFactor(unsigned UserVF) { + void selectUserVectorizationFactor(ElementCount UserVF) { collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); } @@ -1048,7 +1050,7 @@ /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); + unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1057,7 +1059,7 @@ /// the lists of loop-uniform and loop-scalar instructions. /// The calculated cost is saved with widening decision in order to /// avoid redundant calculations. - void setCostBasedWideningDecision(unsigned VF); + void setCostBasedWideningDecision(ElementCount VF); /// A struct that represents some properties of the register usage /// of a loop. @@ -1072,7 +1074,8 @@ /// \return Returns information about the register usages of the loop for the /// given vectorization factors. - SmallVector calculateRegisterUsage(ArrayRef VFs); + SmallVector + calculateRegisterUsage(ArrayRef VFs); /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); @@ -1090,8 +1093,9 @@ /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. - bool isProfitableToScalarize(Instruction *I, unsigned VF) const { - assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); + bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { + assert(VF.isVector() && + "Profitable to scalarize relevant only for VF > 1."); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. @@ -1105,8 +1109,8 @@ } /// Returns true if \p I is known to be uniform after vectorization. - bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { - if (VF == 1) + bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { + if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1121,8 +1125,8 @@ } /// Returns true if \p I is known to be scalar after vectorization. - bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { - if (VF == 1) + bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { + if (VF.isScalar()) return true; // Cost model is not run in the VPlan-native path - return conservative @@ -1138,8 +1142,8 @@ /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. - bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { - return VF > 1 && MinBWs.find(I) != MinBWs.end() && + bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { + return VF.isVector() && MinBWs.find(I) != MinBWs.end() && !isProfitableToScalarize(I, VF) && !isScalarAfterVectorization(I, VF); } @@ -1156,17 +1160,17 @@ /// Save vectorization decision \p W and \p Cost taken by the cost model for /// instruction \p I and vector width \p VF. - void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, + void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, unsigned Cost) { - assert(VF >= 2 && "Expected VF >=2"); + assert(VF.isVector() && "Expected VF >=2"); WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); } /// Save vectorization decision \p W and \p Cost taken by the cost model for /// interleaving group \p Grp and vector width \p VF. - void setWideningDecision(const InterleaveGroup *Grp, unsigned VF, - InstWidening W, unsigned Cost) { - assert(VF >= 2 && "Expected VF >=2"); + void setWideningDecision(const InterleaveGroup *Grp, + ElementCount VF, InstWidening W, unsigned Cost) { + assert(VF.isVector() && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. /// But the cost will be assigned to one instruction only. for (unsigned i = 0; i < Grp->getFactor(); ++i) { @@ -1182,15 +1186,16 @@ /// Return the cost model decision for the given instruction \p I and vector /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. - InstWidening getWideningDecision(Instruction *I, unsigned VF) { - assert(VF >= 2 && "Expected VF >=2"); + InstWidening getWideningDecision(Instruction *I, ElementCount VF) { + assert(!VF.Scalable && "invalid element count"); + assert(VF.isVector() && "Expected VF >=2"); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) return CM_GatherScatter; - std::pair InstOnVF = std::make_pair(I, VF); + std::pair InstOnVF = std::make_pair(I, VF); auto Itr = WideningDecisions.find(InstOnVF); if (Itr == WideningDecisions.end()) return CM_Unknown; @@ -1199,9 +1204,9 @@ /// Return the vectorization cost for the given instruction \p I and vector /// width \p VF. - unsigned getWideningCost(Instruction *I, unsigned VF) { - assert(VF >= 2 && "Expected VF >=2"); - std::pair InstOnVF = std::make_pair(I, VF); + unsigned getWideningCost(Instruction *I, ElementCount VF) { + assert(VF.isVector() && "Expected VF >=2"); + std::pair InstOnVF = std::make_pair(I, VF); assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && "The cost is not calculated"); return WideningDecisions[InstOnVF].second; @@ -1210,7 +1215,7 @@ /// Return True if instruction \p I is an optimizable truncate whose operand /// is an induction variable. Such a truncate will be removed by adding a new /// induction variable with the destination type. - bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { + bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { // If the instruction is not a truncate, return false. auto *Trunc = dyn_cast(I); if (!Trunc) @@ -1235,14 +1240,14 @@ /// Collects the instructions to scalarize for each predicated instruction in /// the loop. - void collectInstsToScalarize(unsigned VF); + void collectInstsToScalarize(ElementCount VF); /// Collect Uniform and Scalar values for the given \p VF. /// The sets depend on CM decision for Load/Store instructions /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(unsigned VF) { + void collectUniformsAndScalars(ElementCount VF) { // Do the analysis once. - if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) + if ((VF == 1 && !VF.Scalable) || Uniforms.find(VF) != Uniforms.end()) return; setCostBasedWideningDecision(VF); collectLoopUniforms(VF); @@ -1293,7 +1298,8 @@ /// instructions that may divide by zero. /// If a non-zero VF has been calculated, we check if I will be scalarized /// predication for that VF. - bool isScalarWithPredication(Instruction *I, unsigned VF = 1); + bool isScalarWithPredication(Instruction *I, + ElementCount VF = ElementCount::getScalar()); // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. @@ -1310,12 +1316,16 @@ /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. - bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + bool + memoryInstructionCanBeWidened(Instruction *I, + ElementCount VF = ElementCount::getScalar()); /// Returns true if \p I is a memory instruction in an interleaved-group /// of memory accesses that can be vectorized with wide vector loads/stores /// and shuffles. - bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); + bool + interleavedAccessCanBeWidened(Instruction *I, + ElementCount VF = ElementCount::getScalar()); /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { @@ -1367,14 +1377,15 @@ /// Estimate cost of an intrinsic call instruction CI if it were vectorized /// with factor VF. Return the cost of the instruction, including /// scalarization overhead if it's needed. - unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); + unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); /// Estimate cost of a call instruction CI if it were vectorized with factor /// VF. Return the cost of the instruction, including scalarization overhead /// if it's needed. The flag NeedToScalarize shows if the call needs to be /// scalarized - /// i.e. either vector version isn't available, or is too expensive. - unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); + unsigned getVectorCallCost(CallInst *CI, ElementCount VF, + bool &NeedToScalarize); /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { @@ -1404,41 +1415,41 @@ /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. - VectorizationCostTy expectedCost(unsigned VF); + VectorizationCostTy expectedCost(ElementCount VF); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. - unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); + unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); /// Calculate vectorization cost of memory instruction \p I. - unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); + unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); /// The cost computation for scalarized memory instruction. - unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); + unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); /// The cost computation for interleaving group of memory instructions. - unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); + unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); /// The cost computation for Gather/Scatter instruction. - unsigned getGatherScatterCost(Instruction *I, unsigned VF); + unsigned getGatherScatterCost(Instruction *I, ElementCount VF); /// The cost computation for widening instruction \p I with consecutive /// memory access. - unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); + unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); /// The cost calculation for Load/Store instruction \p I with uniform pointer - /// Load: scalar load + broadcast. /// Store: scalar store + (loop invariant value stored? 0 : extract of last /// element) - unsigned getUniformMemOpCost(Instruction *I, unsigned VF); + unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. - unsigned getScalarizationOverhead(Instruction *I, unsigned VF); + unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. @@ -1478,19 +1489,19 @@ /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. - DenseMap InstsToScalarize; + DenseMap InstsToScalarize; /// Holds the instructions known to be uniform after vectorization. /// The data is collected per VF. - DenseMap> Uniforms; + DenseMap> Uniforms; /// Holds the instructions known to be scalar after vectorization. /// The data is collected per VF. - DenseMap> Scalars; + DenseMap> Scalars; /// Holds the instructions (address computations) that are forced to be /// scalarized. - DenseMap> ForcedScalars; + DenseMap> ForcedScalars; /// PHINodes of the reductions that should be expanded in-loop along with /// their associated chains of reduction operations, in program order from top @@ -1503,7 +1514,7 @@ /// non-negative return value implies the expression will be scalarized. /// Currently, only single-use chains are considered for scalarization. int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, - unsigned VF); + ElementCount VF); /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in @@ -1514,27 +1525,28 @@ /// scalarized instruction will be represented by VF scalar values in the /// vectorized loop, each corresponding to an iteration of the original /// scalar loop. - void collectLoopUniforms(unsigned VF); + void collectLoopUniforms(ElementCount VF); /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized /// during vectorization. Non-uniform scalarized instructions will be /// represented by VF values in the vectorized loop, each corresponding to an /// iteration of the original scalar loop. - void collectLoopScalars(unsigned VF); + void collectLoopScalars(ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. - using DecisionList = DenseMap, + using DecisionList = DenseMap, std::pair>; DecisionList WideningDecisions; /// Returns true if \p V is expected to be vectorized and it needs to be /// extracted. - bool needsExtract(Value *V, unsigned VF) const { + bool needsExtract(Value *V, ElementCount VF) const { Instruction *I = dyn_cast(V); - if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) + if (VF.isScalar() || !I || !TheLoop->contains(I) || + TheLoop->isLoopInvariant(I)) return false; // Assume we can vectorize V (and hence we need extraction) if the @@ -1549,7 +1561,7 @@ /// Returns a range containing only operands needing to be extracted. SmallVector filterExtractingOperands(Instruction::op_range Ops, - unsigned VF) { + ElementCount VF) { return SmallVector(make_filter_range( Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } @@ -1796,7 +1808,7 @@ // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. - Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); + Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF.Min); Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); // Create a vector splat to use in the induction update. @@ -1804,10 +1816,10 @@ // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. - Value *SplatVF = - isa(Mul) - ? ConstantVector::getSplat({VF, false}, cast(Mul)) - : Builder.CreateVectorSplat(VF, Mul); + assert(!VF.Scalable && "invalid number of elements"); + Value *SplatVF = isa(Mul) + ? ConstantVector::getSplat(VF, cast(Mul)) + : Builder.CreateVectorSplat(VF.Min, Mul); Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -1941,8 +1953,9 @@ auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { - Value *EntryPart = - getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); + assert(!VF.Scalable && "invalid number of elements"); + Value *EntryPart = getStepVector(Broadcasted, VF.Min * Part, Step, + ID.getInductionOpcode()); VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); if (Trunc) addMetadata(EntryPart, Trunc); @@ -1952,7 +1965,7 @@ // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); - if (VF <= 1) { + if (VF.isZero() || VF.isScalar()) { Value *ScalarIV = CreateScalarIV(Step); CreateSplatIV(ScalarIV, Step); return; @@ -2050,8 +2063,9 @@ Instruction *EntryVal, const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. - assert(VF > 1 && "VF should be greater than one"); - + assert(VF.isVector() && "VF should be greater than one"); + assert(!VF.Scalable && + "the code below assumes a fixed number of elements at compile time"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); assert(ScalarIVTy == Step->getType() && @@ -2073,12 +2087,14 @@ // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 - : VF; + Cost->isUniformAfterVectorization(cast(EntryVal), VF) + ? 1 + : VF.Min; // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); + auto *StartIdx = + getSignedIntOrFpConstant(ScalarIVTy, VF.Min * Part + Lane); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); @@ -2121,7 +2137,8 @@ // is known to be uniform after vectorization, this corresponds to lane zero // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. - unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; + unsigned LastLane = + Cost->isUniformAfterVectorization(I, VF) ? 0 : VF.Min - 1; auto *LastInst = cast( VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); @@ -2143,9 +2160,10 @@ VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { // Initialize packing with insertelements to start from undef. - Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF)); + Value *Undef = + UndefValue::get(FixedVectorType::get(V->getType(), VF.Min)); VectorLoopValueMap.setVectorValue(V, Part, Undef); - for (unsigned Lane = 0; Lane < VF; ++Lane) + for (unsigned Lane = 0; Lane < VF.Min; ++Lane) packScalarIntoVectorValue(V, {Part, Lane}); VectorValue = VectorLoopValueMap.getVectorValue(V, Part); } @@ -2209,9 +2227,10 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); + assert(!VF.Scalable && "Cannot reverse scalable vectors"); SmallVector ShuffleMask; - for (unsigned i = 0; i < VF; ++i) - ShuffleMask.push_back(VF - i - 1); + for (unsigned i = 0; i < VF.Min; ++i) + ShuffleMask.push_back(VF.Min - i - 1); return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), ShuffleMask, "reverse"); @@ -2265,7 +2284,7 @@ // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); - auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF); + auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF.Min); // Prepare for the new pointers. SmallVector AddrParts; @@ -2281,8 +2300,10 @@ // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. + assert(!VF.Scalable && + "scalable vector reverse operation is not implemented"); if (Group->isReverse()) - Index += (VF - 1) * Group->getFactor(); + Index += (VF.Min - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, {Part, 0}); @@ -2317,7 +2338,8 @@ Value *MaskForGaps = nullptr; if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { - MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); + assert(!VF.Scalable && "Invalid number of elements"); + MaskForGaps = createBitMaskForGaps(Builder, VF.Min, *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); } @@ -2334,9 +2356,11 @@ if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); + assert(!VF.Scalable && "Invalid element count."); Value *ShuffledMask = Builder.CreateShuffleVector( BlockInMaskPart, Undefs, - createReplicatedMask(InterleaveFactor, VF), "interleaved.mask"); + createReplicatedMask(InterleaveFactor, VF.Min), + "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) @@ -2362,14 +2386,16 @@ if (!Member) continue; - auto StrideMask = createStrideMask(I, InterleaveFactor, VF); + assert(!VF.Scalable && "Invalid element count"); + auto StrideMask = createStrideMask(I, InterleaveFactor, VF.Min); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( NewLoads[Part], UndefVec, StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF); + VectorType *OtherVTy = + FixedVectorType::get(Member->getType(), VF.Min); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } @@ -2383,7 +2409,7 @@ } // The sub vector type for current instruction. - auto *SubVT = FixedVectorType::get(ScalarTy, VF); + auto *SubVT = FixedVectorType::get(ScalarTy, VF.Min); // Vectorize the interleaved store group. for (unsigned Part = 0; Part < UF; Part++) { @@ -2411,8 +2437,9 @@ Value *WideVec = concatenateVectors(Builder, StoredVecs); // Interleave the elements in the wide vector. + assert(!VF.Scalable && "invalid element count"); Value *IVec = Builder.CreateShuffleVector( - WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor), + WideVec, UndefVec, createInterleaveMask(VF.Min, InterleaveFactor), "interleaved.vec"); Instruction *NewStoreInstr; @@ -2420,8 +2447,8 @@ Value *BlockInMaskPart = State.get(BlockInMask, Part); auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF), - "interleaved.mask"); + BlockInMaskPart, Undefs, + createReplicatedMask(InterleaveFactor, VF.Min), "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); } @@ -2454,7 +2481,8 @@ "CM decision is not to widen the memory instruction"); Type *ScalarDataTy = getMemInstValueType(Instr); - auto *DataTy = FixedVectorType::get(ScalarDataTy, VF); + + auto *DataTy = FixedVectorType::get(ScalarDataTy, VF.Min); const Align Alignment = getLoadStoreAlignment(Instr); // Determine if the pointer operand of the access is either consecutive or @@ -2488,17 +2516,17 @@ if (Reverse) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = cast( - Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); + PartPtr = cast(Builder.CreateGEP( + ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.Min))); PartPtr->setIsInBounds(InBounds); - PartPtr = cast( - Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); + PartPtr = cast(Builder.CreateGEP( + ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.Min))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { - PartPtr = cast( - Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); + PartPtr = cast(Builder.CreateGEP( + ScalarDataTy, Ptr, Builder.getInt32(Part * VF.Min))); PartPtr->setIsInBounds(InBounds); } @@ -2694,7 +2722,8 @@ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Type *Ty = TC->getType(); - Constant *Step = ConstantInt::get(Ty, VF * UF); + // This is where we can me the step a runtime constant. + Constant *Step = ConstantInt::get(Ty, VF.Min * UF); // If the tail is to be folded by masking, round the number of iterations N // up to a multiple of Step instead of rounding down. This is done by first @@ -2703,9 +2732,10 @@ // that it starts at zero and its Step is a power of two; the loop will then // exit, with the last early-exit vector comparison also producing all-true. if (Cost->foldTailByMasking()) { - assert(isPowerOf2_32(VF * UF) && + assert(isPowerOf2_32(VF.Min * UF) && "VF*UF must be a power of 2 when folding tail by masking"); - TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); + TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF.Min * UF - 1), + "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the @@ -2722,7 +2752,7 @@ // does not evenly divide the trip count, no adjustment is necessary since // there will already be scalar iterations. Note that the minimum iterations // check ensures that N >= Step. - if (VF > 1 && Cost->requiresScalarEpilogue()) { + if (VF.isVector() && Cost->requiresScalarEpilogue()) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -2782,7 +2812,7 @@ Value *CheckMinIters = Builder.getFalse(); if (!Cost->foldTailByMasking()) CheckMinIters = Builder.CreateICmp( - P, Count, ConstantInt::get(Count->getType(), VF * UF), + P, Count, ConstantInt::get(Count->getType(), VF.Min * UF), "min.iters.check"); // Create new preheader for vector loop. @@ -3236,7 +3266,8 @@ Value *StartIdx = ConstantInt::get(IdxTy, 0); // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). - Constant *Step = ConstantInt::get(IdxTy, VF * UF); + // TODO: make this step a runtime value + Constant *Step = ConstantInt::get(IdxTy, VF.Min * UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); Induction = createInductionVariable(Lp, StartIdx, CountRoundDown, Step, @@ -3368,8 +3399,9 @@ } unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, - unsigned VF, + ElementCount VF, bool &NeedToScalarize) { + assert(!VF.Scalable && "invalid number of elements"); Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector Tys, ScalarTys; @@ -3382,7 +3414,7 @@ // value. unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); - if (VF == 1) + if (VF.isScalar()) return ScalarCallCost; // Compute corresponding vector type for return value and arguments. @@ -3394,12 +3426,12 @@ // packing the return values to a vector. unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); - unsigned Cost = ScalarCallCost * VF + ScalarizationCost; + unsigned Cost = ScalarCallCost * VF.Min + ScalarizationCost; // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. NeedToScalarize = true; - VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); + VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); if (!TLI || CI->isNoBuiltin() || !VecFunc) @@ -3416,7 +3448,7 @@ } unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, - unsigned VF) { + ElementCount VF) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); @@ -3573,7 +3605,7 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. - if (VF > 1) + if (VF.isVector()) truncateToMinimalBitwidths(); // Fix widened non-induction PHIs by setting up the PHI operands. @@ -3614,9 +3646,11 @@ // profile is not inherently precise anyway. Note also possible bypass of // vector code caused by legality checks is ignored, assigning all the weight // to the vector loop, optimistically. + assert(!VF.Scalable && + "cannot use scalable ElementCount to determine unroll factor"); setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), - LI->getLoopFor(LoopScalarBody), VF * UF); + LI->getLoopFor(LoopScalarBody), VF.Min * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs() { @@ -3695,11 +3729,11 @@ // Create a vector from the initial value. auto *VectorInit = ScalarInit; - if (VF > 1) { + if (VF.isVector()) { Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); VectorInit = Builder.CreateInsertElement( - UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)), - VectorInit, Builder.getInt32(VF - 1), "vector.recur.init"); + UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF.Min)), + VectorInit, Builder.getInt32(VF.Min - 1), "vector.recur.init"); } // We constructed a temporary phi node in the first phase of vectorization. @@ -3740,10 +3774,11 @@ // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. - SmallVector ShuffleMask(VF); - ShuffleMask[0] = VF - 1; - for (unsigned I = 1; I < VF; ++I) - ShuffleMask[I] = I + VF - 1; + assert(!VF.Scalable); + SmallVector ShuffleMask(VF.Min); + ShuffleMask[0] = VF.Min - 1; + for (unsigned I = 1; I < VF.Min; ++I) + ShuffleMask[I] = I + VF.Min - 1; // The vector from which to take the initial value for the current iteration // (actual or unrolled). Initially, this is the vector phi node. @@ -3753,9 +3788,10 @@ for (unsigned Part = 0; Part < UF; ++Part) { Value *PreviousPart = getOrCreateVectorValue(Previous, Part); Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); - auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, - ShuffleMask) - : Incoming; + auto *Shuffle = + VF.isVector() + ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) + : Incoming; PhiPart->replaceAllUsesWith(Shuffle); cast(PhiPart)->eraseFromParent(); VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); @@ -3768,10 +3804,10 @@ // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. auto *ExtractForScalar = Incoming; - if (VF > 1) { + if (VF.isVector()) { Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); ExtractForScalar = Builder.CreateExtractElement( - ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); + ExtractForScalar, Builder.getInt32(VF.Min - 1), "vector.recur.extract"); } // Extract the second last element in the middle block if the // Phi is used outside the loop. We need to extract the phi itself @@ -3779,9 +3815,9 @@ // will be the value when jumping to the exit block from the LoopMiddleBlock, // when the scalar loop is not run at all. Value *ExtractForPhiUsedOutsideLoop = nullptr; - if (VF > 1) + if (VF.isVector()) ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( - Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); + Incoming, Builder.getInt32(VF.Min - 2), "vector.recur.extract.for.phi"); // When loop is unrolled without vectorizing, initialize // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of // `Incoming`. This is analogous to the vectorized case above: extracting the @@ -3860,7 +3896,7 @@ // incoming scalar reduction. VectorStart = ReductionStartValue; } else { - Identity = ConstantVector::getSplat({VF, false}, Iden); + Identity = ConstantVector::getSplat(VF, Iden); // This vector is the Identity vector where the first element is the // incoming scalar reduction. @@ -3921,9 +3957,9 @@ // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. - if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { + if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); - Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF); + Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF.Min); Builder.SetInsertPoint( LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); VectorParts RdxParts(UF); @@ -3975,7 +4011,7 @@ // Create the reduction after the loop. Note that inloop reductions create the // target reduction in the loop using a Reduction recipe. - if (VF > 1 && !IsInLoopReductionPhi) { + if (VF.isVector() && !IsInLoopReductionPhi) { bool NoNaN = Legal->hasFunNoNaNAttr(); ReducedPartRdx = createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); @@ -4054,16 +4090,17 @@ } void InnerLoopVectorizer::fixLCSSAPHIs() { + assert(!VF.Scalable && "the code below assumes fixed width vectors"); for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getNumIncomingValues() == 1) { auto *IncomingValue = LCSSAPhi.getIncomingValue(0); // Non-instruction incoming values will have only one value. unsigned LastLane = 0; - if (isa(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast(IncomingValue), VF) - ? 0 - : VF - 1; + if (isa(IncomingValue)) + LastLane = Cost->isUniformAfterVectorization( + cast(IncomingValue), VF) + ? 0 + : VF.Min - 1; // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); @@ -4175,7 +4212,7 @@ } void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, - unsigned UF, unsigned VF, + unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant, VPTransformState &State) { @@ -4185,7 +4222,7 @@ // is vector-typed. Thus, to keep the representation compact, we only use // vector-typed operands for loop-varying values. - if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { // If we are vectorizing, but the GEP has only loop-invariant operands, // the GEP we build (by only using vector-typed operands for // loop-varying values) would be a scalar pointer. Thus, to ensure we @@ -4245,15 +4282,16 @@ } void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, - unsigned VF) { + ElementCount VF) { + assert(!VF.Scalable && "invalid number of elements"); PHINode *P = cast(PN); if (EnableVPlanNativePath) { // Currently we enter here in the VPlan-native path for non-induction // PHIs where all control flow is uniform. We simply widen these PHIs. // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. - Type *VecTy = - (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF); + Type *VecTy = (VF.isScalar()) ? PN->getType() + : FixedVectorType::get(PN->getType(), VF.Min); Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); VectorLoopValueMap.setVectorValue(P, 0, VecPhi); OrigPHIsToFix.push_back(P); @@ -4271,9 +4309,10 @@ if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { for (unsigned Part = 0; Part < UF; ++Part) { // This is phase one of vectorizing PHIs. - bool ScalarPHI = (VF == 1) || Cost->isInLoopReduction(cast(PN)); - Type *VecTy = - ScalarPHI ? PN->getType() : FixedVectorType::get(PN->getType(), VF); + bool ScalarPHI = + (VF.isScalar()) || Cost->isInLoopReduction(cast(PN)); + Type *VecTy = ScalarPHI ? PN->getType() + : FixedVectorType::get(PN->getType(), VF.Min); Value *EntryPart = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); VectorLoopValueMap.setVectorValue(P, Part, EntryPart); @@ -4309,10 +4348,11 @@ // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; + unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.Min; for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); + Constant *Idx = + ConstantInt::get(PtrInd->getType(), Lane + Part * VF.Min); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); @@ -4342,7 +4382,8 @@ Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); Value *InductionGEP = GetElementPtrInst::Create( ScStValueType->getPointerElementType(), NewPointerPhi, - Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)), + Builder.CreateMul(ScalarStepValue, + ConstantInt::get(PhiType, VF.Min * UF)), "ptr.ind", InductionLoc); NewPointerPhi->addIncoming(InductionGEP, LoopLatch); @@ -4352,14 +4393,14 @@ for (unsigned Part = 0; Part < UF; ++Part) { SmallVector Indices; // Create a vector of consecutive numbers from zero to VF. - for (unsigned i = 0; i < VF; ++i) - Indices.push_back(ConstantInt::get(PhiType, i + Part * VF)); + for (unsigned i = 0; i < VF.Min; ++i) + Indices.push_back(ConstantInt::get(PhiType, i + Part * VF.Min)); Constant *StartOffset = ConstantVector::get(Indices); Value *GEP = Builder.CreateGEP( ScStValueType->getPointerElementType(), NewPointerPhi, Builder.CreateMul(StartOffset, - Builder.CreateVectorSplat(VF, ScalarStepValue), + Builder.CreateVectorSplat(VF.Min, ScalarStepValue), "vector.gep")); VectorLoopValueMap.setVectorValue(P, Part, GEP); } @@ -4387,6 +4428,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, VPTransformState &State) { + assert(!VF.Scalable && "invalid number of elements"); switch (I.getOpcode()) { case Instruction::Call: case Instruction::Br: @@ -4474,8 +4516,9 @@ setDebugLocFromInst(Builder, CI); /// Vectorize casts. - Type *DestTy = - (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF); + Type *DestTy = (VF.isScalar()) + ? CI->getType() + : FixedVectorType::get(CI->getType(), VF.Min); for (unsigned Part = 0; Part < UF; ++Part) { Value *A = State.get(User.getOperand(0), Part); @@ -4503,7 +4546,7 @@ SmallVector Tys; for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.Min)); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); @@ -4534,15 +4577,14 @@ if (UseVectorIntrinsic) { // Use vector version of the intrinsic. Type *TysForDecl[] = {CI->getType()}; - if (VF > 1) + if (VF.isVector()) TysForDecl[0] = - FixedVectorType::get(CI->getType()->getScalarType(), VF); + FixedVectorType::get(CI->getType()->getScalarType(), VF.Min); VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); } else { // Use vector version of the function call. - const VFShape Shape = - VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); + const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); #ifndef NDEBUG assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && "Can't create vector function."); @@ -4585,11 +4627,11 @@ } } -void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { +void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. - assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && + assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && "This function should not be visited twice for the same VF"); SmallSetVector Worklist; @@ -4772,7 +4814,9 @@ Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { +bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, + ElementCount VF) { + assert(!VF.Scalable && "invalid number of elements"); if (!blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { @@ -4786,7 +4830,7 @@ auto *Ty = getMemInstValueType(I); // We have already decided how to vectorize this instruction, get that // result. - if (VF > 1) { + if (VF.isVector()) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -4807,8 +4851,8 @@ return false; } -bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, - unsigned VF) { +bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( + Instruction *I, ElementCount VF) { assert(isAccessInterleaved(I) && "Expecting interleaved access."); assert(getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."); @@ -4844,8 +4888,8 @@ : TTI.isLegalMaskedStore(Ty, Alignment); } -bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, - unsigned VF) { +bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( + Instruction *I, ElementCount VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast(I); StoreInst *SI = dyn_cast(I); @@ -4872,13 +4916,13 @@ return true; } -void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { +void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which // already does this check. Collecting Uniforms for VF=1 does not make any // sense. - assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && + assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"); // Visit the list of Uniforms. If we'll not find any uniform value, we'll @@ -4929,7 +4973,7 @@ // Holds pointer operands of instructions that are possibly non-uniform. SmallPtrSet PossibleNonUniformPtrs; - auto isUniformDecision = [&](Instruction *I, unsigned VF) { + auto isUniformDecision = [&](Instruction *I, ElementCount VF) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -5226,10 +5270,10 @@ (MaximizeBandwidth && isScalarEpilogueAllowed())) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). - SmallVector VFs; + SmallVector VFs; unsigned NewMaxVectorSize = WidestRegister / SmallestType; for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) - VFs.push_back(VS); + VFs.push_back({VS, false}); // For each VF calculate its register usage. auto RUs = calculateRegisterUsage(VFs); @@ -5244,7 +5288,7 @@ Selected = false; } if (Selected) { - MaxVF = VFs[i]; + MaxVF = VFs[i].Min; break; } } @@ -5261,7 +5305,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { - float Cost = expectedCost(1).first; + float Cost = expectedCost(ElementCount::getScalar()).first; const float ScalarCost = Cost; unsigned Width = 1; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); @@ -5278,7 +5322,7 @@ // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. - VectorizationCostTy C = expectedCost(i); + VectorizationCostTy C = expectedCost({i, false}); float VectorCost = C.first / (float)i; LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); @@ -5306,7 +5350,7 @@ << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); - VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; + VectorizationFactor Factor = {{Width, false}, (unsigned)(Width * Cost)}; return Factor; } @@ -5366,7 +5410,7 @@ return {MinWidth, MaxWidth}; } -unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, unsigned LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -5444,7 +5488,7 @@ } // Clamp the interleave ranges to reasonable counts. - unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); + unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.Min); // Check if the user has overridden the max. if (VF == 1) { @@ -5458,7 +5502,7 @@ // If trip count is known or estimated compile time constant, limit the // interleave count to be less than the trip count divided by VF. if (BestKnownTC) { - MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); + MaxInterleaveCount = std::min(*BestKnownTC / VF.Min, MaxInterleaveCount); } // If we did not calculate the cost for VF (because the user selected the VF) @@ -5477,7 +5521,7 @@ // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF > 1 && !Legal->getReductionVars().empty()) { + if (VF.isVector() && !Legal->getReductionVars().empty()) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5485,7 +5529,7 @@ // Note that if we've already vectorized the loop we will have done the // runtime check and so interleaving won't require further checks. bool InterleavingRequiresRuntimePointerCheck = - (VF == 1 && Legal->getRuntimePointerChecking()->Need); + (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. @@ -5539,7 +5583,7 @@ } SmallVector -LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { +LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -5626,11 +5670,11 @@ LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); // A lambda that gets the register usage for the given type and VF. - auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { + auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) { if (Ty->isTokenTy()) return 0U; unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); - return std::max(1, VF * TypeSize / WidestRegister); + return std::max(1, VF.Min * TypeSize / WidestRegister); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -5654,7 +5698,7 @@ // Count the number of live intervals. SmallMapVector RegUsage; - if (VFs[j] == 1) { + if (VFs[j].isScalar()) { for (auto Inst : OpenIntervals) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); if (RegUsage.find(ClassID) == RegUsage.end()) @@ -5703,8 +5747,10 @@ SmallMapVector Invariant; for (auto Inst : LoopInvariants) { - unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); - unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); + unsigned Usage = + VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + unsigned ClassID = + TTI.getRegisterClassForType(VFs[i].Min > 1, Inst->getType()); if (Invariant.find(ClassID) == Invariant.end()) Invariant[ClassID] = Usage; else @@ -5752,12 +5798,12 @@ NumPredStores > NumberOfStoresToPredicate); } -void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { +void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the // expected cost for interleaving. - if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) + if (VF.Min < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) return; // Initialize a mapping for VF in InstsToScalalarize. If we find that it's @@ -5787,7 +5833,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( Instruction *PredInst, DenseMap &ScalarCosts, - unsigned VF) { + ElementCount VF) { assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); @@ -5854,16 +5900,19 @@ // the instruction as if it wasn't if-converted and instead remained in the // predicated block. We will scale this cost by block probability after // computing the scalarization overhead. - unsigned ScalarCost = VF * getInstructionCost(I, 1).first; + unsigned ScalarCost = + VF.Min * getInstructionCost(I, ElementCount::getScalar()).first; // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), - APInt::getAllOnesValue(VF), true, false); - ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI, - TTI::TCK_RecipThroughput); + APInt::getAllOnesValue(VF.Min), true, false); + assert(!VF.Scalable && "invalid number of elements"); + ScalarCost += + VF.Min * + TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); } // Compute the scalarization overhead of needed extractelement @@ -5876,10 +5925,12 @@ "Instruction has non-scalar type"); if (canBeScalarized(J)) Worklist.push_back(J); - else if (needsExtract(J, VF)) + else if (needsExtract(J, VF)) { + assert(!VF.Scalable && "invalid number of elements"); ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(J->getType(), VF)), - APInt::getAllOnesValue(VF), false, true); + APInt::getAllOnesValue(VF.Min), false, true); + } } // Scale the total scalar cost by block probability. @@ -5895,7 +5946,8 @@ } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost(unsigned VF) { +LoopVectorizationCostModel::expectedCost(ElementCount VF) { + assert(!VF.Scalable && "invalid number of elements"); VectorizationCostTy Cost; // For each block. @@ -5905,7 +5957,8 @@ // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. - if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) + if (ValuesToIgnore.count(&I) || + (VF.isVector() && VecValuesToIgnore.count(&I))) continue; VectorizationCostTy C = getInstructionCost(&I, VF); @@ -5927,7 +5980,7 @@ // unconditionally executed. For the scalar case, we may not always execute // the predicated block. Thus, scale the block's cost by the probability of // executing it. - if (VF == 1 && blockNeedsPredication(BB)) + if (VF.isScalar() && blockNeedsPredication(BB)) BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; @@ -5972,9 +6025,12 @@ Legal->hasStride(I->getOperand(1)); } -unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, - unsigned VF) { - assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); +unsigned +LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, + ElementCount VF) { + assert(VF.isVector() && + "Scalarization cost of instruction implies vectorization."); + assert(!VF.Scalable && "invalid number of elements"); Type *ValTy = getMemInstValueType(I); auto SE = PSE.getSE(); @@ -5987,14 +6043,14 @@ const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + unsigned Cost = VF.Min * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. const Align Alignment = getLoadStoreAlignment(I); - Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS, - TTI::TCK_RecipThroughput); + Cost += VF.Min * + TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, + AS, TTI::TCK_RecipThroughput); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -6016,7 +6072,7 @@ } unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, - unsigned VF) { + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); @@ -6042,7 +6098,7 @@ } unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, - unsigned VF) { + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -6060,14 +6116,13 @@ return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) + - (isLoopInvariantStoreValue - ? 0 - : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - VF - 1)); + (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( + Instruction::ExtractElement, + VectorTy, VF.Min - 1)); } unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, - unsigned VF) { + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -6080,7 +6135,7 @@ } unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, - unsigned VF) { + ElementCount VF) { Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); @@ -6089,7 +6144,7 @@ assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); - auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor); + auto *WideVecTy = FixedVectorType::get(ValTy, VF.Min * InterleaveFactor); // Holds the indices of existing members in an interleaved load group. // An interleaved store group doesn't need this as it doesn't allow gaps. @@ -6118,10 +6173,10 @@ } unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, - unsigned VF) { + ElementCount VF) { // Calculate scalar cost only. Vectorization cost should be ready at this // moment. - if (VF == 1) { + if (VF.isScalar()) { Type *ValTy = getMemInstValueType(I); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); @@ -6134,35 +6189,40 @@ } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { +LoopVectorizationCostModel::getInstructionCost(Instruction *I, + ElementCount VF) { + assert(!VF.Scalable); // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (isUniformAfterVectorization(I, VF)) - VF = 1; + VF = ElementCount::getScalar(); - if (VF > 1 && isProfitableToScalarize(I, VF)) + if (VF.isVector() && isProfitableToScalarize(I, VF)) return VectorizationCostTy(InstsToScalarize[VF][I], false); // Forced scalars do not have any scalarization overhead. auto ForcedScalar = ForcedScalars.find(VF); - if (VF > 1 && ForcedScalar != ForcedScalars.end()) { + if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { auto InstSet = ForcedScalar->second; if (InstSet.count(I)) - return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); + return VectorizationCostTy( + (getInstructionCost(I, ElementCount::getScalar()).first * VF.Min), + false); } Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); - bool TypeNotScalarized = - VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; + bool TypeNotScalarized = VF.isVector() && VectorTy->isVectorTy() && + TTI.getNumberOfParts(VectorTy) < VF.Min; return VectorizationCostTy(C, TypeNotScalarized); } unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, - unsigned VF) { + ElementCount VF) { - if (VF == 1) + assert(!VF.Scalable); + if (VF.isScalar()) return 0; unsigned Cost = 0; @@ -6170,7 +6230,7 @@ if (!RetTy->isVoidTy() && (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( - cast(RetTy), APInt::getAllOnesValue(VF), true, false); + cast(RetTy), APInt::getAllOnesValue(VF.Min), true, false); // Some targets keep addresses scalar. if (isa(I) && !TTI.prefersVectorizedAddressing()) @@ -6186,12 +6246,14 @@ // Skip operands that do not require extraction/scalarization and do not incur // any overhead. - return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), VF); + return Cost + + TTI.getOperandsScalarizationOverhead(filterExtractingOperands(Ops, VF), + VF.Min); } -void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { - if (VF == 1) +void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { + assert(!VF.Scalable && "invalid number of elements"); + if (VF.isScalar() && !VF.Scalable) return; NumPredStores = 0; for (BasicBlock *BB : TheLoop->blocks()) { @@ -6325,14 +6387,17 @@ InstWidening Decision = getWideningDecision(I, VF); if (Decision == CM_Widen || Decision == CM_Widen_Reverse) // Scalarize a widened load of address. - setWideningDecision(I, VF, CM_Scalarize, - (VF * getMemoryInstructionCost(I, 1))); + setWideningDecision( + I, VF, CM_Scalarize, + (VF.Min * getMemoryInstructionCost(I, ElementCount::getScalar()))); else if (auto Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) - setWideningDecision(Member, VF, CM_Scalarize, - (VF * getMemoryInstructionCost(Member, 1))); + setWideningDecision( + Member, VF, CM_Scalarize, + (VF.Min * + getMemoryInstructionCost(Member, ElementCount::getScalar()))); } } } else @@ -6343,7 +6408,7 @@ } unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, - unsigned VF, + ElementCount VF, Type *&VectorTy) { Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) @@ -6366,19 +6431,19 @@ // blocks requires also an extract of its vector compare i1 element. bool ScalarPredicatedBB = false; BranchInst *BI = cast(I); - if (VF > 1 && BI->isConditional() && + if (VF.isVector() && BI->isConditional() && (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { // Return cost for branches around scalarized and predicated blocks. - auto *Vec_i1Ty = - FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); - return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), - false, true) + - (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF)); - } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) + auto *Vec_i1Ty = FixedVectorType::get( + IntegerType::getInt1Ty(RetTy->getContext()), VF.Min); + return (TTI.getScalarizationOverhead( + Vec_i1Ty, APInt::getAllOnesValue(VF.Min), false, true) + + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.Min)); + } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. return TTI.getCFInstrCost(Instruction::Br, CostKind); else @@ -6393,15 +6458,15 @@ // First-order recurrences are replaced by vector shuffles inside the loop. // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. - if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) + if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - cast(VectorTy), VF - 1, + cast(VectorTy), VF.Min - 1, FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi // node, where N is the number of incoming values. - if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) + if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( Instruction::Select, ToVectorTy(Phi->getType(), VF), @@ -6418,17 +6483,18 @@ // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF > 1 && isScalarWithPredication(I)) { + if (VF.isVector() && isScalarWithPredication(I)) { unsigned Cost = 0; // These instructions have a non-void type, so account for the phi nodes // that we will create. This cost is likely to be zero. The phi node // cost, if any, should be scaled by the block probability because it // models a copy at the end of each predicated block. - Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind); + Cost += VF.Min * TTI.getCFInstrCost(Instruction::PHI, CostKind); // The cost of the non-predicated instruction. - Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); + Cost += + VF.Min * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); // The cost of insertelement and extractelement instructions needed for // scalarization. @@ -6467,14 +6533,14 @@ Op2VK = TargetTransformInfo::OK_UniformValue; SmallVector Operands(I->operand_values()); - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, @@ -6488,7 +6554,7 @@ bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); if (!ScalarCond) - CondTy = FixedVectorType::get(CondTy, VF); + CondTy = FixedVectorType::get(CondTy, VF.Min); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, CostKind, I); @@ -6505,13 +6571,13 @@ } case Instruction::Store: case Instruction::Load: { - unsigned Width = VF; - if (Width > 1) { + ElementCount Width = VF; + if (Width.Min > 1) { InstWidening Decision = getWideningDecision(I, Width); assert(Decision != CM_Unknown && "CM decision should be taken at this point"); if (Decision == CM_Scalarize) - Width = 1; + Width = ElementCount::getScalar(); } VectorTy = ToVectorTy(getMemInstValueType(I), Width); return getMemoryInstructionCost(I, VF); @@ -6533,7 +6599,7 @@ assert((isa(I) || isa(I)) && "Expected a load or a store!"); - if (VF == 1 || !TheLoop->contains(I)) + if (VF.isScalar() || !TheLoop->contains(I)) return TTI::CastContextHint::Normal; switch (getWideningDecision(I, VF)) { @@ -6599,7 +6665,7 @@ } } - unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; return N * TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } @@ -6614,8 +6680,9 @@ default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, - CostKind) + + return VF.Min * + TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, + CostKind) + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6721,8 +6788,9 @@ } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { - unsigned VF = UserVF; +LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { + assert(!UserVF.Scalable && "invalid number of lanes"); + unsigned VF = UserVF.Min; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in @@ -6730,7 +6798,7 @@ if (!OrigLoop->empty()) { // If the user doesn't provide a vectorization factor, determine a // reasonable one. - if (!UserVF) { + if (!UserVF.Min) { VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); @@ -6743,15 +6811,16 @@ } assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); - LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF - << " to build VPlans.\n"); + LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF.Min ? "user " : "") << "VF " + << VF << " to build VPlans.\n"); buildVPlans(VF, VF); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0}; + const ElementCount EC = {VF, false}; + return {EC, 0}; } LLVM_DEBUG( @@ -6760,10 +6829,11 @@ return VectorizationFactor::Disabled(); } -Optional LoopVectorizationPlanner::plan(unsigned UserVF, - unsigned UserIC) { +Optional +LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { + assert(!UserVF.Scalable && "scalable vectorization not yet handled"); assert(OrigLoop->empty() && "Inner loop expected."); - Optional MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); + Optional MaybeMaxVF = CM.computeMaxVF(UserVF.Min, UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -6781,14 +6851,14 @@ CM.invalidateCostModelingDecisions(); } - if (UserVF) { + if (!UserVF.isZero()) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); + assert(isPowerOf2_32(UserVF.Min) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); + buildVPlansWithVPRecipes(UserVF.Min, UserVF.Min); LLVM_DEBUG(printPlans(dbgs())); return {{UserVF, 0}}; } @@ -6798,12 +6868,12 @@ for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { // Collect Uniform and Scalar instructions after vectorization with VF. - CM.collectUniformsAndScalars(VF); + CM.collectUniformsAndScalars({VF, /*Scalable*/ false}); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. if (VF > 1) - CM.collectInstsToScalarize(VF); + CM.collectInstsToScalarize({VF, false}); } CM.collectInLoopReductions(); @@ -6817,7 +6887,7 @@ return CM.selectVectorizationFactor(MaxVF); } -void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { +void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); BestVF = VF; @@ -6952,12 +7022,12 @@ } bool LoopVectorizationPlanner::getDecisionAndClampRange( - const std::function &Predicate, VFRange &Range) { + const std::function &Predicate, VFRange &Range) { assert(Range.End > Range.Start && "Trying to test an empty VF range."); - bool PredicateAtRangeStart = Predicate(Range.Start); + bool PredicateAtRangeStart = Predicate({Range.Start, false /*Scalable*/}); for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) - if (Predicate(TmpVF) != PredicateAtRangeStart) { + if (Predicate({TmpVF, false /*Scalable*/}) != PredicateAtRangeStart) { Range.End = TmpVF; break; } @@ -7068,8 +7138,9 @@ assert((isa(I) || isa(I)) && "Must be called with either a load or store"); - auto willWiden = [&](unsigned VF) -> bool { - if (VF == 1) + auto willWiden = [&](ElementCount VF) -> bool { + assert(!VF.Scalable && "unexpected scalable ElementCount"); + if (VF.isScalar()) return false; LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); @@ -7122,9 +7193,10 @@ // Determine whether \p K is a truncation based on an induction variable that // can be optimized. auto isOptimizableIVTruncate = - [&](Instruction *K) -> std::function { - return - [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; + [&](Instruction *K) -> std::function { + return [=](ElementCount VF) -> bool { + return CM.isOptimizableIVTruncate(K, VF); + }; }; if (LoopVectorizationPlanner::getDecisionAndClampRange( @@ -7159,7 +7231,9 @@ VPlan &Plan) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); }, + [this, CI](ElementCount VF) { + return CM.isScalarWithPredication(CI, VF); + }, Range); if (IsPredicated) @@ -7170,7 +7244,7 @@ ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) return nullptr; - auto willWiden = [&](unsigned VF) -> bool { + auto willWiden = [&](ElementCount VF) -> bool { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // The following case may be scalarized depending on the VF. // The flag shows whether we use Intrinsic or a usual Call for vectorized @@ -7194,7 +7268,7 @@ !isa(I) && "Instruction should have been handled earlier"); // Instruction should be widened, unless it is scalar after vectorization, // scalarization is profitable or it is predicated. - auto WillScalarize = [this, I](unsigned VF) -> bool { + auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I, VF); @@ -7257,11 +7331,12 @@ DenseMap &PredInst2Recipe, VPlanPtr &Plan) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, + [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, + Range); auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated); @@ -7469,8 +7544,8 @@ // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](unsigned VF) -> bool { - return (VF >= 2 && // Query is illegal for VF == 1 + auto applyIG = [IG, this](ElementCount VF) -> bool { + return (VF.isVector() && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); }; @@ -7595,10 +7670,10 @@ std::string PlanName; raw_string_ostream RSO(PlanName); - unsigned VF = Range.Start; + ElementCount VF = {Range.Start, false}; Plan->addVF(VF); RSO << "Initial VPlan for VF={" << VF; - for (VF *= 2; VF < Range.End; VF *= 2) { + for (VF.Min *= 2; VF.Min < Range.End; VF.Min *= 2) { Plan->addVF(VF); RSO << "," << VF; } @@ -7625,7 +7700,7 @@ HCFGBuilder.buildHierarchicalCFG(); for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) - Plan->addVF(VF); + Plan->addVF({VF, false}); if (EnableVPlanPredication) { VPlanPredicator VPP(*Plan); @@ -7819,11 +7894,11 @@ State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, IsPredicated, State); // Insert scalar instance packing it into a vector. - if (AlsoPack && State.VF > 1) { + if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from undef. if (State.Instance->Lane == 0) { Value *Undef = UndefValue::get( - FixedVectorType::get(Ingredient->getType(), State.VF)); + FixedVectorType::get(Ingredient->getType(), State.VF.Min)); State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); } State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); @@ -7834,7 +7909,7 @@ // Generate scalar instances for all VF lanes of all UF parts, unless the // instruction is uniform inwhich case generate only the first lane for each // of the UF parts. - unsigned EndLane = IsUniform ? 1 : State.VF; + unsigned EndLane = IsUniform ? 1 : State.VF.Min; for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, @@ -7975,7 +8050,7 @@ const unsigned UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); + const VectorizationFactor VF = LVP.planInVPlanNativePath({UserVF, false}); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. @@ -8141,7 +8216,7 @@ unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = LVP.plan(UserVF, UserIC); + Optional MaybeVF = LVP.plan({UserVF, false}, UserIC); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -115,7 +115,7 @@ /// The vectorization factor. Each entry in the scalar map contains UF x VF /// scalar values. - unsigned VF; + ElementCount VF; /// The vector and scalar map storage. We use std::map and not DenseMap /// because insertions to DenseMap invalidate its iterators. @@ -126,7 +126,7 @@ public: /// Construct an empty map with the given unroll and vectorization factors. - VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} + VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {} /// \return True if the map has any vector entry for \p Key. bool hasAnyVectorValue(Value *Key) const { @@ -151,12 +151,13 @@ /// \return True if the map has a scalar entry for \p Key and \p Instance. bool hasScalarValue(Value *Key, const VPIteration &Instance) const { assert(Instance.Part < UF && "Queried Scalar Part is too large."); - assert(Instance.Lane < VF && "Queried Scalar Lane is too large."); + assert(Instance.Lane < VF.Min && !VF.Scalable && + "Queried Scalar Lane is too large."); if (!hasAnyScalarValue(Key)) return false; const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Instance.Part].size() == VF && + assert(Entry[Instance.Part].size() == VF.Min && "ScalarParts has wrong dimensions."); return Entry[Instance.Part][Instance.Lane] != nullptr; } @@ -195,7 +196,7 @@ // TODO: Consider storing uniform values only per-part, as they occupy // lane 0 only, keeping the other VF-1 redundant entries null. for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF, nullptr); + Entry[Part].resize(VF.Min, nullptr); ScalarMapStorage[Key] = Entry; } ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; @@ -234,14 +235,15 @@ /// VPTransformState holds information passed down when "executing" a VPlan, /// needed for generating the output IR. struct VPTransformState { - VPTransformState(unsigned VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, - IRBuilder<> &Builder, VectorizerValueMap &ValueMap, - InnerLoopVectorizer *ILV, VPCallback &Callback) + VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, + DominatorTree *DT, IRBuilder<> &Builder, + VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV, + VPCallback &Callback) : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {} /// The chosen Vectorization and Unroll Factors of the loop being vectorized. - unsigned VF; + ElementCount VF; unsigned UF; /// Hold the indices to generate specific scalar instructions. Null indicates @@ -1583,7 +1585,7 @@ VPBlockBase *Entry; /// Holds the VFs applicable to this VPlan. - SmallSet VFs; + SmallSet VFs; /// Holds the name of the VPlan, for printing. std::string Name; @@ -1647,9 +1649,9 @@ return BackedgeTakenCount; } - void addVF(unsigned VF) { VFs.insert(VF); } + void addVF(ElementCount VF) { VFs.insert(VF); } - bool hasVF(unsigned VF) { return VFs.count(VF); } + bool hasVF(ElementCount VF) { return VFs.count(VF); } const std::string &getName() const { return Name; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -300,7 +300,7 @@ for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { State->Instance->Part = Part; - for (unsigned Lane = 0, VF = State->VF; Lane < VF; ++Lane) { + for (unsigned Lane = 0, VF = State->VF.Min; Lane < VF; ++Lane) { State->Instance->Lane = Lane; // Visit the VPBlocks connected to \p this, starting from it. for (VPBlockBase *Block : RPOT) { @@ -387,7 +387,7 @@ Value *ScalarBTC = State.get(getOperand(1), {Part, 0}); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = FixedVectorType::get(Int1Ty, State.VF); + auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.Min); Instruction *Call = Builder.CreateIntrinsic( Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()}, {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask"); @@ -838,14 +838,14 @@ Value *CanonicalIV = State.CanonicalIV; Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); - auto VF = State.VF; - Value *VStart = VF == 1 - ? CanonicalIV - : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); + ElementCount VF = State.VF; + Value *VStart = + VF.Min == 1 ? CanonicalIV + : Builder.CreateVectorSplat(VF.Min, CanonicalIV, "broadcast"); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { SmallVector Indices; - for (unsigned Lane = 0; Lane < VF; ++Lane) - Indices.push_back(ConstantInt::get(STy, Part * VF + Lane)); + for (unsigned Lane = 0; Lane < VF.Min; ++Lane) + Indices.push_back(ConstantInt::get(STy, Part * VF.Min + Lane)); // If VF == 1, there is only one iteration in the loop above, thus the // element pushed back into Indices is ConstantInt::get(STy, Part) Constant *VStep = VF == 1 ? Indices.back() : ConstantVector::get(Indices);