Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -918,6 +918,10 @@ unsigned ChainSizeInBytes, VectorType *VecTy) const; + /// \returns The minimum vector factor for vectors of elements of type + /// \p ElementTy. + unsigned getMinimumVF(Type *ElementTy) const; + /// Flags describing the kind of vector reduction. struct ReductionFlags { ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {} @@ -1124,6 +1128,7 @@ virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; + virtual unsigned getMinimumVF(Type *ElementTy) const = 0; virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; @@ -1501,6 +1506,9 @@ VectorType *VecTy) const override { return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } + unsigned getMinimumVF(Type *ElementTy) const override { + return Impl.getMinimumVF(ElementTy); + } bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags Flags) const override { return Impl.useReductionIntrinsic(Opcode, Ty, Flags); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -541,6 +541,10 @@ return VF; } + unsigned getMinimumVF(Type *ElementTy) const { + return 2; + } + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { return false; Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -593,6 +593,10 @@ return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } +unsigned TargetTransformInfo::getMinimumVF(Type *ElementTy) const { + return TTIImpl->getMinimumVF(ElementTy); +} + bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags Flags) const { return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags); Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1787,15 +1787,15 @@ : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {} - /// \return An upper bound for the vectorization factor, or None if - /// vectorization should be avoided up front. - Optional computeMaxVF(bool OptForSize); + /// \return A lower and an upper bounds for the vectorization factor, + /// or None if vectorization should be avoided up front. + Optional> computeMaxVF(bool OptForSize); /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every power of two up to MaxVF. If UserVF is not ZERO - /// then this vectorization factor will be selected if vectorization is - /// possible. - VectorizationFactor selectVectorizationFactor(unsigned MaxVF); + /// This method checks every power of two from MinVF up to MaxVF. If UserVF + /// is not ZERO then this vectorization factor will be selected if + /// vectorization is possible. + VectorizationFactor selectVectorizationFactor(unsigned MinVF, unsigned MaxVF); /// Setup cost-based decisions for user vectorization factor. void selectUserVectorizationFactor(unsigned UserVF) { @@ -1806,7 +1806,7 @@ /// \return The size (in bits) of the smallest and widest types in the code /// that needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. - std::pair getSmallestAndWidestTypes(); + SmallVector getSmallestAndWidestTypes(); /// \return The desired interleave count. /// If interleave count has been specified by metadata it will be returned. @@ -2028,9 +2028,11 @@ private: unsigned NumPredStores = 0; - /// \return An upper bound for the vectorization factor, larger than zero. - /// One is returned if vectorization should best be avoided due to cost. - unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount); + /// \return A lower and an upper bound for the vectorization factor, + /// larger than zero. One is returned if vectorization should best be + /// avoided due to cost. + std::pair computeFeasibleMaxVF(bool OptForSize, + unsigned ConstTripCount); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -6017,7 +6019,8 @@ } } -Optional LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { +Optional> +LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -6056,9 +6059,9 @@ return None; } - unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); + std::pair Range = computeFeasibleMaxVF(OptForSize, TC); - if (TC % MaxVF != 0) { + if (TC % Range.first != 0 || TC % Range.second != 0) { // If the trip count that we found modulo the vectorization factor is not // zero then we require a tail. // FIXME: look for a smaller MaxVF that does divide TC rather than give up. @@ -6074,15 +6077,19 @@ return None; } - return MaxVF; + return Range; } -unsigned +std::pair LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); - unsigned SmallestType, WidestType; - std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); + SmallVector ElemTypes = getSmallestAndWidestTypes(); + if (ElemTypes.empty()) + return { 1, 1 }; + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + unsigned SmallestType = DL.getTypeSizeInBits(ElemTypes.front()); + unsigned WidestType = std::max(8ul, DL.getTypeSizeInBits(ElemTypes.back())); unsigned WidestRegister = TTI.getRegisterBitWidth(true); // Get the maximum safe dependence distance in bits computed by LAA. @@ -6093,7 +6100,11 @@ WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); - unsigned MaxVectorSize = WidestRegister / WidestType; + unsigned MinVF = 1; + for (Type *Ty : ElemTypes) + MinVF = std::max(MinVF, TTI.getMinimumVF(Ty)); + + unsigned MaxVectorSize = std::max(MinVF, WidestRegister / WidestType); DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"); @@ -6105,7 +6116,7 @@ if (MaxVectorSize == 0) { DEBUG(dbgs() << "LV: The target has no vector registers.\n"); MaxVectorSize = 1; - return MaxVectorSize; + return { MaxVectorSize, MaxVectorSize }; } else if (ConstTripCount && ConstTripCount < MaxVectorSize && isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in @@ -6113,7 +6124,7 @@ DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " << ConstTripCount << "\n"); MaxVectorSize = ConstTripCount; - return MaxVectorSize; + return { MaxVectorSize, MaxVectorSize }; } unsigned MaxVF = MaxVectorSize; @@ -6138,11 +6149,12 @@ } } } - return MaxVF; + return { MinVF, MaxVF }; } VectorizationFactor -LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { +LoopVectorizationCostModel::selectVectorizationFactor(unsigned MinVF, + unsigned MaxVF) { float Cost = expectedCost(1).first; const float ScalarCost = Cost; unsigned Width = 1; @@ -6151,11 +6163,11 @@ bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; // Ignore scalar width, because the user explicitly wants vectorization. if (ForceVectorization && MaxVF > 1) { - Width = 2; + Width = MinVF; Cost = expectedCost(Width).first / (float)Width; } - for (unsigned i = 2; i <= MaxVF; i *= 2) { + for (unsigned i = MinVF; i <= MaxVF; i *= 2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. @@ -6191,10 +6203,10 @@ return Factor; } -std::pair +SmallVector LoopVectorizationCostModel::getSmallestAndWidestTypes() { - unsigned MinWidth = -1U; - unsigned MaxWidth = 8; + SmallVector SortedTypes; + SetVector ElemTypes; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); // For each block. @@ -6237,14 +6249,16 @@ !Legal->isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) continue; - MinWidth = std::min(MinWidth, - (unsigned)DL.getTypeSizeInBits(T->getScalarType())); - MaxWidth = std::max(MaxWidth, - (unsigned)DL.getTypeSizeInBits(T->getScalarType())); + ElemTypes.insert(T->getScalarType()); } } - return {MinWidth, MaxWidth}; + SortedTypes.assign(ElemTypes.begin(), ElemTypes.end()); + llvm::sort(SortedTypes.begin(), SortedTypes.end(), + [&DL] (Type *T0, Type *T1) { + return DL.getTypeSizeInBits(T0) < DL.getTypeSizeInBits(T1); + }); + return SortedTypes; } unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, @@ -7396,7 +7410,8 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { // Width 1 means no vectorize, cost 0 means uncomputed cost. const VectorizationFactor NoVectorization = {1U, 0U}; - Optional MaybeMaxVF = CM.computeMaxVF(OptForSize); + Optional> MaybeMaxVF = + CM.computeMaxVF(OptForSize); if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize. return NoVectorization; @@ -7411,26 +7426,32 @@ return {UserVF, 0}; } - unsigned MaxVF = MaybeMaxVF.getValue(); - assert(MaxVF != 0 && "MaxVF is zero."); + unsigned MinVF, MaxVF; + std::tie(MinVF, MaxVF) = MaybeMaxVF.getValue(); + assert(MinVF != 0 && MaxVF != 0 && "MinVF or MaxVF is zero."); + + // Collect Uniform and Scalar instructions. + if (MinVF > 1) + CM.collectUniformsAndScalars(1); - for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { + for (unsigned VF = MinVF; VF <= MaxVF; VF *= 2) { // Collect Uniform and Scalar instructions after vectorization with VF. CM.collectUniformsAndScalars(VF); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - if (VF > 1) - CM.collectInstsToScalarize(VF); + CM.collectInstsToScalarize(VF); } - buildVPlans(1, MaxVF); + if (MinVF > 1) + buildVPlans(1, 1); + buildVPlans(MinVF, MaxVF); DEBUG(printPlans(dbgs())); if (MaxVF == 1) return NoVectorization; // Select the optimal vectorization factor. - return CM.selectVectorizationFactor(MaxVF); + return CM.selectVectorizationFactor(MinVF, MaxVF); } void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { Index: test/Transforms/LoopUnroll/runtime-loop4.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop4.ll +++ test/Transforms/LoopUnroll/runtime-loop4.ll @@ -37,7 +37,7 @@ store i32 %iv2, i32* %offset2, align 4 %inc2 = add i32 %iv2, 1 %exitcnd2 = icmp uge i32 %inc2, %iter - br i1 %exitcnd2, label %exit2, label %loop2 + br i1 %exitcnd2, label %exit2, label %loop2, !llvm.loop !0 exit2: br label %loop1.latch @@ -45,8 +45,11 @@ loop1.latch: %inc1 = add i32 %iv1, 1 %exitcnd1 = icmp uge i32 %inc1, 1024 - br i1 %exitcnd1, label %exit, label %loop1 + br i1 %exitcnd1, label %exit, label %loop1, !llvm.loop !0 exit: ret void } + +!0 = !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 0} Index: test/Transforms/LoopVectorize/X86/funclet.ll =================================================================== --- test/Transforms/LoopVectorize/X86/funclet.ll +++ test/Transforms/LoopVectorize/X86/funclet.ll @@ -22,7 +22,7 @@ %call = call double @floor(double 1.0) #1 [ "funclet"(token %1) ] %inc = add nuw nsw i32 %i.07, 1 %exitcond = icmp eq i32 %inc, 1024 - br i1 %exitcond, label %for.cond.cleanup, label %for.body + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !0 try.cont: ; preds = %for.cond.cleanup ret void @@ -43,3 +43,6 @@ attributes #0 = { "target-features"="+sse2" } attributes #1 = { nounwind readnone } + +!0 = !{!0, !1} +!1 = !{!"llvm.loop.vectorize.width", i32 16} Index: test/Transforms/LoopVectorize/pr32859.ll =================================================================== --- test/Transforms/LoopVectorize/pr32859.ll +++ test/Transforms/LoopVectorize/pr32859.ll @@ -22,9 +22,12 @@ if.end.2.i: ; preds = %for.cond1.preheader.i %inc5.i = add nsw i32 %c.06.i, 1 %cmp.i = icmp slt i32 %inc5.i, 16 - br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.preheader + br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.preheader, !llvm.loop !0 for.cond.preheader: ; preds = %if.end.2.i %e.0.ph = phi i32 [ 0, %if.end.2.i ] unreachable } + +!0 = !{!0, !1} +!1 = !{!"llvm.loop.vectorize.width", i32 4} Index: test/Transforms/LoopVectorize/unroll.ll =================================================================== --- test/Transforms/LoopVectorize/unroll.ll +++ test/Transforms/LoopVectorize/unroll.ll @@ -27,7 +27,7 @@ store i32 3, i32* %arrayidx, align 4 %inc = add nuw nsw i64 %i.06, 1 %cmp = icmp slt i64 %inc, %conv - br i1 %cmp, label %for.body, label %for.end.loopexit + br i1 %cmp, label %for.body, label %for.end.loopexit, !llvm.loop !0 for.end.loopexit: ; preds = %for.body br label %for.end @@ -35,3 +35,6 @@ for.end: ; preds = %for.end.loopexit, %entry ret void } + +!0 = !{!0, !1} +!1 = !{!"llvm.loop.vectorize.width", i32 1}