Index: llvm/include/llvm/Support/TypeSize.h =================================================================== --- llvm/include/llvm/Support/TypeSize.h +++ llvm/include/llvm/Support/TypeSize.h @@ -349,6 +349,11 @@ isScalable())); } + LeafTy inc(ScalarTy Amount) const { + return static_cast( + LinearPolySize::get(getKnownMinValue() + Amount, isScalable())); + } + /// Printing function. void print(raw_ostream &OS) const { if (isScalable()) Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -281,7 +281,7 @@ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. - void buildVPlans(unsigned MinVF, unsigned MaxVF); + void buildVPlans(ElementCount MinVF, ElementCount MaxVF); private: /// Build a VPlan according to the information gathered by Legal. \return a @@ -299,7 +299,7 @@ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF); + void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); /// Adjust the recipes for any inloop reductions. The chain of instructions /// leading from the loop exit instr to the phi need to be converted to Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1061,7 +1061,7 @@ /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. - Optional computeMaxVF(unsigned UserVF, unsigned UserIC); + Optional computeMaxVF(ElementCount UserVF, unsigned UserIC); /// \return True if runtime checks are required for vectorization, and false /// otherwise. @@ -1071,7 +1071,7 @@ /// This method checks every power of two up to MaxVF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. - VectorizationFactor selectVectorizationFactor(unsigned MaxVF); + VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); /// Setup cost-based decisions for user vectorization factor. void selectUserVectorizationFactor(ElementCount UserVF) { @@ -1438,7 +1438,7 @@ /// \return An upper bound for the vectorization factor, a power-of-2 larger /// than zero. One is returned if vectorization should best be avoided due /// to cost. - unsigned computeFeasibleMaxVF(unsigned ConstTripCount); + ElementCount computeFeasibleMaxVF(unsigned ConstTripCount); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -2269,7 +2269,7 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); - assert(!VF.isScalable() && "Cannot reverse scalable vectors"); + assert(!VF.isScalable() && "Cannot reverse scalable vectors yet"); SmallVector ShuffleMask; for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); @@ -2766,6 +2766,7 @@ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Type *Ty = TC->getType(); + // This is where we can make the step a runtime constant. assert(!VF.isScalable() && "scalable vectorization is not supported yet"); Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF); @@ -5208,8 +5209,8 @@ return false; } -Optional LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, - unsigned UserIC) { +Optional +LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -5267,9 +5268,11 @@ InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); - assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); - unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; + ElementCount MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); + assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getKnownMinValue())) && + "MaxVF must be a power of 2"); + unsigned MaxVFtimesIC = + UserIC ? MaxVF.getKnownMinValue() * UserIC : MaxVF.getKnownMinValue(); if (TC > 0 && TC % MaxVFtimesIC == 0) { // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); @@ -5315,7 +5318,7 @@ return None; } -unsigned +ElementCount LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; @@ -5344,7 +5347,7 @@ if (MaxVectorSize == 0) { LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); MaxVectorSize = 1; - return MaxVectorSize; + return ElementCount::getFixed(MaxVectorSize); } else if (ConstTripCount && ConstTripCount < MaxVectorSize && isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in @@ -5352,7 +5355,7 @@ LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " << ConstTripCount << "\n"); MaxVectorSize = ConstTripCount; - return MaxVectorSize; + return ElementCount::getFixed(MaxVectorSize); } unsigned MaxVF = MaxVectorSize; @@ -5390,25 +5393,30 @@ } } } - return MaxVF; + return ElementCount::getFixed(MaxVF); } VectorizationFactor -LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { +LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { + // This can be fixed for scalable vectors later, as we this prototype is only + // able to vectorize a loop when it has the loop-hint to enable vectorization + // for a certain VF. + assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); + float Cost = expectedCost(ElementCount::getFixed(1)).first; const float ScalarCost = Cost; unsigned Width = 1; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; - if (ForceVectorization && MaxVF > 1) { + if (ForceVectorization && MaxVF.isVector()) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. Cost = std::numeric_limits::max(); } - for (unsigned i = 2; i <= MaxVF; i *= 2) { + for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. @@ -6948,7 +6956,7 @@ "VF needs to be a power of two"); LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") << "VF " << VF << " to build VPlans.\n"); - buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue()); + buildVPlans(VF, VF); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) @@ -6967,8 +6975,8 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); assert(OrigLoop->isInnermost() && "Inner loop expected."); - Optional MaybeMaxVF = - CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC); + Optional MaybeMaxVF = + CM.computeMaxVF(UserVF, UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -6994,30 +7002,33 @@ // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF.getKnownMinValue(), - UserVF.getKnownMinValue()); + buildVPlansWithVPRecipes(UserVF, + UserVF); LLVM_DEBUG(printPlans(dbgs())); return {{UserVF, 0}}; } - unsigned MaxVF = MaybeMaxVF.getValue(); - assert(MaxVF != 0 && "MaxVF is zero."); + ElementCount MaxVF = MaybeMaxVF.getValue(); + assert(MaxVF.isNonZero() && "MaxVF is zero."); + assert(!MaxVF.isScalable() && + "Scalable vectors not yet supported beyond this point"); - for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { + for (ElementCount VF = ElementCount::getFixed(1); + ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { // Collect Uniform and Scalar instructions after vectorization with VF. - CM.collectUniformsAndScalars(ElementCount::getFixed(VF)); + CM.collectUniformsAndScalars(VF); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - if (VF > 1) - CM.collectInstsToScalarize(ElementCount::getFixed(VF)); + if (VF.isVector()) + CM.collectInstsToScalarize(VF); } CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(1, MaxVF); + buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); LLVM_DEBUG(printPlans(dbgs())); - if (MaxVF == 1) + if (MaxVF.isScalar()) return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. @@ -7169,11 +7180,13 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange( const std::function &Predicate, VFRange &Range) { - assert(Range.End > Range.Start && "Trying to test an empty VF range."); - bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start)); + assert(ElementCount::isKnownGT(Range.End, Range.Start) && + "Trying to test an empty VF range."); + bool PredicateAtRangeStart = Predicate(Range.Start); - for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) - if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) { + for (ElementCount TmpVF = Range.Start * 2; + ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) + if (Predicate(TmpVF) != PredicateAtRangeStart) { Range.End = TmpVF; break; } @@ -7186,9 +7199,10 @@ /// of VF's starting at a given VF and extending it as much as possible. Each /// vectorization decision can potentially shorten this sub-range during /// buildVPlan(). -void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { - for (unsigned VF = MinVF; VF < MaxVF + 1;) { - VFRange SubRange = {VF, MaxVF + 1}; +void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, + ElementCount MaxVF) { + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVF.inc(1));) { + VFRange SubRange = {VF, MaxVF.inc(1) }; VPlans.push_back(buildVPlan(SubRange)); VF = SubRange.End; } @@ -7296,7 +7310,6 @@ "Must be called with either a load or store"); auto willWiden = [&](ElementCount VF) -> bool { - assert(!VF.isScalable() && "unexpected scalable ElementCount"); if (VF.isScalar()) return false; LoopVectorizationCostModel::InstWidening Decision = @@ -7594,8 +7607,8 @@ return tryToWiden(Instr, *Plan); } -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, - unsigned MaxVF) { +void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, + ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); // Collect conditions feeding internal conditional branches; they need to be @@ -7645,8 +7658,8 @@ for (Instruction *I : DeadInstructions) SinkAfter.erase(I); - for (unsigned VF = MinVF; VF < MaxVF + 1;) { - VFRange SubRange = {VF, MaxVF + 1}; + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVF.inc(1));) { + VFRange SubRange = {VF, MaxVF.inc(1)}; VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions, SinkAfter)); VF = SubRange.End; @@ -7808,7 +7821,7 @@ } // Adjust the recipes for any inloop reductions. - if (Range.Start > 1) + if (Range.Start.isVector()) adjustRecipesForInLoopReductions(Plan, RecipeBuilder); // Finally, if tail is folded by masking, introduce selects between the phi @@ -7827,10 +7840,12 @@ std::string PlanName; raw_string_ostream RSO(PlanName); - ElementCount VF = ElementCount::getFixed(Range.Start); + ElementCount VF = Range.Start; Plan->addVF(VF); RSO << "Initial VPlan for VF={" << VF; - for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) { + assert(VF.isScalable() == Range.End.isScalable() && + "Scalable flags don't match"); + for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { Plan->addVF(VF); RSO << "," << VF; } @@ -7856,8 +7871,9 @@ VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); HCFGBuilder.buildHierarchicalCFG(); - for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) - Plan->addVF(ElementCount::getFixed(VF)); + for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); + VF *= 2) + Plan->addVF(VF); if (EnableVPlanPredication) { VPlanPredicator VPP(*Plan); Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -65,10 +65,16 @@ /// [1, 9) = {1, 2, 4, 8} struct VFRange { // A power of 2. - const unsigned Start; + const ElementCount Start; // Need not be a power of 2. If End <= Start range is empty. - unsigned End; + ElementCount End; + + VFRange(const ElementCount &Start, const ElementCount &End) + : Start(Start), End(End) { + assert(Start.isScalable() == End.isScalable() && + "Both Start and End should have the same scalable flag"); + } }; using VPlanPtr = std::unique_ptr; @@ -151,7 +157,6 @@ assert(Instance.Part < UF && "Queried Scalar Part is too large."); assert(Instance.Lane < VF.getKnownMinValue() && "Queried Scalar Lane is too large."); - assert(!VF.isScalable() && "VF is assumed to be non scalable."); if (!hasAnyScalarValue(Key)) return false;