diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -200,6 +200,35 @@ } }; +/// A class that represents two maximum vectorization factors (initialized with +/// 0 by default). One for fixed-width vectorization and one for scalable +/// vectorization. This can be used by the vectorizer to choose from a range of +/// fixed and/or scalable VFs in order to find the most cost-effective VF to +/// vectorize with. +struct MaxVFCandidates { + ElementCount FixedVF; + ElementCount ScalableVF; + + MaxVFCandidates() + : FixedVF(ElementCount::getFixed(0)), + ScalableVF(ElementCount::getScalable(0)) {} + MaxVFCandidates(const ElementCount &Max) : MaxVFCandidates() { + *(Max.isScalable() ? &ScalableVF : &FixedVF) = Max; + } + MaxVFCandidates(const ElementCount &FixedVF, const ElementCount &ScalableVF) + : FixedVF(FixedVF), ScalableVF(ScalableVF) {} + + static MaxVFCandidates None() { return MaxVFCandidates(); } + + /// \return true if either max fixed- or scalable VF is non-zero. + explicit operator bool() const { return FixedVF || ScalableVF; } + + /// \return true if either max fixed- or scalable VF is a valid vector VF. + bool hasVectorCandidates() const { + return FixedVF.isVector() || ScalableVF.isVector(); + } +}; + /// Planner drives the vectorization process after having passed /// Legality checks. class LoopVectorizationPlanner { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1231,7 +1231,7 @@ /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. - Optional computeMaxVF(ElementCount UserVF, unsigned UserIC); + MaxVFCandidates computeMaxVF(ElementCount UserVF, unsigned UserIC); /// \return True if runtime checks are required for vectorization, and false /// otherwise. @@ -1630,8 +1630,8 @@ /// \return An upper bound for the vectorization factor, a power-of-2 larger /// than zero. One is returned if vectorization should best be avoided due /// to cost. - ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF); + MaxVFCandidates computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF); /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. @@ -5652,7 +5652,7 @@ return MaxScalableVF; } -ElementCount +MaxVFCandidates LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, ElementCount UserVF) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); @@ -5723,18 +5723,20 @@ WidestType, MaxSafeFixedVF)) MaxFixedVF = MaxVF; + ElementCount MaxScalableVF = ElementCount::getScalable(0); if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, MaxSafeScalableVF)) - // FIXME: Return scalable VF as well (to be added in future patch). - if (MaxVF.isScalable()) + if (MaxVF.isScalable()) { + MaxScalableVF = MaxVF; LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF << "\n"); + } - return MaxFixedVF; + return {MaxFixedVF, MaxScalableVF}; } -Optional -LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { +MaxVFCandidates LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, + unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -5742,7 +5744,7 @@ "Not inserting runtime ptr check for divergent target", "runtime pointer checks needed. Not enabled for divergent target", "CantVersionLoopWithDivergentTarget", ORE, TheLoop); - return None; + return MaxVFCandidates::None(); } unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); @@ -5751,7 +5753,7 @@ reportVectorizationFailure("Single iteration (non) loop", "loop trip count is one, irrelevant for vectorization", "SingleIterationLoop", ORE, TheLoop); - return None; + return MaxVFCandidates::None(); } switch (ScalarEpilogueStatus) { @@ -5778,7 +5780,7 @@ // Bail if runtime checks are required, which are not good when optimising // for size. if (runtimeChecksRequired()) - return None; + return MaxVFCandidates::None(); break; } @@ -5796,7 +5798,7 @@ ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; return computeFeasibleMaxVF(TC, UserVF); } - return None; + return MaxVFCandidates::None(); } // Now try the tail folding @@ -5811,26 +5813,27 @@ InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); - assert(!MaxVF.isScalable() && - "Scalable vectors do not yet support tail folding"); - assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && - "MaxVF must be a power of 2"); - unsigned MaxVFtimesIC = - UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); - // Avoid tail folding if the trip count is known to be a multiple of any VF we - // chose. - ScalarEvolution *SE = PSE.getSE(); - const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); - const SCEV *ExitCount = SE->getAddExpr( - BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); - const SCEV *Rem = SE->getURemExpr( - SE->applyLoopGuards(ExitCount, TheLoop), - SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); - if (Rem->isZero()) { - // Accept MaxVF if we do not have a tail. - LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); - return MaxVF; + MaxVFCandidates MaxFactors = computeFeasibleMaxVF(TC, UserVF); + if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { + ElementCount MaxFixedVF = MaxFactors.FixedVF; + assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && + "MaxFixedVF must be a power of 2"); + unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC + : MaxFixedVF.getFixedValue(); + // Avoid tail folding if the trip count is known to be a multiple of any VF + // we chose. + ScalarEvolution *SE = PSE.getSE(); + const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + const SCEV *Rem = SE->getURemExpr( + SE->applyLoopGuards(ExitCount, TheLoop), + SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); + if (Rem->isZero()) { + // Accept MaxFixedVF if we do not have a tail. + LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + return MaxFactors; + } } // If we don't know the precise trip count, or if the trip count that we @@ -5839,7 +5842,7 @@ // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; - return MaxVF; + return MaxFactors; } // If there was a tail-folding hint/switch, but we can't fold the tail by @@ -5848,12 +5851,12 @@ LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return MaxVF; + return MaxFactors; } if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); - return None; + return MaxVFCandidates::None(); } if (TC == 0) { @@ -5861,7 +5864,7 @@ "Unable to calculate the loop count due to complex control flow", "unable to calculate the loop count due to complex control flow", "UnknownLoopCountComplexCFG", ORE, TheLoop); - return None; + return MaxVFCandidates::None(); } reportVectorizationFailure( @@ -5870,7 +5873,7 @@ "Enable vectorization of this loop with '#pragma clang loop " "vectorize(enable)' when compiling with -Os/-Oz", "NoTailLoopWithOptForSize", ORE, TheLoop); - return None; + return MaxVFCandidates::None(); } ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( @@ -7888,8 +7891,8 @@ Optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); - Optional MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); - if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. + MaxVFCandidates MaxFactors = CM.computeMaxVF(UserVF, UserIC); + if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. return None; // Invalidate interleave groups if all blocks of loop will be predicated. @@ -7906,29 +7909,24 @@ CM.invalidateCostModelingDecisions(); } - ElementCount MaxVF = MaybeMaxVF.getValue(); - assert(MaxVF.isNonZero() && "MaxVF is zero."); - - bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); - if (!UserVF.isZero() && - (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { - // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable - // VFs here, this should be reverted to only use legal UserVFs once the - // loop below supports scalable VFs. - ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; + ElementCount MaxUserVF = + UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; + bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); + if (!UserVF.isZero() && UserVFIsLegal) { LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") - << " VF " << VF << ".\n"); - assert(isPowerOf2_32(VF.getKnownMinValue()) && + << " VF " << UserVF << ".\n"); + assert(isPowerOf2_32(UserVF.getKnownMinValue()) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - CM.selectUserVectorizationFactor(VF); + CM.selectUserVectorizationFactor(UserVF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(VF, VF); + buildVPlansWithVPRecipes({UserVF}, {UserVF}); LLVM_DEBUG(printPlans(dbgs())); - return {{VF, 0}}; + return {{UserVF, 0}}; } + ElementCount MaxVF = MaxFactors.FixedVF; assert(!MaxVF.isScalable() && "Scalable vectors not yet supported beyond this point"); @@ -7947,7 +7945,7 @@ buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); LLVM_DEBUG(printPlans(dbgs())); - if (MaxVF.isScalar()) + if (!MaxFactors.hasVectorCandidates()) return VectorizationFactor::Disabled(); // Select the optimal vectorization factor.