diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -200,6 +200,37 @@ } }; +/// A class that represents two vectorization factors (initialized with 0 by +/// default). One for fixed-width vectorization and one for scalable +/// vectorization. This can be used by the vectorizer to choose from a range of +/// fixed and/or scalable VFs in order to find the most cost-effective VF to +/// vectorize with. +struct FixedScalableVFPair { + ElementCount FixedVF; + ElementCount ScalableVF; + + FixedScalableVFPair() + : FixedVF(ElementCount::getFixed(0)), + ScalableVF(ElementCount::getScalable(0)) {} + FixedScalableVFPair(const ElementCount &Max) : FixedScalableVFPair() { + *(Max.isScalable() ? &ScalableVF : &FixedVF) = Max; + } + FixedScalableVFPair(const ElementCount &FixedVF, + const ElementCount &ScalableVF) + : FixedVF(FixedVF), ScalableVF(ScalableVF) { + assert(!FixedVF.isScalable() && ScalableVF.isScalable() && + "Invalid scalable properties"); + } + + static FixedScalableVFPair getNone() { return FixedScalableVFPair(); } + + /// \return true if either fixed- or scalable VF is non-zero. + explicit operator bool() const { return FixedVF || ScalableVF; } + + /// \return true if either fixed- or scalable VF is a valid vector VF. + bool hasVector() const { return FixedVF.isVector() || ScalableVF.isVector(); } +}; + /// Planner drives the vectorization process after having passed /// Legality checks. class LoopVectorizationPlanner { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1229,9 +1229,10 @@ TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} - /// \return An upper bound for the vectorization factor, or None if - /// vectorization and interleaving should be avoided up front. - Optional computeMaxVF(ElementCount UserVF, unsigned UserIC); + /// \return An upper bound for the vectorization factor (both fixed and + /// scalable) or None if vectorization and interleaving should be avoided up + /// front. + FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); /// \return True if runtime checks are required for vectorization, and false /// otherwise. @@ -1627,11 +1628,13 @@ private: unsigned NumPredStores = 0; - /// \return An upper bound for the vectorization factor, a power-of-2 larger - /// than zero. One is returned if vectorization should best be avoided due - /// to cost. - ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF); + /// \return An upper bound for the vectorization factors for both + /// fixed and scalable vectorization, where the minimum-known number of + /// elements is a power-of-2 larger than zero. If scalable vectorization is + /// disabled or unsupported, then the scalable part will be equal to + /// ElementCount::getScalable(0). + FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF); /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. @@ -5652,7 +5655,7 @@ return MaxScalableVF; } -ElementCount +FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, ElementCount UserVF) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); @@ -5718,22 +5721,24 @@ LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"); - ElementCount MaxFixedVF = ElementCount::getFixed(1); + FixedScalableVFPair Result(ElementCount::getFixed(1), + ElementCount::getScalable(0)); if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, MaxSafeFixedVF)) - MaxFixedVF = MaxVF; + Result.FixedVF = MaxVF; if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, MaxSafeScalableVF)) - // FIXME: Return scalable VF as well (to be added in future patch). - if (MaxVF.isScalable()) + if (MaxVF.isScalable()) { + Result.ScalableVF = MaxVF; LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF << "\n"); + } - return MaxFixedVF; + return Result; } -Optional +FixedScalableVFPair LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically @@ -5742,7 +5747,7 @@ "Not inserting runtime ptr check for divergent target", "runtime pointer checks needed. Not enabled for divergent target", "CantVersionLoopWithDivergentTarget", ORE, TheLoop); - return None; + return FixedScalableVFPair::getNone(); } unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); @@ -5751,7 +5756,7 @@ reportVectorizationFailure("Single iteration (non) loop", "loop trip count is one, irrelevant for vectorization", "SingleIterationLoop", ORE, TheLoop); - return None; + return FixedScalableVFPair::getNone(); } switch (ScalarEpilogueStatus) { @@ -5778,7 +5783,7 @@ // Bail if runtime checks are required, which are not good when optimising // for size. if (runtimeChecksRequired()) - return None; + return FixedScalableVFPair::getNone(); break; } @@ -5796,7 +5801,7 @@ ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; return computeFeasibleMaxVF(TC, UserVF); } - return None; + return FixedScalableVFPair::getNone(); } // Now try the tail folding @@ -5811,26 +5816,29 @@ InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); - assert(!MaxVF.isScalable() && - "Scalable vectors do not yet support tail folding"); - assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && - "MaxVF must be a power of 2"); - unsigned MaxVFtimesIC = - UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); - // Avoid tail folding if the trip count is known to be a multiple of any VF we - // chose. - ScalarEvolution *SE = PSE.getSE(); - const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); - const SCEV *ExitCount = SE->getAddExpr( - BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); - const SCEV *Rem = SE->getURemExpr( - SE->applyLoopGuards(ExitCount, TheLoop), - SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); - if (Rem->isZero()) { - // Accept MaxVF if we do not have a tail. - LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); - return MaxVF; + FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); + // Avoid tail folding if the trip count is known to be a multiple of any VF + // we chose. + // FIXME: The condition below pessimises the case for fixed-width vectors, + // when scalable VFs are also candidates for vectorization. + if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { + ElementCount MaxFixedVF = MaxFactors.FixedVF; + assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && + "MaxFixedVF must be a power of 2"); + unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC + : MaxFixedVF.getFixedValue(); + ScalarEvolution *SE = PSE.getSE(); + const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + const SCEV *Rem = SE->getURemExpr( + SE->applyLoopGuards(ExitCount, TheLoop), + SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); + if (Rem->isZero()) { + // Accept MaxFixedVF if we do not have a tail. + LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + return MaxFactors; + } } // If we don't know the precise trip count, or if the trip count that we @@ -5839,7 +5847,7 @@ // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; - return MaxVF; + return MaxFactors; } // If there was a tail-folding hint/switch, but we can't fold the tail by @@ -5848,12 +5856,12 @@ LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return MaxVF; + return MaxFactors; } if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); - return None; + return FixedScalableVFPair::getNone(); } if (TC == 0) { @@ -5861,7 +5869,7 @@ "Unable to calculate the loop count due to complex control flow", "unable to calculate the loop count due to complex control flow", "UnknownLoopCountComplexCFG", ORE, TheLoop); - return None; + return FixedScalableVFPair::getNone(); } reportVectorizationFailure( @@ -5870,7 +5878,7 @@ "Enable vectorization of this loop with '#pragma clang loop " "vectorize(enable)' when compiling with -Os/-Oz", "NoTailLoopWithOptForSize", ORE, TheLoop); - return None; + return FixedScalableVFPair::getNone(); } ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( @@ -7888,8 +7896,8 @@ Optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); - Optional MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); - if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. + FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); + if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. return None; // Invalidate interleave groups if all blocks of loop will be predicated. @@ -7906,29 +7914,24 @@ CM.invalidateCostModelingDecisions(); } - ElementCount MaxVF = MaybeMaxVF.getValue(); - assert(MaxVF.isNonZero() && "MaxVF is zero."); - - bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); - if (!UserVF.isZero() && - (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { - // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable - // VFs here, this should be reverted to only use legal UserVFs once the - // loop below supports scalable VFs. - ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; + ElementCount MaxUserVF = + UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; + bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); + if (!UserVF.isZero() && UserVFIsLegal) { LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") - << " VF " << VF << ".\n"); - assert(isPowerOf2_32(VF.getKnownMinValue()) && + << " VF " << UserVF << ".\n"); + assert(isPowerOf2_32(UserVF.getKnownMinValue()) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - CM.selectUserVectorizationFactor(VF); + CM.selectUserVectorizationFactor(UserVF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(VF, VF); + buildVPlansWithVPRecipes({UserVF}, {UserVF}); LLVM_DEBUG(printPlans(dbgs())); - return {{VF, 0}}; + return {{UserVF, 0}}; } + ElementCount MaxVF = MaxFactors.FixedVF; assert(!MaxVF.isScalable() && "Scalable vectors not yet supported beyond this point"); @@ -7947,7 +7950,7 @@ buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); LLVM_DEBUG(printPlans(dbgs())); - if (MaxVF.isScalar()) + if (!MaxFactors.hasVector()) return VectorizationFactor::Disabled(); // Select the optimal vectorization factor.