Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -194,6 +194,50 @@ } }; +/// A class that can hold upto two possible vectorization factors, a fixed VF +/// and a scalable VF. This can be used by the vectorizer to choose from a range +/// of fixed and/or scalable VFs in order to find the most cost-effective VF to +/// vectorize with. +class MinMaxVFCandidates { + Optional FixedVF; + Optional ScalableVF; + +public: + /// Initializes both FixedVF/ScalableVF with None + MinMaxVFCandidates() : FixedVF({}), ScalableVF({}) {} + + /// Initializes either FixedVF or ScalableVF with VF, the other with None. + MinMaxVFCandidates(const ElementCount &VF) : MinMaxVFCandidates() { + Optional *LHSPtr = VF.isScalable() ? &ScalableVF : &FixedVF; + *LHSPtr = VF; + } + + /// Initializes FixedVF and ScalableVF + MinMaxVFCandidates(const ElementCount &FixedVF, + const ElementCount &ScalableVF) + : FixedVF(FixedVF), ScalableVF(ScalableVF) {} + + static MinMaxVFCandidates None() { return MinMaxVFCandidates(); } + + /// Assign VF to either FixedVF or ScalableVF, depending on whether VF + /// is fixed/scalable. If the VF value is already set, the value is + /// overwritten with the largest VF. + void addMaxVF(const ElementCount &VF) { + Optional *LHSPtr = + VF.isScalable() ? &ScalableVF : &FixedVF; + if (LHSPtr->hasValue()) + *LHSPtr = ElementCount::isKnownGT(**LHSPtr, VF) ? **LHSPtr : VF; + else + *LHSPtr = VF; + } + + explicit operator bool() const { return FixedVF || ScalableVF; } + bool hasFixedVF() const { return FixedVF.hasValue(); } + bool hasScalableVF() const { return ScalableVF.hasValue(); } + Optional getFixedVF() const { return FixedVF; } + Optional getScalableVF() const { return ScalableVF; } +}; + /// Planner drives the vectorization process after having passed /// Legality checks. class LoopVectorizationPlanner { Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1212,10 +1212,9 @@ : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} - /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. - Optional computeMaxVF(ElementCount UserVF, unsigned UserIC); + MinMaxVFCandidates computeMaxVF(ElementCount UserVF, unsigned UserIC); /// \return True if runtime checks are required for vectorization, and false /// otherwise. @@ -5545,8 +5544,8 @@ return false; } -Optional -LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { +MinMaxVFCandidates LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, + unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -5554,7 +5553,7 @@ "Not inserting runtime ptr check for divergent target", "runtime pointer checks needed. Not enabled for divergent target", "CantVersionLoopWithDivergentTarget", ORE, TheLoop); - return None; + return MinMaxVFCandidates::None(); } unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); @@ -5563,10 +5562,10 @@ reportVectorizationFailure("Single iteration (non) loop", "loop trip count is one, irrelevant for vectorization", "SingleIterationLoop", ORE, TheLoop); - return None; + return MinMaxVFCandidates::None(); } - auto GetFeasibleMaxVF = [&]() -> ElementCount { + auto GetFeasibleMaxVF = [&]() -> MinMaxVFCandidates { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5578,29 +5577,44 @@ auto MaxSafeElements = ElementCount::getFixed( PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType)); - // Try to find a max scalable VF. - if (Hints->allowScalable() && TTI.supportsScalableVectors()) { - auto MaxSafeVF = clampFeasibleMaxVF(ElementCount::getScalable(1 << 16), - MaxSafeElements); - if (MaxSafeVF.isScalableVector()) { - ElementCount MaxScalableVF = - computeFeasibleMaxVF(TC, MaxSafeVF, SmallestType, WidestType); - if (MaxScalableVF.isScalable()) - LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " - << MaxScalableVF << "\n"); - else - LLVM_DEBUG(dbgs() << "LV: No feasible scalable VF found.\n"); - } - } + MinMaxVFCandidates Result; + + // Always make sure to add at least a max fixed-VF of 1 (scalar) as a + // fallback when vectorization is not possible. + Result.addMaxVF(ElementCount::getFixed(1)); // First analyze the UserVF, fall back if the UserVF should be ignored. if (auto MaybeMaxVF = getFeasibleUserVF(UserVF, MaxSafeElements)) - return MaybeMaxVF.getValue(); + Result.addMaxVF(*MaybeMaxVF); + else { + // If we don't force the use of scalable vectors or if the UserVF was + // ignored for some reason, we should consider auto-selecting a + // fixed-width VF. + if (!Hints->isForcedScalable() || UserVF.isNonZero()) { + auto MaxSafeVF = clampFeasibleMaxVF(ElementCount::getFixed(1 << 16), + MaxSafeElements); + Result.addMaxVF( + computeFeasibleMaxVF(TC, MaxSafeVF, SmallestType, WidestType)); + } + + // Try to find a max scalable VF as well if the target supports it. + if (Hints->allowScalable() && TTI.supportsScalableVectors()) { + auto MaxSafeVF = clampFeasibleMaxVF(ElementCount::getScalable(1 << 16), + MaxSafeElements); + if (MaxSafeVF.isScalableVector()) { + ElementCount MaxScalableVF = + computeFeasibleMaxVF(TC, MaxSafeVF, SmallestType, WidestType); + if (MaxScalableVF.isScalable()) + LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " + << MaxScalableVF << "\n"); + else + LLVM_DEBUG(dbgs() << "LV: No feasible scalable VF found.\n"); + Result.addMaxVF(MaxScalableVF); + } + } + } - // Try to automatically determine a suitable maximum VF. - auto MaxSafeVF = - clampFeasibleMaxVF(ElementCount::getFixed(1 << 16), MaxSafeElements); - return computeFeasibleMaxVF(TC, MaxSafeVF, SmallestType, WidestType); + return Result; }; switch (ScalarEpilogueStatus) { @@ -5627,7 +5641,7 @@ // Bail if runtime checks are required, which are not good when optimising // for size. if (runtimeChecksRequired()) - return None; + return MinMaxVFCandidates::None(); break; } @@ -5645,7 +5659,7 @@ ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; return GetFeasibleMaxVF(); } - return None; + return MinMaxVFCandidates::None(); } // Now try the tail folding @@ -5660,26 +5674,27 @@ InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - ElementCount MaxVF = GetFeasibleMaxVF(); - assert(!MaxVF.isScalable() && - "Scalable vectors do not yet support tail folding"); - assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && - "MaxVF must be a power of 2"); - unsigned MaxVFtimesIC = - UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); - // Avoid tail folding if the trip count is known to be a multiple of any VF we - // chose. - ScalarEvolution *SE = PSE.getSE(); - const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); - const SCEV *ExitCount = SE->getAddExpr( - BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); - const SCEV *Rem = SE->getURemExpr( - SE->applyLoopGuards(ExitCount, TheLoop), - SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); - if (Rem->isZero()) { - // Accept MaxVF if we do not have a tail. - LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); - return MaxVF; + MinMaxVFCandidates MaxFactors = GetFeasibleMaxVF(); + if (MaxFactors.hasFixedVF() && !MaxFactors.hasScalableVF()) { + ElementCount MaxFixedVF = *MaxFactors.getFixedVF(); + assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && + "MaxFixedVF must be a power of 2"); + unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC + : MaxFixedVF.getFixedValue(); + // Avoid tail folding if the trip count is known to be a multiple of any VF + // we chose. + ScalarEvolution *SE = PSE.getSE(); + const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + const SCEV *Rem = SE->getURemExpr( + SE->applyLoopGuards(ExitCount, TheLoop), + SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); + if (Rem->isZero()) { + // Accept MaxFixedVF if we do not have a tail. + LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + return MaxFactors; + } } // If we don't know the precise trip count, or if the trip count that we @@ -5688,7 +5703,7 @@ // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; - return MaxVF; + return MaxFactors; } // If there was a tail-folding hint/switch, but we can't fold the tail by @@ -5697,12 +5712,12 @@ LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return MaxVF; + return MaxFactors; } if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); - return None; + return MinMaxVFCandidates::None(); } if (TC == 0) { @@ -5710,7 +5725,7 @@ "Unable to calculate the loop count due to complex control flow", "unable to calculate the loop count due to complex control flow", "UnknownLoopCountComplexCFG", ORE, TheLoop); - return None; + return MinMaxVFCandidates::None(); } reportVectorizationFailure( @@ -5719,7 +5734,7 @@ "Enable vectorization of this loop with '#pragma clang loop " "vectorize(enable)' when compiling with -Os/-Oz", "NoTailLoopWithOptForSize", ORE, TheLoop); - return None; + return MinMaxVFCandidates::None(); } ElementCount LoopVectorizationCostModel::clampFeasibleMaxVF( @@ -7775,8 +7790,8 @@ Optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); - Optional MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); - if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. + MinMaxVFCandidates MaxFactors = CM.computeMaxVF(UserVF, UserIC); + if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. return None; // Invalidate interleave groups if all blocks of loop will be predicated. @@ -7793,7 +7808,10 @@ CM.invalidateCostModelingDecisions(); } - ElementCount MaxVF = MaybeMaxVF.getValue(); + ElementCount MaxVF = + (UserVF && UserVF.isScalable() && MaxFactors.hasScalableVF()) + ? *MaxFactors.getScalableVF() + : *MaxFactors.getFixedVF(); assert(MaxVF.isNonZero() && "MaxVF is zero."); bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);