diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -200,6 +200,37 @@ } }; +/// A class that represents two vectorization factors (initialized with 0 by +/// default). One for fixed-width vectorization and one for scalable +/// vectorization. This can be used by the vectorizer to choose from a range of +/// fixed and/or scalable VFs in order to find the most cost-effective VF to +/// vectorize with. +struct FixedScalableVFPair { + ElementCount FixedVF; + ElementCount ScalableVF; + + FixedScalableVFPair() + : FixedVF(ElementCount::getFixed(0)), + ScalableVF(ElementCount::getScalable(0)) {} + FixedScalableVFPair(const ElementCount &Max) : FixedScalableVFPair() { + *(Max.isScalable() ? &ScalableVF : &FixedVF) = Max; + } + FixedScalableVFPair(const ElementCount &FixedVF, + const ElementCount &ScalableVF) + : FixedVF(FixedVF), ScalableVF(ScalableVF) { + assert(!FixedVF.isScalable() && ScalableVF.isScalable() && + "Invalid scalable properties"); + } + + static FixedScalableVFPair getNone() { return FixedScalableVFPair(); } + + /// \return true if either fixed- or scalable VF is non-zero. + explicit operator bool() const { return FixedVF || ScalableVF; } + + /// \return true if either fixed- or scalable VF is a valid vector VF. + bool hasVector() const { return FixedVF.isVector() || ScalableVF.isVector(); } +}; + /// Planner drives the vectorization process after having passed /// Legality checks. class LoopVectorizationPlanner { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1229,9 +1229,10 @@ TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} - /// \return An upper bound for the vectorization factor, or None if - /// vectorization and interleaving should be avoided up front. - Optional computeMaxVF(ElementCount UserVF, unsigned UserIC); + /// \return An upper bound for the vectorization factors (both fixed and + /// scalable). If the factors are 0, vectorization and interleaving should be + /// avoided up front. + FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); /// \return True if runtime checks are required for vectorization, and false /// otherwise. @@ -1625,11 +1626,13 @@ private: unsigned NumPredStores = 0; - /// \return An upper bound for the vectorization factor, a power-of-2 larger - /// than zero. One is returned if vectorization should best be avoided due - /// to cost. - ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF); + /// \return An upper bound for the vectorization factors for both + /// fixed and scalable vectorization, where the minimum-known number of + /// elements is a power-of-2 larger than zero. If scalable vectorization is + /// disabled or unsupported, then the scalable part will be equal to + /// ElementCount::getScalable(0). + FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF); /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. @@ -5676,7 +5679,7 @@ return MaxScalableVF; } -ElementCount +FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, ElementCount UserVF) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); @@ -5742,22 +5745,24 @@ LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"); - ElementCount MaxFixedVF = ElementCount::getFixed(1); + FixedScalableVFPair Result(ElementCount::getFixed(1), + ElementCount::getScalable(0)); if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, MaxSafeFixedVF)) - MaxFixedVF = MaxVF; + Result.FixedVF = MaxVF; if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, MaxSafeScalableVF)) - // FIXME: Return scalable VF as well (to be added in future patch). - if (MaxVF.isScalable()) + if (MaxVF.isScalable()) { + Result.ScalableVF = MaxVF; LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF << "\n"); + } - return MaxFixedVF; + return Result; } -Optional +FixedScalableVFPair LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically @@ -5766,7 +5771,7 @@ "Not inserting runtime ptr check for divergent target", "runtime pointer checks needed. Not enabled for divergent target", "CantVersionLoopWithDivergentTarget", ORE, TheLoop); - return None; + return FixedScalableVFPair::getNone(); } unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); @@ -5775,7 +5780,7 @@ reportVectorizationFailure("Single iteration (non) loop", "loop trip count is one, irrelevant for vectorization", "SingleIterationLoop", ORE, TheLoop); - return None; + return FixedScalableVFPair::getNone(); } switch (ScalarEpilogueStatus) { @@ -5802,7 +5807,7 @@ // Bail if runtime checks are required, which are not good when optimising // for size. if (runtimeChecksRequired()) - return None; + return FixedScalableVFPair::getNone(); break; } @@ -5820,7 +5825,7 @@ ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; return computeFeasibleMaxVF(TC, UserVF); } - return None; + return FixedScalableVFPair::getNone(); } // Now try the tail folding @@ -5835,26 +5840,29 @@ InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); - assert(!MaxVF.isScalable() && - "Scalable vectors do not yet support tail folding"); - assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && - "MaxVF must be a power of 2"); - unsigned MaxVFtimesIC = - UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); - // Avoid tail folding if the trip count is known to be a multiple of any VF we - // chose. - ScalarEvolution *SE = PSE.getSE(); - const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); - const SCEV *ExitCount = SE->getAddExpr( - BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); - const SCEV *Rem = SE->getURemExpr( - SE->applyLoopGuards(ExitCount, TheLoop), - SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); - if (Rem->isZero()) { - // Accept MaxVF if we do not have a tail. - LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); - return MaxVF; + FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); + // Avoid tail folding if the trip count is known to be a multiple of any VF + // we chose. + // FIXME: The condition below pessimises the case for fixed-width vectors, + // when scalable VFs are also candidates for vectorization. + if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { + ElementCount MaxFixedVF = MaxFactors.FixedVF; + assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && + "MaxFixedVF must be a power of 2"); + unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC + : MaxFixedVF.getFixedValue(); + ScalarEvolution *SE = PSE.getSE(); + const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + const SCEV *Rem = SE->getURemExpr( + SE->applyLoopGuards(ExitCount, TheLoop), + SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); + if (Rem->isZero()) { + // Accept MaxFixedVF if we do not have a tail. + LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + return MaxFactors; + } } // If we don't know the precise trip count, or if the trip count that we @@ -5863,7 +5871,7 @@ // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; - return MaxVF; + return MaxFactors; } // If there was a tail-folding hint/switch, but we can't fold the tail by @@ -5872,12 +5880,12 @@ LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return MaxVF; + return MaxFactors; } if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); - return None; + return FixedScalableVFPair::getNone(); } if (TC == 0) { @@ -5885,7 +5893,7 @@ "Unable to calculate the loop count due to complex control flow", "unable to calculate the loop count due to complex control flow", "UnknownLoopCountComplexCFG", ORE, TheLoop); - return None; + return FixedScalableVFPair::getNone(); } reportVectorizationFailure( @@ -5894,7 +5902,7 @@ "Enable vectorization of this loop with '#pragma clang loop " "vectorize(enable)' when compiling with -Os/-Oz", "NoTailLoopWithOptForSize", ORE, TheLoop); - return None; + return FixedScalableVFPair::getNone(); } ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( @@ -7928,8 +7936,8 @@ Optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); - Optional MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); - if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. + FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); + if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. return None; // Invalidate interleave groups if all blocks of loop will be predicated. @@ -7946,29 +7954,24 @@ CM.invalidateCostModelingDecisions(); } - ElementCount MaxVF = MaybeMaxVF.getValue(); - assert(MaxVF.isNonZero() && "MaxVF is zero."); - - bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); - if (!UserVF.isZero() && - (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { - // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable - // VFs here, this should be reverted to only use legal UserVFs once the - // loop below supports scalable VFs. - ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; + ElementCount MaxUserVF = + UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; + bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); + if (!UserVF.isZero() && UserVFIsLegal) { LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") - << " VF " << VF << ".\n"); - assert(isPowerOf2_32(VF.getKnownMinValue()) && + << " VF " << UserVF << ".\n"); + assert(isPowerOf2_32(UserVF.getKnownMinValue()) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - CM.selectUserVectorizationFactor(VF); + CM.selectUserVectorizationFactor(UserVF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(VF, VF); + buildVPlansWithVPRecipes({UserVF}, {UserVF}); LLVM_DEBUG(printPlans(dbgs())); - return {{VF, 0}}; + return {{UserVF, 0}}; } + ElementCount MaxVF = MaxFactors.FixedVF; assert(!MaxVF.isScalable() && "Scalable vectors not yet supported beyond this point"); @@ -7987,7 +7990,7 @@ buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); LLVM_DEBUG(printPlans(dbgs())); - if (MaxVF.isScalar()) + if (!MaxFactors.hasVector()) return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll @@ -0,0 +1,33 @@ +; RUN: opt -loop-vectorize -force-target-instruction-cost=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s 2>&1 | FileCheck %s + +; This test currently fails when the LV calculates a maximums safe +; distance for scalable vectors, because the code to eliminate the tail is +; pessimistic when scalable vectors are considered. This will be addressed +; in a future patch, at which point we should be able to un-XFAIL the +; test. The expected output is to vectorize this loop without predication +; (and thus have unpredicated vector store). +; XFAIL: * + +; CHECK: store <4 x i32> + +target triple = "aarch64" +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + +define void @f1(i32* %A) #0 { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %iv + store i32 1, i32* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ne i64 %iv.next, 1024 + br i1 %exitcond, label %for.body, label %exit + +exit: + ret void +} + +attributes #0 = { "target-features"="+sve" }