diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -162,7 +162,31 @@ bool skipScalarizationCost() const { return ScalarizationCost.isValid(); } }; -enum class PredicationStyle { None, Data, DataAndControlFlow }; +enum class TailFoldingStyle { + /// Don't use tail folding + None, + /// Use predicate only to mask operations on data in the loop. + /// When the VL is not known to be a power-of-2, this method requires a + /// runtime overflow check for the i + VL in the loop because it compares the + /// scalar induction variable against the tripcount rounded up by VL which may + /// overflow. When the VL is a power-of-2, both the increment and uprounded + /// tripcount will overflow to 0, which does not require a runtime check + /// since the loop is exited when the loop induction variable equals the + /// uprounded trip-count, which are both 0. + Data, + /// Same as Data, but avoids using the get.active.lane.mask intrinsic to + /// calculate the mask and instead implements this with a + /// splat/stepvector/cmp. + /// FIXME: Can this kind be removed now that SelectionDAGBuilder expands the + /// active.lane.mask intrinsic when it is not natively supported? + DataWithoutLaneMask, + /// Use predicate to control both data and control flow. + /// This method always requires a runtime overflow check for the i + VL + /// increment inside the loop, because it uses the result direclty in the + /// active.lane.mask to calculate the mask for the next iteration. If the + /// increment overflows, the mask is no longer correct. + DataAndControlFlow, +}; class TargetTransformInfo; typedef TargetTransformInfo TTI; @@ -516,13 +540,8 @@ LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI) const; - /// Query the target whether lowering of the llvm.get.active.lane.mask - /// intrinsic is supported and how the mask should be used. A return value - /// of PredicationStyle::Data indicates the mask is used as data only, - /// whereas PredicationStyle::DataAndControlFlow indicates we should also use - /// the mask for control flow in the loop. If unsupported the return value is - /// PredicationStyle::None. - PredicationStyle emitGetActiveLaneMask() const; + /// Query the target what the preferred style of tail folding is. + TailFoldingStyle getPreferredTailFoldingStyle() const; // Parameters that control the loop peeling transformation struct PeelingPreferences { @@ -1616,7 +1635,7 @@ AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI) = 0; - virtual PredicationStyle emitGetActiveLaneMask() = 0; + virtual TailFoldingStyle getPreferredTailFoldingStyle() = 0; virtual std::optional instCombineIntrinsic( InstCombiner &IC, IntrinsicInst &II) = 0; virtual std::optional simplifyDemandedUseBitsIntrinsic( @@ -2016,8 +2035,8 @@ InterleavedAccessInfo *IAI) override { return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI); } - PredicationStyle emitGetActiveLaneMask() override { - return Impl.emitGetActiveLaneMask(); + TailFoldingStyle getPreferredTailFoldingStyle() override { + return Impl.getPreferredTailFoldingStyle(); } std::optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -171,8 +171,8 @@ return false; } - PredicationStyle emitGetActiveLaneMask() const { - return PredicationStyle::None; + TailFoldingStyle getPreferredTailFoldingStyle() const { + return TailFoldingStyle::DataWithoutLaneMask; } std::optional instCombineIntrinsic(InstCombiner &IC, diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -630,8 +630,8 @@ return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI); } - PredicationStyle emitGetActiveLaneMask() { - return BaseT::emitGetActiveLaneMask(); + TailFoldingStyle getPreferredTailFoldingStyle() { + return BaseT::getPreferredTailFoldingStyle(); } std::optional instCombineIntrinsic(InstCombiner &IC, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -312,8 +312,8 @@ return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI); } -PredicationStyle TargetTransformInfo::emitGetActiveLaneMask() const { - return TTIImpl->emitGetActiveLaneMask(); +TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle() const { + return TTIImpl->getPreferredTailFoldingStyle(); } std::optional diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -347,10 +347,10 @@ return ST->hasSVE() ? 5 : 0; } - PredicationStyle emitGetActiveLaneMask() const { + TailFoldingStyle getPreferredTailFoldingStyle() const { if (ST->hasSVE()) - return PredicationStyle::DataAndControlFlow; - return PredicationStyle::None; + return TailFoldingStyle::DataAndControlFlow; + return TailFoldingStyle::DataWithoutLaneMask; } bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -312,7 +312,7 @@ TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); - PredicationStyle emitGetActiveLaneMask() const; + TailFoldingStyle getPreferredTailFoldingStyle() const; void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2286,15 +2286,15 @@ return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI()); } -PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const { +TailFoldingStyle ARMTTIImpl::getPreferredTailFoldingStyle() const { if (!ST->hasMVEIntegerOps() || !EnableTailPredication) - return PredicationStyle::None; + return TailFoldingStyle::DataWithoutLaneMask; // Intrinsic @llvm.get.active.lane.mask is supported. // It is used in the MVETailPredication pass, which requires the number of // elements processed by this vector loop to setup the tail-predicated // loop. - return PredicationStyle::Data; + return TailFoldingStyle::Data; } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -76,9 +76,9 @@ bool shouldExpandReduction(const IntrinsicInst *II) const; bool supportsScalableVectors() const { return ST->hasVInstructions(); } bool enableScalableVectorization() const { return ST->hasVInstructions(); } - PredicationStyle emitGetActiveLaneMask() const { - return ST->hasVInstructions() ? PredicationStyle::Data - : PredicationStyle::None; + TailFoldingStyle getPreferredTailFoldingStyle() const { + return ST->hasVInstructions() ? TailFoldingStyle::Data + : TailFoldingStyle::DataWithoutLaneMask; } std::optional getMaxVScale() const; std::optional getVScaleForTuning() const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1538,14 +1538,23 @@ return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; } + /// Returns the TailFoldingStyle that is best for the current loop. + TailFoldingStyle getTailFoldingStyle() const { + if (!CanFoldTailByMasking) + return TailFoldingStyle::None; + + return TTI.getPreferredTailFoldingStyle(); + } + /// Returns true if all loop blocks should be masked to fold tail loop. - bool foldTailByMasking() const { return FoldTailByMasking; } + bool foldTailByMasking() const { + return getTailFoldingStyle() != TailFoldingStyle::None; + } /// Returns true if were tail-folding and want to use the active lane mask /// for vector loop control flow. bool useActiveLaneMaskForControlFlow() const { - return FoldTailByMasking && - TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; + return getTailFoldingStyle() == TailFoldingStyle::DataAndControlFlow; } /// Returns true if the instructions in this block requires predication @@ -1715,7 +1724,7 @@ ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; /// All blocks of loop are to be masked to fold tail of scalar iterations. - bool FoldTailByMasking = false; + bool CanFoldTailByMasking = false; /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the @@ -5134,7 +5143,7 @@ // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { - FoldTailByMasking = true; + CanFoldTailByMasking = true; return MaxFactors; } @@ -5292,7 +5301,7 @@ unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); - if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && + if (!A.Width.isScalable() && !B.Width.isScalable() && foldTailByMasking() && MaxTripCount) { // If we are folding the tail and the trip count is a known (possibly small) // constant, the trip count will be rounded up to an integer number of @@ -8098,8 +8107,8 @@ // If we're using the active lane mask for control flow, then we get the // mask from the active lane mask PHI that is cached in the VPlan. - PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); - if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) + TailFoldingStyle Style = CM.getTailFoldingStyle(); + if (Style == TailFoldingStyle::DataAndControlFlow) return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); // Introduce the early-exit compare IV <= BTC to form header block mask. @@ -8115,7 +8124,8 @@ VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (EmitGetActiveLaneMask != PredicationStyle::None) { + if (Style != TailFoldingStyle::None && + Style != TailFoldingStyle::DataWithoutLaneMask) { VPValue *TC = Plan->getOrCreateTripCount(); BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, nullptr, "active.lane.mask"); @@ -8712,8 +8722,7 @@ // Add the necessary canonical IV and branch recipes required to control the // loop. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW, - bool UseLaneMaskForLoopControlFlow) { + TailFoldingStyle Style) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddVPValue(StartIdx); @@ -8725,6 +8734,7 @@ // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar // IV by VF * UF. + bool HasNUW = Style == TailFoldingStyle::None; auto *CanonicalIVIncrement = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW : VPInstruction::CanonicalIVIncrement, @@ -8734,7 +8744,7 @@ VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); EB->appendRecipe(CanonicalIVIncrement); - if (UseLaneMaskForLoopControlFlow) { + if (Style == TailFoldingStyle::DataAndControlFlow) { // Create the active lane mask instruction in the vplan preheader. VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); @@ -8887,8 +8897,7 @@ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking(), - CM.useActiveLaneMaskForControlFlow()); + CM.getTailFoldingStyle()); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -9198,7 +9207,7 @@ Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true, CM.useActiveLaneMaskForControlFlow()); + CM.getTailFoldingStyle()); return Plan; }