diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -925,6 +925,11 @@ /// architectural maximum vector length, and None otherwise. Optional getMaxVScale() const; + /// \return The maximum number of bits for a block in a scalable vector + /// register for a scalable vector with a vscale number of blocks. + /// i.e. the maximum number of N x elt bits in . + unsigned getMaxScalableBitsPerBlock() const; + /// \return True if the vectorization factor should be chosen to /// make the vector of the smallest element type match the size of a /// vector register. For wider element types, this could result in @@ -1513,6 +1518,7 @@ virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual Optional getMaxVScale() const = 0; + virtual unsigned getMaxScalableBitsPerBlock() const = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; virtual ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const = 0; @@ -1941,6 +1947,9 @@ Optional getMaxVScale() const override { return Impl.getMaxVScale(); } + unsigned getMaxScalableBitsPerBlock() const override { + return Impl.getMaxScalableBitsPerBlock(); + } bool shouldMaximizeVectorBandwidth(bool OptSize) const override { return Impl.shouldMaximizeVectorBandwidth(OptSize); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -376,6 +376,8 @@ Optional getMaxVScale() const { return None; } + unsigned getMaxScalableBitsPerBlock() const { return 0; } + bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; } ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const { diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -109,8 +109,10 @@ void emitRemarkWithHints() const; ElementCount getWidth() const { - return ElementCount::get(Width.Value, isScalable()); + bool IsForcedScalable = (Scalable.Value == LoopVectorizeHints::FK_Enabled); + return ElementCount::get(Width.Value, IsForcedScalable); } + unsigned getInterleave() const { return Interleave.Value; } unsigned getIsVectorized() const { return IsVectorized.Value; } unsigned getPredicate() const { return Predicate.Value; } @@ -121,7 +123,20 @@ return (ForceKind)Force.Value; } - bool isScalable() const { return Scalable.Value; } + /// \return true if scalable vectorization has been disabled explicitly. + bool disableScalableVectorization() const { + return Scalable.Value == LoopVectorizeHints::FK_Disabled; + } + + /// \return true if fixed-width vectorization has been disabled explicitly. + bool disableFixedWidthVectorization() const { + // If Width is set, this means the 'force scalable' is set through a pragma + // to vectorize for a specific scalable VF. In this case, it should still be + // possible to fall back on fixed-width vectors if the suggested VF is not + // legal. + bool IsForcedScalable = (Scalable.Value == LoopVectorizeHints::FK_Enabled); + return Width.Value == 0 && IsForcedScalable; + } /// If hints are provided that force vectorization, use the AlwaysPrint /// pass name to force the frontend to print the diagnostic. diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -174,6 +174,13 @@ const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr); +/// Reports a vectorization informative message: print \p Msg for debugging +/// purposes as well as an optimization remark. Uses either \p I as location of +/// the remark, or otherwise \p TheLoop. +void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag, + OptimizationRemarkEmitter *ORE, Loop *TheLoop, + Instruction *I = nullptr); + } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZE_H diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -585,6 +585,10 @@ return TTIImpl->getMaxVScale(); } +unsigned TargetTransformInfo::getMaxScalableBitsPerBlock() const { + return TTIImpl->getMaxScalableBitsPerBlock(); +} + bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const { return TTIImpl->shouldMaximizeVectorBandwidth(OptSize); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -121,6 +121,12 @@ return BaseT::getMaxVScale(); } + unsigned getMaxScalableBitsPerBlock() const { + if (ST->hasSVE()) + return AArch64::SVEBitsPerBlock; + return BaseT::getMaxScalableBitsPerBlock(); + } + unsigned getMaxInterleaveFactor(unsigned VF); unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -51,6 +51,22 @@ cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma")); +// FIXME: When scalable vectorization is stable enough, change the default +// to FK_Undefined. +static cl::opt ScalableVectorization( + "scalable-vectorization", cl::init(LoopVectorizeHints::FK_Disabled), + cl::Hidden, + cl::desc("Control whether the compiler can use scalable vectors to " + "vectorize a loop"), + cl::values( + clEnumValN(LoopVectorizeHints::FK_Disabled, "off", + "disable all vectorization with scalable vectors"), + clEnumValN(LoopVectorizeHints::FK_Undefined, "on", + "allow loops to be vectorized with scalable vectors"), + clEnumValN( + LoopVectorizeHints::FK_Enabled, "always", + "allow loops to be vectorized exclusively with scalable vectors"))); + /// Maximum vectorization interleave count. static const unsigned MaxInterleaveFactor = 16; @@ -63,10 +79,10 @@ case HK_UNROLL: return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; case HK_FORCE: + case HK_SCALABLE: return (Val <= 1); case HK_ISVECTORIZED: case HK_PREDICATE: - case HK_SCALABLE: return (Val == 0 || Val == 1); } return false; @@ -80,8 +96,8 @@ Force("vectorize.enable", FK_Undefined, HK_FORCE), IsVectorized("isvectorized", 0, HK_ISVECTORIZED), Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), - Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L), - ORE(ORE) { + Scalable("vectorize.scalable.enable", ScalableVectorization, HK_SCALABLE), + TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1062,9 +1062,10 @@ /// output stream. If \p I is passed, it is an instruction that prevents /// vectorization. #ifndef NDEBUG -static void debugVectorizationFailure(const StringRef DebugMsg, - Instruction *I) { - dbgs() << "LV: Not vectorizing: " << DebugMsg; +static void debugVectorizationMessage(const StringRef Prefix, + const StringRef DebugMsg, + Instruction *I) { + dbgs() << "LV: " << Prefix << DebugMsg; if (I != nullptr) dbgs() << " " << *I; else @@ -1093,9 +1094,7 @@ DL = I->getDebugLoc(); } - OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); - R << "loop not vectorized: "; - return R; + return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); } /// Return a value for Step multiplied by VF. @@ -1116,12 +1115,24 @@ } void reportVectorizationFailure(const StringRef DebugMsg, - const StringRef OREMsg, const StringRef ORETag, - OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { - LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); + const StringRef OREMsg, const StringRef ORETag, + OptimizationRemarkEmitter *ORE, Loop *TheLoop, + Instruction *I) { + LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); - ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), - ORETag, TheLoop, I) << OREMsg); + ORE->emit( + createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) + << "loop not vectorized: " << OREMsg); +} + +void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, + OptimizationRemarkEmitter *ORE, Loop *TheLoop, + Instruction *I) { + LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); + LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); + ORE->emit( + createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) + << Msg); } } // end namespace llvm @@ -1610,6 +1621,18 @@ ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, ElementCount UserVF); + /// \return the maximum vector size based on the target's vector registers, + /// limited to MaxVF. This is a helper function of computeFeasibleMaxVF. + ElementCount getMaxVectorSize(unsigned ConstTripCount, unsigned SmallestType, + unsigned WidestType, ElementCount MaxVF); + + /// \return the maximum legal scalable VF, based on the safe max number + /// of elements. + ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); + /// \return the maximum legal fixed-width VF, based on the safe max number + /// of elements. + ElementCount getMaxLegalFixedVF(unsigned MaxSafeElements); + /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually /// operate on @@ -5518,6 +5541,144 @@ return false; } +ElementCount +LoopVectorizationCostModel::getMaxLegalFixedVF(unsigned MaxSafeElements) { + if (Hints->disableFixedWidthVectorization()) { + reportVectorizationInfo("Fixed-width vectorization is explicitly disabled.", + "FixedWidthVectorizationDisabled", ORE, TheLoop); + return ElementCount::getFixed(0); + } + + return ElementCount::getFixed(MaxSafeElements); +} + +ElementCount +LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { + if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { + reportVectorizationInfo( + "Disabling scalable vectorization, because target does not " + "support scalable vectors.", + "ScalableVectorsUnsupported", ORE, TheLoop); + return ElementCount::getScalable(0); + } + + if (Hints->disableScalableVectorization()) { + reportVectorizationInfo("Scalable vectorization is explicitly disabled", + "ScalableVectorizationDisabled", ORE, TheLoop); + return ElementCount::getScalable(0); + } + + auto MaxScalableVF = ElementCount::getScalable(1u << 16); + + // Disable scalable vectorization, if the loop contains unsupported + // reductions. + // Test that the loop-vectorizer can legalize all operations for this MaxVF. + // FIXME: While for scalable vectors this is currently sufficient, this should + // be replaced by a more detailed mechanism that filters out specific VFs, + // instead of invalidating vectorization for a whole set of VFs based on the + // MaxVF. + if (!canVectorizeReductions(MaxScalableVF)) { + reportVectorizationInfo( + "Scalable vectorization not supported for the reduction " + "operations found in this loop.", + "ScalableVFUnfeasible", ORE, TheLoop); + return ElementCount::getScalable(0); + } + + // Limit MaxScalableVF by the maximum safe dependence distance. + if (!Legal->isSafeForAnyVectorWidth()) { + Optional MaxVScale = TTI.getMaxVScale(); + MaxScalableVF = ElementCount::getScalable( + MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); + if (!MaxScalableVF) + reportVectorizationInfo( + "Max legal vector width too small, scalable vectorization " + "unfeasible.", + "ScalableVFUnfeasible", ORE, TheLoop); + } + + return MaxScalableVF; +} + +ElementCount +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF) { + MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); + unsigned SmallestType, WidestType; + std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); + + // Get the maximum safe dependence distance in bits computed by LAA. + // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from + // the memory accesses that is most restrictive (involved in the smallest + // dependence distance). + unsigned MaxSafeElements = + PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); + + auto MaxSafeFixedVF = getMaxLegalFixedVF(MaxSafeElements); + auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); + + LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF + << ".\n"); + LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF + << ".\n"); + + // First analyze the UserVF, fall back if the UserVF should be ignored. + if (UserVF) { + auto MaxSafeUserVF = + UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; + + if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) + return UserVF; + + assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); + + // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it + // is better to ignore the hint and let the compiler choose a suitable VF. + if (!UserVF.isScalable()) { + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe, clamping to max safe VF=" + << MaxSafeFixedVF << ".\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe, clamping to maximum safe vectorization factor " + << ore::NV("VectorizationFactor", MaxSafeFixedVF); + }); + return MaxSafeFixedVF; + } + + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe. Ignoring scalable UserVF.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe. Ignoring the hint to let the compiler pick a " + "suitable VF."; + }); + } + + LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType + << " / " << WidestType << " bits.\n"); + + ElementCount MaxFixedVF = ElementCount::getFixed(1); + if (auto Max = getMaxVectorSize(ConstTripCount, SmallestType, WidestType, + MaxSafeFixedVF)) + MaxFixedVF = Max; + + if (auto Max = getMaxVectorSize(ConstTripCount, SmallestType, WidestType, + MaxSafeScalableVF)) + if (Max.isScalable()) + LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << Max << "\n"); + + return MaxFixedVF; +} + Optional LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { @@ -5658,145 +5819,57 @@ return None; } -ElementCount -LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF) { - bool IgnoreScalableUserVF = UserVF.isScalable() && - !TTI.supportsScalableVectors() && - !ForceTargetSupportsScalableVectors; - if (IgnoreScalableUserVF) { - LLVM_DEBUG( - dbgs() << "LV: Ignoring VF=" << UserVF - << " because target does not support scalable vectors.\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "Ignoring VF=" << ore::NV("UserVF", UserVF) - << " because target does not support scalable vectors."; - }); - } - - // Beyond this point two scenarios are handled. If UserVF isn't specified - // then a suitable VF is chosen. If UserVF is specified and there are - // dependencies, check if it's legal. However, if a UserVF is specified and - // there are no dependencies, then there's nothing to do. - if (UserVF.isNonZero() && !IgnoreScalableUserVF) { - if (!canVectorizeReductions(UserVF)) { - reportVectorizationFailure( - "LV: Scalable vectorization not supported for the reduction " - "operations found in this loop. Using fixed-width " - "vectorization instead.", - "Scalable vectorization not supported for the reduction operations " - "found in this loop. Using fixed-width vectorization instead.", - "ScalableVFUnfeasible", ORE, TheLoop); - return computeFeasibleMaxVF( - ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); - } +ElementCount LoopVectorizationCostModel::getMaxVectorSize( + unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, + ElementCount MaxSafeVF) { + bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); + unsigned WidestRegister = ComputeScalableMaxVF + ? TTI.getMaxScalableBitsPerBlock() + : TTI.getRegisterBitWidth(true); - if (Legal->isSafeForAnyVectorWidth()) - return UserVF; - } - - MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); - unsigned SmallestType, WidestType; - std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); - unsigned WidestRegister = TTI.getRegisterBitWidth(true); - - // Get the maximum safe dependence distance in bits computed by LAA. - // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from - // the memory accesses that is most restrictive (involved in the smallest - // dependence distance). - unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); - - // If the user vectorization factor is legally unsafe, clamp it to a safe - // value. Otherwise, return as is. - if (UserVF.isNonZero() && !IgnoreScalableUserVF) { - unsigned MaxSafeElements = - PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); - ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); - - if (UserVF.isScalable()) { - Optional MaxVScale = TTI.getMaxVScale(); - - // Scale VF by vscale before checking if it's safe. - MaxSafeVF = ElementCount::getScalable( - MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); - - if (MaxSafeVF.isZero()) { - // The dependence distance is too small to use scalable vectors, - // fallback on fixed. - LLVM_DEBUG( - dbgs() - << "LV: Max legal vector width too small, scalable vectorization " - "unfeasible. Using fixed-width vectorization instead.\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "Max legal vector width too small, scalable vectorization " - << "unfeasible. Using fixed-width vectorization instead."; - }); - return computeFeasibleMaxVF( - ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); - } - } - - LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); - - if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) - return UserVF; - - LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF - << " is unsafe, clamping to max safe VF=" << MaxSafeVF - << ".\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "User-specified vectorization factor " - << ore::NV("UserVectorizationFactor", UserVF) - << " is unsafe, clamping to maximum safe vectorization factor " - << ore::NV("VectorizationFactor", MaxSafeVF); - }); - return MaxSafeVF; - } - - WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); + // Convenience function to return the minimum of two ElementCounts. + auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { + assert((LHS.isScalable() == RHS.isScalable()) && + "Scalable flags must match"); + return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; + }; // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. - auto MaxVectorSize = - ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); - - LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType - << " / " << WidestType << " bits.\n"); + auto MaxVectorSize = ElementCount::get( + PowerOf2Floor(WidestRegister / WidestType), ComputeScalableMaxVF); + MaxVectorSize = MinVF(MaxVectorSize, MaxSafeVF); LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " - << WidestRegister << " bits.\n"); + << (MaxVectorSize * WidestType) << " bits.\n"); - assert(MaxVectorSize.getFixedValue() <= WidestRegister && - "Did not expect to pack so many elements" - " into one vector!"); - if (MaxVectorSize.getFixedValue() == 0) { + if (MaxVectorSize.getKnownMinValue() == 0) { LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); return ElementCount::getFixed(1); - } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && - isPowerOf2_32(ConstTripCount)) { + } + + const auto TripCountEC = ElementCount::getFixed(ConstTripCount); + if (ConstTripCount && ElementCount::isKnownLE(TripCountEC, MaxVectorSize) && + isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in - // choosing a higher viable VF as done in the loop below. + // choosing a higher viable VF as done in the loop below. If MaxVectorSize + // is scalable, we only fall back on a fixed VF when the TC is less than or + // equal to the known number of lanes. LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " << ConstTripCount << "\n"); - return ElementCount::getFixed(ConstTripCount); + return TripCountEC; } ElementCount MaxVF = MaxVectorSize; if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || (MaximizeBandwidth && isScalarEpilogueAllowed())) { + auto MaxVectorSizeMaxBW = ElementCount::get( + PowerOf2Floor(WidestRegister / SmallestType), ComputeScalableMaxVF); + if (!Legal->isSafeForAnyVectorWidth()) + MaxVectorSizeMaxBW = MinVF(MaxVectorSizeMaxBW, MaxSafeVF); + // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). SmallVector VFs; - auto MaxVectorSizeMaxBW = - ElementCount::getFixed(WidestRegister / SmallestType); for (ElementCount VS = MaxVectorSize * 2; ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) VFs.push_back(VS); @@ -5819,7 +5892,7 @@ } } if (ElementCount MinVF = - TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { + TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { if (ElementCount::isKnownLT(MaxVF, MinVF)) { LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF << ") with target's minimum: " << MinVF << '\n'); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -221,7 +221,7 @@ ret float %add } -; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead. +; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. ; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) define bfloat @fadd_fast_bfloat(bfloat* noalias nocapture readonly %a, i64 %n) { ; CHECK-LABEL: @fadd_fast_bfloat @@ -322,18 +322,18 @@ ; MUL -; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead. -; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) +; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. +; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @mul ; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <8 x i32> -; CHECK: %[[LOAD2:.*]] = load <8 x i32> -; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD1]] -; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD2]] +; CHECK: %[[LOAD1:.*]] = load <4 x i32> +; CHECK: %[[LOAD2:.*]] = load <4 x i32> +; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]] +; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]]) +; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] +; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) entry: br label %for.body @@ -352,22 +352,22 @@ } ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies -; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead. -; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) +; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. +; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @memory_dependence ; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <8 x i32> -; CHECK: %[[LOAD2:.*]] = load <8 x i32> -; CHECK: %[[LOAD3:.*]] = load <8 x i32> -; CHECK: %[[LOAD4:.*]] = load <8 x i32> -; CHECK: %[[ADD1:.*]] = add nsw <8 x i32> %[[LOAD3]], %[[LOAD1]] -; CHECK: %[[ADD2:.*]] = add nsw <8 x i32> %[[LOAD4]], %[[LOAD2]] -; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD3]] -; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD4]] +; CHECK: %[[LOAD1:.*]] = load <4 x i32> +; CHECK: %[[LOAD2:.*]] = load <4 x i32> +; CHECK: %[[LOAD3:.*]] = load <4 x i32> +; CHECK: %[[LOAD4:.*]] = load <4 x i32> +; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]] +; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]] +; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]] +; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]]) +; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] +; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll @@ -0,0 +1,124 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=on -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_ON +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=always -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_ALWAYSON +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=off -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_DISABLED +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=on -loop-vectorize -S -debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_MAXBW + +; Test that the MaxVF for the following loop, that has no dependence distances, +; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16 +; (maximized bandwidth for i8 in the loop). +define void @test0(i32* %a, i8* %b, i32* %c) { +; CHECK: LV: Checking a loop in "test0" +; CHECK_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 16 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} +; Test that the MaxVF for the following loop, with a dependence distance +; of 64 elements, is calculated as (maxvscale = 16) * 4. +define void @test1(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test1" +; CHECK_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 4 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 64 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 32 elements, is calculated as (maxvscale = 16) * 2. +define void @test2(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test2" +; CHECK_ON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 2 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 32 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 16 elements, is calculated as (maxvscale = 16) * 1. +define void @test3(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test3" +; CHECK_ON: LV: Found feasible scalable VF = vscale x 1 +; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 1 +; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 1 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 16 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll @@ -37,9 +37,10 @@ ; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check ; fixed-width vectorization is used instead. -; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. -; CHECK-DBG: remark: :0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. -; CHECK-DBG: LV: The max safe VF is: 8. +; CHECK-DBG: LV: Checking a loop in "test1" +; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. +; CHECK-DBG: remark: :0:0: Max legal vector width too small, scalable vectorization unfeasible. +; CHECK-DBG: LV: The max safe fixed VF is: 8. ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test1 ; CHECK: <4 x i32> @@ -80,9 +81,10 @@ ; } ; } -; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. -; CHECK-DBG: LV: The max safe VF is: 4. -; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4. +; CHECK-DBG: LV: Checking a loop in "test2" +; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. +; CHECK-DBG: LV: The max safe fixed VF is: 4. +; CHECK-DBG: LV: User VF=vscale x 8 is unsafe. Ignoring scalable UserVF. ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test2 ; CHECK: <4 x i32> @@ -129,7 +131,7 @@ ; Max fixed VF=32, Max scalable VF=2, safe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test3" -; CHECK-DBG: LV: The max safe VF is: vscale x 2. +; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2. ; CHECK-DBG: LV: Using user VF vscale x 2. ; CHECK-LABEL: @test3 ; CHECK: @@ -161,7 +163,8 @@ ; test4 ; -; Scalable vectorization feasible, but the VF is unsafe. Should clamp. +; Scalable vectorization feasible, but the given VF is unsafe. Should ignore +; the hint and leave it to the vectorizer to pick a more suitable VF. ; ; Specifies a vector of , i.e. maximum of 64 x i32 with 4 ; words per 128-bits (packed). @@ -173,15 +176,16 @@ ; } ; } ; -; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. Should clamp to 2. +; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test4" -; CHECK-DBG: LV: The max safe VF is: vscale x 2. -; CHECK-DBG: LV: User VF=vscale x 4 is unsafe, clamping to max safe VF=vscale x 2. -; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2 -; CHECK-DBG: LV: Using max VF vscale x 2 +; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2. +; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. +; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a suitable VF. +; CHECK-DBG: Found feasible scalable VF = vscale x 2 +; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test4 -; CHECK: +; CHECK: <4 x i32> define void @test4(i32* %a, i32* %b) { entry: br label %loop @@ -225,7 +229,7 @@ ; Max fixed VF=128, Max scalable VF=8, safe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test5" -; CHECK-DBG: LV: The max safe VF is: vscale x 8. +; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8. ; CHECK-DBG: LV: Using user VF vscale x 4 ; CHECK-LABEL: @test5 ; CHECK: @@ -257,7 +261,8 @@ ; test6 ; -; Scalable vectorization feasible, but the VF is unsafe. Should clamp. +; Scalable vectorization feasible, but the VF is unsafe. Should ignore +; the hint and leave it to the vectorizer to pick a more suitable VF. ; ; Specifies a vector of , i.e. maximum of 256 x i32. ; @@ -268,15 +273,16 @@ ; } ; } ; -; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. Should clamp to 8. +; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test6" -; CHECK-DBG: LV: The max safe VF is: vscale x 8. -; CHECK-DBG: LV: User VF=vscale x 16 is unsafe, clamping to max safe VF=vscale x 8. -; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8 -; CHECK-DBG: LV: Using max VF vscale x 8 +; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8. +; CHECK-DBG: LV: User VF=vscale x 16 is unsafe. Ignoring scalable UserVF. +; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 16 is unsafe. Ignoring the hint to let the compiler pick a suitable VF. +; CHECK-DBG: LV: Found feasible scalable VF = vscale x 4 +; CHECK-DBG: Selecting VF: 4. ; CHECK-LABEL: @test6 -; CHECK: +; CHECK: <4 x i32> define void @test6(i32* %a, i32* %b) { entry: br label %loop @@ -304,8 +310,9 @@ !17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} ; CHECK-NO-SVE-LABEL: LV: Checking a loop in "test_no_sve" -; CHECK-NO-SVE: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors. -; CHECK-NO-SVE: remark: :0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors. +; CHECK-NO-SVE: LV: Disabling scalable vectorization, because target does not support scalable vectors. +; CHECK-NO-SVE: remark: :0:0: Disabling scalable vectorization, because target does not support scalable vectors. +; CHECK-NO-SVE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. ; CHECK-NO-SVE: LV: Selecting VF: 4. ; CHECK-NO-SVE: <4 x i32> ; CHECK-NO-SVE-NOT: @@ -337,8 +344,8 @@ ; supported but max vscale is undefined. ; ; CHECK-NO-MAX-VSCALE-LABEL: LV: Checking a loop in "test_no_max_vscale" -; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. -; CEHCK-NO-MAX-VSCALE: The max safe VF is: 4. +; CEHCK-NO-MAX-VSCALE: The max safe fixed VF is: 4. +; CHECK-NO-MAX-VSCALE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. ; CHECK-NO-MAX-VSCALE: LV: Selecting VF: 4. ; CHECK-NO-MAX-VSCALE: <4 x i32> define void @test_no_max_vscale(i32* %a, i32* %b) { diff --git a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll --- a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll @@ -3,8 +3,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -; CHECK: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors. -; CHECK: remark: :0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors. +; CHECK: LV: Disabling scalable vectorization, because target does not support scalable vectors. +; CHECK: remark: :0:0: Disabling scalable vectorization, because target does not support scalable vectors. ; CHECK: LV: The Widest register safe to use is: 32 bits. define void @test1(i32* %a, i32* %b) { entry: