Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1608,14 +1608,26 @@ /// \return UserVF directly if it is valid. Otherwise clamp UserVF to the /// largest valid value. Optional getFeasibleUserVF(ElementCount UserVF, - unsigned MaxSafeElements); + ElementCount MaxSafeElements); /// \return An upper bound for the vectorization factor, a power-of-2 larger - /// than zero. One is returned if vectorization should best be avoided due - /// to cost. + /// than zero, limited by \p MaxSafeVF. If \p MaxSafeVF is scalable, the + /// computed feasible max VF will be scalable as well. One (scalar) is + /// returned if vectorization should best be avoided due to cost. ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount MaxSafeVF, unsigned SmallestType, unsigned WidestType); + /// \return the \p SuggestedVF if it is less than or equal to \p MaxSafeVF, + /// otherwise the value is clamped to MaxSafeVF. If \p SuggestedVF is + /// scalable, and \p MaxSafeVF is not, then it uses MaxVScale to determine + /// whether it can use a smaller scalable VF. Otherwise it clamps to a + /// fixed-width VF. If \p GetMaxSafeVF is not None, then the calculated + /// maximum safe VF will written to the passed address. + ElementCount clampFeasibleMaxVF(ElementCount SuggestedVF, + ElementCount MaxSafeVF, + Optional GetMaxSafeVF = None); + /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually /// operate on @@ -5563,13 +5575,17 @@ // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from // the memory accesses that is most restrictive (involved in the smallest // dependence distance). - unsigned MaxSafeElements = - PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); + auto MaxSafeElements = ElementCount::getFixed( + PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType)); // First analyze the UserVF, fall back if the UserVF should be ignored. if (auto MaybeMaxVF = getFeasibleUserVF(UserVF, MaxSafeElements)) return MaybeMaxVF.getValue(); - return computeFeasibleMaxVF(TC, SmallestType, WidestType); + + // Try to automatically determine a suitable maximum VF. + auto MaxSafeVF = + clampFeasibleMaxVF(ElementCount::getFixed(1 << 16), MaxSafeElements); + return computeFeasibleMaxVF(TC, MaxSafeVF, SmallestType, WidestType); }; switch (ScalarEpilogueStatus) { @@ -5691,9 +5707,35 @@ return None; } +ElementCount LoopVectorizationCostModel::clampFeasibleMaxVF( + ElementCount SuggestedVF, ElementCount ClampValue, + Optional OutputMaxValidVF) { + assert((SuggestedVF.isScalable() || !ClampValue.isScalable()) && + "Cannot clamp a fixed-width VF to a scalable VF"); + + ElementCount MaxVF = ClampValue; + if (SuggestedVF.isScalable() && !ClampValue.isScalable()) { + Optional MaxVScale = TTI.getMaxVScale(); + + // Scale VF by vscale before checking if it's safe. + MaxVF = ElementCount::getScalable( + MaxVScale ? (ClampValue.getFixedValue() / MaxVScale.getValue()) : 0); + + // Fall back on fixed-width VF. + if (MaxVF.isZero()) + return clampFeasibleMaxVF( + ElementCount::getFixed(SuggestedVF.getKnownMinValue()), ClampValue, + OutputMaxValidVF); + } + + if (OutputMaxValidVF) + **OutputMaxValidVF = MaxVF; + return ElementCount::isKnownLE(SuggestedVF, MaxVF) ? SuggestedVF : MaxVF; +} + Optional LoopVectorizationCostModel::getFeasibleUserVF(ElementCount UserVF, - unsigned MaxSafeElements) { + ElementCount MaxSafeElements) { if (UserVF.isZero()) return None; @@ -5725,97 +5767,86 @@ if (Legal->isSafeForAnyVectorWidth()) return UserVF; - ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); - - if (UserVF.isScalable()) { - Optional MaxVScale = TTI.getMaxVScale(); - - // Scale VF by vscale before checking if it's safe. - MaxSafeVF = ElementCount::getScalable( - MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); - - if (MaxSafeVF.isZero()) { - // The dependence distance is too small to use scalable vectors, - // fallback on fixed. - LLVM_DEBUG( - dbgs() - << "LV: Max legal vector width too small, scalable vectorization " - "unfeasible. Using fixed-width vectorization instead.\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "Max legal vector width too small, scalable vectorization " - << "unfeasible. Using fixed-width vectorization instead."; - }); - return getFeasibleUserVF( - ElementCount::getFixed(UserVF.getKnownMinValue()), MaxSafeElements); - } - } + // If the user vectorization factor is legally unsafe, clamp it to a safe + // value. Otherwise, return as is. + ElementCount MaxSafeVF; + ElementCount NewVF = clampFeasibleMaxVF(UserVF, MaxSafeElements, &MaxSafeVF); + // Emit some useful debug output / opt remarks if the user value is clamped. LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); - if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) - return UserVF; - - LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF - << " is unsafe, clamping to max safe VF=" << MaxSafeVF - << ".\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "User-specified vectorization factor " - << ore::NV("UserVectorizationFactor", UserVF) - << " is unsafe, clamping to maximum safe vectorization factor " - << ore::NV("VectorizationFactor", MaxSafeVF); - }); - return MaxSafeVF; + if (UserVF.isScalable() != NewVF.isScalable()) { + auto Diag = "Max legal vector width too small, scalable vectorization " + "unfeasible. Using fixed-width vectorization instead."; + LLVM_DEBUG(dbgs() << "LV: " << Diag << "\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << Diag; + }); + } + if (NewVF.getKnownMinValue() != UserVF.getKnownMinValue()) { + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe, clamping to max safe VF=" << MaxSafeVF + << ".\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe, clamping to maximum safe vectorization factor " + << ore::NV("VectorizationFactor", MaxSafeVF); + }); + } + return NewVF; } ElementCount LoopVectorizationCostModel::computeFeasibleMaxVF( - unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType) { - // Get the maximum safe dependence distance in bits computed by LAA. - // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from - // the memory accesses that is most restrictive (involved in the smallest - // dependence distance). - unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); - + unsigned ConstTripCount, ElementCount MaxSafeVF, unsigned SmallestType, + unsigned WidestType) { unsigned WidestRegister = TTI.getRegisterBitWidth(true); - WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); + LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType + << " / " << WidestType << " bits.\n"); + LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " + << WidestRegister << " bits.\n"); // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. auto MaxVectorSize = ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); - LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType - << " / " << WidestType << " bits.\n"); - LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " - << WidestRegister << " bits.\n"); - - assert(MaxVectorSize.getFixedValue() <= WidestRegister && - "Did not expect to pack so many elements" - " into one vector!"); - if (MaxVectorSize.getFixedValue() == 0) { + if (MaxVectorSize.getKnownMinValue() == 0) { LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); return ElementCount::getFixed(1); - } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && - isPowerOf2_32(ConstTripCount)) { + } + + if (!Legal->isSafeForAnyVectorWidth()) + MaxVectorSize = clampFeasibleMaxVF(MaxVectorSize, MaxSafeVF); + + if (ConstTripCount && isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in // choosing a higher viable VF as done in the loop below. - LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " - << ConstTripCount << "\n"); - return ElementCount::getFixed(ConstTripCount); + ElementCount ClampedVF = clampFeasibleMaxVF( + MaxVectorSize, ElementCount::getFixed(ConstTripCount)); + if (ClampedVF != MaxVectorSize) { + LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " + << ConstTripCount << "\n"); + return ClampedVF; + } } ElementCount MaxVF = MaxVectorSize; if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || (MaximizeBandwidth && isScalarEpilogueAllowed())) { + auto MaxVectorSizeMaxBW = + ElementCount::getFixed(PowerOf2Floor(WidestRegister / SmallestType)); + if (!Legal->isSafeForAnyVectorWidth()) + MaxVectorSizeMaxBW = clampFeasibleMaxVF(MaxVectorSizeMaxBW, MaxSafeVF); + // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). SmallVector VFs; - auto MaxVectorSizeMaxBW = - ElementCount::getFixed(WidestRegister / SmallestType); for (ElementCount VS = MaxVectorSize * 2; ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) VFs.push_back(VS); Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll @@ -37,9 +37,9 @@ ; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check ; fixed-width vectorization is used instead. +; CHECK-DBG: LV: The max safe VF is: 8. ; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. ; CHECK-DBG: remark: :0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. -; CHECK-DBG: LV: The max safe VF is: 8. ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test1 ; CHECK: <4 x i32> @@ -80,9 +80,9 @@ ; } ; } -; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. ; CHECK-DBG: LV: The max safe VF is: 4. -; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4. +; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. +; CHECK-DBG: LV: User VF=vscale x 8 is unsafe, clamping to max safe VF=4. ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test2 ; CHECK: <4 x i32> @@ -337,8 +337,8 @@ ; supported but max vscale is undefined. ; ; CHECK-NO-MAX-VSCALE-LABEL: LV: Checking a loop in "test_no_max_vscale" -; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. ; CEHCK-NO-MAX-VSCALE: The max safe VF is: 4. +; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. ; CHECK-NO-MAX-VSCALE: LV: Selecting VF: 4. ; CHECK-NO-MAX-VSCALE: <4 x i32> define void @test_no_max_vscale(i32* %a, i32* %b) {