Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1654,6 +1654,13 @@ ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType); + /// \return A pair with the clamped VF and the maximum safe VF. + /// If MaxSafeElements allows it, ClampedVF is the SuggestedVF, otherwise + /// it is clamped to MaxSafeElements. If SuggestedVF is scalable, the returned + /// VF may be clamped to a non-scalable VF. + std::pair + clampFeasibleMaxVF(ElementCount SuggestedVF, unsigned MaxSafeElements); + /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually /// operate on @@ -5689,6 +5696,30 @@ return None; } +std::pair +LoopVectorizationCostModel::clampFeasibleMaxVF(ElementCount SuggestedVF, + unsigned MaxSafeElements) { + ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); + if (SuggestedVF.isScalable()) { + Optional MaxVScale = TTI.getMaxVScale(); + + // Scale VF by vscale before checking if it's safe. + MaxSafeVF = ElementCount::getScalable( + MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); + + // Fall back on fixed-width VF. + if (MaxSafeVF.isZero()) + return clampFeasibleMaxVF( + ElementCount::getFixed(SuggestedVF.getKnownMinValue()), + MaxSafeElements); + } + + if (ElementCount::isKnownLE(SuggestedVF, MaxSafeVF)) + return {SuggestedVF, MaxSafeVF}; + + return {MaxSafeVF, MaxSafeVF}; +} + Optional LoopVectorizationCostModel::computeFeasibleUserVF(ElementCount UserVF, unsigned WidestType) { @@ -5730,51 +5761,40 @@ unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); unsigned MaxSafeElements = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); - ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); - - if (UserVF.isScalable()) { - Optional MaxVScale = TTI.getMaxVScale(); - - // Scale VF by vscale before checking if it's safe. - MaxSafeVF = ElementCount::getScalable( - MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); - if (MaxSafeVF.isZero()) { - // The dependence distance is too small to use scalable vectors, - // fallback on fixed. - LLVM_DEBUG( - dbgs() - << "LV: Max legal vector width too small, scalable vectorization " - "unfeasible. Using fixed-width vectorization instead.\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "Max legal vector width too small, scalable vectorization " - << "unfeasible. Using fixed-width vectorization instead."; - }); - return computeFeasibleUserVF( - ElementCount::getFixed(UserVF.getKnownMinValue()), WidestType); - } - } + // If the user vectorization factor is legally unsafe, clamp it to a safe + // value. Otherwise, return as is. + ElementCount NewVF = ElementCount::getNull(), MaxSafeVF = NewVF; + std::tie(NewVF, MaxSafeVF) = clampFeasibleMaxVF(UserVF, MaxSafeElements); + // Emit some useful debug output / opt remarks if the user value is clamped. LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); - if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) - return UserVF; - - LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF - << " is unsafe, clamping to max safe VF=" << MaxSafeVF - << ".\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "User-specified vectorization factor " - << ore::NV("UserVectorizationFactor", UserVF) - << " is unsafe, clamping to maximum safe vectorization factor " - << ore::NV("VectorizationFactor", MaxSafeVF); - }); - return MaxSafeVF; + if (UserVF.isScalable() != NewVF.isScalable()) { + auto Diag = "Max legal vector width too small, scalable vectorization " + "unfeasible. Using fixed-width vectorization instead."; + LLVM_DEBUG(dbgs() << "LV: " << Diag << "\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << Diag; + }); + } + if (NewVF.getKnownMinValue() != UserVF.getKnownMinValue()) { + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe, clamping to max safe VF=" << NewVF + << ".\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe, clamping to maximum safe vectorization factor " + << ore::NV("VectorizationFactor", NewVF); + }); + } + return NewVF; } ElementCount LoopVectorizationCostModel::computeFeasibleMaxVF( @@ -5784,43 +5804,50 @@ // the memory accesses that is most restrictive (involved in the smallest // dependence distance). unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); + unsigned MaxSafeElements = + PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); unsigned WidestRegister = TTI.getRegisterBitWidth(true); - WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); - - // Ensure MaxVF is a power of 2; the dependence distance bound may not be. - // Note that both WidestRegister and WidestType may not be a powers of 2. - auto MaxVectorSize = - ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"); LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister << " bits.\n"); - assert(MaxVectorSize.getFixedValue() <= WidestRegister && - "Did not expect to pack so many elements" - " into one vector!"); - if (MaxVectorSize.getFixedValue() == 0) { + // Ensure MaxVF is a power of 2; the dependence distance bound may not be. + // Note that both WidestRegister and WidestType may not be a powers of 2. + auto MaxVectorSize = + ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); + std::tie(MaxVectorSize, std::ignore) = + clampFeasibleMaxVF(MaxVectorSize, MaxSafeElements); + + if (MaxVectorSize.getKnownMinValue() == 0) { LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); return ElementCount::getFixed(1); - } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && - isPowerOf2_32(ConstTripCount)) { + } else if (ConstTripCount && isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in // choosing a higher viable VF as done in the loop below. - LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " - << ConstTripCount << "\n"); - return ElementCount::getFixed(ConstTripCount); + ElementCount ClampedVF = MaxVectorSize; + std::tie(ClampedVF, std::ignore) = + clampFeasibleMaxVF(MaxVectorSize, ConstTripCount); + if (ClampedVF != MaxVectorSize) { + LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " + << ConstTripCount << "\n"); + return ClampedVF; + } } ElementCount MaxVF = MaxVectorSize; if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || (MaximizeBandwidth && isScalarEpilogueAllowed())) { + auto MaxVectorSizeMaxBW = + ElementCount::getFixed(PowerOf2Floor(WidestRegister / SmallestType)); + std::tie(MaxVectorSizeMaxBW, std::ignore) = + clampFeasibleMaxVF(MaxVectorSizeMaxBW, MaxSafeElements); + // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). SmallVector VFs; - auto MaxVectorSizeMaxBW = - ElementCount::getFixed(WidestRegister / SmallestType); for (ElementCount VS = MaxVectorSize * 2; ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) VFs.push_back(VS); Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll @@ -37,9 +37,9 @@ ; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check ; fixed-width vectorization is used instead. +; CHECK-DBG: LV: The max safe VF is: 8. ; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. ; CHECK-DBG: remark: :0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. -; CHECK-DBG: LV: The max safe VF is: 8. ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test1 ; CHECK: <4 x i32> @@ -80,9 +80,9 @@ ; } ; } -; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. ; CHECK-DBG: LV: The max safe VF is: 4. -; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4. +; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. +; CHECK-DBG: LV: User VF=vscale x 8 is unsafe, clamping to max safe VF=4. ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test2 ; CHECK: <4 x i32> @@ -337,8 +337,8 @@ ; supported but max vscale is undefined. ; ; CHECK-NO-MAX-VSCALE-LABEL: LV: Checking a loop in "test_no_max_vscale" -; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. ; CEHCK-NO-MAX-VSCALE: The max safe VF is: 4. +; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. ; CHECK-NO-MAX-VSCALE: LV: Selecting VF: 4. ; CHECK-NO-MAX-VSCALE: <4 x i32> define void @test_no_max_vscale(i32* %a, i32* %b) {