diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5676,10 +5676,32 @@ ElementCount LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, ElementCount UserVF) { - bool IgnoreScalableUserVF = UserVF.isScalable() && - !TTI.supportsScalableVectors() && - !ForceTargetSupportsScalableVectors; - if (IgnoreScalableUserVF) { + // Start with the largest possible fixed and scalable vector factors. + // The maximum vectorization factors will gradually constraint, as we apply + // legality restrictions. + ElementCount FixedMaxVF = ElementCount::getFixed(30u << 16); + ElementCount ScalableMaxVF = ElementCount::getScalable(30u << 16); + + auto ElementCountMin = [](ElementCount A, ElementCount B) { + assert((!A.isScalable() && !B.isScalable()) || + (A.isScalable() && B.isScalable()) && + "can only be used for element counts with matching sigedness"); + if (ElementCount::isKnownLE(A, B)) + return A; + // Is this needed or always guaranteed? + assert(ElementCount::isKnownGT(A, B)); + return B; + }; + + // If the UserVF is not scalable, disable scalable vectorization. + if (!UserVF.isScalable()) + ScalableMaxVF = ElementCount::getScalable(0); + + // Display remark if scalable vectorization is requested but not supported by + // the target. + if (!ScalableMaxVF.isZero() && !TTI.supportsScalableVectors() && + !ForceTargetSupportsScalableVectors) { + ScalableMaxVF = ElementCount::getScalable(0); LLVM_DEBUG( dbgs() << "LV: Ignoring VF=" << UserVF << " because target does not support scalable vectors.\n"); @@ -5690,14 +5712,14 @@ << "Ignoring VF=" << ore::NV("UserVF", UserVF) << " because target does not support scalable vectors."; }); + UserVF = ElementCount::getScalable(0); } - // Beyond this point two scenarios are handled. If UserVF isn't specified - // then a suitable VF is chosen. If UserVF is specified and there are - // dependencies, check if it's legal. However, if a UserVF is specified and - // there are no dependencies, then there's nothing to do. - if (UserVF.isNonZero() && !IgnoreScalableUserVF) { - if (!canVectorizeReductions(UserVF)) { + // Disable scalable vectorization, if the loop contains unsupported + // reductions. + if (!ScalableMaxVF.isZero() && !canVectorizeReductions(UserVF)) { + ScalableMaxVF = ElementCount::getScalable(0); + if (UserVF.isNonZero()) reportVectorizationFailure( "LV: Scalable vectorization not supported for the reduction " "operations found in this loop. Using fixed-width " @@ -5705,40 +5727,33 @@ "Scalable vectorization not supported for the reduction operations " "found in this loop. Using fixed-width vectorization instead.", "ScalableVFUnfeasible", ORE, TheLoop); - return computeFeasibleMaxVF( - ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); - } - - if (Legal->isSafeForAnyVectorWidth()) - return UserVF; + UserVF = ElementCount::getScalable(0); } + // Limit maximum vectorization factors based on memory dependence constraints. MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); - // Get the maximum safe dependence distance in bits computed by LAA. // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from // the memory accesses that is most restrictive (involved in the smallest // dependence distance). unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); - // If the user vectorization factor is legally unsafe, clamp it to a safe - // value. Otherwise, return as is. - if (UserVF.isNonZero() && !IgnoreScalableUserVF) { unsigned MaxSafeElements = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); - ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); + FixedMaxVF = ElementCount::getFixed(MaxSafeElements); - if (UserVF.isScalable()) { + // Limit ScalableMaxVF by the maximum safe dependence distance. + if (!ScalableMaxVF.isZero() && !Legal->isSafeForAnyVectorWidth()) { Optional MaxVScale = TTI.getMaxVScale(); // Scale VF by vscale before checking if it's safe. - MaxSafeVF = ElementCount::getScalable( + ScalableMaxVF = ElementCount::getScalable( MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); - if (MaxSafeVF.isZero()) { + if (ScalableMaxVF.isZero()) { // The dependence distance is too small to use scalable vectors, // fallback on fixed. LLVM_DEBUG( @@ -5752,37 +5767,56 @@ << "Max legal vector width too small, scalable vectorization " << "unfeasible. Using fixed-width vectorization instead."; }); - return computeFeasibleMaxVF( - ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); } } - LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); + // TODO: Just print both max factors? The scalable will be 0 if not + // supported by target. + auto MaxSafeVFForPrint = UserVF.isScalable() && ScalableMaxVF.isNonZero() + ? ScalableMaxVF + : FixedMaxVF; + LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVFForPrint + << ".\n"); - if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) - return UserVF; + // Check if UserVF can be used. If required, clamp it by the right maximum + // VF. + if (UserVF.isNonZero()) { + auto MaxSafeVF = UserVF.isScalable() ? ScalableMaxVF : FixedMaxVF; - LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF - << " is unsafe, clamping to max safe VF=" << MaxSafeVF - << ".\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "User-specified vectorization factor " - << ore::NV("UserVectorizationFactor", UserVF) - << " is unsafe, clamping to maximum safe vectorization factor " - << ore::NV("VectorizationFactor", MaxSafeVF); - }); - return MaxSafeVF; - } + // If the UserVF can be used, return it. + if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) + return UserVF; - WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); + assert(ElementCount::isKnownGT(UserVF, MaxSafeVF)); + + // If no scalable vectorization factor is safe, revert to fixed width + // vectorization. + bool RevertToFixed = UserVF.isScalable() && MaxSafeVF.isZero(); + if (RevertToFixed) + MaxSafeVF = FixedMaxVF; + + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe, clamping to max safe VF=" << MaxSafeVF + << ".\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe, clamping to maximum safe vectorization factor " + << ore::NV("VectorizationFactor", MaxSafeVF); + }); + if (!RevertToFixed) + return MaxSafeVF; + } // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. auto MaxVectorSize = ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); + // Clamp to maximum legal VF. + MaxVectorSize = ElementCountMin(MaxVectorSize, FixedMaxVF); LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"); @@ -5812,6 +5846,9 @@ SmallVector VFs; auto MaxVectorSizeMaxBW = ElementCount::getFixed(WidestRegister / SmallestType); + // Clamp to maximum legal VF. + MaxVectorSizeMaxBW = ElementCountMin(MaxVectorSizeMaxBW, FixedMaxVF); + for (ElementCount VS = MaxVectorSize * 2; ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) VFs.push_back(VS); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -323,17 +323,17 @@ ; MUL ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead. -; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) +; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @mul ; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <8 x i32> -; CHECK: %[[LOAD2:.*]] = load <8 x i32> -; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD1]] -; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD2]] +; CHECK: %[[LOAD1:.*]] = load <4 x i32> +; CHECK: %[[LOAD2:.*]] = load <4 x i32> +; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]] +; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]]) +; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] +; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) entry: br label %for.body @@ -353,21 +353,21 @@ ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead. -; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) +; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @memory_dependence ; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <8 x i32> -; CHECK: %[[LOAD2:.*]] = load <8 x i32> -; CHECK: %[[LOAD3:.*]] = load <8 x i32> -; CHECK: %[[LOAD4:.*]] = load <8 x i32> -; CHECK: %[[ADD1:.*]] = add nsw <8 x i32> %[[LOAD3]], %[[LOAD1]] -; CHECK: %[[ADD2:.*]] = add nsw <8 x i32> %[[LOAD4]], %[[LOAD2]] -; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD3]] -; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD4]] +; CHECK: %[[LOAD1:.*]] = load <4 x i32> +; CHECK: %[[LOAD2:.*]] = load <4 x i32> +; CHECK: %[[LOAD3:.*]] = load <4 x i32> +; CHECK: %[[LOAD4:.*]] = load <4 x i32> +; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]] +; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]] +; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]] +; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]]) +; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] +; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll @@ -82,7 +82,7 @@ ; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. ; CHECK-DBG: LV: The max safe VF is: 4. -; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4. +; CHECK-DBG: LV: User VF=vscale x 8 is unsafe, clamping to max safe VF=4. ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test2 ; CHECK: <4 x i32>