diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -174,6 +174,13 @@ const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr); +/// Reports an informative message: print \p Msg for debugging purposes as well +/// as an optimization remark. Uses either \p I as location of the remark, or +/// otherwise \p TheLoop. +void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag, + OptimizationRemarkEmitter *ORE, Loop *TheLoop, + Instruction *I = nullptr); + } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZE_H diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1065,13 +1065,13 @@ B.SetCurrentDebugLocation(DebugLoc()); } -/// Write a record \p DebugMsg about vectorization failure to the debug -/// output stream. If \p I is passed, it is an instruction that prevents -/// vectorization. +/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I +/// is passed, the message relates to that particular instruction. #ifndef NDEBUG -static void debugVectorizationFailure(const StringRef DebugMsg, - Instruction *I) { - dbgs() << "LV: Not vectorizing: " << DebugMsg; +static void debugVectorizationMessage(const StringRef Prefix, + const StringRef DebugMsg, + Instruction *I) { + dbgs() << "LV: " << Prefix << DebugMsg; if (I != nullptr) dbgs() << " " << *I; else @@ -1100,9 +1100,7 @@ DL = I->getDebugLoc(); } - OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); - R << "loop not vectorized: "; - return R; + return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); } /// Return a value for Step multiplied by VF. @@ -1123,12 +1121,24 @@ } void reportVectorizationFailure(const StringRef DebugMsg, - const StringRef OREMsg, const StringRef ORETag, - OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { - LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); + const StringRef OREMsg, const StringRef ORETag, + OptimizationRemarkEmitter *ORE, Loop *TheLoop, + Instruction *I) { + LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); + LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); + ORE->emit( + createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) + << "loop not vectorized: " << OREMsg); +} + +void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, + OptimizationRemarkEmitter *ORE, Loop *TheLoop, + Instruction *I) { + LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); - ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), - ORETag, TheLoop, I) << OREMsg); + ORE->emit( + createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) + << Msg); } } // end namespace llvm @@ -1623,6 +1633,18 @@ ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, ElementCount UserVF); + /// \return the maximized element count based on the targets vector + /// registers and the loop trip-count, but limited to a maximum safe VF. + /// This is a helper function of computeFeasibleMaxVF. + ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, + unsigned SmallestType, + unsigned WidestType, + ElementCount MaxSafeVF); + + /// \return the maximum legal scalable VF, based on the safe max number + /// of elements. + ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); + /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually /// operate on @@ -5576,6 +5598,129 @@ return false; } +ElementCount +LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { + if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { + reportVectorizationInfo( + "Disabling scalable vectorization, because target does not " + "support scalable vectors.", + "ScalableVectorsUnsupported", ORE, TheLoop); + return ElementCount::getScalable(0); + } + + auto MaxScalableVF = ElementCount::getScalable(1u << 16); + + // Disable scalable vectorization if the loop contains unsupported reductions. + // Test that the loop-vectorizer can legalize all operations for this MaxVF. + // FIXME: While for scalable vectors this is currently sufficient, this should + // be replaced by a more detailed mechanism that filters out specific VFs, + // instead of invalidating vectorization for a whole set of VFs based on the + // MaxVF. + if (!canVectorizeReductions(MaxScalableVF)) { + reportVectorizationInfo( + "Scalable vectorization not supported for the reduction " + "operations found in this loop.", + "ScalableVFUnfeasible", ORE, TheLoop); + return ElementCount::getScalable(0); + } + + if (Legal->isSafeForAnyVectorWidth()) + return MaxScalableVF; + + // Limit MaxScalableVF by the maximum safe dependence distance. + Optional MaxVScale = TTI.getMaxVScale(); + MaxScalableVF = ElementCount::getScalable( + MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); + if (!MaxScalableVF) + reportVectorizationInfo( + "Max legal vector width too small, scalable vectorization " + "unfeasible.", + "ScalableVFUnfeasible", ORE, TheLoop); + + return MaxScalableVF; +} + +ElementCount +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF) { + MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); + unsigned SmallestType, WidestType; + std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); + + // Get the maximum safe dependence distance in bits computed by LAA. + // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from + // the memory accesses that is most restrictive (involved in the smallest + // dependence distance). + unsigned MaxSafeElements = + PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); + + auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); + auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); + + LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF + << ".\n"); + LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF + << ".\n"); + + // First analyze the UserVF, fall back if the UserVF should be ignored. + if (UserVF) { + auto MaxSafeUserVF = + UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; + + if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) + return UserVF; + + assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); + + // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it + // is better to ignore the hint and let the compiler choose a suitable VF. + if (!UserVF.isScalable()) { + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe, clamping to max safe VF=" + << MaxSafeFixedVF << ".\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe, clamping to maximum safe vectorization factor " + << ore::NV("VectorizationFactor", MaxSafeFixedVF); + }); + return MaxSafeFixedVF; + } + + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe. Ignoring scalable UserVF.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe. Ignoring the hint to let the compiler pick a " + "suitable VF."; + }); + } + + LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType + << " / " << WidestType << " bits.\n"); + + ElementCount MaxFixedVF = ElementCount::getFixed(1); + if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, + WidestType, MaxSafeFixedVF)) + MaxFixedVF = MaxVF; + + if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, + WidestType, MaxSafeScalableVF)) + // FIXME: Return scalable VF as well (to be added in future patch). + if (MaxVF.isScalable()) + LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF + << "\n"); + + return MaxFixedVF; +} + Optional LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { @@ -5716,149 +5861,61 @@ return None; } -ElementCount -LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF) { - bool IgnoreScalableUserVF = UserVF.isScalable() && - !TTI.supportsScalableVectors() && - !ForceTargetSupportsScalableVectors; - if (IgnoreScalableUserVF) { - LLVM_DEBUG( - dbgs() << "LV: Ignoring VF=" << UserVF - << " because target does not support scalable vectors.\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "Ignoring VF=" << ore::NV("UserVF", UserVF) - << " because target does not support scalable vectors."; - }); - } - - // Beyond this point two scenarios are handled. If UserVF isn't specified - // then a suitable VF is chosen. If UserVF is specified and there are - // dependencies, check if it's legal. However, if a UserVF is specified and - // there are no dependencies, then there's nothing to do. - if (UserVF.isNonZero() && !IgnoreScalableUserVF) { - if (!canVectorizeReductions(UserVF)) { - reportVectorizationFailure( - "LV: Scalable vectorization not supported for the reduction " - "operations found in this loop. Using fixed-width " - "vectorization instead.", - "Scalable vectorization not supported for the reduction operations " - "found in this loop. Using fixed-width vectorization instead.", - "ScalableVFUnfeasible", ORE, TheLoop); - return computeFeasibleMaxVF( - ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); - } - - if (Legal->isSafeForAnyVectorWidth()) - return UserVF; - } - - MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); - unsigned SmallestType, WidestType; - std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); - unsigned WidestRegister = - TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) - .getFixedSize(); +ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( + unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, + ElementCount MaxSafeVF) { + bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); + TypeSize WidestRegister = TTI.getRegisterBitWidth( + ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector); - // Get the maximum safe dependence distance in bits computed by LAA. - // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from - // the memory accesses that is most restrictive (involved in the smallest - // dependence distance). - unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); - - // If the user vectorization factor is legally unsafe, clamp it to a safe - // value. Otherwise, return as is. - if (UserVF.isNonZero() && !IgnoreScalableUserVF) { - unsigned MaxSafeElements = - PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); - ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); - - if (UserVF.isScalable()) { - Optional MaxVScale = TTI.getMaxVScale(); - - // Scale VF by vscale before checking if it's safe. - MaxSafeVF = ElementCount::getScalable( - MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); - - if (MaxSafeVF.isZero()) { - // The dependence distance is too small to use scalable vectors, - // fallback on fixed. - LLVM_DEBUG( - dbgs() - << "LV: Max legal vector width too small, scalable vectorization " - "unfeasible. Using fixed-width vectorization instead.\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "Max legal vector width too small, scalable vectorization " - << "unfeasible. Using fixed-width vectorization instead."; - }); - return computeFeasibleMaxVF( - ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); - } - } - - LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); - - if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) - return UserVF; - - LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF - << " is unsafe, clamping to max safe VF=" << MaxSafeVF - << ".\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "User-specified vectorization factor " - << ore::NV("UserVectorizationFactor", UserVF) - << " is unsafe, clamping to maximum safe vectorization factor " - << ore::NV("VectorizationFactor", MaxSafeVF); - }); - return MaxSafeVF; - } - - WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); + // Convenience function to return the minimum of two ElementCounts. + auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { + assert((LHS.isScalable() == RHS.isScalable()) && + "Scalable flags must match"); + return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; + }; // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. - auto MaxVectorSize = - ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); - - LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType - << " / " << WidestType << " bits.\n"); + auto MaxVectorElementCount = ElementCount::get( + PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), + ComputeScalableMaxVF); + MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " - << WidestRegister << " bits.\n"); + << (MaxVectorElementCount * WidestType) << " bits.\n"); - assert(MaxVectorSize.getFixedValue() <= WidestRegister && - "Did not expect to pack so many elements" - " into one vector!"); - if (MaxVectorSize.getFixedValue() == 0) { + if (!MaxVectorElementCount) { LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); return ElementCount::getFixed(1); - } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && - isPowerOf2_32(ConstTripCount)) { + } + + const auto TripCountEC = ElementCount::getFixed(ConstTripCount); + if (ConstTripCount && + ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && + isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in - // choosing a higher viable VF as done in the loop below. + // choosing a higher viable VF as done in the loop below. If + // MaxVectorElementCount is scalable, we only fall back on a fixed VF when + // the TC is less than or equal to the known number of lanes. LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " << ConstTripCount << "\n"); - return ElementCount::getFixed(ConstTripCount); + return TripCountEC; } - ElementCount MaxVF = MaxVectorSize; + ElementCount MaxVF = MaxVectorElementCount; if (TTI.shouldMaximizeVectorBandwidth() || (MaximizeBandwidth && isScalarEpilogueAllowed())) { + auto MaxVectorElementCountMaxBW = ElementCount::get( + PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), + ComputeScalableMaxVF); + MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); + // Collect all viable vectorization factors larger than the default MaxVF - // (i.e. MaxVectorSize). + // (i.e. MaxVectorElementCount). SmallVector VFs; - auto MaxVectorSizeMaxBW = - ElementCount::getFixed(WidestRegister / SmallestType); - for (ElementCount VS = MaxVectorSize * 2; - ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) + for (ElementCount VS = MaxVectorElementCount * 2; + ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) VFs.push_back(VS); // For each VF calculate its register usage. @@ -5879,7 +5936,7 @@ } } if (ElementCount MinVF = - TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { + TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { if (ElementCount::isKnownLT(MaxVF, MinVF)) { LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF << ") with target's minimum: " << MinVF << '\n'); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -221,7 +221,7 @@ ret float %add } -; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead. +; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. ; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) define bfloat @fadd_fast_bfloat(bfloat* noalias nocapture readonly %a, i64 %n) { ; CHECK-LABEL: @fadd_fast_bfloat @@ -322,18 +322,18 @@ ; MUL -; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead. -; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) +; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. +; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @mul ; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <8 x i32> -; CHECK: %[[LOAD2:.*]] = load <8 x i32> -; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD1]] -; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD2]] +; CHECK: %[[LOAD1:.*]] = load <4 x i32> +; CHECK: %[[LOAD2:.*]] = load <4 x i32> +; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]] +; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]]) +; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] +; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) entry: br label %for.body @@ -352,22 +352,22 @@ } ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies -; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead. -; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) +; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. +; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @memory_dependence ; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <8 x i32> -; CHECK: %[[LOAD2:.*]] = load <8 x i32> -; CHECK: %[[LOAD3:.*]] = load <8 x i32> -; CHECK: %[[LOAD4:.*]] = load <8 x i32> -; CHECK: %[[ADD1:.*]] = add nsw <8 x i32> %[[LOAD3]], %[[LOAD1]] -; CHECK: %[[ADD2:.*]] = add nsw <8 x i32> %[[LOAD4]], %[[LOAD2]] -; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD3]] -; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD4]] +; CHECK: %[[LOAD1:.*]] = load <4 x i32> +; CHECK: %[[LOAD2:.*]] = load <4 x i32> +; CHECK: %[[LOAD3:.*]] = load <4 x i32> +; CHECK: %[[LOAD4:.*]] = load <4 x i32> +; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]] +; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]] +; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]] +; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]]) +; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] +; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll @@ -0,0 +1,149 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON_MAXBW + +; Test that the MaxVF for the following loop, that has no dependence distances, +; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16 +; (maximized bandwidth for i8 in the loop). +define void @test0(i32* %a, i8* %b, i32* %c) { +; CHECK: LV: Checking a loop in "test0" +; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 64 elements, is calculated as (maxvscale = 16) * 4. +define void @test1(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test1" +; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 64 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 32 elements, is calculated as (maxvscale = 16) * 2. +define void @test2(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test2" +; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 32 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 16 elements, is calculated as (maxvscale = 16) * 1. +define void @test3(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test3" +; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1 +; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 16 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 + +exit: + ret void +} + +; Test the fallback mechanism when scalable vectors are not feasible due +; to e.g. dependence distance. For the '-scalable-vectorization=exclusive' +; it shouldn't try to vectorize with fixed-width vectors. +define void @test4(i32* %a, i32* %b) { +; CHECK: LV: Checking a loop in "test4" +; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_ON_MAXBW-NOT: LV: Found feasible scalable VF +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 8 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !2 + +exit: + ret void +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} +!2 = distinct !{!2, !3, !4} +!3 = !{!"llvm.loop.vectorize.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll @@ -37,9 +37,10 @@ ; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check ; fixed-width vectorization is used instead. -; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. -; CHECK-DBG: remark: :0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. -; CHECK-DBG: LV: The max safe VF is: 8. +; CHECK-DBG: LV: Checking a loop in "test1" +; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. +; CHECK-DBG: remark: :0:0: Max legal vector width too small, scalable vectorization unfeasible. +; CHECK-DBG: LV: The max safe fixed VF is: 8. ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test1 ; CHECK: <4 x i32> @@ -80,9 +81,10 @@ ; } ; } -; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. -; CHECK-DBG: LV: The max safe VF is: 4. -; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4. +; CHECK-DBG: LV: Checking a loop in "test2" +; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. +; CHECK-DBG: LV: The max safe fixed VF is: 4. +; CHECK-DBG: LV: User VF=vscale x 8 is unsafe. Ignoring scalable UserVF. ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test2 ; CHECK: <4 x i32> @@ -129,7 +131,7 @@ ; Max fixed VF=32, Max scalable VF=2, safe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test3" -; CHECK-DBG: LV: The max safe VF is: vscale x 2. +; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2. ; CHECK-DBG: LV: Using user VF vscale x 2. ; CHECK-LABEL: @test3 ; CHECK: @@ -161,7 +163,8 @@ ; test4 ; -; Scalable vectorization feasible, but the VF is unsafe. Should clamp. +; Scalable vectorization feasible, but the given VF is unsafe. Should ignore +; the hint and leave it to the vectorizer to pick a more suitable VF. ; ; Specifies a vector of , i.e. maximum of 64 x i32 with 4 ; words per 128-bits (packed). @@ -173,15 +176,16 @@ ; } ; } ; -; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. Should clamp to 2. +; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test4" -; CHECK-DBG: LV: The max safe VF is: vscale x 2. -; CHECK-DBG: LV: User VF=vscale x 4 is unsafe, clamping to max safe VF=vscale x 2. -; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2 -; CHECK-DBG: LV: Using max VF vscale x 2 +; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2. +; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. +; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a suitable VF. +; CHECK-DBG: Found feasible scalable VF = vscale x 2 +; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test4 -; CHECK: +; CHECK: <4 x i32> define void @test4(i32* %a, i32* %b) { entry: br label %loop @@ -225,7 +229,7 @@ ; Max fixed VF=128, Max scalable VF=8, safe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test5" -; CHECK-DBG: LV: The max safe VF is: vscale x 8. +; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8. ; CHECK-DBG: LV: Using user VF vscale x 4 ; CHECK-LABEL: @test5 ; CHECK: @@ -257,7 +261,8 @@ ; test6 ; -; Scalable vectorization feasible, but the VF is unsafe. Should clamp. +; Scalable vectorization feasible, but the VF is unsafe. Should ignore +; the hint and leave it to the vectorizer to pick a more suitable VF. ; ; Specifies a vector of , i.e. maximum of 256 x i32. ; @@ -268,15 +273,16 @@ ; } ; } ; -; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. Should clamp to 8. +; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test6" -; CHECK-DBG: LV: The max safe VF is: vscale x 8. -; CHECK-DBG: LV: User VF=vscale x 16 is unsafe, clamping to max safe VF=vscale x 8. -; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8 -; CHECK-DBG: LV: Using max VF vscale x 8 +; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8. +; CHECK-DBG: LV: User VF=vscale x 16 is unsafe. Ignoring scalable UserVF. +; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 16 is unsafe. Ignoring the hint to let the compiler pick a suitable VF. +; CHECK-DBG: LV: Found feasible scalable VF = vscale x 4 +; CHECK-DBG: Selecting VF: 4. ; CHECK-LABEL: @test6 -; CHECK: +; CHECK: <4 x i32> define void @test6(i32* %a, i32* %b) { entry: br label %loop @@ -304,8 +310,9 @@ !17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} ; CHECK-NO-SVE-LABEL: LV: Checking a loop in "test_no_sve" -; CHECK-NO-SVE: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors. -; CHECK-NO-SVE: remark: :0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors. +; CHECK-NO-SVE: LV: Disabling scalable vectorization, because target does not support scalable vectors. +; CHECK-NO-SVE: remark: :0:0: Disabling scalable vectorization, because target does not support scalable vectors. +; CHECK-NO-SVE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. ; CHECK-NO-SVE: LV: Selecting VF: 4. ; CHECK-NO-SVE: <4 x i32> ; CHECK-NO-SVE-NOT: @@ -337,8 +344,8 @@ ; supported but max vscale is undefined. ; ; CHECK-NO-MAX-VSCALE-LABEL: LV: Checking a loop in "test_no_max_vscale" -; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. -; CEHCK-NO-MAX-VSCALE: The max safe VF is: 4. +; CEHCK-NO-MAX-VSCALE: The max safe fixed VF is: 4. +; CHECK-NO-MAX-VSCALE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. ; CHECK-NO-MAX-VSCALE: LV: Selecting VF: 4. ; CHECK-NO-MAX-VSCALE: <4 x i32> define void @test_no_max_vscale(i32* %a, i32* %b) { diff --git a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll --- a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll @@ -3,8 +3,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -; CHECK: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors. -; CHECK: remark: :0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors. +; CHECK: LV: Disabling scalable vectorization, because target does not support scalable vectors. +; CHECK: remark: :0:0: Disabling scalable vectorization, because target does not support scalable vectors. ; CHECK: LV: The Widest register safe to use is: 32 bits. define void @test1(i32* %a, i32* %b) { entry: