Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -937,6 +937,11 @@ /// architectural maximum vector length, and None otherwise. Optional getMaxVScale() const; + /// \return The maximum number of bits for a block in a scalable vector + /// register for a scalable vector with a vscale number of blocks. + /// i.e. the maximum number of N x elt bits in . + unsigned getMaxScalableBitsPerBlock() const; + /// \return True if the vectorization factor should be chosen to /// make the vector of the smallest element type match the size of a /// vector register. For wider element types, this could result in @@ -1525,6 +1530,7 @@ virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual Optional getMaxVScale() const = 0; + virtual unsigned getMaxScalableBitsPerBlock() const = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; virtual ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const = 0; @@ -1953,6 +1959,9 @@ Optional getMaxVScale() const override { return Impl.getMaxVScale(); } + unsigned getMaxScalableBitsPerBlock() const override { + return Impl.getMaxScalableBitsPerBlock(); + } bool shouldMaximizeVectorBandwidth(bool OptSize) const override { return Impl.shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -373,6 +373,8 @@ Optional getMaxVScale() const { return None; } + unsigned getMaxScalableBitsPerBlock() const { return 0; } + bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; } ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const { Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -109,7 +109,7 @@ void emitRemarkWithHints() const; ElementCount getWidth() const { - return ElementCount::get(Width.Value, isScalable()); + return ElementCount::get(Width.Value, isForcedScalable()); } unsigned getInterleave() const { return Interleave.Value; } unsigned getIsVectorized() const { return IsVectorized.Value; } @@ -121,7 +121,13 @@ return (ForceKind)Force.Value; } - bool isScalable() const { return Scalable.Value; } + bool isForcedScalable() const { + return Scalable.Value == LoopVectorizeHints::FK_Enabled; + } + + bool allowScalable() const { + return Scalable.Value != LoopVectorizeHints::FK_Disabled; + } /// If hints are provided that force vectorization, use the AlwaysPrint /// pass name to force the frontend to print the diagnostic. Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -633,6 +633,10 @@ return TTIImpl->getMaxVScale(); } +unsigned TargetTransformInfo::getMaxScalableBitsPerBlock() const { + return TTIImpl->getMaxScalableBitsPerBlock(); +} + bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const { return TTIImpl->shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -121,6 +121,12 @@ return BaseT::getMaxVScale(); } + unsigned getMaxScalableBitsPerBlock() const { + if (ST->hasSVE()) + return AArch64::SVEBitsPerBlock; + return BaseT::getMaxScalableBitsPerBlock(); + } + unsigned getMaxInterleaveFactor(unsigned VF); unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -51,6 +51,21 @@ cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma")); +// FIXME: When scalable vectorization is stable enough, change the default +// to FK_Undefined. +static cl::opt ScalableVectorization( + "scalable-vectorization", cl::init(LoopVectorizeHints::FK_Disabled), + cl::Hidden, + cl::desc("Control whether the compiler can use scalable vectors to " + "vectorize a loop"), + cl::values( + clEnumValN(LoopVectorizeHints::FK_Disabled, "off", + "disable all vectorization with scalable vectors"), + clEnumValN(LoopVectorizeHints::FK_Undefined, "on", + "allow loops to be vectorized with scalable vectors"), + clEnumValN(LoopVectorizeHints::FK_Enabled, "always", + "allow loops to be vectorized exclusively with scalable vectors"))); + /// Maximum vectorization interleave count. static const unsigned MaxInterleaveFactor = 16; @@ -63,10 +78,10 @@ case HK_UNROLL: return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; case HK_FORCE: + case HK_SCALABLE: return (Val <= 1); case HK_ISVECTORIZED: case HK_PREDICATE: - case HK_SCALABLE: return (Val == 0 || Val == 1); } return false; @@ -80,8 +95,8 @@ Force("vectorize.enable", FK_Undefined, HK_FORCE), IsVectorized("isvectorized", 0, HK_ISVECTORIZED), Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), - Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L), - ORE(ORE) { + Scalable("vectorize.scalable.enable", ScalableVectorization, HK_SCALABLE), + TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1652,7 +1652,8 @@ /// than zero. One is returned if vectorization should best be avoided due /// to cost. ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, - unsigned SmallestType, unsigned WidestType); + unsigned SmallestType, unsigned WidestType, + bool ComputeMaxScalableVF); /// \return A pair with the clamped VF and the maximum safe VF. /// If MaxSafeElements allows it, ClampedVF is the SuggestedVF, otherwise @@ -5571,10 +5572,22 @@ unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); + // Try to find a max scalable VF. + if (Hints->allowScalable() && TTI.supportsScalableVectors()) { + ElementCount MaxScalableVF = + computeFeasibleMaxVF(TC, SmallestType, WidestType, + /*ComputeMaxScalableVF=*/true); + if(MaxScalableVF.isScalable()) + LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxScalableVF << "\n"); + else + LLVM_DEBUG(dbgs() << "LV: No feasible scalable VF found.\n"); + } + // First analyze the UserVF, fall back if the UserVF should be ignored. if (auto MaybeMaxVF = computeFeasibleUserVF(UserVF, WidestType)) return MaybeMaxVF.getValue(); - return computeFeasibleMaxVF(TC, SmallestType, WidestType); + return computeFeasibleMaxVF(TC, SmallestType, WidestType, + /*ComputeMaxScalableVF=*/false); }; switch (ScalarEpilogueStatus) { @@ -5798,7 +5811,8 @@ } ElementCount LoopVectorizationCostModel::computeFeasibleMaxVF( - unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType) { + unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, + bool ComputeMaxScalableVF) { // Get the maximum safe dependence distance in bits computed by LAA. // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from // the memory accesses that is most restrictive (involved in the smallest @@ -5807,7 +5821,9 @@ unsigned MaxSafeElements = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); - unsigned WidestRegister = TTI.getRegisterBitWidth(true); + unsigned WidestRegister = ComputeMaxScalableVF + ? TTI.getMaxScalableBitsPerBlock() + : TTI.getRegisterBitWidth(true); LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"); @@ -5816,8 +5832,8 @@ // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. - auto MaxVectorSize = - ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); + auto MaxVectorSize = ElementCount::get( + PowerOf2Floor(WidestRegister / WidestType), ComputeMaxScalableVF); std::tie(MaxVectorSize, std::ignore) = clampFeasibleMaxVF(MaxVectorSize, MaxSafeElements); @@ -5840,8 +5856,8 @@ ElementCount MaxVF = MaxVectorSize; if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || (MaximizeBandwidth && isScalarEpilogueAllowed())) { - auto MaxVectorSizeMaxBW = - ElementCount::getFixed(PowerOf2Floor(WidestRegister / SmallestType)); + auto MaxVectorSizeMaxBW = ElementCount::get( + PowerOf2Floor(WidestRegister / SmallestType), ComputeMaxScalableVF); std::tie(MaxVectorSizeMaxBW, std::ignore) = clampFeasibleMaxVF(MaxVectorSizeMaxBW, MaxSafeElements); @@ -5870,7 +5886,7 @@ } } if (ElementCount MinVF = - TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { + TTI.getMinimumVF(SmallestType, ComputeMaxScalableVF)) { if (ElementCount::isKnownLT(MaxVF, MinVF)) { LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF << ") with target's minimum: " << MinVF << '\n'); Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll @@ -0,0 +1,124 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=on -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_ON +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=always -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_ALWAYSON +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=off -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_DISABLED +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=on -loop-vectorize -S -debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_MAXBW + +; Test that the MaxVF for the following loop, that has no dependence distances, +; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16 +; (maximized bandwidth for i8 in the loop). +define void @test0(i32* %a, i8* %b, i32* %c) { +; CHECK: LV: Checking a loop in "test0" +; CHECK_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 16 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} +; Test that the MaxVF for the following loop, with a dependence distance +; of 64 elements, is calculated as (maxvscale = 16) * 4. +define void @test1(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test1" +; CHECK_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 4 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 64 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 32 elements, is calculated as (maxvscale = 16) * 2. +define void @test2(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test2" +; CHECK_ON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 2 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 32 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 16 elements, is calculated as (maxvscale = 16) * 1. +define void @test3(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test3" +; CHECK_ON: LV: Found feasible scalable VF = vscale x 1 +; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 1 +; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 1 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 16 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +}