Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -925,6 +925,11 @@ /// architectural maximum vector length, and None otherwise. Optional getMaxVScale() const; + /// \return The maximum number of bits for a block in a scalable vector + /// register for a scalable vector with a vscale number of blocks. + /// i.e. the maximum number of N x elt bits in . + unsigned getMaxScalableBitsPerBlock() const; + /// \return True if the vectorization factor should be chosen to /// make the vector of the smallest element type match the size of a /// vector register. For wider element types, this could result in @@ -1513,6 +1518,7 @@ virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual Optional getMaxVScale() const = 0; + virtual unsigned getMaxScalableBitsPerBlock() const = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; virtual ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const = 0; @@ -1941,6 +1947,9 @@ Optional getMaxVScale() const override { return Impl.getMaxVScale(); } + unsigned getMaxScalableBitsPerBlock() const override { + return Impl.getMaxScalableBitsPerBlock(); + } bool shouldMaximizeVectorBandwidth(bool OptSize) const override { return Impl.shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -376,6 +376,8 @@ Optional getMaxVScale() const { return None; } + unsigned getMaxScalableBitsPerBlock() const { return 0; } + bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; } ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const { Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -109,7 +109,7 @@ void emitRemarkWithHints() const; ElementCount getWidth() const { - return ElementCount::get(Width.Value, isScalable()); + return ElementCount::get(Width.Value, isForcedScalable()); } unsigned getInterleave() const { return Interleave.Value; } unsigned getIsVectorized() const { return IsVectorized.Value; } @@ -121,7 +121,13 @@ return (ForceKind)Force.Value; } - bool isScalable() const { return Scalable.Value; } + bool isForcedScalable() const { + return Scalable.Value == LoopVectorizeHints::FK_Enabled; + } + + bool allowScalable() const { + return Scalable.Value != LoopVectorizeHints::FK_Disabled; + } /// If hints are provided that force vectorization, use the AlwaysPrint /// pass name to force the frontend to print the diagnostic. Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -585,6 +585,10 @@ return TTIImpl->getMaxVScale(); } +unsigned TargetTransformInfo::getMaxScalableBitsPerBlock() const { + return TTIImpl->getMaxScalableBitsPerBlock(); +} + bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const { return TTIImpl->shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -121,6 +121,12 @@ return BaseT::getMaxVScale(); } + unsigned getMaxScalableBitsPerBlock() const { + if (ST->hasSVE()) + return AArch64::SVEBitsPerBlock; + return BaseT::getMaxScalableBitsPerBlock(); + } + unsigned getMaxInterleaveFactor(unsigned VF); unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -51,6 +51,21 @@ cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma")); +// FIXME: When scalable vectorization is stable enough, change the default +// to FK_Undefined. +static cl::opt ScalableVectorization( + "scalable-vectorization", cl::init(LoopVectorizeHints::FK_Disabled), + cl::Hidden, + cl::desc("Control whether the compiler can use scalable vectors to " + "vectorize a loop"), + cl::values( + clEnumValN(LoopVectorizeHints::FK_Disabled, "off", + "disable all vectorization with scalable vectors"), + clEnumValN(LoopVectorizeHints::FK_Undefined, "on", + "allow loops to be vectorized with scalable vectors"), + clEnumValN(LoopVectorizeHints::FK_Enabled, "always", + "allow loops to be vectorized exclusively with scalable vectors"))); + /// Maximum vectorization interleave count. static const unsigned MaxInterleaveFactor = 16; @@ -63,10 +78,10 @@ case HK_UNROLL: return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; case HK_FORCE: + case HK_SCALABLE: return (Val <= 1); case HK_ISVECTORIZED: case HK_PREDICATE: - case HK_SCALABLE: return (Val == 0 || Val == 1); } return false; @@ -80,8 +95,8 @@ Force("vectorize.enable", FK_Undefined, HK_FORCE), IsVectorized("isvectorized", 0, HK_ISVECTORIZED), Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), - Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L), - ORE(ORE) { + Scalable("vectorize.scalable.enable", ScalableVectorization, HK_SCALABLE), + TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5578,6 +5578,21 @@ auto MaxSafeElements = ElementCount::getFixed( PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType)); + // Try to find a max scalable VF. + if (Hints->allowScalable() && TTI.supportsScalableVectors()) { + auto MaxSafeVF = clampFeasibleMaxVF(ElementCount::getScalable(1 << 16), + MaxSafeElements); + if (MaxSafeVF.isScalableVector()) { + ElementCount MaxScalableVF = + computeFeasibleMaxVF(TC, MaxSafeVF, SmallestType, WidestType); + if (MaxScalableVF.isScalable()) + LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " + << MaxScalableVF << "\n"); + else + LLVM_DEBUG(dbgs() << "LV: No feasible scalable VF found.\n"); + } + } + // First analyze the UserVF, fall back if the UserVF should be ignored. if (auto MaybeMaxVF = getFeasibleUserVF(UserVF, MaxSafeElements)) return MaybeMaxVF.getValue(); @@ -5805,16 +5820,21 @@ ElementCount LoopVectorizationCostModel::computeFeasibleMaxVF( unsigned ConstTripCount, ElementCount MaxSafeVF, unsigned SmallestType, unsigned WidestType) { - unsigned WidestRegister = TTI.getRegisterBitWidth(true); + bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); + unsigned WidestRegister = ComputeScalableMaxVF + ? TTI.getMaxScalableBitsPerBlock() + : TTI.getRegisterBitWidth(true); + LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"); LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " + << (ComputeScalableMaxVF ? "vscale x " : "") << WidestRegister << " bits.\n"); // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. - auto MaxVectorSize = - ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); + auto MaxVectorSize = ElementCount::get( + PowerOf2Floor(WidestRegister / WidestType), ComputeScalableMaxVF); if (MaxVectorSize.getKnownMinValue() == 0) { LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); @@ -5824,6 +5844,14 @@ if (!Legal->isSafeForAnyVectorWidth()) MaxVectorSize = clampFeasibleMaxVF(MaxVectorSize, MaxSafeVF); + // Test that the loop-vectorizer can legalize all operations for this MaxVF. + // FIXME: While for scalable vectors this is currently sufficient, this should + // be replaced by a more detailed mechanism that filters out specific VFs, + // instead of invalidating vectorization for a whole set of VFs based on the + // MaxVF. + if (MaxVectorSize.isScalable() && !canVectorizeReductions(MaxVectorSize)) + return ElementCount::getFixed(1); + if (ConstTripCount && isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in // choosing a higher viable VF as done in the loop below. @@ -5839,8 +5867,8 @@ ElementCount MaxVF = MaxVectorSize; if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || (MaximizeBandwidth && isScalarEpilogueAllowed())) { - auto MaxVectorSizeMaxBW = - ElementCount::getFixed(PowerOf2Floor(WidestRegister / SmallestType)); + auto MaxVectorSizeMaxBW = ElementCount::get( + PowerOf2Floor(WidestRegister / SmallestType), ComputeScalableMaxVF); if (!Legal->isSafeForAnyVectorWidth()) MaxVectorSizeMaxBW = clampFeasibleMaxVF(MaxVectorSizeMaxBW, MaxSafeVF); @@ -5869,7 +5897,7 @@ } } if (ElementCount MinVF = - TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { + TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { if (ElementCount::isKnownLT(MaxVF, MinVF)) { LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF << ") with target's minimum: " << MinVF << '\n'); Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll @@ -0,0 +1,124 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=on -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_ON +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=always -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_ALWAYSON +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=off -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_DISABLED +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=on -loop-vectorize -S -debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_MAXBW + +; Test that the MaxVF for the following loop, that has no dependence distances, +; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16 +; (maximized bandwidth for i8 in the loop). +define void @test0(i32* %a, i8* %b, i32* %c) { +; CHECK: LV: Checking a loop in "test0" +; CHECK_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 16 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} +; Test that the MaxVF for the following loop, with a dependence distance +; of 64 elements, is calculated as (maxvscale = 16) * 4. +define void @test1(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test1" +; CHECK_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 4 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 64 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 32 elements, is calculated as (maxvscale = 16) * 2. +define void @test2(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test2" +; CHECK_ON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 2 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 32 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 16 elements, is calculated as (maxvscale = 16) * 1. +define void @test3(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test3" +; CHECK_ON: LV: Found feasible scalable VF = vscale x 1 +; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 1 +; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 1 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 16 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +}