Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -934,6 +934,11 @@ /// architectural maximum vector length, and None otherwise. Optional getMaxVScale() const; + /// \return The maximum number of bits for a block in a scalable vector + /// register for a scalable vector with a vscale number of blocks. + /// i.e. the maximum number of N x elt bits in . + unsigned getMaxScalableBitsPerBlock() const; + /// \return True if the vectorization factor should be chosen to /// make the vector of the smallest element type match the size of a /// vector register. For wider element types, this could result in @@ -1523,6 +1528,7 @@ virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual Optional getMaxVScale() const = 0; + virtual unsigned getMaxScalableBitsPerBlock() const = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; virtual ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalableVF) const = 0; @@ -1950,6 +1956,9 @@ Optional getMaxVScale() const override { return Impl.getMaxVScale(); } + unsigned getMaxScalableBitsPerBlock() const override { + return Impl.getMaxScalableBitsPerBlock(); + } bool shouldMaximizeVectorBandwidth(bool OptSize) const override { return Impl.shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -372,6 +372,8 @@ Optional getMaxVScale() const { return None; } + unsigned getMaxScalableBitsPerBlock() const { return 0; } + bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; } ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalableVF) const { Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -636,6 +636,10 @@ return TTIImpl->getMaxVScale(); } +unsigned TargetTransformInfo::getMaxScalableBitsPerBlock() const { + return TTIImpl->getMaxScalableBitsPerBlock(); +} + bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const { return TTIImpl->shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -123,6 +123,12 @@ return BaseT::getMaxVScale(); } + unsigned getMaxScalableBitsPerBlock() const { + if (ST->hasSVE()) + return AArch64::SVEBitsPerBlock; + return BaseT::getMaxScalableBitsPerBlock(); + } + unsigned getMaxInterleaveFactor(unsigned VF); unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -198,6 +198,10 @@ "value are vectorized only if no scalar iteration overheads " "are incurred.")); +static cl::opt EnableScalableVectorization( + "enable-scalable-vectorization", cl::init(false), cl::Hidden, + cl::desc("Allow the compiler to use scalable vectors to vectorize a loop")); + // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the // vectorizer will try to fold the tail-loop (epilogue) into the vector body @@ -1635,7 +1639,8 @@ /// than zero. One is returned if vectorization should best be avoided due /// to cost. ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, - unsigned SmallestType, unsigned WidestType); + unsigned SmallestType, unsigned WidestType, + bool ComputeMaxScalableVF); /// \return A pair with the clamped VF and the maximum safe VF. /// If MaxSafeElements allows it, ClampedVF is the SuggestedVF, otherwise @@ -5526,8 +5531,23 @@ // First analyze the UserVF, fall back if the UserVF should be ignored. Optional MaybeMaxVF = computeFeasibleUserVF(UserVF, WidestType); + + // First try to find a max scalable VF, because the resulting MaxVF may not + // be scalable, depending on dependence distances in the loop, or the + // tripcount. + if (EnableScalableVectorization && TTI.supportsScalableVectors()) { + ElementCount MaxScalableVF = + computeFeasibleMaxVF(TC, SmallestType, WidestType, + /*ComputeMaxScalableVF=*/true); + if(MaxScalableVF.isScalable()) + LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxScalableVF << "\n"); + else + LLVM_DEBUG(dbgs() << "LV: No feasible scalable VF found.\n"); + } + if (!MaybeMaxVF) - MaybeMaxVF = computeFeasibleMaxVF(TC, SmallestType, WidestType); + MaybeMaxVF = computeFeasibleMaxVF(TC, SmallestType, WidestType, + /*ComputeMaxScalableVF=*/false); return MaybeMaxVF.getValue(); }; @@ -5739,7 +5759,8 @@ } ElementCount LoopVectorizationCostModel::computeFeasibleMaxVF( - unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType) { + unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, + bool ComputeMaxScalableVF) { // Get the maximum safe dependence distance in bits computed by LAA. // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from // the memory accesses that is most restrictive (involved in the smallest @@ -5748,7 +5769,9 @@ unsigned MaxSafeElements = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); - unsigned WidestRegister = TTI.getRegisterBitWidth(true); + unsigned WidestRegister = ComputeMaxScalableVF + ? TTI.getMaxScalableBitsPerBlock() + : TTI.getRegisterBitWidth(true); LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"); @@ -5757,8 +5780,8 @@ // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. - auto MaxVectorSize = - ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); + auto MaxVectorSize = ElementCount::get( + PowerOf2Floor(WidestRegister / WidestType), ComputeMaxScalableVF); std::tie(MaxVectorSize, std::ignore) = clampFeasibleMaxVF(MaxVectorSize, MaxSafeElements); @@ -5781,8 +5804,8 @@ ElementCount MaxVF = MaxVectorSize; if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || (MaximizeBandwidth && isScalarEpilogueAllowed())) { - auto MaxVectorSizeMaxBW = - ElementCount::getFixed(PowerOf2Floor(WidestRegister / SmallestType)); + auto MaxVectorSizeMaxBW = ElementCount::get( + PowerOf2Floor(WidestRegister / SmallestType), ComputeMaxScalableVF); std::tie(MaxVectorSizeMaxBW, std::ignore) = clampFeasibleMaxVF(MaxVectorSizeMaxBW, MaxSafeElements); @@ -5810,7 +5833,8 @@ break; } } - if (ElementCount MinVF = TTI.getMinimumVF(SmallestType, false)) { + if (ElementCount MinVF = + TTI.getMinimumVF(SmallestType, ComputeMaxScalableVF)) { if (ElementCount::isKnownLT(MaxVF, MinVF)) { LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF << ") with target's minimum: " << MinVF << '\n'); Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll @@ -0,0 +1,118 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -enable-scalable-vectorization -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -enable-scalable-vectorization -loop-vectorize -S -debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefix=CHECKMAXBW + +; Test that the MaxVF for the following loop, that has no dependence distances, +; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16 +; (maximized bandwidth for i8 in the loop). +define void @test0(i32* %a, i8* %b, i32* %c) { +; CHECK: LV: Checking a loop in "test0" +; CHECK: LV: Found feasible scalable VF = vscale x 4 +; CHECKMAXBW: LV: Checking a loop in "test0" +; CHECKMAXBW: LV: Found feasible scalable VF = vscale x 16 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} +; Test that the MaxVF for the following loop, with a dependence distance +; of 64 elements, is calculated as (maxvscale = 16) * 4. +define void @test1(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test1" +; CHECK: LV: Found feasible scalable VF = vscale x 4 +; CHECKMAXBW: LV: Checking a loop in "test1" +; CHECKMAXBW: LV: Found feasible scalable VF = vscale x 4 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 64 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 32 elements, is calculated as (maxvscale = 16) * 2. +define void @test2(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test2" +; CHECK: LV: Found feasible scalable VF = vscale x 2 +; CHECKMAXBW: LV: Checking a loop in "test2" +; CHECKMAXBW: LV: Found feasible scalable VF = vscale x 2 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 32 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 16 elements, is calculated as (maxvscale = 16) * 1. +define void @test3(i32* %a, i8* %b) { +; CHECK: LV: Checking a loop in "test3" +; CHECK: LV: Found feasible scalable VF = vscale x 1 +; CHECKMAXBW: LV: Checking a loop in "test3" +; CHECKMAXBW: LV: Found feasible scalable VF = vscale x 2 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 16 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +}