Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -927,6 +927,10 @@ /// \return The width of the smallest vector register type. unsigned getMinVectorRegisterBitWidth() const; + /// \return The maximum value for vscale in scalable vectors such as + /// . + unsigned getMaxVScale() const; + /// \return True if the vectorization factor should be chosen to /// make the vector of the smallest element type match the size of a /// vector register. For wider element types, this could result in @@ -1495,6 +1499,7 @@ virtual const char *getRegisterClassName(unsigned ClassID) const = 0; virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; + virtual unsigned getMaxVScale() const = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0; virtual bool shouldConsiderAddressTypePromotion( @@ -1910,6 +1915,7 @@ unsigned getMinVectorRegisterBitWidth() override { return Impl.getMinVectorRegisterBitWidth(); } + unsigned getMaxVScale() const override { return Impl.getMaxVScale(); } bool shouldMaximizeVectorBandwidth(bool OptSize) const override { return Impl.shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -352,6 +352,8 @@ unsigned getMinVectorRegisterBitWidth() { return 128; } + unsigned getMaxVScale() const { return 1; } + bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; } unsigned getMinimumVF(unsigned ElemWidth) const { return 0; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -567,6 +567,8 @@ unsigned getRegisterBitWidth(bool Vector) const { return 32; } + unsigned getMaxVScale() const { return 1; } + /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the demanded result elements need to be inserted and/or /// extracted from vectors. Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -626,6 +626,10 @@ return TTIImpl->getMinVectorRegisterBitWidth(); } +unsigned TargetTransformInfo::getMaxVScale() const { + return TTIImpl->getMaxVScale(); +} + bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const { return TTIImpl->shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -115,6 +115,12 @@ return ST->getMinVectorRegisterBitWidth(); } + unsigned getMaxVScale() const { + if (ST->hasSVE()) + return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock; + return BaseT::getMaxVScale(); + } + unsigned getMaxInterleaveFactor(unsigned VF); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5394,10 +5394,34 @@ // dependence distance). unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); + // If the user vectorization factor is legally unsafe, clamp it to a safe + // value. Otherwise, return as is. if (UserVF.isNonZero()) { - // If legally unsafe, clamp the user vectorization factor to a safe value. - unsigned MaxSafeVF = PowerOf2Floor(MaxSafeRegisterWidth / WidestType); - if (UserVF.getKnownMinValue() <= MaxSafeVF) + // Nothing to do if there are no dependencies. + if (MaxSafeVectorWidthInBits >= UINT_MAX) + return UserVF; + + unsigned MaxSafeElements = + PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); + // If scalable, scale VF by vscale before checking if it's safe. + ElementCount MaxSafeVF = + UserVF.isScalable() + ? ElementCount::getScalable(MaxSafeElements / TTI.getMaxVScale()) + : ElementCount::getFixed(MaxSafeElements); + + if (UserVF.isScalable() && MaxSafeVF.isZero()) { + // Dependence distance too small to use scalable vectors. Clamp to max + // fixed VF. + LLVM_DEBUG( + dbgs() + << "LV: Max legal vector width too small, scalable vectorization " + "unfeasible. Using fixed-width vectorization instead.\n"); + MaxSafeVF = ElementCount::getFixed(MaxSafeElements); + } + + LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); + + if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) return UserVF; LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF @@ -5412,7 +5436,7 @@ << " is unsafe, clamping to maximum safe vectorization factor " << ore::NV("VectorizationFactor", MaxSafeVF); }); - return ElementCount::getFixed(MaxSafeVF); + return MaxSafeVF; } WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); @@ -7089,18 +7113,20 @@ ElementCount MaxVF = MaybeMaxVF.getValue(); assert(MaxVF.isNonZero() && "MaxVF is zero."); - if (!UserVF.isZero() && - UserVF.getKnownMinValue() <= MaxVF.getKnownMinValue()) { - LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - assert(isPowerOf2_32(UserVF.getKnownMinValue()) && + if (!UserVF.isZero()) { + bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); + ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; + LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") + << " VF " << VF << ".\n"); + assert(isPowerOf2_32(VF.getKnownMinValue()) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - CM.selectUserVectorizationFactor(UserVF); + CM.selectUserVectorizationFactor(VF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); + buildVPlansWithVPRecipes(VF, VF); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + return {{VF, 0}}; } assert(!MaxVF.isScalable() && Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll @@ -0,0 +1,241 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -S < %s 2>&1 | FileCheck %s +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-DBG %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +; test1 +; +; The pragma applied to this loop implies a scalable vector +; be used for vectorization. For fixed vectors the MaxVF=4, otherwise there +; would be a dependence between vector lanes for vectors greater than 128 bits. +; +; void test1(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i +define void @test1(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 4 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 + +exit: + ret void +} + +!0 = !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} +!2 = !{!"llvm.loop.vectorize.width", !3} +!3 = !{i32 4, i1 true} + +; test2 +; +; Specifies a vector of , i.e. maximum of 32 x i32 with 2 +; words per 128-bits (unpacked). +; +; void test2(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(2, scalable) +; for (int i=0; i +define void @test2(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 32 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !4 + +exit: + ret void +} + +!4 = !{!4, !5, !6} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = !{!"llvm.loop.vectorize.width", !7} +!7 = !{i32 2, i1 true} + +; test3 +; +; Specifies a vector of , i.e. maximum of 64 x i32 with 4 +; words per 128-bits (packed). +; +; void test3(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i:0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2 +; CHECK-DBG: LV: Using max VF vscale x 2. +; CHECK-LABEL: @test3 +; CHECK: +define void @test3(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 32 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !8 + +exit: + ret void +} + +!8 = !{!8, !9, !10} +!9 = !{!"llvm.loop.vectorize.enable", i1 true} +!10 = !{!"llvm.loop.vectorize.width", !11} +!11 = !{i32 4, i1 true} + +; test4 +; +; Specifies a vector of , i.e. maximum of 64 x i32 with 4 +; words per 128-bits (packed). +; +; void test4(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i +define void @test4(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 128 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !12 + +exit: + ret void +} + +!12 = !{!12, !13, !14} +!13 = !{!"llvm.loop.vectorize.enable", i1 true} +!14 = !{!"llvm.loop.vectorize.width", !15} +!15 = !{i32 4, i1 true} + +; test5 +; +; Specifies a vector of , i.e. maximum of 256 x i32. +; +; void test5(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(16, scalable) +; for (int i=0; i:0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8 +; CHECK-DBG: LV: Using max VF vscale x 8 +; CHECK-LABEL: @test5 +; CHECK: +define void @test5(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 128 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !16 + +exit: + ret void +} + +!16 = !{!16, !17, !18} +!17 = !{!"llvm.loop.vectorize.enable", i1 true} +!18 = !{!"llvm.loop.vectorize.width", !19} +!19 = !{i32 16, i1 true} Index: llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll @@ -0,0 +1,52 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; test1 +; +; The pragma applied to this loop implies a scalable vector +; be used for vectorization. For fixed vectors the MaxVF=4, otherwise there +; would be a dependence between vector lanes for vectors greater than 128 bits. +; +; void test1(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i +define void @test1(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 4 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 + +exit: + ret void +} + +!0 = !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} +!2 = !{!"llvm.loop.vectorize.width", !3} +!3 = !{i32 4, i1 true}