Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -922,6 +922,10 @@ /// \return The width of the smallest vector register type. unsigned getMinVectorRegisterBitWidth() const; + /// \return The maximum value for vscale in scalable vectors such as + /// . Default is None. + Optional getMaxVScale() const; + /// \return True if the vectorization factor should be chosen to /// make the vector of the smallest element type match the size of a /// vector register. For wider element types, this could result in @@ -1493,6 +1497,7 @@ virtual const char *getRegisterClassName(unsigned ClassID) const = 0; virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; + virtual Optional getMaxVScale() const = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0; virtual bool shouldConsiderAddressTypePromotion( @@ -1909,6 +1914,9 @@ unsigned getMinVectorRegisterBitWidth() override { return Impl.getMinVectorRegisterBitWidth(); } + Optional getMaxVScale() const override { + return Impl.getMaxVScale(); + } bool shouldMaximizeVectorBandwidth(bool OptSize) const override { return Impl.shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -352,6 +352,8 @@ unsigned getMinVectorRegisterBitWidth() { return 128; } + llvm::Optional getMaxVScale() const { return None; } + bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; } unsigned getMinimumVF(unsigned ElemWidth) const { return 0; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -567,6 +567,8 @@ unsigned getRegisterBitWidth(bool Vector) const { return 32; } + Optional getMaxVScale() const { return None; } + /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the demanded result elements need to be inserted and/or /// extracted from vectors. Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -627,6 +627,10 @@ return TTIImpl->getMinVectorRegisterBitWidth(); } +llvm::Optional TargetTransformInfo::getMaxVScale() const { + return TTIImpl->getMaxVScale(); +} + bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const { return TTIImpl->shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -115,6 +115,12 @@ return ST->getMinVectorRegisterBitWidth(); } + Optional getMaxVScale() const { + if (ST->hasSVE()) + return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock; + return BaseT::getMaxVScale(); + } + unsigned getMaxInterleaveFactor(unsigned VF); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1606,8 +1606,8 @@ /// \return An upper bound for the vectorization factor, a power-of-2 larger /// than zero. One is returned if vectorization should best be avoided due /// to cost. - ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF); + Optional computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -5458,7 +5458,11 @@ return None; } - ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); + Optional MaybeMaxVF = computeFeasibleMaxVF(TC, UserVF); + if (!MaybeMaxVF) + return None; + + ElementCount MaxVF = MaybeMaxVF.getValue(); switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: @@ -5549,7 +5553,7 @@ return None; } -ElementCount +Optional LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, ElementCount UserVF) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); @@ -5563,15 +5567,51 @@ // dependence distance). unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); + // If the user vectorization factor is legally unsafe, clamp it to a safe + // value. Otherwise, return as is. if (UserVF.isNonZero()) { - // For now, don't verify legality of scalable vectors. - // This will be addressed properly in https://reviews.llvm.org/D91718. - if (UserVF.isScalable()) + // Nothing to do if there are no dependencies. + if (MaxSafeVectorWidthInBits >= UINT_MAX) return UserVF; - // If legally unsafe, clamp the user vectorization factor to a safe value. - unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); - if (UserVF.getFixedValue() <= MaxSafeVF) + unsigned MaxVScale; + if (UserVF.isScalable()) { + if (!TTI.supportsScalableVectors()) { + reportVectorizationFailure( + "Ignoring scalable VF because target does not support scalable vectors", + "Ignoring scalable VF because target does not support scalable vectors.", + "NoScalableVectorSupport", ORE, TheLoop); + return None; + } + + Optional MaybeMaxVScale = TTI.getMaxVScale(); + assert(MaybeMaxVScale && + "max vscale undefined for target that supports scalable vectors!"); + MaxVScale = MaybeMaxVScale.getValue(); + } + + unsigned MaxSafeElements = + PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); + // If scalable, scale VF by vscale before checking if it's safe. + ElementCount MaxSafeVF = + UserVF.isScalable() + ? ElementCount::getScalable(MaxSafeElements / MaxVScale) + : ElementCount::getFixed(MaxSafeElements); + + if (UserVF.isScalable() && MaxSafeVF.isZero()) { + // Dependence distance too small to use scalable vectors, use fixed VF + // instead. + LLVM_DEBUG( + dbgs() + << "LV: Max legal vector width too small, scalable vectorization " + "unfeasible. Using fixed-width vectorization instead.\n"); + return computeFeasibleMaxVF( + ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); + } + + LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); + + if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) return UserVF; LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF @@ -5586,7 +5626,7 @@ << " is unsafe, clamping to maximum safe vectorization factor " << ore::NV("VectorizationFactor", MaxSafeVF); }); - return ElementCount::getFixed(MaxSafeVF); + return MaxSafeVF; } WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); @@ -7390,8 +7430,12 @@ return {{UserVF, 0}}; } - assert(!MaxVF.isScalable() && - "Scalable vectors not yet supported beyond this point"); + if (MaxVF.isScalable()) { + // TODO: Use scalable MaxVF once we've added support for scalable + // vectorization beyond this point. For now we convert it to fixed width, + // but this will be removed in a later patch. + MaxVF = ElementCount::getFixed(MaxVF.getKnownMinValue()); + } for (ElementCount VF = ElementCount::getFixed(1); ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll @@ -0,0 +1,329 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -S < %s 2>&1 | FileCheck %s +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-DBG %s +; RUN: opt -mtriple=aarch64-none-linux-gnu -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-NO-SVE %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +; These tests validate the behaviour of scalable vectorization factor hints, +; where the following applies: +; +; * If the backend does not support scalable vectors, ignore the hint. +; * If there are no dependencies and assuming the VF is a power of 2 the VF +; should be accepted. This applies to both fixed and scalable VFs. +; * If the dependency is too small to use scalable vectors, change the VF to +; fixed, where existing behavior applies (clamping). +; * If scalable vectorization is feasible given the dependency and the VF is +; valid, accept it. Otherwise, clamp to the max scalable VF. + +; test1 +; +; Scalable vectorization unfeasible, clamp VF from (4, scalable) -> (4, fixed). +; +; The pragma applied to this loop implies a scalable vector +; be used for vectorization. For fixed vectors the MaxVF=8, otherwise there +; would be a dependence between vector lanes for vectors greater than 256 bits. +; +; void test1(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i +define void @test1(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 8 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 + +exit: + ret void +} + +!0 = !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.width", i32 4} +!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; test2 +; +; Scalable vectorization unfeasible, clamp VF from (8, scalable) -> (4, fixed). +; +; void test2(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(8, scalable) +; for (int i=0; i +define void @test2(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 4 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !3 + +exit: + ret void +} + +!3 = !{!3, !4, !5} +!4 = !{!"llvm.loop.vectorize.width", i32 8} +!5 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; test3 +; +; Scalable vectorization feasible and the VF is valid. +; +; Specifies a vector of , i.e. maximum of 32 x i32 with 2 +; words per 128-bits (unpacked). +; +; void test3(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(2, scalable) +; for (int i=0; i +define void @test3(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 32 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !6 + +exit: + ret void +} + +!6 = !{!6, !7, !8} +!7 = !{!"llvm.loop.vectorize.width", i32 2} +!8 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; test4 +; +; Scalable vectorization feasible, but the VF is unsafe. Should clamp. +; +; Specifies a vector of , i.e. maximum of 64 x i32 with 4 +; words per 128-bits (packed). +; +; void test4(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i:0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2 +; CHECK-DBG: LV: Selecting VF: 2. +; CHECK-LABEL: @test4 +; CHECK: <2 x i32> +define void @test4(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 32 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !9 + +exit: + ret void +} + +!9 = !{!9, !10, !11} +!10 = !{!"llvm.loop.vectorize.width", i32 4} +!11 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; test5 +; +; Scalable vectorization feasible and the VF is valid. +; +; Specifies a vector of , i.e. maximum of 64 x i32 with 4 +; words per 128-bits (packed). +; +; void test5(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i +define void @test5(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 128 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !12 + +exit: + ret void +} + +!12 = !{!12, !13, !14} +!13 = !{!"llvm.loop.vectorize.width", i32 4} +!14 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; test6 +; +; Scalable vectorization feasible, but the VF is unsafe. Should clamp. +; +; Specifies a vector of , i.e. maximum of 256 x i32. +; +; void test6(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(16, scalable) +; for (int i=0; i:0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8 +; CHECK-DBG: LV: Selecting VF: 8. +; CHECK-LABEL: @test6 +; CHECK: <8 x i32> +define void @test6(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 128 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !15 + +exit: + ret void +} + +!15 = !{!15, !16, !17} +!16 = !{!"llvm.loop.vectorize.width", i32 16} +!17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; CHECK-NO-SVE: LV: Not vectorizing: Ignoring scalable VF because target does not support scalable vectors. +; CHECK-NO-SVE: remark: :0:0: loop not vectorized: Ignoring scalable VF because target does not support scalable vectors. +define void @test_no_sve(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 4 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !18 + +exit: + ret void +} + +!18 = !{!18, !19, !20} +!19 = !{!"llvm.loop.vectorize.width", i32 4} +!20 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll @@ -0,0 +1,32 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; CHECK: LV: Not vectorizing: Ignoring scalable VF because target does not support scalable vectors. +; CHECK: remark: :0:0: loop not vectorized: Ignoring scalable VF because target does not support scalable vectors. +define void @test1(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 4 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 + +exit: + ret void +} + +!0 = !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.width", i32 4} +!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}