Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -927,6 +927,10 @@ /// \return The width of the smallest vector register type. unsigned getMinVectorRegisterBitWidth() const; + /// \return The maximum value for vscale in scalable vectors such as + /// . + unsigned getMaxVScale() const; + /// \return True if the vectorization factor should be chosen to /// make the vector of the smallest element type match the size of a /// vector register. For wider element types, this could result in @@ -1495,6 +1499,7 @@ virtual const char *getRegisterClassName(unsigned ClassID) const = 0; virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; + virtual unsigned getMaxVScale() const = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0; virtual bool shouldConsiderAddressTypePromotion( @@ -1910,6 +1915,7 @@ unsigned getMinVectorRegisterBitWidth() override { return Impl.getMinVectorRegisterBitWidth(); } + unsigned getMaxVScale() const override { return Impl.getMaxVScale(); } bool shouldMaximizeVectorBandwidth(bool OptSize) const override { return Impl.shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -352,6 +352,8 @@ unsigned getMinVectorRegisterBitWidth() { return 128; } + unsigned getMaxVScale() const { return 1; } + bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; } unsigned getMinimumVF(unsigned ElemWidth) const { return 0; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -567,6 +567,8 @@ unsigned getRegisterBitWidth(bool Vector) const { return 32; } + unsigned getMaxVScale() const { return 1; } + /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the demanded result elements need to be inserted and/or /// extracted from vectors. Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -622,6 +622,10 @@ return TTIImpl->getMinVectorRegisterBitWidth(); } +unsigned TargetTransformInfo::getMaxVScale() const { + return TTIImpl->getMaxVScale(); +} + bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const { return TTIImpl->shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -115,6 +115,12 @@ return ST->getMinVectorRegisterBitWidth(); } + unsigned getMaxVScale() const { + if (ST->hasSVE()) + return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock; + return BaseT::getMaxVScale(); + } + unsigned getMaxInterleaveFactor(unsigned VF); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1448,8 +1448,8 @@ /// \return An upper bound for the vectorization factor, a power-of-2 larger /// than zero. One is returned if vectorization should best be avoided due /// to cost. - ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF); + Optional computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -5256,7 +5256,11 @@ return None; } - ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); + Optional MaybeMaxVF = computeFeasibleMaxVF(TC, UserVF); + if (!MaybeMaxVF) + return None; + + ElementCount MaxVF = MaybeMaxVF.getValue(); switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: @@ -5347,7 +5351,7 @@ return None; } -ElementCount +Optional LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, ElementCount UserVF) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); @@ -5361,10 +5365,34 @@ // dependence distance). unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); + // If the user vectorization factor is legally unsafe, clamp it to a safe + // value if possible. Otherwise, return as is. if (UserVF.isNonZero()) { - // If legally unsafe, clamp the user vectorization factor to a safe value. - unsigned MaxSafeVF = PowerOf2Floor(MaxSafeRegisterWidth / WidestType); - if (UserVF.getKnownMinValue() <= MaxSafeVF) + // Nothing to do if there are no dependencies. + if (MaxSafeRegisterWidth >= UINT_MAX) + return UserVF; + + unsigned FixedVF = PowerOf2Floor(MaxSafeRegisterWidth / WidestType); + // If scalable, scale VF by vscale before checking if it's safe. + ElementCount MaxSafeVF = + UserVF.isScalable() + ? ElementCount::getScalable(FixedVF / TTI.getMaxVScale()) + : ElementCount::getFixed(FixedVF); + + if (UserVF.isScalable() && MaxSafeVF.isZero()) { + // Dependence distance too small to use scalable vectors. Bail out. + // TODO: Fixed-width vectorization may still be beneficial. + reportVectorizationFailure("Max legal vector width too small, scalable " + "vectorization unfeasible.", + "max legal vector width too small, scalable " + "vectorization unfeasible.", + "UnfeasibleScalableVF", ORE, TheLoop); + return None; + } + + LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); + + if (UserVF.getKnownMinValue() <= MaxSafeVF.getKnownMinValue()) return UserVF; LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF @@ -5379,7 +5407,7 @@ << " is unsafe, clamping to maximum safe vectorization factor " << ore::NV("VectorizationFactor", MaxSafeVF); }); - return ElementCount::getFixed(MaxSafeVF); + return MaxSafeVF; } WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll @@ -0,0 +1,267 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +; test1 +; +; The pragma applied to this loop implies a scalable vector +; be used for vectorization. For fixed vectors the MaxVF=4, otherwise there +; would be a dependence between vector lanes for vectors greater than 128 bits. +; +; void test1(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i:0:0: loop not vectorized: max legal vector width too small, scalable vectorization unfeasible. +define void @test1(i32* %a, i32* %b, i32 %N) { +entry: + %cmp12 = icmp sgt i32 %N, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %indvars.iv, 4 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0 +} + +!0 = !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} +!2 = !{!"llvm.loop.vectorize.width", !3} +!3 = !{i32 4, i1 true} + +; test2 +; +; Specifies a vector of , i.e. maximum of 32 x i32 with 2 +; words per 128-bits (unpacked). +; +; void test2(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(2, scalable) +; for (int i=0; i, i.e. maximum of 64 x i32 with 4 +; words per 128-bits (packed). +; +; void test3(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i:0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2 +define void @test3(i32* %a, i32* %b, i32 %N) { +entry: + %cmp12 = icmp sgt i32 %N, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %indvars.iv, 32 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !8 +} + +!8 = !{!8, !9, !10} +!9 = !{!"llvm.loop.vectorize.enable", i1 true} +!10 = !{!"llvm.loop.vectorize.width", !11} +!11 = !{i32 4, i1 true} + +; test4 +; +; Specifies a vector of , i.e. maximum of 64 x i32 with 4 +; words per 128-bits (packed). +; +; void test4(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i, i.e. maximum of 256 x i32. +; +; void test5(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(8, scalable) +; for (int i=0; i:0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8 +define void @test5(i32* %a, i32* %b, i32 %N) { +entry: + %cmp12 = icmp sgt i32 %N, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %indvars.iv, 128 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !16 +} + +!16 = !{!16, !17, !18} +!17 = !{!"llvm.loop.vectorize.enable", i1 true} +!18 = !{!"llvm.loop.vectorize.width", !19} +!19 = !{i32 16, i1 true} Index: llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll @@ -0,0 +1,60 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; test1 +; +; The pragma applied to this loop implies a scalable vector +; be used for vectorization. For fixed vectors the MaxVF=4, otherwise there +; would be a dependence between vector lanes for vectors greater than 128 bits. +; +; void test1(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i +define void @test1(i32* %a, i32* %b, i32 %N) { +entry: + %cmp12 = icmp sgt i32 %N, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %indvars.iv, 4 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0 +} + +!0 = !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} +!2 = !{!"llvm.loop.vectorize.width", !3} +!3 = !{i32 4, i1 true}