Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1010,6 +1010,9 @@ /// \return the value of vscale to tune the cost model for. std::optional getVScaleForTuning() const; + /// \return true if vscale is known to be a power of 2 + bool isVScaleKnownToBeAPowerOfTwo() const; + /// \return True if the vectorization factor should be chosen to /// make the vector of the smallest element type match the size of a /// vector register. For wider element types, this could result in @@ -1766,6 +1769,7 @@ virtual unsigned getMinVectorRegisterBitWidth() const = 0; virtual std::optional getMaxVScale() const = 0; virtual std::optional getVScaleForTuning() const = 0; + virtual bool isVScaleKnownToBeAPowerOfTwo() const = 0; virtual bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const = 0; virtual ElementCount getMinimumVF(unsigned ElemWidth, @@ -2305,6 +2309,9 @@ std::optional getVScaleForTuning() const override { return Impl.getVScaleForTuning(); } + bool isVScaleKnownToBeAPowerOfTwo() const override { + return Impl.isVScaleKnownToBeAPowerOfTwo(); + } bool shouldMaximizeVectorBandwidth( TargetTransformInfo::RegisterKind K) const override { return Impl.shouldMaximizeVectorBandwidth(K); Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -439,6 +439,7 @@ std::optional getMaxVScale() const { return std::nullopt; } std::optional getVScaleForTuning() const { return std::nullopt; } + bool isVScaleKnownToBeAPowerOfTwo() const { return false; } bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const { Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -714,6 +714,7 @@ std::optional getMaxVScale() const { return std::nullopt; } std::optional getVScaleForTuning() const { return std::nullopt; } + bool isVScaleKnownToBeAPowerOfTwo() const { return false; } /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the demanded result elements need to be inserted and/or Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -672,6 +672,10 @@ return TTIImpl->getVScaleForTuning(); } +bool TargetTransformInfo::isVScaleKnownToBeAPowerOfTwo() const { + return TTIImpl->isVScaleKnownToBeAPowerOfTwo(); +} + bool TargetTransformInfo::shouldMaximizeVectorBandwidth( TargetTransformInfo::RegisterKind K) const { return TTIImpl->shouldMaximizeVectorBandwidth(K); Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -917,7 +917,7 @@ SDValue Chain, SDValue InFlag, SDValue PStateSM, bool Entry) const; - bool isVScaleKnownToBeAPowerOfTwo() const override; + bool isVScaleKnownToBeAPowerOfTwo() const override { return true; } // Normally SVE is only used for byte size vectors that do not fit within a // NEON vector. This changes when OverrideNEON is true, allowing SVE to be Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6030,10 +6030,6 @@ return !Subtarget->useSVEForFixedLengthVectors(); } -bool AArch64TargetLowering::isVScaleKnownToBeAPowerOfTwo() const { - return true; -} - bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( EVT VT, bool OverrideNEON) const { if (!VT.isFixedLengthVector() || !VT.isSimple()) Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -131,6 +131,8 @@ return ST->getVScaleForTuning(); } + bool isVScaleKnownToBeAPowerOfTwo() const { return true; } + bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const; /// Try to return an estimate cost factor that can be used as a multiplier Index: llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h =================================================================== --- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -227,6 +227,10 @@ return ST->is64Bit() && !ST->hasVInstructionsI64(); } + bool isVScaleKnownToBeAPowerOfTwo() const { + return TLI->isVScaleKnownToBeAPowerOfTwo(); + } + /// \returns How the target needs this vector-predicated operation to be /// transformed. TargetTransformInfo::VPLegalization Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5149,16 +5149,34 @@ } FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); + // Avoid tail folding if the trip count is known to be a multiple of any VF - // we chose. - // FIXME: The condition below pessimises the case for fixed-width vectors, - // when scalable VFs are also candidates for vectorization. - if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { - ElementCount MaxFixedVF = MaxFactors.FixedVF; - assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && + // we choose. + unsigned MaxPossibleVF = 1; + if (MaxFactors.FixedVF.isVector()) + MaxPossibleVF = MaxFactors.FixedVF.getFixedValue(); + if (MaxFactors.ScalableVF) { + // If vscale is a power of 2 then we only need to look at the maximum + // possible value of vscale and determine the maximum possible scalable + // VF. Otherwise for arbitrary values of vscale we'd have to ask if for + // every possible value of vscale the trip count is a perfect multiple + // of the VF. + if (TTI.isVScaleKnownToBeAPowerOfTwo()) { + std::optional MaxVScale = getMaxVScale(*TheFunction, TTI); + if (MaxVScale) { + unsigned MaxPossibleScalableVF = + *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue(); + if (MaxPossibleScalableVF > MaxPossibleVF) + MaxPossibleVF = MaxPossibleScalableVF; + } + } else + MaxPossibleVF = 1; // Stick with tail-folding for now. + } + + if (MaxPossibleVF > 1) { + assert((UserVF.isNonZero() || isPowerOf2_32(MaxPossibleVF)) && "MaxFixedVF must be a power of 2"); - unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC - : MaxFixedVF.getFixedValue(); + unsigned MaxVFtimesIC = UserIC ? MaxPossibleVF * UserIC : MaxPossibleVF; ScalarEvolution *SE = PSE.getSE(); const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); const SCEV *ExitCount = SE->getAddExpr( Index: llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll @@ -1,14 +1,6 @@ ; RUN: opt -passes=loop-vectorize -force-target-instruction-cost=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s 2>&1 | FileCheck %s -; This test currently fails when the LV calculates a maximums safe -; distance for scalable vectors, because the code to eliminate the tail is -; pessimistic when scalable vectors are considered. This will be addressed -; in a future patch, at which point we should be able to un-XFAIL the -; test. The expected output is to vectorize this loop without predication -; (and thus have unpredicated vector store). -; XFAIL: * - -; CHECK: store <4 x i32> +; CHECK: store target triple = "aarch64" target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -777,41 +777,32 @@ define void @simple_memset_trip1024(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-LABEL: @simple_memset_trip1024( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 1024, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 1024, [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1024) +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] Index: llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll @@ -348,32 +348,31 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @vector_add_trip1024( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP6:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[TMP6]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]]