diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -916,6 +916,9 @@ /// architectural maximum vector length, and None otherwise. Optional getMaxVScale() const; + /// \return the value of vscale to tune the cost model for. + Optional getVScaleForTuning() const; + /// \return True if the vectorization factor should be chosen to /// make the vector of the smallest element type match the size of a /// vector register. For wider element types, this could result in @@ -1590,6 +1593,7 @@ virtual TypeSize getRegisterBitWidth(RegisterKind K) const = 0; virtual unsigned getMinVectorRegisterBitWidth() const = 0; virtual Optional getMaxVScale() const = 0; + virtual Optional getVScaleForTuning() const = 0; virtual bool shouldMaximizeVectorBandwidth() const = 0; virtual ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const = 0; @@ -2060,6 +2064,9 @@ Optional getMaxVScale() const override { return Impl.getMaxVScale(); } + Optional getVScaleForTuning() const override { + return Impl.getVScaleForTuning(); + } bool shouldMaximizeVectorBandwidth() const override { return Impl.shouldMaximizeVectorBandwidth(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -399,6 +399,7 @@ unsigned getMinVectorRegisterBitWidth() const { return 128; } Optional getMaxVScale() const { return None; } + Optional getVScaleForTuning() const { return None; } bool shouldMaximizeVectorBandwidth() const { return false; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -665,6 +665,7 @@ } Optional getMaxVScale() const { return None; } + Optional getVScaleForTuning() const { return None; } /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the demanded result elements need to be inserted and/or diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -604,6 +604,10 @@ return TTIImpl->getMaxVScale(); } +Optional TargetTransformInfo::getVScaleForTuning() const { + return TTIImpl->getVScaleForTuning(); +} + bool TargetTransformInfo::shouldMaximizeVectorBandwidth() const { return TTIImpl->shouldMaximizeVectorBandwidth(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -125,6 +125,9 @@ return ST->getMinVectorRegisterBitWidth(); } + Optional getVScaleForTuning() const { + return ST->getVScaleForTuning(); + } /// Try to return an estimate cost factor that can be used as a multiplier /// when scalarizing an operation for a vector with ElementCount \p VF. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6021,19 +6021,27 @@ return RTCostA < RTCostB; } - // When set to preferred, for now assume vscale may be larger than 1, so - // that scalable vectorization is slightly favorable over fixed-width - // vectorization. + // Improve estimate for the vector width if it is scalable. + unsigned EstimatedWidthA = A.Width.getKnownMinValue(); + unsigned EstimatedWidthB = B.Width.getKnownMinValue(); + if (Optional VScale = TTI.getVScaleForTuning()) { + if (A.Width.isScalable()) + EstimatedWidthA *= VScale.getValue(); + if (B.Width.isScalable()) + EstimatedWidthB *= VScale.getValue(); + } + + // When set to preferred, for now assume vscale may be larger than 1 (or the + // one being tuned for), so that scalable vectorization is slightly favorable + // over fixed-width vectorization. if (Hints->isScalableVectorizationPreferred()) if (A.Width.isScalable() && !B.Width.isScalable()) - return (CostA * B.Width.getKnownMinValue()) <= - (CostB * A.Width.getKnownMinValue()); + return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); // To avoid the need for FP division: // (CostA / A.Width) < (CostB / B.Width) // <=> (CostA * B.Width) < (CostB * A.Width) - return (CostA * B.Width.getKnownMinValue()) < - (CostB * A.Width.getKnownMinValue()); + return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); } VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( @@ -6063,11 +6071,22 @@ VectorizationCostTy C = expectedCost(i, &InvalidCosts); VectorizationFactor Candidate(i, C.first); - LLVM_DEBUG( - dbgs() << "LV: Vector loop of width " << i << " costs: " - << (Candidate.Cost / Candidate.Width.getKnownMinValue()) - << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "") - << ".\n"); + +#ifndef NDEBUG + unsigned AssumedMinimumVscale = 1; + if (Optional VScale = TTI.getVScaleForTuning()) + AssumedMinimumVscale = VScale.getValue(); + unsigned Width = + Candidate.Width.isScalable() + ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale + : Candidate.Width.getFixedValue(); + LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i + << " costs: " << (Candidate.Cost / Width)); + if (i.isScalable()) + LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " + << AssumedMinimumVscale << ")"); + LLVM_DEBUG(dbgs() << ".\n"); +#endif if (!C.second && !ForceVectorization) { LLVM_DEBUG( diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll @@ -0,0 +1,54 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=aarch64 -mattr=+sve -scalable-vectorization=on \ +; RUN: -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4 + +; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic -scalable-vectorization=on \ +; RUN: -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4 + +; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 -scalable-vectorization=on \ +; RUN: -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4 + +; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 -scalable-vectorization=on \ +; RUN: -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-4 + +; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 -scalable-vectorization=preferred \ +; RUN: -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4 + +; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2). +; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2). + +; NEOVERSE-V1: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2). +; NEOVERSE-V1: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2). + +; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1). +; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1). + +; VF-4: <4 x i32> +; VF-VSCALE4: +define void @test0(i32* %a, i8* %b, i32* %c) #0 { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll @@ -10,7 +10,7 @@ define void @test0(i32* %a, i8* %b, i32* %c) #0 { ; CHECK: LV: Checking a loop in "test0" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 -; CHECK_SCALABLE_ON: LV: Selecting VF: 4 +; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4 ; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF @@ -43,7 +43,7 @@ define void @test1(i32* %a, i8* %b) #0 { ; CHECK: LV: Checking a loop in "test1" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 -; CHECK_SCALABLE_ON: LV: Selecting VF: 4 +; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4 ; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF @@ -79,7 +79,7 @@ ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2 ; CHECK_SCALABLE_ON: LV: Selecting VF: 4 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 2 -; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4 +; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 2 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF ; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll @@ -187,9 +187,9 @@ ; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. ; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a more suitable value. ; CHECK-DBG: Found feasible scalable VF = vscale x 2 -; CHECK-DBG: LV: Selecting VF: 4. +; CHECK-DBG: LV: Selecting VF: vscale x 2. ; CHECK-LABEL: @test4 -; CHECK: <4 x i32> +; CHECK: define void @test4(i32* %a, i32* %b) #0 { entry: br label %loop