diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -280,6 +280,7 @@ unsigned MinSVEVectorSizeInBits; unsigned MaxSVEVectorSizeInBits; + unsigned VScaleForTuning = 2; /// TargetTriple - What processor and OS we're targeting. Triple TargetTriple; @@ -655,6 +656,8 @@ } bool useSVEForFixedLengthVectors() const; + + unsigned getVScaleForTuning() const { return VScaleForTuning; } }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -80,8 +80,11 @@ break; case CortexA53: case CortexA55: + PrefFunctionLogAlignment = 4; + break; case CortexA510: PrefFunctionLogAlignment = 4; + VScaleForTuning = 1; break; case CortexA57: MaxInterleaveFactor = 4; @@ -109,6 +112,7 @@ PrefetchDistance = 128; MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 4; + VScaleForTuning = 4; break; case AppleA7: case AppleA10: @@ -150,9 +154,15 @@ PrefFunctionLogAlignment = 3; break; case NeoverseN1: + PrefFunctionLogAlignment = 4; + break; case NeoverseN2: + PrefFunctionLogAlignment = 4; + VScaleForTuning = 1; + break; case NeoverseV1: PrefFunctionLogAlignment = 4; + VScaleForTuning = 2; break; case Saphira: MaxInterleaveFactor = 4; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -130,20 +130,11 @@ /// when scalarizing an operation for a vector with ElementCount \p VF. /// For scalable vectors this currently takes the most pessimistic view based /// upon the maximum possible value for vscale. - unsigned getMaxNumElements(ElementCount VF, - const Function *F = nullptr) const { + unsigned getMaxNumElements(ElementCount VF) const { if (!VF.isScalable()) return VF.getFixedValue(); - unsigned MaxNumVScale = 16; - if (F && F->hasFnAttribute(Attribute::VScaleRange)) { - unsigned VScaleMax = - F->getFnAttribute(Attribute::VScaleRange).getVScaleRangeArgs().second; - if (VScaleMax > 0) - MaxNumVScale = VScaleMax; - } - - return MaxNumVScale * VF.getKnownMinValue(); + return VF.getKnownMinValue() * ST->getVScaleForTuning(); } unsigned getMaxInterleaveFactor(unsigned VF); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1632,7 +1632,7 @@ ElementCount LegalVF = LT.second.getVectorElementCount(); InstructionCost MemOpCost = getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); - return LT.first * MemOpCost * getMaxNumElements(LegalVF, I->getFunction()); + return LT.first * MemOpCost * getMaxNumElements(LegalVF); } bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll --- a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll @@ -1,32 +1,95 @@ -; Check getIntrinsicInstrCost in BasicTTIImpl.h for masked gather +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt -analyze -cost-model < %s | FileCheck %s +; RUN: opt -analyze -cost-model -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-VSCALE-2 +; RUN: opt -analyze -cost-model -mcpu=neoverse-n2 < %s | FileCheck %s --check-prefix=CHECK-VSCALE-1 +; RUN: opt -analyze -cost-model -mcpu=cortex-a510 < %s | FileCheck %s --check-prefix=CHECK-VSCALE-1 -; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s +target triple="aarch64--linux-gnu" -define void @masked_gathers( %nxv4i1mask, %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, %nxv1i1mask) vscale_range(0, 16) { +define void @masked_gathers( %nxv4i1mask, %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, %nxv1i1mask) #0 { ; CHECK-LABEL: 'masked_gathers' -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res.nxv4i32 = call @llvm.masked.gather.nxv4i32.nxv4p0i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res.nxv8i32 = call @llvm.masked.gather.nxv8i32.nxv8p0i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %res.v4i32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res.v1i128 = call <1 x i128> @llvm.masked.gather.v1i128.v1p0i128 -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call @llvm.masked.gather.nxv1i64.nxv1p0i64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call @llvm.masked.gather.nxv4i32.nxv4p0i32( undef, i32 0, %nxv4i1mask, zeroinitializer) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call @llvm.masked.gather.nxv8i32.nxv8p0i32( undef, i32 0, %nxv8i1mask, zeroinitializer) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call @llvm.masked.gather.nxv1i64.nxv1p0i64( undef, i32 0, %nxv1i1mask, zeroinitializer) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-VSCALE-2-LABEL: 'masked_gathers' +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call @llvm.masked.gather.nxv4i32.nxv4p0i32( undef, i32 0, %nxv4i1mask, zeroinitializer) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call @llvm.masked.gather.nxv8i32.nxv8p0i32( undef, i32 0, %nxv8i1mask, zeroinitializer) +; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call @llvm.masked.gather.nxv1i64.nxv1p0i64( undef, i32 0, %nxv1i1mask, zeroinitializer) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-VSCALE-1-LABEL: 'masked_gathers' +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv4i32 = call @llvm.masked.gather.nxv4i32.nxv4p0i32( undef, i32 0, %nxv4i1mask, zeroinitializer) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv8i32 = call @llvm.masked.gather.nxv8i32.nxv8p0i32( undef, i32 0, %nxv8i1mask, zeroinitializer) +; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call @llvm.masked.gather.nxv1i64.nxv1p0i64( undef, i32 0, %nxv1i1mask, zeroinitializer) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; %res.nxv4i32 = call @llvm.masked.gather.nxv4i32( undef, i32 0, %nxv4i1mask, zeroinitializer) %res.nxv8i32 = call @llvm.masked.gather.nxv8i32( undef, i32 0, %nxv8i1mask, zeroinitializer) - %res.v4i32 = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> undef, i32 0, <4 x i1> %v4i1mask, <4 x i32> zeroinitializer) - %res.v1i128 = call <1 x i128> @llvm.masked.gather.v1i128.v1p0i128(<1 x i128*> undef, i32 0, <1 x i1> %v1i1mask, <1 x i128> zeroinitializer) - %res.nxv1i64 = call @llvm.masked.gather.nxv1i64.nxv1p0i64( undef, i32 0, %nxv1i1mask, zeroinitializer) + %res.nxv1i64 = call @llvm.masked.gather.nxv1i64( undef, i32 0, %nxv1i1mask, zeroinitializer) ret void } -define void @masked_gathers_no_vscale_range() { +define void @masked_gathers_tune_generic( %nxv4i1mask, %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, %nxv1i1mask) #1 { +; CHECK-LABEL: 'masked_gathers_tune_generic' +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call @llvm.masked.gather.nxv4i32.nxv4p0i32( undef, i32 0, %nxv4i1mask, zeroinitializer) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call @llvm.masked.gather.nxv8i32.nxv8p0i32( undef, i32 0, %nxv8i1mask, zeroinitializer) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call @llvm.masked.gather.nxv1i64.nxv1p0i64( undef, i32 0, %nxv1i1mask, zeroinitializer) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-VSCALE-2-LABEL: 'masked_gathers_tune_generic' +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call @llvm.masked.gather.nxv4i32.nxv4p0i32( undef, i32 0, %nxv4i1mask, zeroinitializer) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call @llvm.masked.gather.nxv8i32.nxv8p0i32( undef, i32 0, %nxv8i1mask, zeroinitializer) +; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call @llvm.masked.gather.nxv1i64.nxv1p0i64( undef, i32 0, %nxv1i1mask, zeroinitializer) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-VSCALE-1-LABEL: 'masked_gathers_tune_generic' +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i32 = call @llvm.masked.gather.nxv4i32.nxv4p0i32( undef, i32 0, %nxv4i1mask, zeroinitializer) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i32 = call @llvm.masked.gather.nxv8i32.nxv8p0i32( undef, i32 0, %nxv8i1mask, zeroinitializer) +; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call @llvm.masked.gather.nxv1i64.nxv1p0i64( undef, i32 0, %nxv1i1mask, zeroinitializer) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %res.nxv4i32 = call @llvm.masked.gather.nxv4i32( undef, i32 0, %nxv4i1mask, zeroinitializer) + %res.nxv8i32 = call @llvm.masked.gather.nxv8i32( undef, i32 0, %nxv8i1mask, zeroinitializer) + %res.nxv1i64 = call @llvm.masked.gather.nxv1i64( undef, i32 0, %nxv1i1mask, zeroinitializer) + ret void +} + +define void @masked_gathers_no_vscale_range() #2 { ; CHECK-LABEL: 'masked_gathers_no_vscale_range' -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res.nxv4f64 = call @llvm.masked.gather.nxv4f64.nxv4p0f64( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res.nxv2f64 = call @llvm.masked.gather.nxv2f64.nxv2p0f64( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res.nxv8f32 = call @llvm.masked.gather.nxv8f32.nxv8p0f32( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res.nxv4f32 = call @llvm.masked.gather.nxv4f32.nxv4p0f32( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res.nxv2f32 = call @llvm.masked.gather.nxv2f32.nxv2p0f32( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res.nxv16i16 = call @llvm.masked.gather.nxv16i16.nxv16p0i16( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res.nxv8i16 = call @llvm.masked.gather.nxv8i16.nxv8p0i16( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res.nxv4i16 = call @llvm.masked.gather.nxv4i16.nxv4p0i16( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4f64 = call @llvm.masked.gather.nxv4f64.nxv4p0f64( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv2f64 = call @llvm.masked.gather.nxv2f64.nxv2p0f64( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8f32 = call @llvm.masked.gather.nxv8f32.nxv8p0f32( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4f32 = call @llvm.masked.gather.nxv4f32.nxv4p0f32( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv2f32 = call @llvm.masked.gather.nxv2f32.nxv2p0f32( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res.nxv16i16 = call @llvm.masked.gather.nxv16i16.nxv16p0i16( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i16 = call @llvm.masked.gather.nxv8i16.nxv8p0i16( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i16 = call @llvm.masked.gather.nxv4i16.nxv4p0i16( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-VSCALE-2-LABEL: 'masked_gathers_no_vscale_range' +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4f64 = call @llvm.masked.gather.nxv4f64.nxv4p0f64( undef, i32 1, undef, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv2f64 = call @llvm.masked.gather.nxv2f64.nxv2p0f64( undef, i32 1, undef, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8f32 = call @llvm.masked.gather.nxv8f32.nxv8p0f32( undef, i32 1, undef, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4f32 = call @llvm.masked.gather.nxv4f32.nxv4p0f32( undef, i32 1, undef, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv2f32 = call @llvm.masked.gather.nxv2f32.nxv2p0f32( undef, i32 1, undef, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res.nxv16i16 = call @llvm.masked.gather.nxv16i16.nxv16p0i16( undef, i32 1, undef, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv8i16 = call @llvm.masked.gather.nxv8i16.nxv8p0i16( undef, i32 1, undef, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv4i16 = call @llvm.masked.gather.nxv4i16.nxv4p0i16( undef, i32 1, undef, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-VSCALE-1-LABEL: 'masked_gathers_no_vscale_range' +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv4f64 = call @llvm.masked.gather.nxv4f64.nxv4p0f64( undef, i32 1, undef, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.nxv2f64 = call @llvm.masked.gather.nxv2f64.nxv2p0f64( undef, i32 1, undef, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv8f32 = call @llvm.masked.gather.nxv8f32.nxv8p0f32( undef, i32 1, undef, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv4f32 = call @llvm.masked.gather.nxv4f32.nxv4p0f32( undef, i32 1, undef, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.nxv2f32 = call @llvm.masked.gather.nxv2f32.nxv2p0f32( undef, i32 1, undef, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res.nxv16i16 = call @llvm.masked.gather.nxv16i16.nxv16p0i16( undef, i32 1, undef, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res.nxv8i16 = call @llvm.masked.gather.nxv8i16.nxv8p0i16( undef, i32 1, undef, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.nxv4i16 = call @llvm.masked.gather.nxv4i16.nxv4p0i16( undef, i32 1, undef, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; %res.nxv4f64 = call @llvm.masked.gather.nxv4f64( undef, i32 1, undef, undef) %res.nxv2f64 = call @llvm.masked.gather.nxv2f64( undef, i32 1, undef, undef) @@ -41,11 +104,13 @@ ret void } +attributes #0 = { "target-features"="+sve" vscale_range(0, 8) } +attributes #1 = { "target-features"="+sve" vscale_range(0, 16) "tune-cpu"="generic" } +attributes #2 = { "target-features"="+sve" } + declare @llvm.masked.gather.nxv4i32(, i32, , ) declare @llvm.masked.gather.nxv8i32(, i32, , ) -declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) -declare <1 x i128> @llvm.masked.gather.v1i128.v1p0i128(<1 x i128*>, i32, <1 x i1>, <1 x i128>) -declare @llvm.masked.gather.nxv1i64.nxv1p0i64(, i32, , ) +declare @llvm.masked.gather.nxv1i64(, i32, , ) declare @llvm.masked.gather.nxv4f64(, i32, , ) declare @llvm.masked.gather.nxv2f64(, i32, , ) declare @llvm.masked.gather.nxv8f32(, i32, , ) diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll --- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -80,8 +80,8 @@ define void @strict_fp_reductions( %v0, %v1) { ; CHECK-LABEL: 'strict_fp_reductions' -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v0) -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v1) ; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f32 = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.000000e+00, %v0) ; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f64 = call double @llvm.vector.reduce.fmul.nxv4f64(double 0.000000e+00, %v1) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll --- a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll @@ -1,32 +1,95 @@ -; Check getIntrinsicInstrCost in BasicTTIImpl.h with for masked scatter +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt -analyze -cost-model < %s | FileCheck %s +; RUN: opt -analyze -cost-model -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-VSCALE-2 +; RUN: opt -analyze -cost-model -mcpu=neoverse-n2 < %s | FileCheck %s --check-prefix=CHECK-VSCALE-1 +; RUN: opt -analyze -cost-model -mcpu=cortex-a510 < %s | FileCheck %s --check-prefix=CHECK-VSCALE-1 -; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s +target triple="aarch64--linux-gnu" -define void @masked_scatters( %nxv4i1mask, %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, %nxv1i1mask) vscale_range(0, 16) { +define void @masked_scatters( %nxv4i1mask, %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, %nxv1i1mask) #0 { ; CHECK-LABEL: 'masked_scatters' -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v1i128.v1p0i128 -; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( undef, undef, i32 0, %nxv4i1mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( undef, undef, i32 0, %nxv8i1mask) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64( undef, undef, i32 0, %nxv1i1mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-VSCALE-2-LABEL: 'masked_scatters' +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( undef, undef, i32 0, %nxv4i1mask) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( undef, undef, i32 0, %nxv8i1mask) +; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64( undef, undef, i32 0, %nxv1i1mask) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-VSCALE-1-LABEL: 'masked_scatters' +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( undef, undef, i32 0, %nxv4i1mask) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( undef, undef, i32 0, %nxv8i1mask) +; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64( undef, undef, i32 0, %nxv1i1mask) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; call void @llvm.masked.scatter.nxv4i32( undef, undef, i32 0, %nxv4i1mask) call void @llvm.masked.scatter.nxv8i32( undef, undef, i32 0, %nxv8i1mask) - call void @llvm.masked.scatter.v4i32(<4 x i32> undef, <4 x i32*> undef, i32 0, <4 x i1> %v4i1mask) - call void @llvm.masked.scatter.v1i128.v1p0i128(<1 x i128> undef, <1 x i128*> undef, i32 0, <1 x i1> %v1i1mask) - call void @llvm.masked.scatter.nxv1i64.nxv1p0i64( undef, undef, i32 0, %nxv1i1mask) + call void @llvm.masked.scatter.nxv1i64( undef, undef, i32 0, %nxv1i1mask) ret void } -define void @masked_scatters_no_vscale_range() { +define void @masked_scatters_tune_generic( %nxv4i1mask, %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, %nxv1i1mask) #1 { +; CHECK-LABEL: 'masked_scatters_tune_generic' +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( undef, undef, i32 0, %nxv4i1mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( undef, undef, i32 0, %nxv8i1mask) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64( undef, undef, i32 0, %nxv1i1mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-VSCALE-2-LABEL: 'masked_scatters_tune_generic' +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( undef, undef, i32 0, %nxv4i1mask) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( undef, undef, i32 0, %nxv8i1mask) +; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64( undef, undef, i32 0, %nxv1i1mask) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-VSCALE-1-LABEL: 'masked_scatters_tune_generic' +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( undef, undef, i32 0, %nxv4i1mask) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( undef, undef, i32 0, %nxv8i1mask) +; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0i64( undef, undef, i32 0, %nxv1i1mask) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call void @llvm.masked.scatter.nxv4i32( undef, undef, i32 0, %nxv4i1mask) + call void @llvm.masked.scatter.nxv8i32( undef, undef, i32 0, %nxv8i1mask) + call void @llvm.masked.scatter.nxv1i64( undef, undef, i32 0, %nxv1i1mask) + ret void +} + +define void @masked_scatters_no_vscale_range() #2 { ; CHECK-LABEL: 'masked_scatters_no_vscale_range' -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-VSCALE-2-LABEL: 'masked_scatters_no_vscale_range' +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64( undef, undef, i32 1, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( undef, undef, i32 1, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( undef, undef, i32 1, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( undef, undef, i32 1, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32( undef, undef, i32 1, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16( undef, undef, i32 1, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( undef, undef, i32 1, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( undef, undef, i32 1, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-VSCALE-1-LABEL: 'masked_scatters_no_vscale_range' +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64( undef, undef, i32 1, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( undef, undef, i32 1, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( undef, undef, i32 1, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( undef, undef, i32 1, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32( undef, undef, i32 1, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16( undef, undef, i32 1, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( undef, undef, i32 1, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( undef, undef, i32 1, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; call void @llvm.masked.scatter.nxv4f64( undef, undef, i32 1, undef) call void @llvm.masked.scatter.nxv2f64( undef, undef, i32 1, undef) @@ -41,11 +104,13 @@ ret void } +attributes #0 = { "target-features"="+sve" vscale_range(0, 8) } +attributes #1 = { "target-features"="+sve" vscale_range(0, 16) "tune-cpu"="generic" } +attributes #2 = { "target-features"="+sve" } + declare void @llvm.masked.scatter.nxv4i32(, , i32, ) declare void @llvm.masked.scatter.nxv8i32(, , i32, ) -declare void @llvm.masked.scatter.v4i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) -declare void @llvm.masked.scatter.v1i128.v1p0i128(<1 x i128>, <1 x i128*>, i32, <1 x i1>) -declare void @llvm.masked.scatter.nxv1i64.nxv1p0i64(, , i32, ) +declare void @llvm.masked.scatter.nxv1i64(, , i32, ) declare void @llvm.masked.scatter.nxv4f64(, , i32, ) declare void @llvm.masked.scatter.nxv2f64(, , i32, ) declare void @llvm.masked.scatter.nxv8f32(, , i32, ) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll @@ -3,11 +3,14 @@ ; RUN: -scalable-vectorization=on -force-vector-width=4 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4 ; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \ ; RUN: -scalable-vectorization=on -force-vector-width=8 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF8 +; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \ +; RUN: -scalable-vectorization=on -force-vector-width=4 -force-vector-interleave=1 -mcpu=neoverse-n2 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4-CPU-NEOVERSE-N2 target triple="aarch64-unknown-linux-gnu" -; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 -; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF4: Found an estimated cost of 16 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 32 for VF vscale x 8 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF4-CPU-NEOVERSE-N2: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) #0 { entry: @@ -28,8 +31,9 @@ } -; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07 -; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF4: Found an estimated cost of 16 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 32 for VF vscale x 8 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF4-CPU-NEOVERSE-N2: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07 define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) #0 { entry: