Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -129,13 +129,17 @@ /// Try to return an estimate cost factor that can be used as a multiplier /// when scalarizing an operation for a vector with ElementCount \p VF. /// For scalable vectors this currently takes the most pessimistic view based - /// upon the maximum possible value for vscale. - unsigned getMaxNumElements(ElementCount VF, + /// upon the maximum possible value for vscale, which is determined as follows: + /// 1. If the Function \p F contains a vscale_range attribute then we use the + /// max value from that, otherwise + /// 2. Use the \p DefaultMaxVscale value passed in by the user, which is 16 + /// if unspecified. + unsigned getMaxNumElements(ElementCount VF, unsigned DefaultMaxVscale = 16, const Function *F = nullptr) const { if (!VF.isScalable()) return VF.getFixedValue(); - unsigned MaxNumVScale = 16; + unsigned MaxNumVScale = DefaultMaxVscale; if (F && F->hasFnAttribute(Attribute::VScaleRange)) { unsigned VScaleMax = F->getFnAttribute(Attribute::VScaleRange).getVScaleRangeArgs().second; Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1544,7 +1544,7 @@ ElementCount LegalVF = LT.second.getVectorElementCount(); InstructionCost MemOpCost = getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); - return LT.first * MemOpCost * getMaxNumElements(LegalVF, I->getFunction()); + return LT.first * MemOpCost * getMaxNumElements(LegalVF, 4, I->getFunction()); } bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { @@ -1969,7 +1969,7 @@ auto *VTy = cast(ValTy); InstructionCost Cost = getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); - Cost *= getMaxNumElements(VTy->getElementCount()); + Cost *= getMaxNumElements(VTy->getElementCount(), 4); return Cost; } Index: llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -80,8 +80,8 @@ define void @strict_fp_reductions( %v0, %v1) { ; CHECK-LABEL: 'strict_fp_reductions' -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v0) -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v1) ; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f32 = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.000000e+00, %v0) ; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f64 = call double @llvm.vector.reduce.fmul.nxv4f64(double 0.000000e+00, %v1) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -380,14 +380,14 @@ define void @masked_gather() { ; CHECK-LABEL: 'masked_gather' -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %NXV4F64 = call @llvm.masked.gather.nxv4f64.nxv4p0f64( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %NXV2F64 = call @llvm.masked.gather.nxv2f64.nxv2p0f64( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %NXV8F32 = call @llvm.masked.gather.nxv8f32.nxv8p0f32( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %NXV4F32 = call @llvm.masked.gather.nxv4f32.nxv4p0f32( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %NXV2F32 = call @llvm.masked.gather.nxv2f32.nxv2p0f32( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %NXV16I16 = call @llvm.masked.gather.nxv16i16.nxv16p0i16( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %NXV8I16 = call @llvm.masked.gather.nxv8i16.nxv8p0i16( undef, i32 1, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %NXV4I16 = call @llvm.masked.gather.nxv4i16.nxv4p0i16( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NXV4F64 = call @llvm.masked.gather.nxv4f64.nxv4p0f64( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NXV2F64 = call @llvm.masked.gather.nxv2f64.nxv2p0f64( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %NXV8F32 = call @llvm.masked.gather.nxv8f32.nxv8p0f32( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NXV4F32 = call @llvm.masked.gather.nxv4f32.nxv4p0f32( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NXV2F32 = call @llvm.masked.gather.nxv2f32.nxv2p0f32( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %NXV16I16 = call @llvm.masked.gather.nxv16i16.nxv16p0i16( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %NXV8I16 = call @llvm.masked.gather.nxv8i16.nxv8p0i16( undef, i32 1, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NXV4I16 = call @llvm.masked.gather.nxv4i16.nxv4p0i16( undef, i32 1, undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %NXV4F64 = call @llvm.masked.gather.nxv4f64( undef, i32 1, undef, undef) @@ -406,14 +406,14 @@ define void @masked_scatter() { ; CHECK-LABEL: 'masked_scatter' -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( undef, undef, i32 1, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0i16( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( undef, undef, i32 1, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( undef, undef, i32 1, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; call void @llvm.masked.scatter.nxv4f64( undef, undef, i32 1, undef) Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll @@ -6,8 +6,8 @@ target triple="aarch64-unknown-linux-gnu" -; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 -; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF4: Found an estimated cost of 32 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 64 for VF vscale x 8 For instruction: %add = fadd float %0, %sum.07 define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) #0 { entry: @@ -28,8 +28,8 @@ } -; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07 -; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF4: Found an estimated cost of 32 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 64 for VF vscale x 8 For instruction: %add = fadd double %0, %sum.07 define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) #0 { entry: