diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1990,6 +1990,11 @@ // Legalize the type. std::pair LT = getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); + // we can lower SDIV/UDIV operations using SVE instructions if the SVE feature + // is available, so we can have less cost + bool OverrideNEON = + (supportsScalableVectors() && + ((LT.second == MVT::v2i64) || (LT.second == MVT::v4i32))); switch (ISD) { default: @@ -2036,10 +2041,19 @@ if (Ty->isVectorTy()) { // On AArch64, vector divisions are not supported natively and are // expanded into scalar divisions of each pair of elements. - Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, - Op1Info, Op2Info); - Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, - Op1Info, Op2Info); + if (OverrideNEON || (isa(Ty))) { + bool IsFloat = Ty->isFPOrFPVectorTy(); + // Assume that floating point arithmetic operations cost twice as much + // as integer operations. + InstructionCost OpCost = (IsFloat ? 2 : 1); + // multiply by 2 because it's calculated for both extract and insert + Cost += (LT.first * OpCost * 2); + } else { + Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, + CostKind, Op1Info, Op2Info); + Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, + Op1Info, Op2Info); + } // TODO: if one of the arguments is scalar, then it's not necessary to // double the cost of handling the vector elements. Cost += Cost; @@ -2047,16 +2061,20 @@ return Cost; } case ISD::MUL: - // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive + // Since we do not have a MUL.2d instruction, a mul <4 x i64> is expensive // as elements are extracted from the vectors and the muls scalarized. // As getScalarizationOverhead is a bit too pessimistic, we estimate the - // cost for a i64 vector directly here, which is: + // cost for a v4i64 vector directly here, which is: // - four 2-cost i64 extracts, // - two 2-cost i64 inserts, and // - two 1-cost muls. - // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with - // LT.first = 2 the cost is 28. If both operands are extensions it will not - // need to scalarize so the cost can be cheaper (smull or umull). + // So, for a v4i64 with LT.first = 2 the cost is 28. + // If both operands are extensions it will not need to scalarize, + // so the cost can be cheaper (smull or umull). + // but if SVE is available, we can lower v2i64, that means we can have less + // cost: + if (supportsScalableVectors() && LT.second == MVT::v2i64) + return LT.first; if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) return LT.first; return LT.first * 14; diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length-div-mul.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length-div-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length-div-mul.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -passes='print' 2>&1 -disable-output -mcpu=neoverse-v1 -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s + +define void @fixed_sdiv() { +; CHECK-LABEL: 'fixed_sdiv' +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %sdiv_v16i8 = sdiv <16 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %sdiv_v8i16 = sdiv <8 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sdiv_v4i32 = sdiv <4 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sdiv_v2i64 = sdiv <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %sdiv_v16i8 = sdiv <16 x i8> undef, undef + %sdiv_v8i16 = sdiv <8 x i16> undef, undef + %sdiv_v4i32 = sdiv <4 x i32> undef, undef + %sdiv_v2i64 = sdiv <2 x i64> undef, undef + + ret void +} + +define void @fixed_udiv() { +; CHECK-LABEL: 'fixed_udiv' +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %udiv_v16i8 = udiv <16 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %udiv_v8i16 = udiv <8 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %udiv_v4i32 = udiv <4 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %udiv_v2i64 = udiv <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %udiv_v16i8 = udiv <16 x i8> undef, undef + %udiv_v8i16 = udiv <8 x i16> undef, undef + %udiv_v4i32 = udiv <4 x i32> undef, undef + %udiv_v2i64 = udiv <2 x i64> undef, undef + + ret void +} + +define void @fixed_mul() { +; CHECK-LABEL: 'fixed_mul' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_v16i8 = mul <16 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_v8i16 = mul <8 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_v4i32 = mul <4 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_v2i64 = mul <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %mul_v16i8 = mul <16 x i8> undef, undef + %mul_v8i16 = mul <8 x i16> undef, undef + %mul_v4i32 = mul <4 x i32> undef, undef + %mul_v2i64 = mul <2 x i64> undef, undef + + ret void +} + +define void @scalable_sdiv() { +; CHECK-LABEL: 'scalable_sdiv' +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sdiv_nxv16i8 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sdiv_nxv8i16 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sdiv_nxv4i32 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sdiv_nxv2i64 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %sdiv_nxv16i8 = sdiv undef, undef + %sdiv_nxv8i16 = sdiv undef, undef + %sdiv_nxv4i32 = sdiv undef, undef + %sdiv_nxv2i64 = sdiv undef, undef + + ret void +} + +define void @scalable_udiv() { +; CHECK-LABEL: 'scalable_udiv' +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %udiv_nxv16i8 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %udiv_nxv8i16 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %udiv_nxv4i32 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %udiv_nxv2i64 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %udiv_nxv16i8 = udiv undef, undef + %udiv_nxv8i16 = udiv undef, undef + %udiv_nxv4i32 = udiv undef, undef + %udiv_nxv2i64 = udiv undef, undef + + ret void +} + +define void @scalable_mul() { +; CHECK-LABEL: 'scalable_mul' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv16i8 = mul undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv8i16 = mul undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv4i32 = mul undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv2i64 = mul undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %mul_nxv16i8 = mul undef, undef + %mul_nxv8i16 = mul undef, undef + %mul_nxv4i32 = mul undef, undef + %mul_nxv2i64 = mul undef, undef + + ret void +}