diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1288,15 +1288,11 @@ case Intrinsic::vector_reduce_fmin: case Intrinsic::vector_reduce_umax: case Intrinsic::vector_reduce_umin: { - if (isa(RetTy)) - return BaseT::getIntrinsicInstrCost(ICA, CostKind); IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } case Intrinsic::vector_reduce_fadd: case Intrinsic::vector_reduce_fmul: { - if (isa(RetTy)) - return BaseT::getIntrinsicInstrCost(ICA, CostKind); IntrinsicCostAttributes Attrs( IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -139,6 +139,14 @@ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsPairwise, bool IsUnsigned, + TTI::TargetCostKind CostKind); + + int getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, + bool IsPairwiseForm, + TTI::TargetCostKind CostKind); + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1101,11 +1101,70 @@ return false; } +int AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsPairwise, bool IsUnsigned, + TTI::TargetCostKind CostKind) { + if (!isa(Ty)) + return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned, + CostKind); + assert((isa(Ty) && isa(CondTy)) && + "Both vector needs to be scalable"); + + std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + int LegalizationCost = 0; + if (LT.first > 1) { + Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); + unsigned CmpOpcode = + Ty->isFPOrFPVectorTy() ? Instruction::FCmp : Instruction::ICmp; + LegalizationCost = + getCmpSelInstrCost(CmpOpcode, LegalVTy, LegalVTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind) + + getCmpSelInstrCost(Instruction::Select, LegalVTy, LegalVTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + LegalizationCost *= LT.first - 1; + } + + return LegalizationCost + /*Cost of horizontal reduction*/ 2; +} + +int AArch64TTIImpl::getArithmeticReductionCostSVE( + unsigned Opcode, VectorType *ValTy, bool IsPairwise, + TTI::TargetCostKind CostKind) { + assert(!IsPairwise && "Cannot be pair wise to continue"); + + std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + int LegalizationCost = 0; + if (LT.first > 1) { + Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); + LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); + LegalizationCost *= LT.first - 1; + } + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + // Add the final reduction cost for the legal horizontal reduction + switch (ISD) { + case ISD::ADD: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::FADD: + return LegalizationCost + 2; + default: + // TODO: Replace for invalid when InstructionCost is used + // cases not supported by SVE + return 16; + } +} + int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, bool IsPairwiseForm, TTI::TargetCostKind CostKind) { + if (isa(ValTy)) + return getArithmeticReductionCostSVE(Opcode, ValTy, IsPairwiseForm, + CostKind); if (IsPairwiseForm) return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, CostKind); diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-vector-reduce.ll b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-vector-reduce.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-vector-reduce.ll @@ -0,0 +1,306 @@ +; Check getIntrinsicInstrCost in BasicTTIImpl.h with SVE for vector.reduce. +; Checks legal and not legal vector size + +; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s + + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define i8 @add.i8.nxv8i8( %v) { +; CHECK-LABEL: 'add.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.add.nxv8i8( %v) +; CHECK-NEXT:Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.add.nxv8i8( %v) + ret i8 %r +} + +define i8 @mul.i8.nxv8i8( %v) { +; CHECK-LABEL: 'mul.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %r = call i8 @llvm.vector.reduce.mul.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.mul.nxv8i8( %v) + ret i8 %r +} + +define i8 @and.i8.nxv8i8( %v) { +; CHECK-LABEL: 'and.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.and.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.and.nxv8i8( %v) + ret i8 %r +} + +define i8 @or.i8.nxv8i8( %v) { +; CHECK-LABEL: 'or.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.or.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.or.nxv8i8( %v) + ret i8 %r +} + +define i8 @xor.i8.nxv8i8( %v) { +; CHECK-LABEL: 'xor.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.xor.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.xor.nxv8i8( %v) + ret i8 %r +} + +define i8 @umin.i8.nxv8i8( %v) { +; CHECK-LABEL: 'umin.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.umin.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.umin.nxv8i8( %v) + ret i8 %r +} + +define float @fmax.f32.nxv2f32( %v) { +; CHECK-LABEL: 'fmax.f32.nxv2f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call float @llvm.vector.reduce.fmax.nxv2f32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r + + %r = call float @llvm.vector.reduce.fmax.nxv2f32( %v) + ret float %r +} + +define i32 @fmax.i32.nxv2i32( %v) { +; CHECK-LABEL: 'fmax.i32.nxv2i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.fmax.nxv2i32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.fmax.nxv2i32( %v) + ret i32 %r +} + +define float @fmin.f32.nxv2f32( %v) { +; CHECK-LABEL: 'fmin.f32.nxv2f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call float @llvm.vector.reduce.fmin.nxv2f32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r + + %r = call float @llvm.vector.reduce.fmin.nxv2f32( %v) + ret float %r +} + +define i32 @fmin.i32.nxv2i32( %v) { +; CHECK-LABEL: 'fmin.i32.nxv2i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.fmin.nxv2i32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.fmin.nxv2i32( %v) + ret i32 %r +} + +define i8 @umax.i8.nxv8i8( %v) { +; CHECK-LABEL: 'umax.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.umax.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.umax.nxv8i8( %v) + ret i8 %r +} +define i8 @smin.i8.nxv8i8( %v) { +; CHECK-LABEL: 'smin.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.smin.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.smin.nxv8i8( %v) + ret i8 %r +} +define i8 @smax.i8.nxv8i8( %v) { +; CHECK-LABEL: 'smax.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.smax.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.smax.nxv8i8( %v) + ret i8 %r +} + +define float @fadda_nxv2f32(float %start, %a) #0 { +; CHECK-LABEL: 'fadda_nxv2f32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call float @llvm.vector.reduce.fadd.nxv2f32(float %start, %a) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %res + + %res = call float @llvm.vector.reduce.fadd.nxv2f32(float %start, %a) + ret float %res +} + +define i32 @fadda_nxv2i32(i32 %start, %a) #0 { +; CHECK-LABEL: 'fadda_nxv2i32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call i32 @llvm.vector.reduce.fadd.nxv2i32(i32 %start, %a) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %res + + %res = call i32 @llvm.vector.reduce.fadd.nxv2i32(i32 %start, %a) + ret i32 %res +} +;Test legalization cost + +define i64 @add.i64.nxv8i64( %v) { +; CHECK-LABEL: 'add.i64.nxv8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = call i64 @llvm.vector.reduce.add.nxv8i64( %v) +; CHECK-NEXT:Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.add.nxv8i64( %v) + ret i64 %r +} + +define i64 @mul.i64.nxv8i64( %v) { +; CHECK-LABEL: 'mul.i64.nxv8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %r = call i64 @llvm.vector.reduce.mul.nxv8i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.mul.nxv8i64( %v) + ret i64 %r +} + +define i64 @and.i64.nxv8i64( %v) { +; CHECK-LABEL: 'and.i64.nxv8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = call i64 @llvm.vector.reduce.and.nxv8i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.and.nxv8i64( %v) + ret i64 %r +} + +define i64 @or.i64.nxv8i64( %v) { +; CHECK-LABEL: 'or.i64.nxv8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = call i64 @llvm.vector.reduce.or.nxv8i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.or.nxv8i64( %v) + ret i64 %r +} + +define i64 @xor.i64.nxv8i64( %v) { +; CHECK-LABEL: 'xor.i64.nxv8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = call i64 @llvm.vector.reduce.xor.nxv8i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.xor.nxv8i64( %v) + ret i64 %r +} + +define i64 @umin.i64.nxv8i64( %v) { +; CHECK-LABEL: 'umin.i64.nxv8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = call i64 @llvm.vector.reduce.umin.nxv8i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.umin.nxv8i64( %v) + ret i64 %r +} + +define float @fmax.f32.nxv8f32( %v) { +; CHECK-LABEL: 'fmax.f32.nxv8f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call float @llvm.vector.reduce.fmax.nxv8f32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r + + %r = call float @llvm.vector.reduce.fmax.nxv8f32( %v) + ret float %r +} + +define i32 @fmax.i32.nxv8i32( %v) { +; CHECK-LABEL: 'fmax.i32.nxv8i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call i32 @llvm.vector.reduce.fmax.nxv8i32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.fmax.nxv8i32( %v) + ret i32 %r +} + +define float @fmin.f32.nxv8f32( %v) { +; CHECK-LABEL: 'fmin.f32.nxv8f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call float @llvm.vector.reduce.fmin.nxv8f32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r + + %r = call float @llvm.vector.reduce.fmin.nxv8f32( %v) + ret float %r +} + +define i32 @fmin.i32.nxv8i32( %v) { +; CHECK-LABEL: 'fmin.i32.nxv8i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call i32 @llvm.vector.reduce.fmin.nxv8i32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.fmin.nxv8i32( %v) + ret i32 %r +} + +define i64 @umax.i64.nxv8i64( %v) { +; CHECK-LABEL: 'umax.i64.nxv8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = call i64 @llvm.vector.reduce.umax.nxv8i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.umax.nxv8i64( %v) + ret i64 %r +} +define i64 @smin.i64.nxv8i64( %v) { +; CHECK-LABEL: 'smin.i64.nxv8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = call i64 @llvm.vector.reduce.smin.nxv8i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.smin.nxv8i64( %v) + ret i64 %r +} +define i64 @smax.i64.nxv8i64( %v) { +; CHECK-LABEL: 'smax.i64.nxv8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = call i64 @llvm.vector.reduce.smax.nxv8i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.smax.nxv8i64( %v) + ret i64 %r +} + +define float @fadda_nxv8f32(float %start, %a) #0 { +; CHECK-LABEL: 'fadda_nxv8f32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call float @llvm.vector.reduce.fadd.nxv8f32(float %start, %a) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %res + + %res = call float @llvm.vector.reduce.fadd.nxv8f32(float %start, %a) + ret float %res +} + +define i32 @fadda_nxv8i32(i32 %start, %a) #0 { +; CHECK-LABEL: 'fadda_nxv8i32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = call i32 @llvm.vector.reduce.fadd.nxv8i32(i32 %start, %a) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %res + + %res = call i32 @llvm.vector.reduce.fadd.nxv8i32(i32 %start, %a) + ret i32 %res +} + +declare i8 @llvm.vector.reduce.add.nxv8i8() +declare i8 @llvm.vector.reduce.mul.nxv8i8() +declare i8 @llvm.vector.reduce.and.nxv8i8() +declare i8 @llvm.vector.reduce.or.nxv8i8() +declare i8 @llvm.vector.reduce.xor.nxv8i8() +declare float @llvm.vector.reduce.fmax.nxv2f32() +declare i32 @llvm.vector.reduce.fmax.nxv2i32() +declare float @llvm.vector.reduce.fmin.nxv2f32() +declare i32 @llvm.vector.reduce.fmin.nxv2i32() +declare i8 @llvm.vector.reduce.umin.nxv8i8() +declare i8 @llvm.vector.reduce.umax.nxv8i8() +declare i8 @llvm.vector.reduce.smin.nxv8i8() +declare i8 @llvm.vector.reduce.smax.nxv8i8() +declare float @llvm.vector.reduce.fadd.nxv2f32(float, ) +declare i32 @llvm.vector.reduce.fadd.nxv2i32(i32, ) +declare i64 @llvm.vector.reduce.add.nxv8i64() +declare i64 @llvm.vector.reduce.mul.nxv8i64() +declare i64 @llvm.vector.reduce.and.nxv8i64() +declare i64 @llvm.vector.reduce.or.nxv8i64() +declare i64 @llvm.vector.reduce.xor.nxv8i64() +declare float @llvm.vector.reduce.fmax.nxv8f32() +declare float @llvm.vector.reduce.fmin.nxv8f32() +declare i32 @llvm.vector.reduce.fmax.nxv8i32() +declare i32 @llvm.vector.reduce.fmin.nxv8i32() +declare i64 @llvm.vector.reduce.umin.nxv8i64() +declare i64 @llvm.vector.reduce.umax.nxv8i64() +declare i64 @llvm.vector.reduce.smin.nxv8i64() +declare i64 @llvm.vector.reduce.smax.nxv8i64() +declare float @llvm.vector.reduce.fadd.nxv8f32(float, ) +declare i32 @llvm.vector.reduce.fadd.nxv8i32(i32, )