diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1288,15 +1288,11 @@ case Intrinsic::vector_reduce_fmin: case Intrinsic::vector_reduce_umax: case Intrinsic::vector_reduce_umin: { - if (isa(RetTy)) - return BaseT::getIntrinsicInstrCost(ICA, CostKind); IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } case Intrinsic::vector_reduce_fadd: case Intrinsic::vector_reduce_fmul: { - if (isa(RetTy)) - return BaseT::getIntrinsicInstrCost(ICA, CostKind); IntrinsicCostAttributes Attrs( IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -139,6 +139,14 @@ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + unsigned getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsPairwise, bool IsUnsigned, + TTI::TargetCostKind CostKind); + + int getArithmeticReductionCostScalableVectorType( + unsigned Opcode, VectorType *ValTy, bool IsPairwiseForm, + TTI::TargetCostKind CostKind); + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1101,11 +1101,95 @@ return false; } +unsigned AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, + VectorType *CondTy, + bool IsPairwise, + bool IsUnsigned, + TTI::TargetCostKind CostKind) { + if (!isa(Ty)) + return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned, + CostKind); + assert((isa(Ty) && isa(CondTy)) && + "Both vector needs to be scalable"); + + std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + EVT VT = LT.second; + unsigned LegalizationCost = 0; + if (LT.first > 1) { + auto *ValVTy = cast(Ty); + Type *VTy = VT.getTypeForEVT(ValVTy->getContext()); + unsigned CmpOpcode = + Ty->isFPOrFPVectorTy() ? Instruction::FCmp : Instruction::ICmp; + LegalizationCost = + getCmpSelInstrCost(CmpOpcode, VTy, VTy, CmpInst::BAD_ICMP_PREDICATE, + CostKind) + + getCmpSelInstrCost(Instruction::Select, VTy, VTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + LegalizationCost *= LT.first - 1; + } + + int ISD; + if (Ty->isIntOrIntVectorTy()) { + ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; + } else { + assert(Ty->isFPOrFPVectorTy() && + "Expected float point or integer vector type."); + ISD = ISD::FMINNUM; + } + // Add the final reduction cost for the legal horizontal reduction + switch (ISD) { + case ISD::UMIN: + case ISD::SMIN: + case ISD::FMINNUM: + return LegalizationCost + 2; + default: + llvm_unreachable("ISD does not exist"); + } +} + +int AArch64TTIImpl::getArithmeticReductionCostScalableVectorType( + unsigned Opcode, VectorType *ValTy, bool IsPairwise, + TTI::TargetCostKind CostKind) { + assert(!IsPairwise && "Cannot be pair wise to continue"); + + std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + EVT VT = LT.second; + unsigned LegalizationCost = 0; + if (LT.first > 1) { + auto *ValVTy = cast(ValTy); + Type *SingleOpTy = VT.getTypeForEVT(ValVTy->getContext()); + LegalizationCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); + LegalizationCost *= LT.first - 1; + } + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + // Add the final reduction cost for the legal horizontal reduction + switch (ISD) { + case ISD::ADD: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::FADD: + return LegalizationCost + 2; + case ISD::FMUL: + case ISD::MUL: + return LegalizationCost + 16; + default: + // TODO: Replace for invalid when InstructionCost is used + // cases not supported by SVE + return 16; + } +} + int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, bool IsPairwiseForm, TTI::TargetCostKind CostKind) { + if (isa(ValTy)) + return getArithmeticReductionCostScalableVectorType( + Opcode, ValTy, IsPairwiseForm, CostKind); if (IsPairwiseForm) return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, CostKind); diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-vector-reduce.ll b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-vector-reduce.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-vector-reduce.ll @@ -0,0 +1,127 @@ +; Check getIntrinsicInstrCost in BasicTTIImpl.h with SVE for masked gather + +; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s + + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define i64 @add.i64.nxv8i64( %v) { +; CHECK-LABEL: 'add.i64.nxv8i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = call i64 @llvm.vector.reduce.add.nxv8i64( %v) +; CHECK-NEXT:Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.add.nxv8i64( %v) + ret i64 %r +} + +define i8 @mul.i8.nxv8i8( %v) { +; CHECK-LABEL: 'mul.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %r = call i8 @llvm.vector.reduce.mul.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.mul.nxv8i8( %v) + ret i8 %r +} + +define i8 @and.i8.nxv8i8( %v) { +; CHECK-LABEL: 'and.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.and.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.and.nxv8i8( %v) + ret i8 %r +} + +define i8 @or.i8.nxv8i8( %v) { +; CHECK-LABEL: 'or.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.or.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.or.nxv8i8( %v) + ret i8 %r +} + +define i8 @xor.i8.nxv8i8( %v) { +; CHECK-LABEL: 'xor.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.xor.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.xor.nxv8i8( %v) + ret i8 %r +} + +define i8 @umin.i8.nxv8i8( %v) { +; CHECK-LABEL: 'umin.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.umin.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.umin.nxv8i8( %v) + ret i8 %r +} + +define float @fmax.f32.nxv8f32( %v) { +; CHECK-LABEL: 'fmax.f32.nxv8f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call float @llvm.vector.reduce.fmax.nxv8f32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r + + %r = call float @llvm.vector.reduce.fmax.nxv8f32( %v) + ret float %r +} + +define float @fmin.f32.nxv8f32( %v) { +; CHECK-LABEL: 'fmin.f32.nxv8f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call float @llvm.vector.reduce.fmin.nxv8f32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r + + %r = call float @llvm.vector.reduce.fmin.nxv8f32( %v) + ret float %r +} + +define i8 @umax.i8.nxv8i8( %v) { +; CHECK-LABEL: 'umax.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.umax.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.umax.nxv8i8( %v) + ret i8 %r +} +define i8 @smin.i8.nxv8i8( %v) { +; CHECK-LABEL: 'smin.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.smin.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.smin.nxv8i8( %v) + ret i8 %r +} +define i8 @smax.i8.nxv8i8( %v) { +; CHECK-LABEL: 'smax.i8.nxv8i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.smax.nxv8i8( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r + + %r = call i8 @llvm.vector.reduce.smax.nxv8i8( %v) + ret i8 %r +} + +define float @fadda_nxv8f32(float %start, %a) #0 { +; CHECK-LABEL: 'fadda_nxv8f32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call float @llvm.vector.reduce.fadd.nxv8f32(float %start, %a) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %res + + %res = call float @llvm.vector.reduce.fadd.nxv8f32(float %start, %a) + ret float %res +} + + +declare i64 @llvm.vector.reduce.add.nxv8i64() +declare i8 @llvm.vector.reduce.mul.nxv8i8() +declare i8 @llvm.vector.reduce.and.nxv8i8() +declare i8 @llvm.vector.reduce.or.nxv8i8() +declare i8 @llvm.vector.reduce.xor.nxv8i8() +declare float @llvm.vector.reduce.fmax.nxv8f32() +declare float @llvm.vector.reduce.fmin.nxv8f32() +declare i8 @llvm.vector.reduce.umin.nxv8i8() +declare i8 @llvm.vector.reduce.umax.nxv8i8() +declare i8 @llvm.vector.reduce.smin.nxv8i8() +declare i8 @llvm.vector.reduce.smax.nxv8i8() +declare float @llvm.vector.reduce.fadd.nxv8f32(float, )