diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -105,6 +105,16 @@ Optional FMF, TTI::TargetCostKind CostKind); + InstructionCost getMinMaxReductionCostScalable(VectorType *Ty, + VectorType *CondTy, + bool IsUnsigned, + TTI::TargetCostKind CostKind); + + InstructionCost + getArithmeticReductionCostScalable(unsigned Opcode, VectorType *Ty, + Optional FMF, + TTI::TargetCostKind CostKind); + bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) { if (!ST->hasVInstructions()) return false; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -288,13 +288,37 @@ return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); } +InstructionCost +RISCVTTIImpl::getMinMaxReductionCostScalable(VectorType *Ty, VectorType *CondTy, + bool IsUnsigned, + TTI::TargetCostKind CostKind) { + // TODO: Add the final reduction cost + std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + InstructionCost LegalizationCost = 0; + assert((isa(Ty) && isa(CondTy)) && + "Both vectors need to be scalable"); + + if (LT.first > 1) { + Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); + unsigned CmpOpcode = + Ty->isFPOrFPVectorTy() ? Instruction::FCmp : Instruction::ICmp; + LegalizationCost = + getCmpSelInstrCost(CmpOpcode, LegalVTy, LegalVTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind) + + getCmpSelInstrCost(Instruction::Select, LegalVTy, LegalVTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + LegalizationCost *= LT.first - 1; + } + + return LegalizationCost + 2; +} + InstructionCost RISCVTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind) { - // FIXME: Only supporting fixed vectors for now. - if (!isa(Ty)) - return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); + if (isa(Ty)) + return getMinMaxReductionCostScalable(Ty, CondTy, IsUnsigned, CostKind); if (!ST->useRVVForFixedLengthVectors()) return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); @@ -310,13 +334,40 @@ return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); } +InstructionCost RISCVTTIImpl::getArithmeticReductionCostScalable( + unsigned Opcode, VectorType *VTy, Optional FMF, + TTI::TargetCostKind CostKind) { + // TODO: Add the final reduction cost + std::pair LT = TLI->getTypeLegalizationCost(DL, VTy); + InstructionCost LegalizationCost = 0; + if (LT.first > 1) { + Type *LegalVTy = EVT(LT.second).getTypeForEVT(VTy->getContext()); + LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); + LegalizationCost *= LT.first - 1; + } + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + switch (ISD) { + default: + return BaseT::getArithmeticReductionCost(Opcode, VTy, FMF, CostKind); + case ISD::ADD: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::FADD: + return LegalizationCost + 2; + } +} + InstructionCost RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *VTy, Optional FMF, TTI::TargetCostKind CostKind) { - // FIXME: Only supporting fixed vectors for now. - if (!isa(VTy)) - return BaseT::getArithmeticReductionCost(Opcode, VTy, FMF, CostKind); + + if (isa(VTy)) + return getArithmeticReductionCostScalable(Opcode, VTy, FMF, CostKind); // FIXME: Do not support i1 and/or reduction now. if (VTy->getElementType()->isIntegerTy(1)) diff --git a/llvm/test/Analysis/CostModel/RISCV/scalable-reduce.ll b/llvm/test/Analysis/CostModel/RISCV/scalable-reduce.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/RISCV/scalable-reduce.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -passes='print' 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+experimental-zvfh | FileCheck %s + +define void @reductions( %v0, %v1, %v2, %v3) { +; CHECK-LABEL: 'reductions' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32( %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64( %v1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32( %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64( %v1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32( %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64( %v1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32( %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64( %v1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32( %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64( %v1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32( %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smin_nxv4i64 = call i64 @llvm.vector.reduce.smin.nxv4i64( %v1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %umax_nxv4i32 = call i32 @llvm.vector.reduce.umax.nxv4i32( %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64( %v1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32( %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64( %v1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v3) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32( %v2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64( %v3) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32( %v2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64( %v3) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32( %v0) + %add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64( %v1) + %and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32( %v0) + %and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64( %v1) + %or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32( %v0) + %or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64( %v1) + %xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32( %v0) + %xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64( %v1) + %umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32( %v0) + %umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64( %v1) + %smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32( %v0) + %smin_nxv4i64 = call i64 @llvm.vector.reduce.smin.nxv4i64( %v1) + %umax_nxv4i32 = call i32 @llvm.vector.reduce.umax.nxv4i32( %v0) + %umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64( %v1) + %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32( %v0) + %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64( %v1) + + %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, %v2) + %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, %v3) + %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32( %v2) + %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64( %v3) + %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32( %v2) + %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64( %v3) + + ret void +} + +define void @strict_fp_reductions( %v0, %v1) { +; CHECK-LABEL: 'strict_fp_reductions' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, %v0) + %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, %v1) + + ret void +} + +declare i32 @llvm.vector.reduce.add.nxv4i32() +declare i64 @llvm.vector.reduce.add.nxv4i64() +declare i32 @llvm.vector.reduce.and.nxv4i32() +declare i64 @llvm.vector.reduce.and.nxv4i64() +declare i32 @llvm.vector.reduce.or.nxv4i32() +declare i64 @llvm.vector.reduce.or.nxv4i64() +declare i32 @llvm.vector.reduce.xor.nxv4i32() +declare i64 @llvm.vector.reduce.xor.nxv4i64() +declare i32 @llvm.vector.reduce.umin.nxv4i32() +declare i64 @llvm.vector.reduce.umin.nxv4i64() +declare i32 @llvm.vector.reduce.smin.nxv4i32() +declare i64 @llvm.vector.reduce.smin.nxv4i64() +declare i32 @llvm.vector.reduce.umax.nxv4i32() +declare i64 @llvm.vector.reduce.umax.nxv4i64() +declare i32 @llvm.vector.reduce.smax.nxv4i32() +declare i64 @llvm.vector.reduce.smax.nxv4i64() +declare float @llvm.vector.reduce.fadd.nxv4f32(float, ) +declare double @llvm.vector.reduce.fadd.nxv4f64(double, ) +declare float @llvm.vector.reduce.fmin.nxv4f32() +declare double @llvm.vector.reduce.fmin.nxv4f64() +declare float @llvm.vector.reduce.fmax.nxv4f32() +declare double @llvm.vector.reduce.fmax.nxv4f64() + +