Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1167,6 +1167,8 @@ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, bool UseMaskForCond = false, bool UseMaskForGaps = false) const; + enum class ReductionType : uint8_t { Split, Ordered }; + /// Calculate the cost of performing a vector reduction. /// /// This is the cost of reducing the vector value of type \p Ty to a scalar @@ -1181,7 +1183,7 @@ /// (v0, v1, v2, v3) /// ((v0+v2), (v1+v3), undef, undef) InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, + unsigned Opcode, VectorType *Ty, ReductionType RedType, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; InstructionCost getMinMaxReductionCost( @@ -1645,6 +1647,7 @@ bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0; virtual InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + ReductionType RedType, TTI::TargetCostKind CostKind) = 0; virtual InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, @@ -2145,8 +2148,9 @@ } InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + ReductionType RedType, TTI::TargetCostKind CostKind) override { - return Impl.getArithmeticReductionCost(Opcode, Ty, CostKind); + return Impl.getArithmeticReductionCost(Opcode, Ty, RedType, CostKind); } InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -622,6 +622,7 @@ } InstructionCost getArithmeticReductionCost(unsigned, VectorType *, + TTI::ReductionType, TTI::TargetCostKind) const { return 1; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1624,28 +1624,32 @@ CostKind); } case Intrinsic::vector_reduce_add: - return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::Add, VecOpTy, TTI::ReductionType::Split, CostKind); case Intrinsic::vector_reduce_mul: - return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::Mul, VecOpTy, TTI::ReductionType::Split, CostKind); case Intrinsic::vector_reduce_and: - return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::And, VecOpTy, TTI::ReductionType::Split, CostKind); case Intrinsic::vector_reduce_or: - return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::Or, VecOpTy, TTI::ReductionType::Split, CostKind); case Intrinsic::vector_reduce_xor: - return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::Xor, VecOpTy, TTI::ReductionType::Split, CostKind); case Intrinsic::vector_reduce_fadd: - // FIXME: Add new flag for cost of strict reductions. - return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::FAdd, VecOpTy, + FMF.allowReassoc() ? TTI::ReductionType::Split + : TTI::ReductionType::Ordered, + CostKind); case Intrinsic::vector_reduce_fmul: - // FIXME: Add new flag for cost of strict reductions. - return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::FMul, VecOpTy, + FMF.allowReassoc() ? TTI::ReductionType::Split + : TTI::ReductionType::Ordered, + CostKind); case Intrinsic::vector_reduce_smax: case Intrinsic::vector_reduce_smin: case Intrinsic::vector_reduce_fmax: @@ -1981,8 +1985,8 @@ /// /// The cost model should take into account that the actual length of the /// vector is reduced on each iteration. - InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - TTI::TargetCostKind CostKind) { + InstructionCost getFastReductionCost(unsigned Opcode, VectorType *Ty, + TTI::TargetCostKind CostKind) { Type *ScalarTy = Ty->getElementType(); unsigned NumVecElts = cast(Ty)->getNumElements(); if ((Opcode == Instruction::Or || Opcode == Instruction::And) && @@ -2034,6 +2038,44 @@ thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } + /// Try to calculate the cost of performing strict (in-order) reductions, + /// which involves doing a sequence of floating point additions in lane + /// order, starting with an initial value. For example, consider a scalar + /// initial value 'InitVal' of type float and a vector of type <4 x float>: + /// + /// Vector = + /// + /// %add1 = %InitVal + %v0 + /// %add2 = %add1 + %v1 + /// %add3 = %add2 + %v2 + /// %add4 = %add3 + %v3 + /// + /// As a simple estimate we can say the cost of such a reduction is 4 times + /// the cost of a scalar FP addition. We can only estimate the costs for + /// fixed-width vectors here because for scalable vectors we do not know the + /// runtime number of operations. + InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, + TTI::TargetCostKind CostKind) { + if (isa(Ty)) + return InstructionCost::getInvalid(); + auto *VTy = cast(Ty); + InstructionCost ExtractCost = + getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true); + InstructionCost ArithCost = + getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind); + ArithCost *= VTy->getNumElements(); + + return ExtractCost + ArithCost; + } + + InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + TTI::ReductionType RedType, + TTI::TargetCostKind CostKind) { + if (RedType == TTI::ReductionType::Ordered) + return getOrderedReductionCost(Opcode, Ty, CostKind); + return getFastReductionCost(Opcode, Ty, CostKind); + } + /// Try to calculate op costs for min/max reduction operations. /// \param CondTy Conditional type for the Select instruction. InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, @@ -2100,8 +2142,8 @@ // Without any native support, this is equivalent to the cost of // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) VectorType *ExtTy = VectorType::get(ResTy, Ty); - InstructionCost RedCost = - thisT()->getArithmeticReductionCost(Instruction::Add, ExtTy, CostKind); + InstructionCost RedCost = thisT()->getArithmeticReductionCost( + Instruction::Add, ExtTy, TTI::ReductionType::Split, CostKind); InstructionCost MulCost = 0; InstructionCost ExtCost = thisT()->getCastInstrCost( IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -894,9 +894,10 @@ } InstructionCost TargetTransformInfo::getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) const { + unsigned Opcode, VectorType *Ty, ReductionType RedType, + TTI::TargetCostKind CostKind) const { InstructionCost Cost = - TTIImpl->getArithmeticReductionCost(Opcode, Ty, CostKind); + TTIImpl->getArithmeticReductionCost(Opcode, Ty, RedType, CostKind); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -131,6 +131,18 @@ return BaseT::getMaxVScale(); } + /// Try to return an estimate cost factor that can be used as a multiplier + /// when scalarizing an operation for a vector with ElementCount \p VF. + /// For scalable vectors this currently takes the most pessimistic view based + /// upon the maximum possible value for vscale. + unsigned getScalarizationCostFactor(ElementCount VF) const { + if (!VF.isScalable()) + return VF.getKnownMinValue(); + Optional MaxNumVScale = getMaxVScale(); + assert(MaxNumVScale && "Expected valid max vscale value"); + return *MaxNumVScale * VF.getKnownMinValue(); + } + unsigned getMaxInterleaveFactor(unsigned VF); InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, @@ -301,7 +313,7 @@ ElementCount VF) const; InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, + unsigned Opcode, VectorType *Ty, TTI::ReductionType RedType, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1404,14 +1404,9 @@ return InstructionCost::getInvalid(); ElementCount LegalVF = LT.second.getVectorElementCount(); - Optional MaxNumVScale = getMaxVScale(); - assert(MaxNumVScale && "Expected valid max vscale value"); - InstructionCost MemOpCost = getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); - unsigned MaxNumElementsPerGather = - MaxNumVScale.getValue() * LegalVF.getKnownMinValue(); - return LT.first * MaxNumElementsPerGather * MemOpCost; + return LT.first * MemOpCost * getScalarizationCostFactor(LegalVF); } bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { @@ -1817,7 +1812,20 @@ InstructionCost AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + TTI::ReductionType RedType, TTI::TargetCostKind CostKind) { + if (RedType == TTI::ReductionType::Ordered) { + if (!isa(ValTy)) + return BaseT::getArithmeticReductionCost(Opcode, ValTy, RedType, + CostKind); + + auto *VTy = cast(ValTy); + InstructionCost Cost = + getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); + Cost *= getScalarizationCostFactor(VTy->getElementCount()); + return Cost; + } + if (isa(ValTy)) return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); @@ -1891,7 +1899,7 @@ } break; } - return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, RedType, CostKind); } InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -212,7 +212,7 @@ int getInlinerVectorBonusPercent() { return 0; } InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, + unsigned Opcode, VectorType *Ty, TTI::ReductionType RedType, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -843,13 +843,14 @@ InstructionCost GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + TTI::ReductionType RedType, TTI::TargetCostKind CostKind) { EVT OrigTy = TLI->getValueType(DL, Ty); // Computes cost on targets that have packed math instructions(which support // 16-bit types only). if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) - return BaseT::getArithmeticReductionCost(Opcode, Ty, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, Ty, RedType, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); return LT.first * getFullRateInstrCost(); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -257,6 +257,7 @@ const Instruction *I = nullptr); InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + TTI::ReductionType RedType, TTI::TargetCostKind CostKind); InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *ValTy, Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1594,11 +1594,12 @@ InstructionCost ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + TTI::ReductionType RedType, TTI::TargetCostKind CostKind) { EVT ValVT = TLI->getValueType(DL, ValTy); int ISD = TLI->InstructionOpcodeToISD(Opcode); if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, RedType, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); @@ -1610,7 +1611,7 @@ if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first; - return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, RedType, CostKind); } InstructionCost Index: llvm/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -181,7 +181,7 @@ TTI::TargetCostKind CostKind); InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, + unsigned Opcode, VectorType *Ty, TTI::ReductionType RedType, TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency); InstructionCost getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned); Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3631,6 +3631,7 @@ InstructionCost X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + TTI::ReductionType RedType, TTI::TargetCostKind CostKind) { // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. @@ -3702,7 +3703,8 @@ return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, TargetTransformInfo::CastContextHint::None, CostKind) + - getArithmeticReductionCost(Opcode, WideVecTy, CostKind); + getArithmeticReductionCost(Opcode, WideVecTy, + TTI::ReductionType::Split, CostKind); } InstructionCost ArithmeticCost = 0; @@ -3798,7 +3800,8 @@ if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) return ArithmeticCost + Entry->Cost; - return BaseT::getArithmeticReductionCost(Opcode, ValVTy, CostKind); + return BaseT::getArithmeticReductionCost( + Opcode, ValVTy, TTI::ReductionType::Split, CostKind); } unsigned NumVecElts = ValVTy->getNumElements(); @@ -3807,7 +3810,8 @@ // Special case power of 2 reductions where the scalar type isn't changed // by type legalization. if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) - return BaseT::getArithmeticReductionCost(Opcode, ValVTy, CostKind); + return BaseT::getArithmeticReductionCost( + Opcode, ValVTy, TTI::ReductionType::Split, CostKind); InstructionCost ReductionCost = 0; Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7168,8 +7168,13 @@ const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[cast(ReductionPhi)]; - InstructionCost BaseCost = - TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), VectorTy, CostKind); + + if (useOrderedReductions(RdxDesc)) + return TTI.getArithmeticReductionCost( + RdxDesc.getOpcode(), VectorTy, TTI::ReductionType::Ordered, CostKind); + + InstructionCost BaseCost = TTI.getArithmeticReductionCost( + RdxDesc.getOpcode(), VectorTy, TTI::ReductionType::Split, CostKind); // Get the operand that was not the reduction chain and match it to one of the // patterns, returning the better cost if it is found. Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7879,7 +7879,8 @@ case RecurKind::FAdd: case RecurKind::FMul: { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); - VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy); + VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, + TTI::ReductionType::Split); ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy); break; } Index: llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll @@ -0,0 +1,20 @@ +; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu < %s | FileCheck %s + +define void @strict_fp_reductions() { +; CHECK-LABEL: strict_fp_reductions +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) + %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef) + %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef) + %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef) + %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef) + + ret void +} + +declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) +declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) Index: llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -57,21 +57,32 @@ %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32( %v0) %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64( %v1) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v2) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v3) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f32 = call float @llvm.vector.reduce.fmin.nxv4f32( %v2) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmin_nxv4f64 = call double @llvm.vector.reduce.fmin.nxv4f64( %v3) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f32 = call float @llvm.vector.reduce.fmax.nxv4f32( %v2) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmax_nxv4f64 = call double @llvm.vector.reduce.fmax.nxv4f64( %v3) - %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, %v2) - %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, %v3) - %fmin_nxv4f32 = call float @llvm.vector.reduce.fmin.nxv4f32( %v2) - %fmin_nxv4f64 = call double @llvm.vector.reduce.fmin.nxv4f64( %v3) - %fmax_nxv4f32 = call float @llvm.vector.reduce.fmax.nxv4f32( %v2) - %fmax_nxv4f64 = call double @llvm.vector.reduce.fmax.nxv4f64( %v3) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v3) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32( %v2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64( %v3) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32( %v2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64( %v3) + %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, %v2) + %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, %v3) + %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32( %v2) + %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64( %v3) + %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32( %v2) + %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64( %v3) ret void } + +define void @strict_fp_reductions( %v0, %v1) { +; CHECK-LABEL: 'strict_fp_reductions' +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v1) + %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, %v0) + %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, %v1) + + ret void +} + declare i32 @llvm.vector.reduce.add.nxv4i32() declare i64 @llvm.vector.reduce.add.nxv4i64() declare i32 @llvm.vector.reduce.mul.nxv4i32() Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll @@ -0,0 +1,49 @@ +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -force-vector-width=4 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4 +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -force-vector-width=8 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF8 + +target triple="aarch64-unknown-linux-gnu" + +; CHECK-VF4: Found an estimated cost of 17 for VF 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 34 for VF 8 For instruction: %add = fadd float %0, %sum.07 + +define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret float %add +} + + +; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction: %add = fadd double %0, %sum.07 + +define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %iv + %0 = load double, double* %arrayidx, align 4 + %add = fadd double %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret double %add +} Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll @@ -0,0 +1,55 @@ +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -scalable-vectorization=on -force-vector-width=4 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4 +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -scalable-vectorization=on -force-vector-width=8 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF8 + +target triple="aarch64-unknown-linux-gnu" + +; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd float %0, %sum.07 + +define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) #0 { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %add +} + + +; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd double %0, %sum.07 + +define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) #0 { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %iv + %0 = load double, double* %arrayidx, align 4 + %add = fadd double %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret double %add +} + +attributes #0 = { "target-features"="+sve" } + +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.interleave.count", i32 1} +!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}