diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1146,12 +1146,38 @@ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, bool UseMaskForCond = false, bool UseMaskForGaps = false) const; + /// A helper function to determine the type of reduction algorithm used + /// for a given \p Opcode and set of FastMathFlags \p FMF. + static bool requiresOrderedReduction(Optional FMF) { + return FMF != None && !(*FMF).allowReassoc(); + } + /// Calculate the cost of vector reduction intrinsics. /// /// This is the cost of reducing the vector value of type \p Ty to a scalar - /// value using the operation denoted by \p Opcode. + /// value using the operation denoted by \p Opcode. The FastMathFlags + /// parameter \p FMF indicates what type of reduction we are performing: + /// 1. Tree-wise. This is the typical 'fast' reduction performed that + /// involves successively splitting a vector into half and doing the + /// operation on the pair of halves until you have a scalar value. For + /// example: + /// (v0, v1, v2, v3) + /// ((v0+v2), (v1+v3), undef, undef) + /// ((v0+v2+v1+v3), undef, undef, undef) + /// This is the default behaviour for integer operations, whereas for + /// floating point we only do this if \p FMF indicates that + /// reassociation is allowed. + /// 2. Ordered. For a vector with N elements this involves performing N + /// operations in lane order, starting with an initial scalar value, i.e. + /// result = InitVal + v0 + /// result = result + v1 + /// result = result + v2 + /// result = result + v3 + /// This is only the case for FP operations and when reassociation is not + /// allowed. + /// InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, + unsigned Opcode, VectorType *Ty, Optional FMF, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; InstructionCost getMinMaxReductionCost( @@ -1618,6 +1644,7 @@ bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0; virtual InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + Optional FMF, TTI::TargetCostKind CostKind) = 0; virtual InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, @@ -2119,8 +2146,9 @@ } InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + Optional FMF, TTI::TargetCostKind CostKind) override { - return Impl.getArithmeticReductionCost(Opcode, Ty, CostKind); + return Impl.getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); } InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -622,6 +622,7 @@ } InstructionCost getArithmeticReductionCost(unsigned, VectorType *, + Optional FMF, TTI::TargetCostKind) const { return 1; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1658,27 +1658,25 @@ } case Intrinsic::vector_reduce_add: return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, - CostKind); + None, CostKind); case Intrinsic::vector_reduce_mul: return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, - CostKind); + None, CostKind); case Intrinsic::vector_reduce_and: return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, - CostKind); + None, CostKind); case Intrinsic::vector_reduce_or: - return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, + return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None, CostKind); case Intrinsic::vector_reduce_xor: return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, - CostKind); + None, CostKind); case Intrinsic::vector_reduce_fadd: - // FIXME: Add new flag for cost of strict reductions. return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, - CostKind); + FMF, CostKind); case Intrinsic::vector_reduce_fmul: - // FIXME: Add new flag for cost of strict reductions. return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, - CostKind); + FMF, CostKind); case Intrinsic::vector_reduce_smax: case Intrinsic::vector_reduce_smin: case Intrinsic::vector_reduce_fmax: @@ -2014,8 +2012,8 @@ /// /// The cost model should take into account that the actual length of the /// vector is reduced on each iteration. - InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - TTI::TargetCostKind CostKind) { + InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, + TTI::TargetCostKind CostKind) { Type *ScalarTy = Ty->getElementType(); unsigned NumVecElts = cast(Ty)->getNumElements(); if ((Opcode == Instruction::Or || Opcode == Instruction::And) && @@ -2067,6 +2065,47 @@ thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } + /// Try to calculate the cost of performing strict (in-order) reductions, + /// which involves doing a sequence of floating point additions in lane + /// order, starting with an initial value. For example, consider a scalar + /// initial value 'InitVal' of type float and a vector of type <4 x float>: + /// + /// Vector = + /// + /// %add1 = %InitVal + %v0 + /// %add2 = %add1 + %v1 + /// %add3 = %add2 + %v2 + /// %add4 = %add3 + %v3 + /// + /// As a simple estimate we can say the cost of such a reduction is 4 times + /// the cost of a scalar FP addition. We can only estimate the costs for + /// fixed-width vectors here because for scalable vectors we do not know the + /// runtime number of operations. + InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, + TTI::TargetCostKind CostKind) { + // Targets must implement a default value for the scalable case, since + // we don't know how many lanes the vector has. + if (isa(Ty)) + return InstructionCost::getInvalid(); + + auto *VTy = cast(Ty); + InstructionCost ExtractCost = + getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true); + InstructionCost ArithCost = + getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind); + ArithCost *= VTy->getNumElements(); + + return ExtractCost + ArithCost; + } + + InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + Optional FMF, + TTI::TargetCostKind CostKind) { + if (TTI::requiresOrderedReduction(FMF)) + return getOrderedReductionCost(Opcode, Ty, CostKind); + return getTreeReductionCost(Opcode, Ty, CostKind); + } + /// Try to calculate op costs for min/max reduction operations. /// \param CondTy Conditional type for the Select instruction. InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, @@ -2133,8 +2172,8 @@ // Without any native support, this is equivalent to the cost of // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) VectorType *ExtTy = VectorType::get(ResTy, Ty); - InstructionCost RedCost = - thisT()->getArithmeticReductionCost(Instruction::Add, ExtTy, CostKind); + InstructionCost RedCost = thisT()->getArithmeticReductionCost( + Instruction::Add, ExtTy, None, CostKind); InstructionCost MulCost = 0; InstructionCost ExtCost = thisT()->getCastInstrCost( IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -894,9 +894,10 @@ } InstructionCost TargetTransformInfo::getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) const { + unsigned Opcode, VectorType *Ty, Optional FMF, + TTI::TargetCostKind CostKind) const { InstructionCost Cost = - TTIImpl->getArithmeticReductionCost(Opcode, Ty, CostKind); + TTIImpl->getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -131,6 +131,18 @@ return BaseT::getMaxVScale(); } + /// Try to return an estimate cost factor that can be used as a multiplier + /// when scalarizing an operation for a vector with ElementCount \p VF. + /// For scalable vectors this currently takes the most pessimistic view based + /// upon the maximum possible value for vscale. + unsigned getMaxNumElements(ElementCount VF) const { + if (!VF.isScalable()) + return VF.getFixedValue(); + Optional MaxNumVScale = getMaxVScale(); + assert(MaxNumVScale && "Expected valid max vscale value"); + return *MaxNumVScale * VF.getKnownMinValue(); + } + unsigned getMaxInterleaveFactor(unsigned VF); InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, @@ -305,7 +317,7 @@ ElementCount VF) const; InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, + unsigned Opcode, VectorType *Ty, Optional FMF, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1543,14 +1543,9 @@ return InstructionCost::getInvalid(); ElementCount LegalVF = LT.second.getVectorElementCount(); - Optional MaxNumVScale = getMaxVScale(); - assert(MaxNumVScale && "Expected valid max vscale value"); - InstructionCost MemOpCost = getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); - unsigned MaxNumElementsPerGather = - MaxNumVScale.getValue() * LegalVF.getKnownMinValue(); - return LT.first * MaxNumElementsPerGather * MemOpCost; + return LT.first * MemOpCost * getMaxNumElements(LegalVF); } bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { @@ -1956,7 +1951,22 @@ InstructionCost AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + Optional FMF, TTI::TargetCostKind CostKind) { + if (TTI::requiresOrderedReduction(FMF)) { + if (!isa(ValTy)) + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); + + if (Opcode != Instruction::FAdd) + return InstructionCost::getInvalid(); + + auto *VTy = cast(ValTy); + InstructionCost Cost = + getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); + Cost *= getMaxNumElements(VTy->getElementCount()); + return Cost; + } + if (isa(ValTy)) return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); @@ -2031,7 +2041,7 @@ } break; } - return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); } InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -212,7 +212,7 @@ int getInlinerVectorBonusPercent() { return 0; } InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, + unsigned Opcode, VectorType *Ty, Optional FMF, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -843,13 +843,17 @@ InstructionCost GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + Optional FMF, TTI::TargetCostKind CostKind) { + if (TTI::requiresOrderedReduction(FMF)) + return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); + EVT OrigTy = TLI->getValueType(DL, Ty); // Computes cost on targets that have packed math instructions(which support // 16-bit types only). if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) - return BaseT::getArithmeticReductionCost(Opcode, Ty, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); return LT.first * getFullRateInstrCost(); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -257,6 +257,7 @@ const Instruction *I = nullptr); InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + Optional FMF, TTI::TargetCostKind CostKind); InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *ValTy, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1594,11 +1594,15 @@ InstructionCost ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + Optional FMF, TTI::TargetCostKind CostKind) { + if (TTI::requiresOrderedReduction(FMF)) + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); + EVT ValVT = TLI->getValueType(DL, ValTy); int ISD = TLI->InstructionOpcodeToISD(Opcode); if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); @@ -1610,7 +1614,7 @@ if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first; - return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); } InstructionCost diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -181,7 +181,7 @@ TTI::TargetCostKind CostKind); InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, + unsigned Opcode, VectorType *Ty, Optional FMF, TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency); InstructionCost getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3746,7 +3746,11 @@ InstructionCost X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + Optional FMF, TTI::TargetCostKind CostKind) { + if (TTI::requiresOrderedReduction(FMF)) + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); + // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. @@ -3817,7 +3821,7 @@ return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, TargetTransformInfo::CastContextHint::None, CostKind) + - getArithmeticReductionCost(Opcode, WideVecTy, CostKind); + getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind); } InstructionCost ArithmeticCost = 0; @@ -3913,7 +3917,7 @@ if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) return ArithmeticCost + Entry->Cost; - return BaseT::getArithmeticReductionCost(Opcode, ValVTy, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); } unsigned NumVecElts = ValVTy->getNumElements(); @@ -3922,7 +3926,7 @@ // Special case power of 2 reductions where the scalar type isn't changed // by type legalization. if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) - return BaseT::getArithmeticReductionCost(Opcode, ValVTy, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); InstructionCost ReductionCost = 0; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7252,8 +7252,15 @@ const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[cast(ReductionPhi)]; - InstructionCost BaseCost = - TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), VectorTy, CostKind); + + InstructionCost BaseCost = TTI.getArithmeticReductionCost( + RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); + + // If we're using ordered reductions then we can just return the base cost + // here, since getArithmeticReductionCost calculates the full ordered + // reduction cost when FP reassociation is not allowed. + if (useOrderedReductions(RdxDesc)) + return BaseCost; // Get the operand that was not the reduction chain and match it to one of the // patterns, returning the better cost if it is found. diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7852,7 +7852,7 @@ InstructionCost TreeCost = V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth)); InstructionCost ReductionCost = - getReductionCost(TTI, ReducedVals[i], ReduxWidth); + getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF); InstructionCost Cost = TreeCost + ReductionCost; if (!Cost.isValid()) { LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); @@ -7944,8 +7944,8 @@ private: /// Calculate the cost of a reduction. InstructionCost getReductionCost(TargetTransformInfo *TTI, - Value *FirstReducedVal, - unsigned ReduxWidth) { + Value *FirstReducedVal, unsigned ReduxWidth, + FastMathFlags FMF) { Type *ScalarTy = FirstReducedVal->getType(); FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); InstructionCost VectorCost, ScalarCost; @@ -7958,7 +7958,7 @@ case RecurKind::FAdd: case RecurKind::FMul: { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); - VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy); + VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF); ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy); break; } diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll @@ -0,0 +1,20 @@ +; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu < %s | FileCheck %s + +define void @strict_fp_reductions() { +; CHECK-LABEL: strict_fp_reductions +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) + %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef) + %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef) + %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef) + %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef) + + ret void +} + +declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) +declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll --- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -58,21 +58,36 @@ %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32( %v0) %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64( %v1) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v2) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v3) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f32 = call float @llvm.vector.reduce.fmin.nxv4f32( %v2) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmin_nxv4f64 = call double @llvm.vector.reduce.fmin.nxv4f64( %v3) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f32 = call float @llvm.vector.reduce.fmax.nxv4f32( %v2) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmax_nxv4f64 = call double @llvm.vector.reduce.fmax.nxv4f64( %v3) - %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, %v2) - %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, %v3) - %fmin_nxv4f32 = call float @llvm.vector.reduce.fmin.nxv4f32( %v2) - %fmin_nxv4f64 = call double @llvm.vector.reduce.fmin.nxv4f64( %v3) - %fmax_nxv4f32 = call float @llvm.vector.reduce.fmax.nxv4f32( %v2) - %fmax_nxv4f64 = call double @llvm.vector.reduce.fmax.nxv4f64( %v3) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v3) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32( %v2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64( %v3) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32( %v2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64( %v3) + %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, %v2) + %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, %v3) + %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32( %v2) + %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64( %v3) + %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32( %v2) + %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64( %v3) ret void } + +define void @strict_fp_reductions( %v0, %v1) { +; CHECK-LABEL: 'strict_fp_reductions' +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v1) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f32 = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.000000e+00, %v0) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f64 = call double @llvm.vector.reduce.fmul.nxv4f64(double 0.000000e+00, %v1) + %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, %v0) + %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, %v1) + %fmul_nxv4f32 = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.0, %v0) + %fmul_nxv4f64 = call double @llvm.vector.reduce.fmul.nxv4f64(double 0.0, %v1) + + ret void +} + declare i32 @llvm.vector.reduce.add.nxv4i32() declare i64 @llvm.vector.reduce.add.nxv4i64() declare i32 @llvm.vector.reduce.mul.nxv4i32() @@ -93,6 +108,8 @@ declare i64 @llvm.vector.reduce.smax.nxv4i64() declare float @llvm.vector.reduce.fadd.nxv4f32(float, ) declare double @llvm.vector.reduce.fadd.nxv4f64(double, ) +declare float @llvm.vector.reduce.fmul.nxv4f32(float, ) +declare double @llvm.vector.reduce.fmul.nxv4f64(double, ) declare float @llvm.vector.reduce.fmin.nxv4f32() declare double @llvm.vector.reduce.fmin.nxv4f64() declare float @llvm.vector.reduce.fmax.nxv4f32() diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll --- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll +++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll @@ -344,7 +344,7 @@ define void @reduce_fmul(<16 x float> %va) { ; THRU-LABEL: 'reduce_fmul' -; THRU-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v = call float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) +; THRU-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %v = call float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; LATE-LABEL: 'reduce_fmul' @@ -352,11 +352,11 @@ ; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'reduce_fmul' -; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) +; SIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v = call float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'reduce_fmul' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v = call float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %v = call float @llvm.vector.reduce.fmul.v16f32(float 42.0, <16 x float> %va) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fadd.ll b/llvm/test/Analysis/CostModel/X86/reduce-fadd.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-fadd.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-fadd.ll @@ -11,59 +11,59 @@ define void @reduce_f64(double %arg) { ; SSE2-LABEL: 'reduce_f64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'reduce_f64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE41-LABEL: 'reduce_f64' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'reduce_f64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'reduce_f64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'reduce_f64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'reduce_f64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) @@ -76,66 +76,66 @@ define void @reduce_f32(float %arg) { ; SSE2-LABEL: 'reduce_f32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'reduce_f32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE41-LABEL: 'reduce_f32' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'reduce_f32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'reduce_f32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'reduce_f32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'reduce_f32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll @@ -11,59 +11,59 @@ define void @reduce_f64(double %arg) { ; SSE2-LABEL: 'reduce_f64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'reduce_f64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE41-LABEL: 'reduce_f64' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'reduce_f64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'reduce_f64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'reduce_f64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'reduce_f64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) @@ -76,66 +76,66 @@ define void @reduce_f32(float %arg) { ; SSE2-LABEL: 'reduce_f32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'reduce_f32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE41-LABEL: 'reduce_f32' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'reduce_f32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'reduce_f32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'reduce_f32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'reduce_f32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll @@ -0,0 +1,49 @@ +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -force-vector-width=4 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4 +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -force-vector-width=8 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF8 + +target triple="aarch64-unknown-linux-gnu" + +; CHECK-VF4: Found an estimated cost of 17 for VF 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 34 for VF 8 For instruction: %add = fadd float %0, %sum.07 + +define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret float %add +} + + +; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction: %add = fadd double %0, %sum.07 + +define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %iv + %0 = load double, double* %arrayidx, align 4 + %add = fadd double %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret double %add +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll @@ -0,0 +1,54 @@ +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -scalable-vectorization=on -force-vector-width=4 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4 +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -scalable-vectorization=on -force-vector-width=8 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF8 + +target triple="aarch64-unknown-linux-gnu" + +; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd float %0, %sum.07 + +define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) #0 { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %add +} + + +; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd double %0, %sum.07 + +define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) #0 { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %iv + %0 = load double, double* %arrayidx, align 4 + %add = fadd double %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret double %add +} + +attributes #0 = { "target-features"="+sve" } + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}