Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1146,12 +1146,42 @@ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, bool UseMaskForCond = false, bool UseMaskForGaps = false) const; + /// A helper function to determine the type of reduction algorithm used + /// for a given \p Opcode and set of FastMathFlags \p FPFlags. + static bool isOrderedReduction(unsigned Opcode, FastMathFlags FPFlags) { + if ((Opcode == Instruction::FAdd || Opcode == Instruction::FMul) && + !FPFlags.allowReassoc()) + return true; + return false; + } + /// Calculate the cost of vector reduction intrinsics. /// /// This is the cost of reducing the vector value of type \p Ty to a scalar - /// value using the operation denoted by \p Opcode. + /// value using the operation denoted by \p Opcode. The \p Opcode and + /// FastMathFlags parameter \p FPFlags together indicate what type of + /// reduction we are performing: + /// 1. Tree-wise. This is the typical 'fast' reduction performed that + /// involves successively splitting a vector into half and doing the + /// operation on the pair of halves until you have a scalar value. For + /// example: + /// (v0, v1, v2, v3) + /// ((v0+v2), (v1+v3), undef, undef) + /// ((v0+v2+v1+v3), undef, undef, undef) + /// This is the default behaviour for integer operations, whereas for + /// floating point we only do this if \p FPFlags indicates that + /// reassociation is allowed. + /// 2. Ordered. For a vector with N elements this involves performing N + /// operations in lane order, starting with an initial scalar value, i.e. + /// result = InitVal + v0 + /// result = result + v1 + /// result = result + v2 + /// result = result + v3 + /// This is only the case for FP operations and when reassociation is not + /// allowed. + /// InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, + unsigned Opcode, VectorType *Ty, FastMathFlags FPFlags, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; InstructionCost getMinMaxReductionCost( @@ -1618,6 +1648,7 @@ bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0; virtual InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + FastMathFlags FPFlags, TTI::TargetCostKind CostKind) = 0; virtual InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, @@ -2119,8 +2150,9 @@ } InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + FastMathFlags FPFlags, TTI::TargetCostKind CostKind) override { - return Impl.getArithmeticReductionCost(Opcode, Ty, CostKind); + return Impl.getArithmeticReductionCost(Opcode, Ty, FPFlags, CostKind); } InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -622,6 +622,7 @@ } InstructionCost getArithmeticReductionCost(unsigned, VectorType *, + FastMathFlags FPFlags, TTI::TargetCostKind) const { return 1; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1654,27 +1654,25 @@ } case Intrinsic::vector_reduce_add: return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, - CostKind); + FastMathFlags(), CostKind); case Intrinsic::vector_reduce_mul: return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, - CostKind); + FastMathFlags(), CostKind); case Intrinsic::vector_reduce_and: return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, - CostKind); + FastMathFlags(), CostKind); case Intrinsic::vector_reduce_or: return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, - CostKind); + FastMathFlags(), CostKind); case Intrinsic::vector_reduce_xor: return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, - CostKind); + FastMathFlags(), CostKind); case Intrinsic::vector_reduce_fadd: - // FIXME: Add new flag for cost of strict reductions. return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, - CostKind); + FMF, CostKind); case Intrinsic::vector_reduce_fmul: - // FIXME: Add new flag for cost of strict reductions. return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, - CostKind); + FMF, CostKind); case Intrinsic::vector_reduce_smax: case Intrinsic::vector_reduce_smin: case Intrinsic::vector_reduce_fmax: @@ -2010,8 +2008,8 @@ /// /// The cost model should take into account that the actual length of the /// vector is reduced on each iteration. - InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - TTI::TargetCostKind CostKind) { + InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, + TTI::TargetCostKind CostKind) { Type *ScalarTy = Ty->getElementType(); unsigned NumVecElts = cast(Ty)->getNumElements(); if ((Opcode == Instruction::Or || Opcode == Instruction::And) && @@ -2063,6 +2061,44 @@ thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } + /// Try to calculate the cost of performing strict (in-order) reductions, + /// which involves doing a sequence of floating point additions in lane + /// order, starting with an initial value. For example, consider a scalar + /// initial value 'InitVal' of type float and a vector of type <4 x float>: + /// + /// Vector = + /// + /// %add1 = %InitVal + %v0 + /// %add2 = %add1 + %v1 + /// %add3 = %add2 + %v2 + /// %add4 = %add3 + %v3 + /// + /// As a simple estimate we can say the cost of such a reduction is 4 times + /// the cost of a scalar FP addition. We can only estimate the costs for + /// fixed-width vectors here because for scalable vectors we do not know the + /// runtime number of operations. + InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, + TTI::TargetCostKind CostKind) { + if (isa(Ty)) + return InstructionCost::getInvalid(); + auto *VTy = cast(Ty); + InstructionCost ExtractCost = + getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true); + InstructionCost ArithCost = + getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind); + ArithCost *= VTy->getNumElements(); + + return ExtractCost + ArithCost; + } + + InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + FastMathFlags FPFlags, + TTI::TargetCostKind CostKind) { + if (TTI::isOrderedReduction(Opcode, FPFlags)) + return getOrderedReductionCost(Opcode, Ty, CostKind); + return getTreeReductionCost(Opcode, Ty, CostKind); + } + /// Try to calculate op costs for min/max reduction operations. /// \param CondTy Conditional type for the Select instruction. InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, @@ -2129,8 +2165,8 @@ // Without any native support, this is equivalent to the cost of // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) VectorType *ExtTy = VectorType::get(ResTy, Ty); - InstructionCost RedCost = - thisT()->getArithmeticReductionCost(Instruction::Add, ExtTy, CostKind); + InstructionCost RedCost = thisT()->getArithmeticReductionCost( + Instruction::Add, ExtTy, FastMathFlags(), CostKind); InstructionCost MulCost = 0; InstructionCost ExtCost = thisT()->getCastInstrCost( IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -894,9 +894,10 @@ } InstructionCost TargetTransformInfo::getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) const { + unsigned Opcode, VectorType *Ty, FastMathFlags FPFlags, + TTI::TargetCostKind CostKind) const { InstructionCost Cost = - TTIImpl->getArithmeticReductionCost(Opcode, Ty, CostKind); + TTIImpl->getArithmeticReductionCost(Opcode, Ty, FPFlags, CostKind); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -131,6 +131,18 @@ return BaseT::getMaxVScale(); } + /// Try to return an estimate cost factor that can be used as a multiplier + /// when scalarizing an operation for a vector with ElementCount \p VF. + /// For scalable vectors this currently takes the most pessimistic view based + /// upon the maximum possible value for vscale. + unsigned getScalarizationCostFactor(ElementCount VF) const { + if (!VF.isScalable()) + return VF.getKnownMinValue(); + Optional MaxNumVScale = getMaxVScale(); + assert(MaxNumVScale && "Expected valid max vscale value"); + return *MaxNumVScale * VF.getKnownMinValue(); + } + unsigned getMaxInterleaveFactor(unsigned VF); InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, @@ -305,7 +317,7 @@ ElementCount VF) const; InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, + unsigned Opcode, VectorType *Ty, FastMathFlags FPFlags, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1405,14 +1405,9 @@ return InstructionCost::getInvalid(); ElementCount LegalVF = LT.second.getVectorElementCount(); - Optional MaxNumVScale = getMaxVScale(); - assert(MaxNumVScale && "Expected valid max vscale value"); - InstructionCost MemOpCost = getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); - unsigned MaxNumElementsPerGather = - MaxNumVScale.getValue() * LegalVF.getKnownMinValue(); - return LT.first * MaxNumElementsPerGather * MemOpCost; + return LT.first * MemOpCost * getScalarizationCostFactor(LegalVF); } bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { @@ -1810,15 +1805,32 @@ InstructionCost AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + FastMathFlags FPFlags, TTI::TargetCostKind CostKind) { + if (TTI::isOrderedReduction(Opcode, FPFlags)) { + if (!isa(ValTy)) + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FPFlags, + CostKind); + + if (Opcode != Instruction::FAdd) + return InstructionCost::getInvalid(); + + auto *VTy = cast(ValTy); + InstructionCost Cost = + getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); + Cost *= getScalarizationCostFactor(VTy->getElementCount()); + return Cost; + } + if (isa(ValTy)) return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); - MVT MTy = LT.second; int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + MVT MTy = LT.second; + // Horizontal adds can use the 'addv' instruction. We model the cost of these // instructions as normal vector adds. This is the only arithmetic vector // reduction operation for which we have an instruction. @@ -1884,7 +1896,7 @@ } break; } - return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FPFlags, CostKind); } InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -212,7 +212,7 @@ int getInlinerVectorBonusPercent() { return 0; } InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, + unsigned Opcode, VectorType *Ty, FastMathFlags FPFlags, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -843,13 +843,17 @@ InstructionCost GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + FastMathFlags FPFlags, TTI::TargetCostKind CostKind) { + if (TTI::isOrderedReduction(Opcode, FPFlags)) + return BaseT::getArithmeticReductionCost(Opcode, Ty, FPFlags, CostKind); + EVT OrigTy = TLI->getValueType(DL, Ty); // Computes cost on targets that have packed math instructions(which support // 16-bit types only). if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) - return BaseT::getArithmeticReductionCost(Opcode, Ty, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, Ty, FPFlags, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); return LT.first * getFullRateInstrCost(); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -257,6 +257,7 @@ const Instruction *I = nullptr); InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + FastMathFlags FPFlags, TTI::TargetCostKind CostKind); InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *ValTy, Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1594,11 +1594,15 @@ InstructionCost ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + FastMathFlags FPFlags, TTI::TargetCostKind CostKind) { + if (TTI::isOrderedReduction(Opcode, FPFlags)) + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FPFlags, CostKind); + EVT ValVT = TLI->getValueType(DL, ValTy); int ISD = TLI->InstructionOpcodeToISD(Opcode); if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FPFlags, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); @@ -1610,7 +1614,7 @@ if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first; - return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FPFlags, CostKind); } InstructionCost Index: llvm/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -181,7 +181,7 @@ TTI::TargetCostKind CostKind); InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, + unsigned Opcode, VectorType *Ty, FastMathFlags FPFlags, TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency); InstructionCost getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned); Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3749,7 +3749,11 @@ InstructionCost X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + FastMathFlags FPFlags, TTI::TargetCostKind CostKind) { + if (TTI::isOrderedReduction(Opcode, FPFlags)) + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FPFlags, CostKind); + // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. @@ -3820,7 +3824,7 @@ return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, TargetTransformInfo::CastContextHint::None, CostKind) + - getArithmeticReductionCost(Opcode, WideVecTy, CostKind); + getArithmeticReductionCost(Opcode, WideVecTy, FPFlags, CostKind); } InstructionCost ArithmeticCost = 0; @@ -3916,7 +3920,7 @@ if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) return ArithmeticCost + Entry->Cost; - return BaseT::getArithmeticReductionCost(Opcode, ValVTy, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FPFlags, CostKind); } unsigned NumVecElts = ValVTy->getNumElements(); @@ -3925,7 +3929,7 @@ // Special case power of 2 reductions where the scalar type isn't changed // by type legalization. if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) - return BaseT::getArithmeticReductionCost(Opcode, ValVTy, CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FPFlags, CostKind); InstructionCost ReductionCost = 0; Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7182,8 +7182,13 @@ const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[cast(ReductionPhi)]; - InstructionCost BaseCost = - TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), VectorTy, CostKind); + + if (useOrderedReductions(RdxDesc)) + return TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), VectorTy, + RdxDesc.getFastMathFlags(), CostKind); + + InstructionCost BaseCost = TTI.getArithmeticReductionCost( + RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); // Get the operand that was not the reduction chain and match it to one of the // patterns, returning the better cost if it is found. Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7809,7 +7809,7 @@ InstructionCost TreeCost = V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth)); InstructionCost ReductionCost = - getReductionCost(TTI, ReducedVals[i], ReduxWidth); + getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF); InstructionCost Cost = TreeCost + ReductionCost; if (!Cost.isValid()) { LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); @@ -7896,8 +7896,8 @@ private: /// Calculate the cost of a reduction. InstructionCost getReductionCost(TargetTransformInfo *TTI, - Value *FirstReducedVal, - unsigned ReduxWidth) { + Value *FirstReducedVal, unsigned ReduxWidth, + FastMathFlags FMF) { Type *ScalarTy = FirstReducedVal->getType(); FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); InstructionCost VectorCost, ScalarCost; @@ -7910,7 +7910,7 @@ case RecurKind::FAdd: case RecurKind::FMul: { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); - VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy); + VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF); ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy); break; } Index: llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll @@ -0,0 +1,20 @@ +; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu < %s | FileCheck %s + +define void @strict_fp_reductions() { +; CHECK-LABEL: strict_fp_reductions +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) + %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef) + %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef) + %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef) + %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef) + + ret void +} + +declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) +declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) Index: llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -58,21 +58,36 @@ %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32( %v0) %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64( %v1) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v2) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v3) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f32 = call float @llvm.vector.reduce.fmin.nxv4f32( %v2) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmin_nxv4f64 = call double @llvm.vector.reduce.fmin.nxv4f64( %v3) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f32 = call float @llvm.vector.reduce.fmax.nxv4f32( %v2) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmax_nxv4f64 = call double @llvm.vector.reduce.fmax.nxv4f64( %v3) - %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, %v2) - %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, %v3) - %fmin_nxv4f32 = call float @llvm.vector.reduce.fmin.nxv4f32( %v2) - %fmin_nxv4f64 = call double @llvm.vector.reduce.fmin.nxv4f64( %v3) - %fmax_nxv4f32 = call float @llvm.vector.reduce.fmax.nxv4f32( %v2) - %fmax_nxv4f64 = call double @llvm.vector.reduce.fmax.nxv4f64( %v3) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v3) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32( %v2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64( %v3) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32( %v2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64( %v3) + %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, %v2) + %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, %v3) + %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32( %v2) + %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64( %v3) + %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32( %v2) + %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64( %v3) ret void } + +define void @strict_fp_reductions( %v0, %v1) { +; CHECK-LABEL: 'strict_fp_reductions' +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, %v0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, %v1) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f32 = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.000000e+00, %v0) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmul_nxv4f64 = call double @llvm.vector.reduce.fmul.nxv4f64(double 0.000000e+00, %v1) + %fadd_nxv4f32 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, %v0) + %fadd_nxv4f64 = call double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, %v1) + %fmul_nxv4f32 = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.0, %v0) + %fmul_nxv4f64 = call double @llvm.vector.reduce.fmul.nxv4f64(double 0.0, %v1) + + ret void +} + declare i32 @llvm.vector.reduce.add.nxv4i32() declare i64 @llvm.vector.reduce.add.nxv4i64() declare i32 @llvm.vector.reduce.mul.nxv4i32() @@ -93,6 +108,8 @@ declare i64 @llvm.vector.reduce.smax.nxv4i64() declare float @llvm.vector.reduce.fadd.nxv4f32(float, ) declare double @llvm.vector.reduce.fadd.nxv4f64(double, ) +declare float @llvm.vector.reduce.fmul.nxv4f32(float, ) +declare double @llvm.vector.reduce.fmul.nxv4f64(double, ) declare float @llvm.vector.reduce.fmin.nxv4f32() declare double @llvm.vector.reduce.fmin.nxv4f64() declare float @llvm.vector.reduce.fmax.nxv4f32() Index: llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll +++ llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll @@ -342,24 +342,24 @@ ret void } -define void @reduce_fmul(<16 x float> %va) { -; THRU-LABEL: 'reduce_fmul' -; THRU-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v = call float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) +define void @reduce_fmul_fast(<16 x float> %va) { +; THRU-LABEL: 'reduce_fmul_fast' +; THRU-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v = call fast float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; -; LATE-LABEL: 'reduce_fmul' -; LATE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v = call float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) +; LATE-LABEL: 'reduce_fmul_fast' +; LATE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v = call fast float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) ; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; -; SIZE-LABEL: 'reduce_fmul' -; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) +; SIZE-LABEL: 'reduce_fmul_fast' +; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call fast float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; -; SIZE_LATE-LABEL: 'reduce_fmul' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) +; SIZE_LATE-LABEL: 'reduce_fmul_fast' +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v = call fast float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %v = call float @llvm.vector.reduce.fmul.v16f32(float 42.0, <16 x float> %va) + %v = call fast float @llvm.vector.reduce.fmul.v16f32(float 42.0, <16 x float> %va) ret void } Index: llvm/test/Analysis/CostModel/X86/reduce-fadd.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/reduce-fadd.ll +++ llvm/test/Analysis/CostModel/X86/reduce-fadd.ll @@ -11,139 +11,139 @@ define void @reduce_f64(double %arg) { ; SSE2-LABEL: 'reduce_f64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'reduce_f64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE41-LABEL: 'reduce_f64' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'reduce_f64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'reduce_f64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'reduce_f64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'reduce_f64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %V1 = call double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) - %V2 = call double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) - %V4 = call double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) - %V8 = call double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) - %V16 = call double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) + %V1 = call fast double @llvm.vector.reduce.fadd.v1f64(double %arg, <1 x double> undef) + %V2 = call fast double @llvm.vector.reduce.fadd.v2f64(double %arg, <2 x double> undef) + %V4 = call fast double @llvm.vector.reduce.fadd.v4f64(double %arg, <4 x double> undef) + %V8 = call fast double @llvm.vector.reduce.fadd.v8f64(double %arg, <8 x double> undef) + %V16 = call fast double @llvm.vector.reduce.fadd.v16f64(double %arg, <16 x double> undef) ret void } define void @reduce_f32(float %arg) { ; SSE2-LABEL: 'reduce_f32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'reduce_f32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE41-LABEL: 'reduce_f32' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'reduce_f32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'reduce_f32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'reduce_f32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'reduce_f32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %V1 = call float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) - %V2 = call float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) - %V4 = call float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) - %V8 = call float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) - %V16 = call float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) - %V32 = call float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) + %V1 = call fast float @llvm.vector.reduce.fadd.v1f32(float %arg, <1 x float> undef) + %V2 = call fast float @llvm.vector.reduce.fadd.v2f32(float %arg, <2 x float> undef) + %V4 = call fast float @llvm.vector.reduce.fadd.v4f32(float %arg, <4 x float> undef) + %V8 = call fast float @llvm.vector.reduce.fadd.v8f32(float %arg, <8 x float> undef) + %V16 = call fast float @llvm.vector.reduce.fadd.v16f32(float %arg, <16 x float> undef) + %V32 = call fast float @llvm.vector.reduce.fadd.v32f32(float %arg, <32 x float> undef) ret void } Index: llvm/test/Analysis/CostModel/X86/reduce-fmul.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/reduce-fmul.ll +++ llvm/test/Analysis/CostModel/X86/reduce-fmul.ll @@ -11,139 +11,139 @@ define void @reduce_f64(double %arg) { ; SSE2-LABEL: 'reduce_f64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'reduce_f64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE41-LABEL: 'reduce_f64' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'reduce_f64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'reduce_f64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'reduce_f64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'reduce_f64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) - %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) - %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) - %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) - %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) + %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) + %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) + %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) + %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) + %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ret void } define void @reduce_f32(float %arg) { ; SSE2-LABEL: 'reduce_f32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'reduce_f32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE41-LABEL: 'reduce_f32' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'reduce_f32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'reduce_f32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'reduce_f32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'reduce_f32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) - %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) - %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) - %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) - %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) - %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) + %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef) + %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef) + %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef) + %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef) + %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef) + %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef) ret void } Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll @@ -0,0 +1,49 @@ +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -force-vector-width=4 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4 +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -force-vector-width=8 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF8 + +target triple="aarch64-unknown-linux-gnu" + +; CHECK-VF4: Found an estimated cost of 17 for VF 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 34 for VF 8 For instruction: %add = fadd float %0, %sum.07 + +define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret float %add +} + + +; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction: %add = fadd double %0, %sum.07 + +define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %iv + %0 = load double, double* %arrayidx, align 4 + %add = fadd double %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret double %add +} Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll @@ -0,0 +1,54 @@ +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -scalable-vectorization=on -force-vector-width=4 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4 +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -scalable-vectorization=on -force-vector-width=8 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF8 + +target triple="aarch64-unknown-linux-gnu" + +; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd float %0, %sum.07 + +define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) #0 { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %add +} + + +; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd double %0, %sum.07 + +define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) #0 { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %iv + %0 = load double, double* %arrayidx, align 4 + %add = fadd double %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret double %add +} + +attributes #0 = { "target-features"="+sve" } + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}