Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1193,6 +1193,13 @@ VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; + /// Calculate the cost of a call to the llvm.fmuladd intrinsic. This is + /// modeled as the cost of a normal fmul instruction plus the cost of an fadd + /// reduction. + InstructionCost getFMulAddReductionCost( + VectorType *Ty, Optional FMF, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; + /// Calculate the cost of an extended reduction pattern, similar to /// getArithmeticReductionCost of an Add reduction with an extension and /// optional multiply. This is the cost of as: @@ -1662,6 +1669,9 @@ virtual InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind) = 0; + virtual InstructionCost + getFMulAddReductionCost(VectorType *Ty, Optional FMF, + TTI::TargetCostKind CostKind) = 0; virtual InstructionCost getExtendedAddReductionCost( bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) = 0; @@ -2177,6 +2187,11 @@ TTI::TargetCostKind CostKind) override { return Impl.getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); } + InstructionCost + getFMulAddReductionCost(VectorType *Ty, Optional FMF, + TTI::TargetCostKind CostKind) override { + return Impl.getFMulAddReductionCost(Ty, FMF, CostKind); + } InstructionCost getExtendedAddReductionCost( bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) override { Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -637,6 +637,11 @@ return 1; } + InstructionCost getFMulAddReductionCost(VectorType *, Optional, + TTI::TargetCostKind) const { + return 1; + } + InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2174,6 +2174,16 @@ thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } + InstructionCost getFMulAddReductionCost(VectorType *Ty, + Optional FMF, + TTI::TargetCostKind CostKind) { + InstructionCost FAddReductionCost = thisT()->getArithmeticReductionCost( + Instruction::FAdd, Ty, FMF, CostKind); + InstructionCost FMulCost = + thisT()->getArithmeticInstrCost(Instruction::FMul, Ty, CostKind); + return FMulCost + FAddReductionCost; + } + InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) { Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -917,6 +917,14 @@ return Cost; } +InstructionCost TargetTransformInfo::getFMulAddReductionCost( + VectorType *Ty, Optional FMF, + TTI::TargetCostKind CostKind) const { + InstructionCost Cost = TTIImpl->getFMulAddReductionCost(Ty, FMF, CostKind); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + InstructionCost TargetTransformInfo::getExtendedAddReductionCost( bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const { Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1983,6 +1983,7 @@ case RecurKind::FMax: case RecurKind::SelectICmp: case RecurKind::SelectFCmp: + case RecurKind::FMulAdd: return true; default: return false; Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7255,8 +7255,14 @@ const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[cast(ReductionPhi)]; - InstructionCost BaseCost = TTI.getArithmeticReductionCost( - RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); + InstructionCost BaseCost; + if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) + // Recognize a call to the llvm.fmuladd intrinsic. + BaseCost = TTI.getFMulAddReductionCost(VectorTy, RdxDesc.getFastMathFlags(), + CostKind); + else + BaseCost = TTI.getArithmeticReductionCost( + RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); // If we're using ordered reductions then we can just return the base cost // here, since getArithmeticReductionCost calculates the full ordered @@ -7929,6 +7935,12 @@ return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { + // Recognize a call to the llvm.fmuladd intrinsic. + if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) { + // Detect reduction patterns. + if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + return *RedCost; + } bool NeedToScalarize; CallInst *CI = cast(I); InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -395,50 +395,50 @@ ; CHECK-ORDERED-LABEL: @fmuladd_strict ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.*]], %vector.body ] -; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[FMUL:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[FMUL]]) -; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX]], <4 x float> [[FMUL1]]) -; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX1]], <4 x float> [[FMUL2]]) -; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED: [[RDX3:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX2]], <4 x float> [[FMUL3]]) +; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load , * +; CHECK-ORDERED: [[FMUL:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[FMUL]]) +; CHECK-ORDERED: [[FMUL1:%.*]] = fmul [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], [[FMUL1]]) +; CHECK-ORDERED: [[FMUL2:%.*]] = fmul [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], [[FMUL2]]) +; CHECK-ORDERED: [[FMUL3:%.*]] = fmul [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED: [[RDX3:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], [[FMUL3]]) ; CHECK-ORDERED: for.end ; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] ; CHECK-ORDERED: ret float [[RES]] ; CHECK-UNORDERED-LABEL: @fmuladd_strict ; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] -; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[FMULADD]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD4]], <4 x float> [[VEC_PHI]]) -; CHECK-UNORDERED: [[FMULADD1]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD5]], <4 x float> [[VEC_PHI1]]) -; CHECK-UNORDERED: [[FMULADD2]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD2]], <4 x float> [[WIDE_LOAD6]], <4 x float> [[VEC_PHI2]]) -; CHECK-UNORDERED: [[FMULADD3]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD3]], <4 x float> [[WIDE_LOAD7]], <4 x float> [[VEC_PHI3]]) +; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] +; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load , * +; CHECK-UNORDERED: [[FMULADD]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD4]], [[VEC_PHI]]) +; CHECK-UNORDERED: [[FMULADD1]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD1]], [[WIDE_LOAD5]], [[VEC_PHI1]]) +; CHECK-UNORDERED: [[FMULADD2]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD2]], [[WIDE_LOAD6]], [[VEC_PHI2]]) +; CHECK-UNORDERED: [[FMULADD3]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD3]], [[WIDE_LOAD7]], [[VEC_PHI3]]) ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd <4 x float> [[FMULADD1]], [[FMULADD]] -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <4 x float> [[FMULADD2]], [[BIN_RDX]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <4 x float> [[FMULADD3]], [[BIN_RDX1]] -; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX2]] +; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd [[FMULADD1]], [[FMULADD]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd [[FMULADD2]], [[BIN_RDX]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd [[FMULADD3]], [[BIN_RDX1]] +; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX2]] ; CHECK-UNORDERED: for.body ; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] ; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* @@ -475,50 +475,50 @@ ; CHECK-ORDERED-LABEL: @fmuladd_strict_fmf ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.*]], %vector.body ] -; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[FMUL:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[FMUL]]) -; CHECK-ORDERED: [[FMUL1:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX]], <4 x float> [[FMUL1]]) -; CHECK-ORDERED: [[FMUL2:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX1]], <4 x float> [[FMUL2]]) -; CHECK-ORDERED: [[FMUL3:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED: [[RDX3:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX2]], <4 x float> [[FMUL3]]) +; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load , * +; CHECK-ORDERED: [[FMUL:%.*]] = fmul nnan [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[FMUL]]) +; CHECK-ORDERED: [[FMUL1:%.*]] = fmul nnan [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], [[FMUL1]]) +; CHECK-ORDERED: [[FMUL2:%.*]] = fmul nnan [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], [[FMUL2]]) +; CHECK-ORDERED: [[FMUL3:%.*]] = fmul nnan [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED: [[RDX3:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], [[FMUL3]]) ; CHECK-ORDERED: for.end ; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] ; CHECK-ORDERED: ret float [[RES]] ; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf ; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] -; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[FMULADD]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD4]], <4 x float> [[VEC_PHI]]) -; CHECK-UNORDERED: [[FMULADD1]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD5]], <4 x float> [[VEC_PHI1]]) -; CHECK-UNORDERED: [[FMULADD2]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD2]], <4 x float> [[WIDE_LOAD6]], <4 x float> [[VEC_PHI2]]) -; CHECK-UNORDERED: [[FMULADD3]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD3]], <4 x float> [[WIDE_LOAD7]], <4 x float> [[VEC_PHI3]]) +; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] +; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load , * +; CHECK-UNORDERED: [[FMULADD]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD4]], [[VEC_PHI]]) +; CHECK-UNORDERED: [[FMULADD1]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD1]], [[WIDE_LOAD5]], [[VEC_PHI1]]) +; CHECK-UNORDERED: [[FMULADD2]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD2]], [[WIDE_LOAD6]], [[VEC_PHI2]]) +; CHECK-UNORDERED: [[FMULADD3]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD3]], [[WIDE_LOAD7]], [[VEC_PHI3]]) ; CHECK-UNORDERED-NOT: call nnan float @llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan <4 x float> [[FMULADD1]], [[FMULADD]] -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan <4 x float> [[FMULADD2]], [[BIN_RDX]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan <4 x float> [[FMULADD3]], [[BIN_RDX1]] -; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX2]] +; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan [[FMULADD1]], [[FMULADD]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan [[FMULADD2]], [[BIN_RDX]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan [[FMULADD3]], [[BIN_RDX1]] +; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX2]] ; CHECK-UNORDERED: for.body ; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] ; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll @@ -48,3 +48,53 @@ for.end: ret double %add } + +; CHECK-VF4: Found an estimated cost of 23 for VF 4 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) +; CHECK-VF8: Found an estimated cost of 46 for VF 8 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + +define float @fmuladd_strict32(float* %a, float* %b, i64 %n) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret float %muladd +} + +declare float @llvm.fmuladd.f32(float, float, float) + +; CHECK-VF4: Found an estimated cost of 22 for VF 4 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) +; CHECK-VF8: Found an estimated cost of 44 for VF 8 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) + +define double @fmuladd_strict64(double* %a, double* %b, i64 %n) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi double [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %iv + %0 = load double, double* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds double, double* %b, i64 %iv + %1 = load double, double* %arrayidx2, align 4 + %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret double %muladd +} + +declare double @llvm.fmuladd.f64(double, double, double)