Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1983,6 +1983,7 @@ case RecurKind::FMax: case RecurKind::SelectICmp: case RecurKind::SelectFCmp: + case RecurKind::FMulAdd: return true; default: return false; Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7261,8 +7261,21 @@ // If we're using ordered reductions then we can just return the base cost // here, since getArithmeticReductionCost calculates the full ordered // reduction cost when FP reassociation is not allowed. - if (useOrderedReductions(RdxDesc)) - return BaseCost; + if (useOrderedReductions(RdxDesc)) { + if (RdxDesc.getRecurrenceKind() != RecurKind::FMulAdd) + return BaseCost; + // For a call to the llvm.fmuladd intrinsic we need to add the cost of an + // fmul instruction to the cost of the fadd reduction. + Value *Op2 = I->getOperand(1); + TargetTransformInfo::OperandValueProperties Op2VP; + TargetTransformInfo::OperandValueKind Op2VK = + TTI.getOperandInfo(Op2, Op2VP); + SmallVector Operands(I->operand_values()); + InstructionCost FMulCost = TTI.getArithmeticInstrCost( + Instruction::FMul, VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, + Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); + return BaseCost + FMulCost; + } // Get the operand that was not the reduction chain and match it to one of the // patterns, returning the better cost if it is found. @@ -7929,6 +7942,12 @@ return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { + // Recognize a call to the llvm.fmuladd intrinsic. + if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) { + // Detect reduction patterns. + if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + return *RedCost; + } bool NeedToScalarize; CallInst *CI = cast(I); InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -395,50 +395,50 @@ ; CHECK-ORDERED-LABEL: @fmuladd_strict ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.*]], %vector.body ] -; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[FMUL:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[FMUL]]) -; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX]], <4 x float> [[FMUL1]]) -; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX1]], <4 x float> [[FMUL2]]) -; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED: [[RDX3:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX2]], <4 x float> [[FMUL3]]) +; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load , * +; CHECK-ORDERED: [[FMUL:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[FMUL]]) +; CHECK-ORDERED: [[FMUL1:%.*]] = fmul [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], [[FMUL1]]) +; CHECK-ORDERED: [[FMUL2:%.*]] = fmul [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], [[FMUL2]]) +; CHECK-ORDERED: [[FMUL3:%.*]] = fmul [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED: [[RDX3:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], [[FMUL3]]) ; CHECK-ORDERED: for.end ; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] ; CHECK-ORDERED: ret float [[RES]] ; CHECK-UNORDERED-LABEL: @fmuladd_strict ; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] -; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[FMULADD]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD4]], <4 x float> [[VEC_PHI]]) -; CHECK-UNORDERED: [[FMULADD1]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD5]], <4 x float> [[VEC_PHI1]]) -; CHECK-UNORDERED: [[FMULADD2]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD2]], <4 x float> [[WIDE_LOAD6]], <4 x float> [[VEC_PHI2]]) -; CHECK-UNORDERED: [[FMULADD3]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD3]], <4 x float> [[WIDE_LOAD7]], <4 x float> [[VEC_PHI3]]) +; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] +; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load , * +; CHECK-UNORDERED: [[FMULADD]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD4]], [[VEC_PHI]]) +; CHECK-UNORDERED: [[FMULADD1]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD1]], [[WIDE_LOAD5]], [[VEC_PHI1]]) +; CHECK-UNORDERED: [[FMULADD2]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD2]], [[WIDE_LOAD6]], [[VEC_PHI2]]) +; CHECK-UNORDERED: [[FMULADD3]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD3]], [[WIDE_LOAD7]], [[VEC_PHI3]]) ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd <4 x float> [[FMULADD1]], [[FMULADD]] -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <4 x float> [[FMULADD2]], [[BIN_RDX]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <4 x float> [[FMULADD3]], [[BIN_RDX1]] -; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX2]] +; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd [[FMULADD1]], [[FMULADD]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd [[FMULADD2]], [[BIN_RDX]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd [[FMULADD3]], [[BIN_RDX1]] +; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX2]] ; CHECK-UNORDERED: for.body ; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] ; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* @@ -475,50 +475,50 @@ ; CHECK-ORDERED-LABEL: @fmuladd_strict_fmf ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.*]], %vector.body ] -; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: [[FMUL:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[FMUL]]) -; CHECK-ORDERED: [[FMUL1:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX]], <4 x float> [[FMUL1]]) -; CHECK-ORDERED: [[FMUL2:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX1]], <4 x float> [[FMUL2]]) -; CHECK-ORDERED: [[FMUL3:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED: [[RDX3:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX2]], <4 x float> [[FMUL3]]) +; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load , * +; CHECK-ORDERED: [[FMUL:%.*]] = fmul nnan [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[FMUL]]) +; CHECK-ORDERED: [[FMUL1:%.*]] = fmul nnan [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], [[FMUL1]]) +; CHECK-ORDERED: [[FMUL2:%.*]] = fmul nnan [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], [[FMUL2]]) +; CHECK-ORDERED: [[FMUL3:%.*]] = fmul nnan [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED: [[RDX3:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], [[FMUL3]]) ; CHECK-ORDERED: for.end ; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] ; CHECK-ORDERED: ret float [[RES]] ; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf ; CHECK-UNORDERED: vector.body -; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] -; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] -; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* -; CHECK-UNORDERED: [[FMULADD]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD4]], <4 x float> [[VEC_PHI]]) -; CHECK-UNORDERED: [[FMULADD1]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD5]], <4 x float> [[VEC_PHI1]]) -; CHECK-UNORDERED: [[FMULADD2]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD2]], <4 x float> [[WIDE_LOAD6]], <4 x float> [[VEC_PHI2]]) -; CHECK-UNORDERED: [[FMULADD3]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD3]], <4 x float> [[WIDE_LOAD7]], <4 x float> [[VEC_PHI3]]) +; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] +; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load , * +; CHECK-UNORDERED: [[FMULADD]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD4]], [[VEC_PHI]]) +; CHECK-UNORDERED: [[FMULADD1]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD1]], [[WIDE_LOAD5]], [[VEC_PHI1]]) +; CHECK-UNORDERED: [[FMULADD2]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD2]], [[WIDE_LOAD6]], [[VEC_PHI2]]) +; CHECK-UNORDERED: [[FMULADD3]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD3]], [[WIDE_LOAD7]], [[VEC_PHI3]]) ; CHECK-UNORDERED-NOT: call nnan float @llvm.vector.reduce.fadd ; CHECK-UNORDERED: middle.block -; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan <4 x float> [[FMULADD1]], [[FMULADD]] -; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan <4 x float> [[FMULADD2]], [[BIN_RDX]] -; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan <4 x float> [[FMULADD3]], [[BIN_RDX1]] -; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX2]] +; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan [[FMULADD1]], [[FMULADD]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan [[FMULADD2]], [[BIN_RDX]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan [[FMULADD3]], [[BIN_RDX1]] +; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX2]] ; CHECK-UNORDERED: for.body ; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] ; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll @@ -48,3 +48,53 @@ for.end: ret double %add } + +; CHECK-VF4: Found an estimated cost of 23 for VF 4 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) +; CHECK-VF8: Found an estimated cost of 46 for VF 8 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + +define float @fmuladd_strict32(float* %a, float* %b, i64 %n) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret float %muladd +} + +declare float @llvm.fmuladd.f32(float, float, float) + +; CHECK-VF4: Found an estimated cost of 22 for VF 4 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) +; CHECK-VF8: Found an estimated cost of 44 for VF 8 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) + +define double @fmuladd_strict64(double* %a, double* %b, i64 %n) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi double [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %iv + %0 = load double, double* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds double, double* %b, i64 %iv + %1 = load double, double* %arrayidx2, align 4 + %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret double %muladd +} + +declare double @llvm.fmuladd.f64(double, double, double)