Index: llvm/include/llvm/Analysis/IVDescriptors.h =================================================================== --- llvm/include/llvm/Analysis/IVDescriptors.h +++ llvm/include/llvm/Analysis/IVDescriptors.h @@ -20,6 +20,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Casting.h" @@ -50,6 +51,7 @@ FMul, ///< Product of floats. FMin, ///< FP min implemented in terms of select(cmp()). FMax, ///< FP max implemented in terms of select(cmp()). + FMulAdd, ///< Fused multiply-add of floats (a * b + c). SelectICmp, ///< Integer select(icmp(),x,y) where one of (x,y) is loop ///< invariant SelectFCmp ///< Integer select(fcmp(),x,y) where one of (x,y) is loop @@ -260,6 +262,12 @@ SmallVector getReductionOpChain(PHINode *Phi, Loop *L) const; + /// Returns true if the instruction is a call to the llvm.fmuladd intrinsic. + static bool isFMulAddIntrinsic(Instruction *I) { + return isa(I) && + cast(I)->getIntrinsicID() == Intrinsic::fmuladd; + } + private: // The starting value of the recurrence. // It does not have to be zero! Index: llvm/lib/Analysis/IVDescriptors.cpp =================================================================== --- llvm/lib/Analysis/IVDescriptors.cpp +++ llvm/lib/Analysis/IVDescriptors.cpp @@ -81,6 +81,7 @@ case RecurKind::Mul: case RecurKind::FAdd: case RecurKind::FMul: + case RecurKind::FMulAdd: return true; } return false; @@ -197,18 +198,27 @@ // vectorizing floating point operations without unsafe math. static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst, Instruction *Exit, PHINode *Phi) { - // Currently only FAdd is supported - if (Kind != RecurKind::FAdd) + // Currently only FAdd and FMulAdd are supported. + if (Kind != RecurKind::FAdd && Kind != RecurKind::FMulAdd) return false; - if (Exit->getOpcode() != Instruction::FAdd || Exit != ExactFPMathInst) + if (Kind == RecurKind::FAdd && Exit->getOpcode() != Instruction::FAdd) + return false; + + if (Kind == RecurKind::FMulAdd && + !RecurrenceDescriptor::isFMulAddIntrinsic(Exit)) + return false; + + if (Exit != ExactFPMathInst) return false; // The only pattern accepted is the one in which the reduction PHI // is used as one of the operands of the exit instruction - auto *LHS = Exit->getOperand(0); - auto *RHS = Exit->getOperand(1); - if (LHS != Phi && RHS != Phi) + auto *Op0 = Exit->getOperand(0); + auto *Op1 = Exit->getOperand(1); + if (Kind == RecurKind::FAdd && Op0 != Phi && Op1 != Phi) + return false; + if (Kind == RecurKind::FMulAdd && Exit->getOperand(2) != Phi) return false; LLVM_DEBUG(dbgs() << "LV: Found an ordered reduction: Phi: " << *Phi @@ -710,6 +720,12 @@ I->hasNoSignedZeros())) && isFPMinMaxRecurrenceKind(Kind))) return isMinMaxPattern(I, Kind, Prev); + // If the instruction is a call to llvm.fmuladd, the reduction phi can only + // be the final operand. + else if (isFMulAddIntrinsic(I) && + (I->getOperand(0) != OrigPhi && I->getOperand(1) != OrigPhi)) + return InstDesc(Kind == RecurKind::FMulAdd, I, + I->hasAllowReassoc() ? nullptr : I); return InstDesc(false, I); } } @@ -804,6 +820,11 @@ << " PHI." << *Phi << "\n"); return true; } + if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, + DT)) { + LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n"); + return true; + } // Not a reduction of known type. return false; } @@ -927,6 +948,7 @@ case RecurKind::FMul: // Multiplying a number by 1 does not change it. return ConstantFP::get(Tp, 1.0L); + case RecurKind::FMulAdd: case RecurKind::FAdd: // Adding zero to a number does not change it. // FIXME: Ideally we should not need to check FMF for FAdd and should always @@ -974,6 +996,7 @@ return Instruction::Xor; case RecurKind::FMul: return Instruction::FMul; + case RecurKind::FMulAdd: case RecurKind::FAdd: return Instruction::FAdd; case RecurKind::SMax: @@ -1032,6 +1055,10 @@ return SelectPatternResult::isMinOrMax( matchSelectPattern(Cur, LHS, RHS).Flavor); } + // Recognize a call to the llvm.fmuladd intrinsic. + if (isFMulAddIntrinsic(Cur)) + return true; + return Cur->getOpcode() == RedOp; }; Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2056,6 +2056,7 @@ case RecurKind::FMax: case RecurKind::SelectICmp: case RecurKind::SelectFCmp: + case RecurKind::FMulAdd: return true; default: return false; Index: llvm/lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopUtils.cpp +++ llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1053,6 +1053,7 @@ return Builder.CreateOrReduce(Src); case RecurKind::Xor: return Builder.CreateXorReduce(Src); + case RecurKind::FMulAdd: case RecurKind::FAdd: return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy), Src); @@ -1095,7 +1096,8 @@ Value *llvm::createOrderedReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, Value *Src, Value *Start) { - assert(Desc.getRecurrenceKind() == RecurKind::FAdd && + assert((Desc.getRecurrenceKind() == RecurKind::FAdd || + Desc.getRecurrenceKind() == RecurKind::FMulAdd) && "Unexpected reduction kind"); assert(Src->getType()->isVectorTy() && "Expected a vector type"); assert(!Start->getType()->isVectorTy() && "Expected a scalar type"); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9606,12 +9606,17 @@ unsigned FirstOpId; assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && "Only min/max recurrences allowed for inloop reductions"); + // Recognize a call to the llvm.fmuladd intrinsic. + bool IsFMulAdd = (Kind == RecurKind::FMulAdd); + assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && + "Expected instruction to be a call to the llvm.fmuladd intrinsic"); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { assert(isa(WidenRecipe) && "Expected to replace a VPWidenSelectSC"); FirstOpId = 1; } else { - assert((MinVF.isScalar() || isa(WidenRecipe)) && + assert((MinVF.isScalar() || isa(WidenRecipe) || + (IsFMulAdd && isa(WidenRecipe))) && "Expected to replace a VPWidenSC"); FirstOpId = 0; } @@ -9622,8 +9627,19 @@ auto *CondOp = CM.foldTailByMasking() ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) : nullptr; - VPReductionRecipe *RedRecipe = new VPReductionRecipe( - &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); + + if (IsFMulAdd) { + // If the instruction is a call to the llvm.fmuladd intrinsic then we + // need to create an fmul recipe to use as the vector operand for the + // fadd reduction. + VPInstruction *FMulRecipe = new VPInstruction( + Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); + WidenRecipe->getParent()->insert(FMulRecipe, + WidenRecipe->getIterator()); + VecOp = FMulRecipe; + } + VPReductionRecipe *RedRecipe = + new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8759,6 +8759,8 @@ assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); + assert(RdxKind != RecurKind::FMulAdd && + "A call to the llvm.fmuladd intrinsic is not handled yet"); ++NumVectorInstructions; return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -390,6 +390,168 @@ ret float %rdx } +; Test case where loop has a call to the llvm.fmuladd intrinsic. +define float @fmuladd_strict(float* %a, float* %b, i64 %n) #0 { +; CHECK-ORDERED-LABEL: @fmuladd_strict +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.*]], %vector.body ] +; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load , * +; CHECK-ORDERED: [[FMUL:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED: [[FMUL1:%.*]] = fmul [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED: [[FMUL2:%.*]] = fmul [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED: [[FMUL3:%.*]] = fmul [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[FMUL]]) +; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], [[FMUL1]]) +; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], [[FMUL2]]) +; CHECK-ORDERED: [[RDX3:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], [[FMUL3]]) +; CHECK-ORDERED: for.end +; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] +; CHECK-ORDERED: ret float [[RES]] + +; CHECK-UNORDERED-LABEL: @fmuladd_strict +; CHECK-UNORDERED: vector.body +; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] +; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load , * +; CHECK-UNORDERED: [[FMULADD]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD4]], [[VEC_PHI]]) +; CHECK-UNORDERED: [[FMULADD1]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD1]], [[WIDE_LOAD5]], [[VEC_PHI1]]) +; CHECK-UNORDERED: [[FMULADD2]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD2]], [[WIDE_LOAD6]], [[VEC_PHI2]]) +; CHECK-UNORDERED: [[FMULADD3]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD3]], [[WIDE_LOAD7]], [[VEC_PHI3]]) +; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd +; CHECK-UNORDERED: middle.block +; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd [[FMULADD1]], [[FMULADD]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd [[FMULADD2]], [[BIN_RDX]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd [[FMULADD3]], [[BIN_RDX1]] +; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX2]] +; CHECK-UNORDERED: for.body +; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] +; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float* +; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]]) +; CHECK-UNORDERED: for.end +; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ] +; CHECK-UNORDERED: ret float [[RES]] + +; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict +; CHECK-NOT-VECTORIZED-NOT: vector.body + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret float %muladd +} + +; Same as above but where the call to the llvm.fmuladd intrinsic uses a fast-math flag. +define float @fmuladd_strict_fmf(float* %a, float* %b, i64 %n) #0 { +; CHECK-ORDERED-LABEL: @fmuladd_strict_fmf +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.*]], %vector.body ] +; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load , * +; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load , * +; CHECK-ORDERED: [[FMUL:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED: [[FMUL1:%.*]] = fmul [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED: [[FMUL2:%.*]] = fmul [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED: [[FMUL3:%.*]] = fmul [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[FMUL]]) +; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], [[FMUL1]]) +; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], [[FMUL2]]) +; CHECK-ORDERED: [[RDX3:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], [[FMUL3]]) +; CHECK-ORDERED: for.end +; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] +; CHECK-ORDERED: ret float [[RES]] + +; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf +; CHECK-UNORDERED: vector.body +; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] +; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load , * +; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load , * +; CHECK-UNORDERED: [[FMULADD]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD4]], [[VEC_PHI]]) +; CHECK-UNORDERED: [[FMULADD1]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD1]], [[WIDE_LOAD5]], [[VEC_PHI1]]) +; CHECK-UNORDERED: [[FMULADD2]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD2]], [[WIDE_LOAD6]], [[VEC_PHI2]]) +; CHECK-UNORDERED: [[FMULADD3]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD3]], [[WIDE_LOAD7]], [[VEC_PHI3]]) +; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd +; CHECK-UNORDERED: middle.block +; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan [[FMULADD1]], [[FMULADD]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan [[FMULADD2]], [[BIN_RDX]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan [[FMULADD3]], [[BIN_RDX1]] +; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX2]] +; CHECK-UNORDERED: for.body +; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] +; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float* +; CHECK-UNORDERED: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]]) +; CHECK-UNORDERED: for.end +; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ] +; CHECK-UNORDERED: ret float [[RES]] + +; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict_fmf +; CHECK-NOT-VECTORIZED-NOT: vector.body + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %muladd = tail call nnan float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret float %muladd +} + +declare float @llvm.fmuladd.f32(float, float, float) + attributes #0 = { vscale_range(0, 16) } !0 = distinct !{!0, !3, !6, !8} !1 = distinct !{!1, !3, !7, !8} Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -931,6 +931,203 @@ ret double %res } +; Test case where the loop has a call to the llvm.fmuladd intrinsic. +define float @fmuladd_strict(float* %a, float* %b, i64 %n) { +; CHECK-ORDERED-LABEL: @fmuladd_strict +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ] +; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[FMUL:%.*]] = fmul <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[FMUL]]) +; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX]], <8 x float> [[FMUL1]]) +; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX1]], <8 x float> [[FMUL2]]) +; CHECK-ORDERED: [[RDX3:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX2]], <8 x float> [[FMUL3]]) +; CHECK-ORDERED: for.body: +; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ] +; CHECK-ORDERED: [[LOAD:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float* +; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]]) +; CHECK-ORDERED: for.end +; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX3]], %middle.block ] + +; CHECK-UNORDERED-LABEL: @fmuladd_strict +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ , %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] +; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* +; CHECK-UNORDERED: [[FMULADD]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]]) +; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float> +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float> +; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float> +; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]]) +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ] +; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float* +; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]]) +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ] +; CHECK-UNORDERED: ret float [[RES]] + +; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict +; CHECK-NOT-VECTORIZED-NOT: vector.body + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret float %muladd +} + +; Test reductions for a VF of 1 and a UF > 1 where the loop has a call to the llvm.fmuladd intrinsic. +define float @fmuladd_scalar_vf(float* %a, float* %b, i64 %n) { +; CHECK-ORDERED-LABEL: @fmuladd_scalar_vf +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.*]], %vector.body ] +; CHECK-ORDERED: [[LOAD:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD2:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD3:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD4:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD5:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD6:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD7:%.*]] = load float, float* +; CHECK-ORDERED: [[FMUL:%.*]] = fmul float [[LOAD]], [[LOAD4]] +; CHECK-ORDERED: [[FMUL1:%.*]] = fmul float [[LOAD1]], [[LOAD5]] +; CHECK-ORDERED: [[FMUL2:%.*]] = fmul float [[LOAD2]], [[LOAD6]] +; CHECK-ORDERED: [[FMUL3:%.*]] = fmul float [[LOAD3]], [[LOAD7]] +; CHECK-ORDERED: [[FADD:%.*]] = fadd float [[VEC_PHI]], [[FMUL]] +; CHECK-ORDERED: [[FADD1:%.*]] = fadd float [[FADD]], [[FMUL1]] +; CHECK-ORDERED: [[FADD2:%.*]] = fadd float [[FADD1]], [[FMUL2]] +; CHECK-ORDERED: [[FADD3:%.*]] = fadd float [[FADD2]], [[FMUL3]] +; CHECK-ORDERED-NOT: @llvm.vector.reduce.fadd +; CHECK-ORDERED: scalar.ph +; CHECK-ORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[FADD3]], %middle.block ] +; CHECK-ORDERED: for.body +; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] +; CHECK-ORDERED: [[LOAD8:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD9:%.*]] = load float, float* +; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]]) +; CHECK-ORDERED: for.end +; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[FADD3]], %middle.block ] +; CHECK-ORDERED: ret float [[RES]] + +; CHECK-UNORDERED-LABEL: @fmuladd_scalar_vf +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] +; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD3:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD4:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD6:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD7:%.*]] = load float, float* +; CHECK-UNORDERED: [[FMULADD]] = call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD4]], float [[VEC_PHI]]) +; CHECK-UNORDERED: [[FMULADD1]] = call float @llvm.fmuladd.f32(float [[LOAD1]], float [[LOAD5]], float [[VEC_PHI1]]) +; CHECK-UNORDERED: [[FMULADD2]] = call float @llvm.fmuladd.f32(float [[LOAD2]], float [[LOAD6]], float [[VEC_PHI2]]) +; CHECK-UNORDERED: [[FMULADD3]] = call float @llvm.fmuladd.f32(float [[LOAD3]], float [[LOAD7]], float [[VEC_PHI3]]) +; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd float [[FMULADD1]], [[FMULADD]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd float [[FMULADD2]], [[BIN_RDX]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd float [[FMULADD3]], [[BIN_RDX1]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[BIN_RDX2]], %middle.block ] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] +; CHECK-UNORDERED: [[LOAD8:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD9:%.*]] = load float, float* +; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]]) +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[BIN_RDX2]], %middle.block ] +; CHECK-UNORDERED: ret float [[RES]] + +; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_scalar_vf +; CHECK-NOT-VECTORIZED-NOT: vector.body + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !4 + +for.end: + ret float %muladd +} + +; Test case where the reduction phi is one of the mul operands of the llvm.fmuladd, +; which isn't vectorized. +define float @fmuladd_phi_is_mul_operand(float* %a, float* %b, i64 %n) { +; CHECK-ORDERED-LABEL: @fmuladd_phi_is_mul_operand +; CHECK-ORDERED-NOT: vector.body + +; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_mul_operand +; CHECK-UNORDERED-NOT: vector.body + +; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_mul_operand +; CHECK-NOT-VECTORIZED-NOT: vector.body + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %muladd = tail call float @llvm.fmuladd.f32(float %sum.07, float %0, float %1) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret float %muladd +} + +declare float @llvm.fmuladd.f32(float, float, float) + !0 = distinct !{!0, !5, !9, !11} !1 = distinct !{!1, !5, !10, !11} !2 = distinct !{!2, !6, !9, !11} Index: llvm/test/Transforms/LoopVectorize/reduction-inloop.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -1091,6 +1091,73 @@ ret i8 %ret } +; Test case when loop has a call to the llvm.fmuladd intrinsic. +define float @reduction_fmuladd(float* %a, float* %b, i64 %n) { +; CHECK-LABEL: @reduction_fmuladd( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) +; CHECK-NEXT: [[TMP6]] = fadd float [[TMP5]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[SUM_07]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[MULADD_LCSSA]] + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret float %muladd +} + +declare float @llvm.fmuladd.f32(float, float, float) + !6 = distinct !{!6, !7, !8} !7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} !8 = !{!"llvm.loop.vectorize.enable", i1 true}