diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -21,9 +21,12 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Casting.h" +using namespace llvm::PatternMatch; + namespace llvm { class DemandedBits; @@ -50,6 +53,7 @@ FMul, ///< Product of floats. FMin, ///< FP min implemented in terms of select(cmp()). FMax, ///< FP max implemented in terms of select(cmp()). + FMulAdd, ///< Fused multiply-add of floats (a * b + c). SelectICmp, ///< Integer select(icmp(),x,y) where one of (x,y) is loop ///< invariant SelectFCmp ///< Integer select(fcmp(),x,y) where one of (x,y) is loop @@ -260,6 +264,12 @@ SmallVector getReductionOpChain(PHINode *Phi, Loop *L) const; + /// Returns true if the instruction is a call to the llvm.fmuladd intrinsic. + static bool isFMulAddIntrinsic(Instruction *I) { + return match( + I, m_Intrinsic(m_Value(), m_Value(), m_Value())); + } + private: // The starting value of the recurrence. // It does not have to be zero! diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -81,6 +81,7 @@ case RecurKind::Mul: case RecurKind::FAdd: case RecurKind::FMul: + case RecurKind::FMulAdd: return true; } return false; @@ -197,18 +198,23 @@ // vectorizing floating point operations without unsafe math. static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst, Instruction *Exit, PHINode *Phi) { - // Currently only FAdd is supported - if (Kind != RecurKind::FAdd) + // Currently only FAdd and FMulAdd are supported + if (Kind != RecurKind::FAdd && Kind != RecurKind::FMulAdd) return false; - if (Exit->getOpcode() != Instruction::FAdd || Exit != ExactFPMathInst) + // Recognise a call to the llvm.fmuladd intrinsic. + bool IsFMulAdd = RecurrenceDescriptor::isFMulAddIntrinsic(Exit); + + if ((Exit->getOpcode() != Instruction::FAdd && !IsFMulAdd) || + Exit != ExactFPMathInst) return false; // The only pattern accepted is the one in which the reduction PHI // is used as one of the operands of the exit instruction auto *LHS = Exit->getOperand(0); auto *RHS = Exit->getOperand(1); - if (LHS != Phi && RHS != Phi) + if ((!IsFMulAdd && LHS != Phi && RHS != Phi) || + (IsFMulAdd && Exit->getOperand(2) != Phi)) return false; LLVM_DEBUG(dbgs() << "LV: Found an ordered reduction: Phi: " << *Phi @@ -710,6 +716,10 @@ I->hasNoSignedZeros())) && isFPMinMaxRecurrenceKind(Kind))) return isMinMaxPattern(I, Kind, Prev); + // Recognize a call to the llvm.fmuladd intrinsic. + else if (isFMulAddIntrinsic(I)) + return InstDesc(Kind == RecurKind::FMulAdd, I, + I->hasAllowReassoc() ? nullptr : I); return InstDesc(false, I); } } @@ -804,6 +814,11 @@ << " PHI." << *Phi << "\n"); return true; } + if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, + DT)) { + LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n"); + return true; + } // Not a reduction of known type. return false; } @@ -927,6 +942,7 @@ case RecurKind::FMul: // Multiplying a number by 1 does not change it. return ConstantFP::get(Tp, 1.0L); + case RecurKind::FMulAdd: case RecurKind::FAdd: // Adding zero to a number does not change it. // FIXME: Ideally we should not need to check FMF for FAdd and should always @@ -974,6 +990,7 @@ return Instruction::Xor; case RecurKind::FMul: return Instruction::FMul; + case RecurKind::FMulAdd: case RecurKind::FAdd: return Instruction::FAdd; case RecurKind::SMax: @@ -1032,6 +1049,10 @@ return SelectPatternResult::isMinOrMax( matchSelectPattern(Cur, LHS, RHS).Flavor); } + // Recognize a call to the llvm.fmuladd intrinsic. + if (Kind == RecurKind::FMulAdd) + return true; + return Cur->getOpcode() == RedOp; }; diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1053,6 +1053,7 @@ return Builder.CreateOrReduce(Src); case RecurKind::Xor: return Builder.CreateXorReduce(Src); + case RecurKind::FMulAdd: case RecurKind::FAdd: return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy), Src); @@ -1095,7 +1096,8 @@ Value *llvm::createOrderedReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, Value *Src, Value *Start) { - assert(Desc.getRecurrenceKind() == RecurKind::FAdd && + assert((Desc.getRecurrenceKind() == RecurKind::FAdd || + Desc.getRecurrenceKind() == RecurKind::FMulAdd) && "Unexpected reduction kind"); assert(Src->getType()->isVectorTy() && "Expected a vector type"); assert(!Start->getType()->isVectorTy() && "Expected a scalar type"); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9588,24 +9588,39 @@ unsigned FirstOpId; assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && "Only min/max recurrences allowed for inloop reductions"); + // Recognize a call to the llvm.fmuladd intrinsic. + bool IsFMulAdd = (Kind == RecurKind::FMulAdd); + if (IsFMulAdd) + assert( + RecurrenceDescriptor::isFMulAddIntrinsic(R) && + "Expected instruction to be a call to the llvm.fmuladd intrinsic"); + SmallVector VecOps; if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { assert(isa(WidenRecipe) && "Expected to replace a VPWidenSelectSC"); FirstOpId = 1; } else { - assert((MinVF.isScalar() || isa(WidenRecipe)) && + assert((MinVF.isScalar() || isa(WidenRecipe) || + (IsFMulAdd && isa(WidenRecipe))) && "Expected to replace a VPWidenSC"); FirstOpId = 0; } unsigned VecOpId = R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); + VecOps.push_back(VecOp); auto *CondOp = CM.foldTailByMasking() ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) : nullptr; - VPReductionRecipe *RedRecipe = new VPReductionRecipe( - &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); + if (IsFMulAdd) { + // Add the second operand of the llvm.fmuladd intrinsic to the reduction + // recipe. + VPValue *VecOp2 = Plan->getVPValue(R->getOperand(1)); + VecOps.push_back(VecOp2); + } + VPReductionRecipe *RedRecipe = + new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOps, CondOp, TTI); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); @@ -9753,10 +9768,12 @@ void VPReductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Reduction being replicated."); Value *PrevInChain = State.get(getChainOp(), 0); + // Recognize a call to the llvm.fmuladd intrinsic. + bool IsFMulAdd = (RdxDesc->getRecurrenceKind() == RecurKind::FMulAdd); for (unsigned Part = 0; Part < State.UF; ++Part) { RecurKind Kind = RdxDesc->getRecurrenceKind(); bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); - Value *NewVecOp = State.get(getVecOp(), Part); + Value *NewVecOp = State.get(getVecOp(0), Part); if (VPValue *Cond = getCondOp()) { Value *NewCond = State.get(Cond, Part); VectorType *VecTy = cast(NewVecOp->getType()); @@ -9769,14 +9786,26 @@ } Value *NewRed; Value *NextInChain; + if (IsFMulAdd) { + // If the underlying instruction is a call to the llvm.fmuladd intrinsic, + // we need to create an FMul instruction to use as an operand for + // llvm.vector.reduce.fadd. + FastMathFlags FMF; + if (auto *FPMO = dyn_cast(getUnderlyingInstr())) { + FMF = FPMO->getFastMathFlags(); + State.Builder.setFastMathFlags(FMF); + } + NewVecOp = State.Builder.CreateBinOp(Instruction::FMul, NewVecOp, + State.get(getVecOp(1), Part)); + } if (IsOrdered) { if (State.VF.isVector()) NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, PrevInChain); else NewRed = State.Builder.CreateBinOp( - (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), - PrevInChain, NewVecOp); + (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, + NewVecOp); PrevInChain = NewRed; } else { PrevInChain = State.get(getChainOp(), Part); @@ -9788,11 +9817,10 @@ NewRed, PrevInChain); } else if (IsOrdered) NextInChain = NewRed; - else { + else NextInChain = State.Builder.CreateBinOp( - (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, + (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, PrevInChain); - } State.set(this, NextInChain, Part); } } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8625,6 +8625,8 @@ assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); + assert(RdxKind != RecurKind::FMulAdd && + "A call to the llvm.fmuladd intrinsic is not handled yet"); return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, ReductionOps.back()); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1341,13 +1341,18 @@ RecurrenceDescriptor *RdxDesc; /// Pointer to the TTI, needed to create the target reduction const TargetTransformInfo *TTI; + /// Number of vector values. + unsigned NumVecElts; public: VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp, - VPValue *VecOp, VPValue *CondOp, + SmallVector &VecOps, VPValue *CondOp, const TargetTransformInfo *TTI) - : VPRecipeBase(VPRecipeBase::VPReductionSC, {ChainOp, VecOp}), + : VPRecipeBase(VPRecipeBase::VPReductionSC, {ChainOp}), VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), TTI(TTI) { + NumVecElts = VecOps.size(); + for (VPValue *V : VecOps) + addOperand(V); if (CondOp) addOperand(CondOp); } @@ -1368,13 +1373,20 @@ VPSlotTracker &SlotTracker) const override; #endif + /// The number of vector values. + unsigned getNumVecElts() const { return NumVecElts; } /// The VPValue of the scalar Chain being accumulated. VPValue *getChainOp() const { return getOperand(0); } - /// The VPValue of the vector value to be reduced. - VPValue *getVecOp() const { return getOperand(1); } + /// The VPValue of the vector value to be reduced for a given index in \p + /// VecOps. + VPValue *getVecOp(unsigned i) const { + assert(i < NumVecElts && "Invalid vector op"); + return getOperand(i + 1); + } /// The VPValue of the condition for the block. VPValue *getCondOp() const { - return getNumOperands() > 2 ? getOperand(2) : nullptr; + return getNumOperands() > NumVecElts + 1 ? getOperand(NumVecElts + 1) + : nullptr; } }; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1189,9 +1189,18 @@ printAsOperand(O, SlotTracker); O << " = "; getChainOp()->printAsOperand(O, SlotTracker); - O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) - << " ("; - getVecOp()->printAsOperand(O, SlotTracker); + if (RdxDesc->getRecurrenceKind() == RecurKind::FMulAdd) { + O << " + reduce." << Instruction::getOpcodeName(Instruction::FAdd) << " ("; + O << "fmul("; + getVecOp(0)->printAsOperand(O, SlotTracker); + O << ", "; + getVecOp(1)->printAsOperand(O, SlotTracker); + O << ")"; + } else { + O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) + << " ("; + getVecOp(0)->printAsOperand(O, SlotTracker); + } if (getCondOp()) { O << ", "; getCondOp()->printAsOperand(O, SlotTracker); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -390,6 +390,168 @@ ret float %rdx } +; Test case where loop has a call to the llvm.fmuladd intrinsic. +define float @fmuladd_strict(float* %a, float* %b, i64 %n) #0 { +; CHECK-ORDERED-LABEL: @fmuladd_strict +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.*]], %vector.body ] +; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[FMUL:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[FMUL]]) +; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX]], <4 x float> [[FMUL1]]) +; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX1]], <4 x float> [[FMUL2]]) +; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED: [[RDX3:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX2]], <4 x float> [[FMUL3]]) +; CHECK-ORDERED: for.end +; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] +; CHECK-ORDERED: ret float [[RES]] + +; CHECK-UNORDERED-LABEL: @fmuladd_strict +; CHECK-UNORDERED: vector.body +; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] +; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[FMULADD]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD4]], <4 x float> [[VEC_PHI]]) +; CHECK-UNORDERED: [[FMULADD1]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD5]], <4 x float> [[VEC_PHI1]]) +; CHECK-UNORDERED: [[FMULADD2]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD2]], <4 x float> [[WIDE_LOAD6]], <4 x float> [[VEC_PHI2]]) +; CHECK-UNORDERED: [[FMULADD3]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD3]], <4 x float> [[WIDE_LOAD7]], <4 x float> [[VEC_PHI3]]) +; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd +; CHECK-UNORDERED: middle.block +; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd <4 x float> [[FMULADD1]], [[FMULADD]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <4 x float> [[FMULADD2]], [[BIN_RDX]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <4 x float> [[FMULADD3]], [[BIN_RDX1]] +; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX2]] +; CHECK-UNORDERED: for.body +; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] +; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float* +; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]]) +; CHECK-UNORDERED: for.end +; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ] +; CHECK-UNORDERED: ret float [[RES]] + +; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict +; CHECK-NOT-VECTORIZED-NOT: vector.body + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret float %muladd +} + +; Same as above but where the call to the llvm.fmuladd intrinsic uses a fast-math flag. +define float @fmuladd_strict_fmf(float* %a, float* %b, i64 %n) #0 { +; CHECK-ORDERED-LABEL: @fmuladd_strict_fmf +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.*]], %vector.body ] +; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* +; CHECK-ORDERED: [[FMUL:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[FMUL]]) +; CHECK-ORDERED: [[FMUL1:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX]], <4 x float> [[FMUL1]]) +; CHECK-ORDERED: [[FMUL2:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX1]], <4 x float> [[FMUL2]]) +; CHECK-ORDERED: [[FMUL3:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED: [[RDX3:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX2]], <4 x float> [[FMUL3]]) +; CHECK-ORDERED: for.end +; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ] +; CHECK-ORDERED: ret float [[RES]] + +; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf +; CHECK-UNORDERED: vector.body +; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] +; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: [[FMULADD]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD4]], <4 x float> [[VEC_PHI]]) +; CHECK-UNORDERED: [[FMULADD1]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD5]], <4 x float> [[VEC_PHI1]]) +; CHECK-UNORDERED: [[FMULADD2]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD2]], <4 x float> [[WIDE_LOAD6]], <4 x float> [[VEC_PHI2]]) +; CHECK-UNORDERED: [[FMULADD3]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD3]], <4 x float> [[WIDE_LOAD7]], <4 x float> [[VEC_PHI3]]) +; CHECK-UNORDERED-NOT: call nnan float @llvm.vector.reduce.fadd +; CHECK-UNORDERED: middle.block +; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan <4 x float> [[FMULADD1]], [[FMULADD]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan <4 x float> [[FMULADD2]], [[BIN_RDX]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan <4 x float> [[FMULADD3]], [[BIN_RDX1]] +; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX2]] +; CHECK-UNORDERED: for.body +; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] +; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float* +; CHECK-UNORDERED: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]]) +; CHECK-UNORDERED: for.end +; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ] +; CHECK-UNORDERED: ret float [[RES]] + +; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict_fmf +; CHECK-NOT-VECTORIZED-NOT: vector.body + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %muladd = tail call nnan float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret float %muladd +} + +declare float @llvm.fmuladd.f32(float, float, float) + attributes #0 = { vscale_range(0, 16) } !0 = distinct !{!0, !3, !6, !8} !1 = distinct !{!1, !3, !7, !8} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -812,6 +812,171 @@ ret double %res } +; Test case where the loop has a call to the llvm.fmuladd intrinsic. +define float @fmuladd_strict(float* %a, float* %b, i64 %n) { +; CHECK-ORDERED-LABEL: @fmuladd_strict +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ] +; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* +; CHECK-ORDERED: [[FMUL:%.*]] = fmul <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[FMUL]]) +; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX]], <8 x float> [[FMUL1]]) +; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX1]], <8 x float> [[FMUL2]]) +; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED: [[RDX3:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX2]], <8 x float> [[FMUL3]]) +; CHECK-ORDERED: for.body: +; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ] +; CHECK-ORDERED: [[LOAD:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float* +; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]]) +; CHECK-ORDERED: for.end +; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX3]], %middle.block ] + +; CHECK-UNORDERED-LABEL: @fmuladd_strict +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ , %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] +; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* +; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* +; CHECK-UNORDERED: [[FMULADD]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]]) +; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float> +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float> +; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float> +; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]]) +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ] +; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float* +; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]]) +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ] +; CHECK-UNORDERED: ret float [[RES]] + +; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict +; CHECK-NOT-VECTORIZED-NOT: vector.body + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret float %muladd +} + +; Test reductions for a VF of 1 and a UF > 1 where the loop has a call to the llvm.fmuladd intrinsic. +define float @fmuladd_scalar_vf(float* %a, float* %b, i64 %n) { +; CHECK-ORDERED-LABEL: @fmuladd_scalar_vf +; CHECK-ORDERED: vector.body: +; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.*]], %vector.body ] +; CHECK-ORDERED: [[LOAD:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD2:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD3:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD4:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD5:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD6:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD7:%.*]] = load float, float* +; CHECK-ORDERED: [[FMUL:%.*]] = fmul float [[LOAD]], [[LOAD4]] +; CHECK-ORDERED: [[FADD:%.*]] = fadd float [[VEC_PHI]], [[FMUL]] +; CHECK-ORDERED: [[FMUL1:%.*]] = fmul float [[LOAD1]], [[LOAD5]] +; CHECK-ORDERED: [[FADD1:%.*]] = fadd float [[FADD]], [[FMUL1]] +; CHECK-ORDERED: [[FMUL2:%.*]] = fmul float [[LOAD2]], [[LOAD6]] +; CHECK-ORDERED: [[FADD2:%.*]] = fadd float [[FADD1]], [[FMUL2]] +; CHECK-ORDERED: [[FMUL3:%.*]] = fmul float [[LOAD3]], [[LOAD7]] +; CHECK-ORDERED: [[FADD3:%.*]] = fadd float [[FADD2]], [[FMUL3]] +; CHECK-ORDERED-NOT: call float @llvm.vector.reduce.fadd +; CHECK-ORDERED: scalar.ph +; CHECK-ORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[FADD3]], %middle.block ] +; CHECK-ORDERED: for.body +; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] +; CHECK-ORDERED: [[LOAD8:%.*]] = load float, float* +; CHECK-ORDERED: [[LOAD9:%.*]] = load float, float* +; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]]) +; CHECK-ORDERED: for.end +; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[FADD3]], %middle.block ] +; CHECK-ORDERED: ret float [[RES]] + +; CHECK-UNORDERED-LABEL: @fmuladd_scalar_vf +; CHECK-UNORDERED: vector.body: +; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ] +; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ] +; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD3:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD4:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD6:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD7:%.*]] = load float, float* +; CHECK-UNORDERED: [[FMULADD]] = call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD4]], float [[VEC_PHI]]) +; CHECK-UNORDERED: [[FMULADD1]] = call float @llvm.fmuladd.f32(float [[LOAD1]], float [[LOAD5]], float [[VEC_PHI1]]) +; CHECK-UNORDERED: [[FMULADD2]] = call float @llvm.fmuladd.f32(float [[LOAD2]], float [[LOAD6]], float [[VEC_PHI2]]) +; CHECK-UNORDERED: [[FMULADD3]] = call float @llvm.fmuladd.f32(float [[LOAD3]], float [[LOAD7]], float [[VEC_PHI3]]) +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd float [[FMULADD1]], [[FMULADD]] +; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd float [[FMULADD2]], [[BIN_RDX]] +; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd float [[FMULADD3]], [[BIN_RDX1]] +; CHECK-UNORDERED: scalar.ph: +; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[BIN_RDX2]], %middle.block ] +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ] +; CHECK-UNORDERED: [[LOAD8:%.*]] = load float, float* +; CHECK-UNORDERED: [[LOAD9:%.*]] = load float, float* +; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]]) +; CHECK-UNORDERED: for.end: +; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[BIN_RDX2]], %middle.block ] +; CHECK-UNORDERED: ret float [[RES]] + +; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_scalar_vf +; CHECK-NOT-VECTORIZED-NOT: @vector.body + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !4 + +for.end: + ret float %muladd +} + +declare float @llvm.fmuladd.f32(float, float, float) + !0 = distinct !{!0, !5, !9, !11} !1 = distinct !{!1, !5, !10, !11} !2 = distinct !{!2, !6, !9, !11} diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -1093,6 +1093,73 @@ ret i8 %ret } +; Test case when loop has a call to the llvm.fmuladd intrinsic. +define float @reduction_fmuladd(float* %a, float* %b, i64 %n) { +; CHECK-LABEL: @reduction_fmuladd( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) +; CHECK-NEXT: [[TMP6]] = fadd float [[TMP5]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[SUM_07]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[MULADD_LCSSA]] + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret float %muladd +} + +declare float @llvm.fmuladd.f32(float, float, float) + !6 = distinct !{!6, !7, !8} !7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} !8 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1017,9 +1017,9 @@ { VPValue ChainOp; - VPValue VecOp; + SmallVector VecOps; VPValue CondOp; - VPReductionRecipe Recipe(nullptr, nullptr, &ChainOp, &CondOp, &VecOp, + VPReductionRecipe Recipe(nullptr, nullptr, &ChainOp, VecOps, &CondOp, nullptr); EXPECT_FALSE(Recipe.mayHaveSideEffects()); EXPECT_FALSE(Recipe.mayReadFromMemory()); @@ -1145,9 +1145,9 @@ LLVMContext C; VPValue ChainOp; - VPValue VecOp; + SmallVector VecOps; VPValue CondOp; - VPReductionRecipe Recipe(nullptr, nullptr, &ChainOp, &CondOp, &VecOp, + VPReductionRecipe Recipe(nullptr, nullptr, &ChainOp, VecOps, &CondOp, nullptr); EXPECT_TRUE(isa(&Recipe)); VPRecipeBase *BaseR = &Recipe;