Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4291,11 +4291,15 @@ BasicBlock *Latch = OrigLoop->getLoopLatch(); Value *LoopVal = Phi->getIncomingValueForBlock(Latch); + bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && + useOrderedReductions(RdxDesc); + for (unsigned Part = 0; Part < UF; ++Part) { + if (IsOrdered && Part > 0) + break; Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part); Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part); - if (IsInLoopReductionPhi && useOrderedReductions(RdxDesc) && - State.VF.isVector()) + if (IsOrdered) Val = State.get(State.Plan->getVPValue(LoopVal), UF - 1); cast(VecRdxPhi) ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); @@ -4388,7 +4392,7 @@ // terminate on this line. This is the easiest way to ensure we don't // accidentally cause an extra step back into the loop while debugging. setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); - if (IsInLoopReductionPhi && useOrderedReductions(RdxDesc)) + if (IsOrdered) ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); else { // Floating-point operations should have some FMF to enable the reduction. @@ -4722,8 +4726,14 @@ } } + bool IsOrdered = State.VF.isVector() && + Cost->isInLoopReduction(cast(PN)) && + useOrderedReductions(*RdxDesc); + for (unsigned Part = 0; Part < State.UF; ++Part) { // This is phase one of vectorizing PHIs. + if (Part > 0 && IsOrdered) + return; Value *EntryPart = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); State.set(PhiR, EntryPart, Part); Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -30,9 +30,6 @@ ; CHECK-LABEL: @fadd_strict_unroll ; CHECK: vector.body: ; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ] -; CHECK: %[[VEC_PHI2:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] -; CHECK: %[[VEC_PHI3:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] -; CHECK: %[[VEC_PHI4:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] ; CHECK: %[[LOAD1:.*]] = load , * ; CHECK: %[[LOAD2:.*]] = load , * ; CHECK: %[[LOAD3:.*]] = load , * Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -30,9 +30,7 @@ ; CHECK-LABEL: @fadd_strict_unroll ; CHECK: vector.body: ; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ] -; CHECK: %[[VEC_PHI2:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] -; CHECK: %[[VEC_PHI3:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] -; CHECK: %[[VEC_PHI4:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] +; CHECK-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ] ; CHECK: %[[LOAD1:.*]] = load <8 x float>, <8 x float>* ; CHECK: %[[LOAD2:.*]] = load <8 x float>, <8 x float>* ; CHECK: %[[LOAD3:.*]] = load <8 x float>, <8 x float>* @@ -61,6 +59,63 @@ ret float %add } +; An additional test for unrolling where we need the last value of the reduction, i.e: +; float sum = 0, sum2; +; for(int i=0; i, <8 x float>* +; CHECK: %[[LOAD2:.*]] = load <8 x float>, <8 x float>* +; CHECK: %[[LOAD3:.*]] = load <8 x float>, <8 x float>* +; CHECK: %[[LOAD4:.*]] = load <8 x float>, <8 x float>* +; CHECK: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]]) +; CHECK: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]]) +; CHECK: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]]) +; CHECK: %[[RDX4]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]]) +; CHECK: for.body +; CHECK: %[[SUM_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ %bc.merge.rdx, %scalar.ph ] +; CHECK: %[[LOAD5:.*]] = load float, float* +; CHECK: %[[FADD]] = fadd float %[[SUM_PHI]], %[[LOAD5]] +; CHECK: for.cond.cleanup +; CHECK: %[[FADD_LCSSA:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX4]], %middle.block ] +; CHECK: %[[FADD_42:.*]] = fadd float %[[FADD_LCSSA]], 4.200000e+01 +; CHECK: store float %[[FADD_42]], float* %b +; CHECK: for.end +; CHECK: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ] +; CHECK: ret float %[[SUM_LCSSA]] +entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum = phi float [ 0.000000e+00, %entry ], [ %fadd, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %fadd = fadd float %sum, %0 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 + +for.cond.cleanup: + %fadd.lcssa = phi float [ %fadd, %for.body ] + %fadd2 = fadd float %fadd.lcssa, 4.200000e+01 + store float %fadd2, float* %b, align 4 + br label %for.end + +for.end: + %sum.lcssa = phi float [ %fadd.lcssa, %for.cond.cleanup ], [ 0.000000e+00, %entry ] + ret float %sum.lcssa +} + define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @fadd_strict_interleave ; CHECK: entry