Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -553,13 +553,29 @@ if (!Previous || !TheLoop->contains(Previous) || isa(Previous)) return false; - // Ensure every user of the phi node is dominated by the previous value. The - // dominance requirement ensures the loop vectorizer will not need to - // vectorize the initial value prior to the first iteration of the loop. + auto *Cmp = dyn_cast(Latch->getTerminator()->getOperand(0)); + bool LoopTermConditionUsesPhi = false; + if (Cmp && (Cmp->getOperand(0) == Phi || Cmp->getOperand(1) == Phi)) + LoopTermConditionUsesPhi = true; + for (User *U : Phi->users()) - if (auto *I = dyn_cast(U)) + if (auto *I = dyn_cast(U)) { + // Ensure every user of the phi node is dominated by the previous value. + // The dominance requirement ensures the loop vectorizer will not need to + // vectorize the initial value prior to the first iteration of the loop. if (!DT->dominates(Previous, I)) return false; + // When the phi node has users outside the loop, and it is not used in the + // loop terminating condition, the current logic for + // fixFirstOrderRecurrences may generate incorrect code. Specifically, we + // extract the last element from the vectorized phi, which would be the + // update to the phi before exiting the loop. However, what we want is the + // previous phi value before the update (i.e. the second last update + // before end of the vectorized loop). + // See added test cases in first-order-recurrence.ll + if (!LoopTermConditionUsesPhi && !TheLoop->contains(I)) + return false; + } return true; } Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4069,7 +4069,12 @@ VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); // Extract the last vector element in the middle block. This will be the - // initial value for the recurrence when jumping to the scalar loop. + // initial value for the recurrence when jumping to the scalar loop. + // FIXME: Note that the last vector element need not always be the correct one: + // consider a loop where we have phi uses outside the loop - we need the + // second last iteration value and not the last one). For now, we avoid + // considering such cases as firstOrderRecurrences (see + // isFirstOrderRecurrence). auto *Extract = Incoming; if (VF > 1) { Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); Index: test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -235,10 +235,13 @@ } ; CHECK-LABEL: @add_phifail2( -; CHECK: load <16 x i8>, <16 x i8>* -; CHECK: add nuw nsw <16 x i32> -; CHECK: store <16 x i8> +; CHECK-NOT: load <16 x i8>, <16 x i8>* +; CHECK-NOT: add nuw nsw <16 x i32> +; CHECK-NOT: store <16 x i8> ; Function Attrs: nounwind +; FIXME: Currently, if we vectorize this loop, we will generate incorrect code +; if %len evenly divides VF. Vectorized loop code gen returns a_phi = p[len -1], +; whereas it should be the previous value a_phi = p[len -2] define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { entry: br label %for.body Index: test/Transforms/LoopVectorize/first-order-recurrence.ll =================================================================== --- test/Transforms/LoopVectorize/first-order-recurrence.ll +++ test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -349,3 +349,53 @@ for.end: ret void } + +; FIXME: we can vectorize this first order recurrence, by generating two +; extracts - one for the phi `val.phi` and other for the phi update `addx`. +; val.phi at end of loop is 94 + x. +; CHECK-LABEL: extract_second_last_iteration +; CHECK-NOT: vector.body +define i32 @extract_second_last_iteration(i32* %cval, i32 %x) { +entry: + br label %for.body + +for.body: + %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ] + %inc = add i32 %inc.phi, 1 + %bc = zext i32 %inc.phi to i64 + %addx = add i32 %inc.phi, %x + %cmp = icmp eq i32 %inc.phi, 95 + br i1 %cmp, label %for.end, label %for.body + +for.end: + ret i32 %val.phi +} + +; We can vectorize this loop even though we use the phi outside the loop. +; The phi used outside the loop is used in the loop terminating condition as +; well. So, we generate the correct element at the end of the vector loop (and the +; scalar loop is never run). +; CHECK-LABEL: extract_last_iteration +; CHECK: vector.body: +; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: index.next = add i32 %index, 4 +; CHECK: icmp eq i32 %index.next, 96 +; CHECK: for.end: +; CHECK: ret i32 95 +define i32 @extract_last_iteration(i32* %cval, i32 %x) { +entry: + br label %for.body + +for.body: + %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ] + %inc = add i32 %inc.phi, 1 + %bc = zext i32 %inc.phi to i64 + %addx = add i32 %inc.phi, %x + %cmp = icmp eq i32 %inc.phi, 95 + br i1 %cmp, label %for.end, label %for.body + +for.end: + ret i32 %inc.phi +}