Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -553,22 +553,13 @@ if (!Previous || !TheLoop->contains(Previous) || isa(Previous)) return false; + // Ensure every user of the phi node is dominated by the previous value. + // The dominance requirement ensures the loop vectorizer will not need to + // vectorize the initial value prior to the first iteration of the loop. for (User *U : Phi->users()) if (auto *I = dyn_cast(U)) { - // Ensure every user of the phi node is dominated by the previous value. - // The dominance requirement ensures the loop vectorizer will not need to - // vectorize the initial value prior to the first iteration of the loop. if (!DT->dominates(Previous, I)) return false; - // When the phi node has users outside the loop, the current logic for - // fixFirstOrderRecurrences may generate incorrect code. Specifically, we - // extract the last element from the vectorized phi, which would be the - // update to the phi before exiting the loop. However, what we want is the - // previous phi value before the update (i.e. the second last update - // before end of the vectorized loop). - // See added test cases in first-order-recurrence.ll - if (!TheLoop->contains(I)) - return false; } return true; Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4070,17 +4070,29 @@ // Extract the last vector element in the middle block. This will be the // initial value for the recurrence when jumping to the scalar loop. - // FIXME: Note that the last vector element need not always be the correct one: - // consider a loop where we have phi uses outside the loop - we need the - // second last iteration value and not the last one). For now, we avoid - // considering such cases as firstOrderRecurrences (see - // isFirstOrderRecurrence). auto *Extract = Incoming; if (VF > 1) { Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1), "vector.recur.extract"); } + // We also need to extract the second last element in the middle block if the + // Phi is used outside the loop. We need to extract the phi itself + // and not the last element (the phi update in the current iteration). This + // will be the value when jumping to the exit block from the LoopMiddleBlock, + // when the scalar loop is not run at all. + auto PhiHasUsersOutsideLoop = false; + for (User *U : Phi->users()) { + if (auto *I = dyn_cast(U)) + if (!OrigLoop->contains(I)) { + PhiHasUsersOutsideLoop = true; + break; + } + } + Value *ExtractForPhiUsedOutsideLoop = nullptr; + if (VF > 1 && PhiHasUsersOutsideLoop) + ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( + Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); // Fix the initial value of the original recurrence in the scalar loop. Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); @@ -4098,6 +4110,10 @@ // vector recurrence we extracted in the middle block. Since the loop is in // LCSSA form, we just need to find the phi node for the original scalar // recurrence in the exit block, and then add an edge for the middle block. + // Also, choose the correct value of the vector recurrence for fixing up users of + // the recurrence outside the loop. + if (PhiHasUsersOutsideLoop) + Extract = ExtractForPhiUsedOutsideLoop; for (auto &I : *LoopExitBlock) { auto *LCSSAPhi = dyn_cast(&I); if (!LCSSAPhi) Index: test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -234,15 +234,27 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body } -; CHECK-LABEL: @add_phifail2( -; CHECK-NOT: load <16 x i8>, <16 x i8>* -; CHECK-NOT: add nuw nsw <16 x i32> -; CHECK-NOT: store <16 x i8> ; Function Attrs: nounwind -; FIXME: Currently, if we vectorize this loop, we will generate incorrect code -; if %len evenly divides VF. Vectorized loop code gen returns a_phi = p[len -1], -; whereas it should be the previous value a_phi = p[len -2] +; When we vectorize this loop, we generate correct code +; even when %len exactly divides VF (since we extract from the second last index +; and pass this to the for.cond.cleanup block). Vectorized loop returns +; the correct value a_phi = p[len -2] define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { +; CHECK-LABEL: @add_phifail2( +; CHECK: vector.body: +; CHECK: %wide.load = load <16 x i8>, <16 x i8>* +; CHECK: %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32> +; CHECK: add nuw nsw <16 x i32> +; CHECK: store <16 x i8> +; CHECK: add i64 %index, 16 +; CHECK: icmp eq i64 %index.next, %n.vec +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15 +; CHECK: %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14 +; CHECK: for.cond.cleanup: +; CHECK: %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ] +; CHECK: %ret = trunc i32 %a_phi.lcssa to i8 +; CHECK: ret i8 %ret entry: br label %for.body Index: test/Transforms/LoopVectorize/first-order-recurrence.ll =================================================================== --- test/Transforms/LoopVectorize/first-order-recurrence.ll +++ test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -350,11 +350,25 @@ ret void } -; FIXME: we can vectorize this first order recurrence, by generating two -; extracts - one for the phi `val.phi` and other for the phi update `addx`. -; val.phi at end of loop is 94 + x. -; CHECK-LABEL: extract_second_last_iteration -; CHECK-NOT: vector.body +; We vectorize this first order recurrence, by generating two +; extracts for the phi `val.phi` - one at the last index and +; another at the second last index. We need these 2 extracts because +; the first order recurrence phi is used outside the loop, so we require the phi +; itself and not its update (addx). +; UNROLL-NO-IC-LABEL: extract_second_last_iteration +; UNROLL-NO-IC-CHECK: vector.body +; UNROLL-NO-IC-CHECK: %vector.recur = phi <4 x i32> [ , %vector.ph ], [ %[[L1], %vector.body ] +; UNROLL-NO-IC-CHECK: %step.add = add <4 x i32> %vec.ind, +; UNROLL-NO-IC-CHECK: %[[L2]] = add <4 x i32> %vec.ind, %broadcast.splat +; UNROLL-NO-IC-CHECK: %[[L1]] = add <4 x i32> %step.add, %broadcast.splat +; UNROLL-NO-IC-CHECK: %index.next = add i32 %index, 8 +; UNROLL-NO-IC-CHECK: icmp eq i32 %index.next, 96 +; UNROLL-NO-IC-CHECK: middle.block +; UNROLL-NO-IC-CHECK: icmp eq i32 96, 96 +; UNROLL-NO-IC-CHECK: %vector.recur.extract = extractelement <4 x i32> %[[L1], i32 3 +; UNROLL-NO-IC-CHECK: %vector.recur.extract.for.phi = extractelement <4 x i32> %[[L1], i32 2 +; UNROLL-NO-IC-CHECK: for.end +; UNROLL-NO-IC-CHECK: %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ] define i32 @extract_second_last_iteration(i32* %cval, i32 %x) { entry: br label %for.body