Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1566,14 +1566,14 @@ /// Returns true if we're required to use a scalar epilogue for at least /// the final iteration of the original loop. - bool requiresScalarEpilogue() const { + bool requiresScalarEpilogue(ElementCount VF) const { if (!isScalarEpilogueAllowed()) return false; // If we might exit from anywhere but the latch, must run the exiting // iteration in scalar form. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) return true; - return InterleaveInfo.requiresScalarEpilogue(); + return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); } /// Returns true if a scalar epilogue is not allowed due to optsize or a @@ -3181,18 +3181,13 @@ // unroll factor (number of SIMD instructions). Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); - // There are two cases where we need to ensure (at least) the last iteration - // runs in the scalar remainder loop. Thus, if the step evenly divides - // the trip count, we set the remainder to be equal to the step. If the step - // does not evenly divide the trip count, no adjustment is necessary since - // there will already be scalar iterations. Note that the minimum iterations - // check ensures that N >= Step. The cases are: - // 1) If there is a non-reversed interleaved group that may speculatively - // access memory out-of-bounds. - // 2) If any instruction may follow a conditionally taken exit. That is, if - // the loop contains multiple exiting blocks, or a single exiting block - // which is not the latch. - if (VF.isVector() && Cost->requiresScalarEpilogue()) { + // There are cases where we *must* run at least one iteration in the remainder + // loop. See the cost model for when this can happen. If the step evenly + // divides the trip count, we set the remainder to be equal to the step. If + // the step does not evenly divide the trip count, no adjustment is necessary + // since there will already be scalar iterations. Note that the minimum + // iterations check ensures that N >= Step. + if (Cost->requiresScalarEpilogue(VF)) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -3246,8 +3241,8 @@ // vector trip count is zero. This check also covers the case where adding one // to the backedge-taken count overflowed leading to an incorrect trip count // of zero. In this case we will also jump to the scalar loop. - auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE - : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE + : ICmpInst::ICMP_ULT; // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); @@ -8314,8 +8309,8 @@ // Generate code to check if the loop's trip count is less than VF * UF of the // main vector loop. - auto P = - Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? + ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp( P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), @@ -8458,8 +8453,8 @@ // Generate code to check if the loop's trip count is less than VF * UF of the // vector epilogue loop. - auto P = - Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? + ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp( P, Count, Index: llvm/test/Transforms/LoopVectorize/unroll_nonlatch.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/unroll_nonlatch.ll +++ llvm/test/Transforms/LoopVectorize/unroll_nonlatch.ll @@ -2,11 +2,11 @@ ; RUN: opt %s -S -loop-vectorize -force-vector-interleave=2 | FileCheck %s ; Demonstrate a case where we unroll a loop, but don't vectorize it. -; This currently reveals a miscompile. The original loop runs stores in -; the latch block on iterations 0 to 1022, and exits when %indvars.iv = 1023. -; Currently, the unrolled loop produced by the vectorizer runs the iteration -; where %indvar.iv = 1023 in the vector.body loop before exiting. This results -; in an out of bounds access.. +; The original loop runs stores in the latch block on iterations 0 to 1022, +; and exits when %indvars.iv = 1023. (That is, it actually runs the stores +; for an odd number of iterations.) If we unroll by two in the "vector.body" +; loop, we must exit to the epilogue on iteration with %indvars.iv = 1022 to +; avoid an out of bounds access. define void @test(double* %data) { ; CHECK-LABEL: @test( @@ -31,13 +31,13 @@ ; CHECK-NEXT: store double [[TMP8]], double* [[TMP4]], align 8 ; CHECK-NEXT: store double [[TMP9]], double* [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1022 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1022 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1022, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_LATCH:%.*]] ]