Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1566,14 +1566,14 @@ /// Returns true if we're required to use a scalar epilogue for at least /// the final iteration of the original loop. - bool requiresScalarEpilogue() const { + bool requiresScalarEpilogue(ElementCount VF) const { if (!isScalarEpilogueAllowed()) return false; // If we might exit from anywhere but the latch, must run the exiting // iteration in scalar form. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) return true; - return InterleaveInfo.requiresScalarEpilogue(); + return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); } /// Returns true if a scalar epilogue is not allowed due to optsize or a @@ -3183,18 +3183,13 @@ // unroll factor (number of SIMD instructions). Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); - // There are two cases where we need to ensure (at least) the last iteration - // runs in the scalar remainder loop. Thus, if the step evenly divides - // the trip count, we set the remainder to be equal to the step. If the step - // does not evenly divide the trip count, no adjustment is necessary since - // there will already be scalar iterations. Note that the minimum iterations - // check ensures that N >= Step. The cases are: - // 1) If there is a non-reversed interleaved group that may speculatively - // access memory out-of-bounds. - // 2) If any instruction may follow a conditionally taken exit. That is, if - // the loop contains multiple exiting blocks, or a single exiting block - // which is not the latch. - if (VF.isVector() && Cost->requiresScalarEpilogue()) { + // There are cases where we *must* run at least one iteration in the remainder + // loop. See the cost model for when this can happen. If the step evenly + // divides the trip count, we set the remainder to be equal to the step. If + // the step does not evenly divide the trip count, no adjustment is necessary + // since there will already be scalar iterations. Note that the minimum + // iterations check ensures that N >= Step. + if (Cost->requiresScalarEpilogue(VF)) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -3248,8 +3243,8 @@ // vector trip count is zero. This check also covers the case where adding one // to the backedge-taken count overflowed leading to an incorrect trip count // of zero. In this case we will also jump to the scalar loop. - auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE - : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE + : ICmpInst::ICMP_ULT; // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); @@ -8324,7 +8319,7 @@ // Generate code to check if the loop's trip count is less than VF * UF of the // main vector loop. auto P = - Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp( P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), @@ -8468,7 +8463,7 @@ // Generate code to check if the loop's trip count is less than VF * UF of the // vector epilogue loop. auto P = - Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp( P, Count, Index: llvm/test/Transforms/LoopVectorize/unroll_nonlatch.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/unroll_nonlatch.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -S -loop-vectorize -force-vector-interleave=2 | FileCheck %s + +; Demonstrate a case where we unroll a loop, but don't vectorize it. This +; still requires a scalar epilogue to run the final iteration. (Tail folding +; would also be legal, but isn't yet implemented for this case.) + +define void @test(double* %data) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[INDUCTION]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDUCTION1]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[DATA:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, double* [[DATA]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = load double, double* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load double, double* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = fneg double [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = fneg double [[TMP7]] +; CHECK-NEXT: store double [[TMP8]], double* [[TMP4]], align 8 +; CHECK-NEXT: store double [[TMP9]], double* [[TMP5]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1022 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1022 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1022, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_LATCH:%.*]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_LATCH]] +; CHECK: for.latch: +; CHECK-NEXT: [[T15:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[T16:%.*]] = or i64 [[T15]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[DATA]], i64 [[T16]] +; CHECK-NEXT: [[T17:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[T17]] +; CHECK-NEXT: store double [[FNEG]], double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: br label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.latch ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.latch + +for.latch: + %t15 = shl nuw nsw i64 %indvars.iv, 1 + %t16 = or i64 %t15, 1 + %arrayidx = getelementptr inbounds double, double* %data, i64 %t16 + %t17 = load double, double* %arrayidx, align 8 + %fneg = fneg double %t17 + store double %fneg, double* %arrayidx, align 8 + br label %for.body + +for.end: + ret void +}