diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -357,7 +357,7 @@ /// for vectorizing the epilogue. Returns VectorizationFactor::Disabled if /// epilogue vectorization is not supported for the loop. VectorizationFactor - selectEpilogueVectorizationFactor(const ElementCount MaxVF); + selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC); protected: /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5630,7 +5630,7 @@ } VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( - const ElementCount MainLoopVF) { + const ElementCount MainLoopVF, unsigned IC) { VectorizationFactor Result = VectorizationFactor::Disabled(); if (!EnableEpilogueVectorization) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); @@ -5686,6 +5686,9 @@ EstimatedRuntimeVF *= *VScale; } + ScalarEvolution &SE = *PSE.getSE(); + Type *TCType = Legal->getWidestInductionType(); + const SCEV *RemainingIterations = nullptr; for (auto &NextVF : ProfitableVFs) { // Skip candidate VFs without a corresponding VPlan. if (!hasPlanWithVF(NextVF.Width)) @@ -5698,6 +5701,22 @@ ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) continue; + // If NextVF is greater than the number of remaining iterations, the + // epilogue loop would be dead. Skip such factors. + if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { + // TODO: extend to support scalable VFs. + if (!RemainingIterations) { + const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop); + RemainingIterations = SE.getURemExpr( + TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); + } + if (SE.isKnownPredicate( + CmpInst::ICMP_UGT, + SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), + RemainingIterations)) + continue; + } + if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) Result = NextVF; } @@ -10470,7 +10489,7 @@ // Consider vectorizing the epilogue too if it's profitable. VectorizationFactor EpilogueVF = - LVP.selectEpilogueVectorizationFactor(VF.Width); + LVP.selectEpilogueVectorizationFactor(VF.Width, IC); if (EpilogueVF.Width.isVector()) { // The first pass vectorizes the main loop and creates a scalar epilogue diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -23,10 +23,10 @@ ; Function Attrs: nounwind uwtable define void @foo1(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr noalias %index) { ; AVX512-LABEL: @foo1( -; AVX512-NEXT: iter.check: +; AVX512-NEXT: entry: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX1]], 0 ; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 @@ -164,11 +164,11 @@ define void @foo2(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr noalias %index) #0 { ; AVX512-LABEL: @foo2( -; AVX512-NEXT: iter.check: +; AVX512-NEXT: entry: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ITER_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> , <16 x i32> poison) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -311,11 +311,11 @@ define void @foo3(ptr noalias %in, ptr noalias %out, ptr noalias %trigger) { ; AVX512-LABEL: @foo3( -; AVX512-NEXT: iter.check: +; AVX512-NEXT: entry: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ITER_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> , <16 x i32> poison) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -445,11 +445,11 @@ define void @foo2_addrspace(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out, ptr noalias %trigger, ptr noalias %index) #0 { ; AVX512-LABEL: @foo2_addrspace( -; AVX512-NEXT: iter.check: +; AVX512-NEXT: entry: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ITER_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> , <16 x i32> poison) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -578,11 +578,11 @@ define void @foo2_addrspace2(ptr addrspace(1) noalias %in, ptr addrspace(0) noalias %out, ptr noalias %trigger, ptr noalias %index) { ; AVX512-LABEL: @foo2_addrspace2( -; AVX512-NEXT: iter.check: +; AVX512-NEXT: entry: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ITER_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> , <16 x i32> poison) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -711,11 +711,11 @@ define void @foo2_addrspace3(ptr addrspace(0) noalias %in, ptr addrspace(1) noalias %out, ptr noalias %trigger, ptr noalias %index) { ; AVX512-LABEL: @foo2_addrspace3( -; AVX512-NEXT: iter.check: +; AVX512-NEXT: entry: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ITER_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> , <16 x i32> poison) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer diff --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll --- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll @@ -4,13 +4,11 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2" target triple = "x86_64-unknown-linux-gnu" -; TODO: Make sure selected VF for the epilog loop doesn't exceed remaining TC. +; Make sure selected VF for the epilog loop doesn't exceed remaining TC. define void @test_tc_17_no_epilogue_vectorization(ptr noalias %src, ptr noalias %dst) { ; CHECK-LABEL: @test_tc_17_no_epilogue_vectorization( -; CHECK-NEXT: iter.check: -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] -; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -27,39 +25,19 @@ ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 17, 16 -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] -; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] -; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX2]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: store <8 x i8> [[WIDE_LOAD3]], ptr [[TMP10]], align 64 -; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX2]], 8 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 16 -; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 17, 16 -; CHECK-NEXT: br i1 [[CMP_N1]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] -; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[LDADDR:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I]] ; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[LDADDR]], align 64 ; CHECK-NEXT: [[STADDR:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I]] ; CHECK-NEXT: store i8 [[VAL]], ptr [[STADDR]], align 64 ; CHECK-NEXT: [[I_NEXT]] = add i64 [[I]], 1 ; CHECK-NEXT: [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 17 -; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -99,12 +77,12 @@ ; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 18, 16 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] @@ -113,18 +91,18 @@ ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX2]], 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x i8>, ptr [[TMP8]], align 64 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: store <8 x i8> [[WIDE_LOAD3]], ptr [[TMP10]], align 64 -; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX2]], 8 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 16 -; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: store <2 x i8> [[WIDE_LOAD3]], ptr [[TMP10]], align 64 +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX2]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 18 +; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 18, 16 +; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 18, 18 ; CHECK-NEXT: br i1 [[CMP_N1]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 18, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] @@ -134,7 +112,7 @@ ; CHECK-NEXT: store i8 [[VAL]], ptr [[STADDR]], align 64 ; CHECK-NEXT: [[I_NEXT]] = add i64 [[I]], 1 ; CHECK-NEXT: [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 18 -; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -174,12 +152,12 @@ ; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 19, 16 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] @@ -188,18 +166,18 @@ ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX2]], 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x i8>, ptr [[TMP8]], align 64 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: store <8 x i8> [[WIDE_LOAD3]], ptr [[TMP10]], align 64 -; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX2]], 8 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 16 -; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: store <2 x i8> [[WIDE_LOAD3]], ptr [[TMP10]], align 64 +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX2]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 18 +; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 19, 16 +; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 19, 18 ; CHECK-NEXT: br i1 [[CMP_N1]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 18, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] @@ -209,7 +187,7 @@ ; CHECK-NEXT: store i8 [[VAL]], ptr [[STADDR]], align 64 ; CHECK-NEXT: [[I_NEXT]] = add i64 [[I]], 1 ; CHECK-NEXT: [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 19 -; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -247,7 +225,7 @@ ; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 20 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -262,7 +240,7 @@ ; CHECK-NEXT: store i8 [[VAL]], ptr [[STADDR]], align 64 ; CHECK-NEXT: [[I_NEXT]] = add i64 [[I]], 1 ; CHECK-NEXT: [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 20 -; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -301,7 +279,7 @@ ; CHECK-NEXT: store <8 x i8> [[STRIDED_VEC]], ptr [[TMP4]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -315,7 +293,7 @@ ; CHECK-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 32 -; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll @@ -8,25 +8,17 @@ ; the vector loop was dead code leaving only a scalar remainder. define zeroext i8 @sum() { ; CHECK-LABEL: @sum( -; CHECK-NEXT: iter.check: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <64 x i8> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <64 x i8> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [128 x i8], ptr @bytes, i64 0, i64 [[INDEX]] +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [128 x i8], ptr @bytes, i64 0, i64 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <64 x i8>, ptr [[TMP0]], align 16 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 64 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <64 x i8>, ptr [[TMP2]], align 16 -; CHECK-NEXT: [[TMP4]] = add <64 x i8> [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = add <64 x i8> [[WIDE_LOAD2]], [[VEC_PHI1]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX]], 0 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <64 x i8> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]]) -; CHECK-NEXT: ret i8 [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 64 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <64 x i8>, ptr [[TMP1]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = add <64 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = add <64 x i8> [[WIDE_LOAD2]], zeroinitializer +; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw i64 0, 128 +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <64 x i8> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]]) +; CHECK-NEXT: ret i8 [[TMP4]] ; entry: br label %for.body