diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3185,8 +3185,7 @@ return BCResumeVal; } -DenseMap -InnerLoopVectorizer::createInductionResumeValues( +void InnerLoopVectorizer::createInductionResumeValues( std::pair AdditionalBypass) { assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && @@ -5491,14 +5490,6 @@ return false; } - // Induction variables that are widened require special handling that is - // currently not supported. - if (any_of(Legal->getInductionVars(), [&](auto &Entry) { - return !(this->isScalarAfterVectorization(Entry.first, VF) || - this->isProfitableToScalarize(Entry.first, VF)); - })) - return false; - // Epilogue vectorization code has not been auditted to ensure it handles // non-latch exits properly. It may be fine, but it needs auditted and // tested. @@ -7904,15 +7895,20 @@ PhisInBlock.push_back(&Phi); for (PHINode *Phi : PhisInBlock) { + Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); Phi->replaceIncomingBlockWith( VecEpilogueIterationCountCheck->getSinglePredecessor(), VecEpilogueIterationCountCheck); + + if (!any_of(Phi->blocks(), [&](BasicBlock *IncB) { + return EPI.EpilogueIterationCountCheck == IncB; + })) + continue; Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); if (EPI.SCEVSafetyCheck) Phi->removeIncomingValue(EPI.SCEVSafetyCheck); if (EPI.MemSafetyCheck) Phi->removeIncomingValue(EPI.MemSafetyCheck); - Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); } // Generate a resume induction for the vector epilogue and put it in the @@ -10486,8 +10482,7 @@ VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); - // Ensure that the start values for any VPReductionPHIRecipes are - // updated before vectorising the epilogue loop. + // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated before vectorizing the epilogue loop. for (VPRecipeBase &R : Header->phis()) { if (auto *ReductionPhi = dyn_cast(&R)) { if (auto *Resume = MainILV.getReductionResumeValue( @@ -10496,6 +10491,23 @@ ReductionPhi->setOperand(0, StartVal); } } + + PHINode *IndPhi = nullptr; + const InductionDescriptor *ID; + if (auto *Ind = dyn_cast(&R)) { + IndPhi = cast(Ind->getUnderlyingValue()); + ID = &Ind->getInductionDescriptor(); + } else if (auto *Ind = + dyn_cast(&R)) { + IndPhi = Ind->getPHINode(); + ID = &Ind->getInductionDescriptor(); + } + if (!IndPhi) + continue; + R.setOperand( + 0, BestEpiPlan.getOrAddExternalDef( + MainILV.createInductionResumeValue( + IndPhi, *ID, {EPI.MainLoopIterationCountCheck}))); } LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll @@ -11,11 +11,12 @@ ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR_START_1:%.*]], i64 10000 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR_START_1:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP]], i32 0 @@ -47,40 +48,41 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 10001, 10000 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END4:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000 +; CHECK-NEXT: [[IND_END6:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000 ; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR_START_1]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000 +; CHECK-NEXT: [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX7]], 0 -; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX7]], 1 -; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP8]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP9]], i32 1 +; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX9]], 0 +; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX9]], 1 +; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP10]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP11]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]]) ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP23]]) -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP8]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP10]], i32 0 ; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP24]], align 1 -; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2 -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT10]], 10000 +; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 2 +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 10000 ; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 10001, 10000 -; CHECK-NEXT: br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 10001, 10000 +; CHECK-NEXT: br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR_START_1]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi ptr [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR_START_1]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL5]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[CMP_I_I_I_I:%.*]] = icmp ne ptr [[PTR_IV]], null ; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I_I]]) ; CHECK-NEXT: store i8 0, ptr [[PTR_IV]], align 1 @@ -114,8 +116,10 @@ define void @test_widen_induction(i32 %init, ptr %A, i32 %N, i32 %step) { ; CHECK-LABEL: @test_widen_induction( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -136,17 +140,40 @@ ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[BC_RESUME_VAL]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <2 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <2 x i64> [[VEC_IND5]], ptr [[TMP9]], align 4 +; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT7]] = add <2 x i64> [[VEC_IND5]], +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 1000 +; CHECK-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: br i1 [[CMP_N3]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_1]] ; CHECK-NEXT: store i64 [[IV_1]], ptr [[GEP_A]], align 4 ; CHECK-NEXT: [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_1_NEXT]], 1000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -167,10 +194,12 @@ define void @test_widen_extended_induction(ptr %dst) { ; CHECK-LABEL: @test_widen_extended_induction( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -191,22 +220,47 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[STEP_ADD]], ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 10000 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 10000, 10000 -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX8:%.*]] = trunc i32 [[INDEX4]] to i8 +; CHECK-NEXT: [[TMP9:%.*]] = add i8 [[OFFSET_IDX8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = zext i8 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [6 x i8], ptr [[DST]], i64 0, i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: store <2 x i8> [[VEC_IND5]], ptr [[TMP12]], align 1 +; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i32 [[INDEX4]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT7]] = add <2 x i8> [[VEC_IND5]], +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT9]], 10000 +; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i32 10000, 10000 +; CHECK-NEXT: br i1 [[CMP_N3]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i8 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL2]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV_EXT:%.*]] = zext i8 [[IV]] to i64 ; CHECK-NEXT: [[ARRAYIDX1449:%.*]] = getelementptr inbounds [6 x i8], ptr [[DST]], i64 0, i64 [[IV_EXT]] ; CHECK-NEXT: store i8 [[IV]], ptr [[ARRAYIDX1449]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], 1 ; CHECK-NEXT: [[IV_NEXT_EXT:%.*]] = zext i8 [[IV_NEXT]] to i32 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT_EXT]], 10000 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -229,8 +283,10 @@ define void @test_widen_truncated_induction(ptr %A) { ; CHECK-LABEL: @test_widen_truncated_induction( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -248,21 +304,45 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[STEP_ADD]], ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i8 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[TMP7]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: store <2 x i8> [[VEC_IND5]], ptr [[TMP10]], align 1 +; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT7]] = add <2 x i8> [[VEC_IND5]], +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 10000 +; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i64 10000, 10000 +; CHECK-NEXT: br i1 [[CMP_N3]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: store i8 [[IV_TRUNC]], ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 10000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll --- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -7,13 +7,16 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp { ; CHECK-LABEL: @conversion_cost1( ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 3 -; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]] -; CHECK: .lr.ph.preheader: +; CHECK-NEXT: br i1 [[TMP1]], label [[ITER_CHECK:%.*]], label [[DOT_CRIT_EDGE:%.*]] +; CHECK: iter.check: ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -4 ; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 ; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 32 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP4]], 32 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 32 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]] @@ -21,32 +24,64 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <32 x i8> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <32 x i8> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <32 x i8>* -; CHECK-NEXT: store <32 x i8> [[VEC_IND1]], <32 x i8>* [[TMP8]], align 1 +; CHECK-NEXT: store <32 x i8> [[VEC_IND]], <32 x i8>* [[TMP8]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <32 x i8> [[VEC_IND1]], +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i8> [[VEC_IND]], ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[DOTLR_PH_PREHEADER]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[IND_END5:%.*]] = add i64 3, [[N_VEC]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP4]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 16 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP4]], 16 +; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF2]] +; CHECK-NEXT: [[IND_END4:%.*]] = add i64 3, [[N_VEC3]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i8 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[TMP10]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <16 x i8> [[DOTSPLAT]], +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND9:%.*]] = phi <16 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX11:%.*]] = add i64 3, [[INDEX8]] +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX11]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[VEC_IND9]], <16 x i8>* [[TMP14]], align 1 +; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX8]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT10]] = add <16 x i8> [[VEC_IND9]], +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[CMP_N7]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[INDVARS_IV]] to i8 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i8 [[TMP10]], i8* [[TMP11]], align 1 +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV]] to i8 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i8 [[TMP16]], i8* [[TMP17]], align 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: ._crit_edge.loopexit: ; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] ; CHECK: ._crit_edge: @@ -92,37 +127,37 @@ ; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <2 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 9, [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 6 -; CHECK-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[STEP_ADD]], -; CHECK-NEXT: [[TMP15:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], -; CHECK-NEXT: [[TMP16:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], -; CHECK-NEXT: [[TMP17:%.*]] = sitofp <2 x i64> [[TMP13]] to <2 x float> -; CHECK-NEXT: [[TMP18:%.*]] = sitofp <2 x i64> [[TMP14]] to <2 x float> -; CHECK-NEXT: [[TMP19:%.*]] = sitofp <2 x i64> [[TMP15]] to <2 x float> -; CHECK-NEXT: [[TMP20:%.*]] = sitofp <2 x i64> [[TMP16]] to <2 x float> -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 6 +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <2 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], +; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP13:%.*]] = sitofp <2 x i64> [[TMP9]] to <2 x float> +; CHECK-NEXT: [[TMP14:%.*]] = sitofp <2 x i64> [[TMP10]] to <2 x float> +; CHECK-NEXT: [[TMP15:%.*]] = sitofp <2 x i64> [[TMP11]] to <2 x float> +; CHECK-NEXT: [[TMP16:%.*]] = sitofp <2 x i64> [[TMP12]] to <2 x float> +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP13]], <2 x float>* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 2 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP23]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP14]], <2 x float>* [[TMP24]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 4 ; CHECK-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP17]], <2 x float>* [[TMP26]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 2 +; CHECK-NEXT: store <2 x float> [[TMP15]], <2 x float>* [[TMP26]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 6 ; CHECK-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP27]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP18]], <2 x float>* [[TMP28]], align 4 -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 4 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP19]], <2 x float>* [[TMP30]], align 4 -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 6 -; CHECK-NEXT: [[TMP32:%.*]] = bitcast float* [[TMP31]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP20]], <2 x float>* [[TMP32]], align 4 +; CHECK-NEXT: store <2 x float> [[TMP16]], <2 x float>* [[TMP28]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD2]], -; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -138,7 +173,7 @@ ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: ._crit_edge.loopexit: ; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] ; CHECK: ._crit_edge: diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -170,11 +170,11 @@ define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { ; AVX512-LABEL: @foo2( -; AVX512-NEXT: entry: +; AVX512-NEXT: iter.check: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ITER_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -317,11 +317,11 @@ define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) { ; AVX512-LABEL: @foo3( -; AVX512-NEXT: entry: +; AVX512-NEXT: iter.check: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ITER_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -451,11 +451,11 @@ define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { ; AVX512-LABEL: @foo2_addrspace( -; AVX512-NEXT: entry: +; AVX512-NEXT: iter.check: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ITER_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -584,11 +584,11 @@ define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) { ; AVX512-LABEL: @foo2_addrspace2( -; AVX512-NEXT: entry: +; AVX512-NEXT: iter.check: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ITER_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -717,11 +717,11 @@ define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) { ; AVX512-LABEL: @foo2_addrspace3( -; AVX512-NEXT: entry: +; AVX512-NEXT: iter.check: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ITER_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -855,16 +855,16 @@ ; AVX512-NEXT: [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64 ; AVX512-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 [[IDX_EXT]] ; AVX512-NEXT: [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0 -; AVX512-NEXT: br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]] -; AVX512: for.body.lr.ph: +; AVX512-NEXT: br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[ITER_CHECK:%.*]] +; AVX512: iter.check: ; AVX512-NEXT: [[MUL:%.*]] = sub nsw i32 0, [[D]] ; AVX512-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 ; AVX512-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2 ; AVX512-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4 ; AVX512-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2 ; AVX512-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 32 -; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: ; AVX512-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2 ; AVX512-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4 @@ -889,13 +889,16 @@ ; AVX512-NEXT: [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] ; AVX512-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] -; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX512: vector.main.loop.iter.check: +; AVX512-NEXT: [[MIN_ITERS_CHECK13:%.*]] = icmp ult i64 [[TMP3]], 16 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK13]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16 ; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] ; AVX512-NEXT: [[IND_END:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]] ; AVX512-NEXT: [[TMP12:%.*]] = mul i64 [[N_VEC]], 16 -; AVX512-NEXT: [[IND_END13:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]] +; AVX512-NEXT: [[IND_END15:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]] ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] @@ -910,33 +913,73 @@ ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD]], <16 x float*> [[TMP14]], i32 4, <16 x i1> ), !alias.scope !17, !noalias !19 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[NEXT_GEP]], i32 0 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x float>, <16 x float>* [[TMP19]], align 4, !alias.scope !21 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x float>, <16 x float>* [[TMP19]], align 4, !alias.scope !21 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP14]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15]], <16 x float*> [[TMP20]], i32 4, <16 x i1> ), !alias.scope !17, !noalias !19 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD14]], <16 x float*> [[TMP20]], i32 4, <16 x i1> ), !alias.scope !17, !noalias !19 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX512-NEXT: [[PTR_IND]] = getelementptr float, float* [[POINTER_PHI]], i64 256 ; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] -; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[SCALAR_PH]] -; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi float* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ] -; AVX512-NEXT: [[BC_RESUME_VAL14:%.*]] = phi float* [ [[IND_END13]], [[MIDDLE_BLOCK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ] +; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX512: vec.epilog.iter.check: +; AVX512-NEXT: [[TMP22:%.*]] = mul i64 [[N_VEC]], 16 +; AVX512-NEXT: [[IND_END23:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP22]] +; AVX512-NEXT: [[IND_END20:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]] +; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] +; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX512: vec.epilog.ph: +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi float* [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL16:%.*]] = phi float* [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX512-NEXT: [[N_MOD_VF17:%.*]] = urem i64 [[TMP3]], 8 +; AVX512-NEXT: [[N_VEC18:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF17]] +; AVX512-NEXT: [[IND_END19:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC18]] +; AVX512-NEXT: [[TMP23:%.*]] = mul i64 [[N_VEC18]], 16 +; AVX512-NEXT: [[IND_END22:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP23]] +; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AVX512: vec.epilog.vector.body: +; AVX512-NEXT: [[POINTER_PHI28:%.*]] = phi float* [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_PH]] ], [ [[PTR_IND29:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT32:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP24:%.*]] = add i64 [[INDEX26]], 0 +; AVX512-NEXT: [[NEXT_GEP27:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP24]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr float, float* [[POINTER_PHI28]], <8 x i64> +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP27]], i64 [[IDXPROM]] +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP26]], i32 0 +; AVX512-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP27]] to <8 x float>* +; AVX512-NEXT: [[WIDE_LOAD30:%.*]] = load <8 x float>, <8 x float>* [[TMP28]], align 4, !alias.scope !23 +; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> [[WIDE_LOAD30]], <8 x float*> [[TMP25]], i32 4, <8 x i1> ), !alias.scope !26, !noalias !28 +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr float, float* [[NEXT_GEP27]], i32 0 +; AVX512-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <8 x float>* +; AVX512-NEXT: [[WIDE_LOAD31:%.*]] = load <8 x float>, <8 x float>* [[TMP30]], align 4, !alias.scope !30 +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, <8 x float*> [[TMP25]], i64 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> [[WIDE_LOAD31]], <8 x float*> [[TMP31]], i32 4, <8 x i1> ), !alias.scope !26, !noalias !28 +; AVX512-NEXT: [[INDEX_NEXT32]] = add nuw i64 [[INDEX26]], 8 +; AVX512-NEXT: [[PTR_IND29]] = getelementptr float, float* [[POINTER_PHI28]], i64 128 +; AVX512-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT32]], [[N_VEC18]] +; AVX512-NEXT: br i1 [[TMP32]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; AVX512: vec.epilog.middle.block: +; AVX512-NEXT: [[CMP_N25:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC18]] +; AVX512-NEXT: br i1 [[CMP_N25]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX512: vec.epilog.scalar.ph: +; AVX512-NEXT: [[BC_RESUME_VAL21:%.*]] = phi float* [ [[IND_END19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END20]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[ITER_CHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL24:%.*]] = phi float* [ [[IND_END22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END23]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ], [ [[DEST]], [[ITER_CHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: -; AVX512-NEXT: [[PTR_ADDR_012:%.*]] = phi float* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] -; AVX512-NEXT: [[DEST_ADDR_011:%.*]] = phi float* [ [[BC_RESUME_VAL14]], [[SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[PTR_ADDR_012:%.*]] = phi float* [ [[BC_RESUME_VAL21]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[DEST_ADDR_011:%.*]] = phi float* [ [[BC_RESUME_VAL24]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; AVX512-NEXT: store float [[TMP22]], float* [[DEST_ADDR_011]], align 4 -; AVX512-NEXT: [[TMP23:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 +; AVX512-NEXT: [[TMP33:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; AVX512-NEXT: store float [[TMP33]], float* [[DEST_ADDR_011]], align 4 +; AVX512-NEXT: [[TMP34:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1 -; AVX512-NEXT: store float [[TMP23]], float* [[ARRAYIDX5]], align 4 +; AVX512-NEXT: store float [[TMP34]], float* [[ARRAYIDX5]], align 4 ; AVX512-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1 ; AVX512-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16 ; AVX512-NEXT: [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]] -; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; REQUIRES: asserts ; RUN: opt < %s -passes='loop-vectorize' -force-vector-width=2 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S 2>&1 | FileCheck %s @@ -8,6 +9,56 @@ ; CHECK: LEV: Unable to vectorize epilogue because the loop is not a supported candidate. define signext i32 @f2(i8* noalias %A, i32 signext %n) { +; CHECK-LABEL: @f2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <2 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <2 x i8>* +; CHECK-NEXT: store <2 x i8> [[TMP4]], <2 x i8>* [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP8]], 1 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: [[INC_LCSSA_WIDE:%.*]] = phi i64 [ [[INDVARS_IV_NEXT]], [[FOR_BODY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[INC_LCSSA_WIDE]] to i32 +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[I_0_LCSSA]] +; entry: %cmp1 = icmp sgt i32 %n, 0 br i1 %cmp1, label %for.body.preheader, label %for.end @@ -36,33 +87,3 @@ %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %for.end.loopexit ] ret i32 %i.0.lcssa } - -; Currently we cannot handle widended/truncated inductions. -; CHECK: LV: Checking a loop in 'f3' -; CHECK: LEV: Unable to vectorize epilogue because the loop is not a supported candidate. - -define void @f3(i8* noalias %A, i32 signext %n) { -entry: - %cmp1 = icmp sgt i32 %n, 0 - br i1 %cmp1, label %for.body.preheader, label %for.end - -for.body.preheader: ; preds = %entry - %wide.trip.count = zext i32 %n to i64 - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] - %0 = trunc i64 %indvars.iv to i32 - %conv = trunc i32 %0 to i8 - %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv - store i8 %conv, i8* %arrayidx, align 1 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond, label %for.body, label %for.end.loopexit - -for.end.loopexit: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.end.loopexit, %entry - ret void -} diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll @@ -208,7 +208,7 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END5:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END4:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] @@ -245,11 +245,11 @@ ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[I_014]], -1 ; CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP36]], [[N]] ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64 @@ -472,8 +472,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(i8* %dst) { ; CHECK-LABEL: @induction_resume_value_requires_non_trivial_scev_expansion( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] -; CHECK: outer.header: +; CHECK-NEXT: br label [[ITER_CHECK:%.*]] +; CHECK: iter.check: ; CHECK-NEXT: [[INDUCTION_IV:%.*]] = phi i8 [ [[INDUCTION_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ -56, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[INDVAR:%.*]] = phi i8 [ [[INDVAR_NEXT:%.*]], [[OUTER_LATCH]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH]] ] @@ -483,16 +483,18 @@ ; CHECK-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[OUTER_IV]] to i32 ; CHECK-NEXT: [[ADD]] = add i32 [[P2]], [[TRUNC_IV]] ; CHECK-NEXT: [[TRUNC_ADD:%.*]] = trunc i32 [[ADD]] to i8 -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[IND_END:%.*]] = mul i8 84, [[INDUCTION_IV]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[INDUCTION_IV]], i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> , [[DOTSPLAT]] ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i8> zeroinitializer, [[TMP2]] ; CHECK-NEXT: [[TMP3:%.*]] = mul i8 [[INDUCTION_IV]], 4 -; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP3]], i32 0 -; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT2]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP3]], i32 0 +; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[IND_END:%.*]] = mul i8 84, [[INDUCTION_IV]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -504,35 +506,68 @@ ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* ; CHECK-NEXT: store <4 x i8> [[VEC_IND]], <4 x i8>* [[TMP7]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], [[DOTSPLAT3]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], [[DOTSPLAT2]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 84 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 84, 84 -; CHECK-NEXT: br i1 [[CMP_N]], label [[OUTER_LATCH]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 85, [[MIDDLE_BLOCK]] ], [ 1, [[OUTER_HEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_HEADER]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[OUTER_LATCH]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[IND_END5:%.*]] = mul i8 84, [[INDUCTION_IV]] +; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]] +; CHECK-NEXT: [[DOTSPLATINSERT9:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i32 0 +; CHECK-NEXT: [[DOTSPLAT10:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT9]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT11:%.*]] = insertelement <4 x i8> poison, i8 [[INDUCTION_IV]], i32 0 +; CHECK-NEXT: [[DOTSPLAT12:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT11]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i8> , [[DOTSPLAT12]] +; CHECK-NEXT: [[INDUCTION13:%.*]] = add <4 x i8> [[DOTSPLAT10]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i8 [[INDUCTION_IV]], 4 +; CHECK-NEXT: [[DOTSPLATINSERT14:%.*]] = insertelement <4 x i8> poison, i8 [[TMP10]], i32 0 +; CHECK-NEXT: [[DOTSPLAT15:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT14]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND16:%.*]] = phi <4 x i8> [ [[INDUCTION13]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX18:%.*]] = add i64 1, [[INDEX8]] +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX18]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[VEC_IND16]], <4 x i8>* [[TMP14]], align 1 +; CHECK-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX8]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT17]] = add <4 x i8> [[VEC_IND16]], [[DOTSPLAT15]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT19]], 84 +; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 84, 84 +; CHECK-NEXT: br i1 [[CMP_N7]], label [[OUTER_LATCH]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 85, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 85, [[VEC_EPILOG_ITER_CHECK]] ], [ 1, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i8 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[INNER:%.*]] ; CHECK: inner: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[INNER]] ] -; CHECK-NEXT: [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[INNER]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[INNER]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[INNER]] ] ; CHECK-NEXT: [[IV_2_NEXT]] = sub i8 [[IV_2]], [[TRUNC_ADD]] ; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[IV]] ; CHECK-NEXT: store i8 [[IV_2]], i8* [[GEP_DST]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp ugt i64 [[IV]], 83 -; CHECK-NEXT: br i1 [[EC]], label [[OUTER_LATCH]], label [[INNER]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[OUTER_LATCH]], label [[INNER]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: outer.latch: ; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1 ; CHECK-NEXT: [[INDVAR_NEXT]] = add i8 [[INDVAR]], 1 ; CHECK-NEXT: [[INDUCTION_IV_NEXT]] = add i8 [[INDUCTION_IV]], [[TMP1]] -; CHECK-NEXT: br label [[OUTER_HEADER]] +; CHECK-NEXT: br label [[ITER_CHECK]] ; ; CHECK-PROFITABLE-BY-DEFAULT-LABEL: @induction_resume_value_requires_non_trivial_scev_expansion( ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: entry: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[OUTER_HEADER:%.*]] -; CHECK-PROFITABLE-BY-DEFAULT: outer.header: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[ITER_CHECK:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT: iter.check: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDUCTION_IV:%.*]] = phi i8 [ [[INDUCTION_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ -56, [[ENTRY:%.*]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDVAR:%.*]] = phi i8 [ [[INDVAR_NEXT:%.*]], [[OUTER_LATCH]] ], [ 0, [[ENTRY]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH]] ] @@ -542,16 +577,18 @@ ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[OUTER_IV]] to i32 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ADD]] = add i32 [[P2]], [[TRUNC_IV]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TRUNC_ADD:%.*]] = trunc i32 [[ADD]] to i8 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT: vector.main.loop.iter.check: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-PROFITABLE-BY-DEFAULT: vector.ph: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IND_END:%.*]] = mul i8 84, [[INDUCTION_IV]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[INDUCTION_IV]], i32 0 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP2:%.*]] = mul <4 x i8> , [[DOTSPLAT]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDUCTION:%.*]] = add <4 x i8> zeroinitializer, [[TMP2]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP3:%.*]] = mul i8 [[INDUCTION_IV]], 4 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP3]], i32 0 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT2]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP3]], i32 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IND_END:%.*]] = mul i8 84, [[INDUCTION_IV]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-PROFITABLE-BY-DEFAULT: vector.body: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -563,30 +600,63 @@ ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store <4 x i8> [[VEC_IND]], <4 x i8>* [[TMP7]], align 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], [[DOTSPLAT3]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], [[DOTSPLAT2]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 84 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-PROFITABLE-BY-DEFAULT: middle.block: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 84, 84 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N]], label [[OUTER_LATCH]], label [[SCALAR_PH]] -; CHECK-PROFITABLE-BY-DEFAULT: scalar.ph: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 85, [[MIDDLE_BLOCK]] ], [ 1, [[OUTER_HEADER]] ] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_HEADER]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N]], label [[OUTER_LATCH]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.iter.check: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IND_END5:%.*]] = mul i8 84, [[INDUCTION_IV]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.ph: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLATINSERT9:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i32 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLAT10:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT9]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLATINSERT11:%.*]] = insertelement <2 x i8> poison, i8 [[INDUCTION_IV]], i32 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLAT12:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT11]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP9:%.*]] = mul <2 x i8> , [[DOTSPLAT12]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDUCTION13:%.*]] = add <2 x i8> [[DOTSPLAT10]], [[TMP9]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP10:%.*]] = mul i8 [[INDUCTION_IV]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLATINSERT14:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLAT15:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT14]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.vector.body: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_IND16:%.*]] = phi <2 x i8> [ [[INDUCTION13]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[OFFSET_IDX18:%.*]] = add i64 1, [[INDEX8]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX18]], 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP11]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <2 x i8>* +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store <2 x i8> [[VEC_IND16]], <2 x i8>* [[TMP14]], align 1 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX8]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_IND_NEXT17]] = add <2 x i8> [[VEC_IND16]], [[DOTSPLAT15]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT19]], 84 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.middle.block: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N7:%.*]] = icmp eq i64 84, 84 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N7]], label [[OUTER_LATCH]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.scalar.ph: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 85, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 85, [[VEC_EPILOG_ITER_CHECK]] ], [ 1, [[ITER_CHECK]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i8 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[INNER:%.*]] ; CHECK-PROFITABLE-BY-DEFAULT: inner: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[INNER]] ] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[INNER]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[INNER]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[INNER]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_2_NEXT]] = sub i8 [[IV_2]], [[TRUNC_ADD]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[IV]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store i8 [[IV_2]], i8* [[GEP_DST]], align 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[EC:%.*]] = icmp ugt i64 [[IV]], 83 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[EC]], label [[OUTER_LATCH]], label [[INNER]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[EC]], label [[OUTER_LATCH]], label [[INNER]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-PROFITABLE-BY-DEFAULT: outer.latch: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDVAR_NEXT]] = add i8 [[INDVAR]], 1 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDUCTION_IV_NEXT]] = add i8 [[INDUCTION_IV]], [[TMP1]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[OUTER_HEADER]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[ITER_CHECK]] ; entry: br label %outer.header