diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -877,6 +877,20 @@ // Get the previous value. The previous value comes from the latch edge while // the initial value comes form the preheader edge. auto *Previous = dyn_cast(Phi->getIncomingValueForBlock(Latch)); + + // If Previous is a phi in the header, go through incoming values from the + // latch until we find a non-phi value. Use this as the new Previous, all uses + // in the header will be dominated by the original phi, but need to be moved + // after the non-phi previous value. + SmallPtrSet SeenPhis; + while (auto *PrevPhi = dyn_cast_or_null(Previous)) { + if (PrevPhi->getParent() != Phi->getParent()) + return false; + if (!SeenPhis.insert(PrevPhi).second) + return false; + Previous = dyn_cast(PrevPhi->getIncomingValueForBlock(Latch)); + } + if (!Previous || !TheLoop->contains(Previous) || isa(Previous) || SinkAfter.count(Previous)) // Cannot rely on dominance due to motion. return false; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8873,13 +8873,16 @@ CM.isInLoopReduction(Phi), CM.useOrderedReductions(RdxDesc)); } else { + recordRecipeOf(Phi); PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); } // Record the incoming value from the backedge, so we can add the incoming // value from the backedge after all recipes have been created. - recordRecipeOf(cast( - Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); + auto *Inc = cast( + Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); + if (Ingredient2Recipe[Inc] == nullptr) + recordRecipeOf(Inc); PhisToFix.push_back(PhiRecipe); } else { // TODO: record backedge value for remaining pointer induction phis. @@ -9257,6 +9260,9 @@ continue; VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); + while (auto *PrevPhi = + dyn_cast(PrevRecipe)) + PrevRecipe = PrevPhi->getBackedgeRecipe(); VPBasicBlock *InsertBlock = PrevRecipe->getParent(); auto *Region = GetReplicateRegion(PrevRecipe); if (Region) @@ -9265,7 +9271,6 @@ Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); else Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); - auto *RecurSplice = cast( Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, {RecurPhi, RecurPhi->getBackedgeValue()})); diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll @@ -2,7 +2,29 @@ define void @test_chained_first_order_recurrences_1(i16* %ptr) { ; CHECK-LABEL: @test_chained_first_order_recurrences_1 -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP4:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP2]] to <4 x i16>* +; CHECK-NEXT: store <4 x i16> [[TMP6]], <4 x i16>* [[TMP7]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 ; entry: br label %loop @@ -25,7 +47,29 @@ define void @test_chained_first_order_recurrences_2(i16* %ptr) { ; CHECK-LABEL: @test_chained_first_order_recurrences_2 -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP4:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP2]] to <4 x i16>* +; CHECK-NEXT: store <4 x i16> [[TMP6]], <4 x i16>* [[TMP7]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body, !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 ; entry: br label %loop @@ -48,7 +92,34 @@ define void @test_chained_first_order_recurrences_3(i16* %ptr) { ; CHECK-LABEL: @test_chained_first_order_recurrences_3 -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP4:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP5:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i16> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP2]] to <4 x i16>* +; CHECK-NEXT: store <4 x i16> [[TMP8]], <4 x i16>* [[TMP9]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 ; entry: br label %loop @@ -137,7 +208,31 @@ define void @test_chained_first_order_recurrence_sink_users_1(double* %ptr) { ; CHECK-LABEL: @test_chained_first_order_recurrence_sink_users_1 -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ , %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ , %vector.ph ], [ [[TMP4:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <4 x double>* +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x double>, <4 x double>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x double> , [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x double> [[TMP6]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP2]] to <4 x double>* +; CHECK-NEXT: store <4 x double> [[TMP7]], <4 x double>* [[TMP8]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996 +; CHECK-NEXT: br i1 [[TMP9]], label %middle.block, label %vector.body, !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 999, 996 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x double> [[TMP4]], i32 2 ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -1810,17 +1810,42 @@ ; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] ; CHECK: for.cond1.preheader: ; CHECK-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] -; CHECK-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1:%.*]], [[FOR_COND_CLEANUP3]] ] +; CHECK-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I_016]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[I_016]], 2147483644 +; CHECK-NEXT: [[IND_END:%.*]] = and i32 [[I_016]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i64 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i64 2 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[I_016]], [[FOR_COND1_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND1:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret i32 [[E_1]] +; CHECK-NEXT: ret i32 [[E_1_LCSSA]] ; CHECK: for.cond1: -; CHECK-NEXT: [[E_1]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ] -; CHECK-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1 ; CHECK-NEXT: [[DEC]] = add nsw i32 [[K_0]], -1 -; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]] +; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: for.cond.cleanup3: +; CHECK-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] @@ -1830,17 +1855,43 @@ ; UNROLL-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] ; UNROLL: for.cond1.preheader: ; UNROLL-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] -; UNROLL-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1:%.*]], [[FOR_COND_CLEANUP3]] ] +; UNROLL-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] +; UNROLL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I_016]], 8 +; UNROLL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL: vector.ph: +; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[I_016]], 2147483640 +; UNROLL-NEXT: [[IND_END:%.*]] = and i32 [[I_016]], 7 +; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0 +; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; UNROLL-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] +; UNROLL: vector.body: +; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], +; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 +; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; UNROLL-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; UNROLL: middle.block: +; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] +; UNROLL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i64 3 +; UNROLL-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i64 2 +; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] +; UNROLL: scalar.ph: +; UNROLL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[I_016]], [[FOR_COND1_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: br label [[FOR_COND1:%.*]] ; UNROLL: for.cond.cleanup: -; UNROLL-NEXT: ret i32 [[E_1]] +; UNROLL-NEXT: ret i32 [[E_1_LCSSA]] ; UNROLL: for.cond1: -; UNROLL-NEXT: [[E_1]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ] -; UNROLL-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] +; UNROLL-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; UNROLL-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1 ; UNROLL-NEXT: [[DEC]] = add nsw i32 [[K_0]], -1 -; UNROLL-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]] +; UNROLL-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP19:![0-9]+]] ; UNROLL: for.cond.cleanup3: +; UNROLL-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; UNROLL-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 ; UNROLL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] @@ -1851,18 +1902,57 @@ ; UNROLL-NO-IC: for.cond1.preheader: ; UNROLL-NO-IC-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] ; UNROLL-NO-IC-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] +; UNROLL-NO-IC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I_016]], 8 +; UNROLL-NO-IC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC: vector.ph: +; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 8 +; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]] +; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3 +; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i32 0 +; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] +; UNROLL-NO-IC: vector.body: +; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], +; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]] +; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 +; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], -1 +; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], -2 +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -3 +; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], -4 +; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], -5 +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i32 [[OFFSET_IDX]], -6 +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], -7 +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> +; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 +; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], +; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; UNROLL-NO-IC: middle.block: +; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3 +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2 +; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] +; UNROLL-NO-IC: scalar.ph: +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_COND1:%.*]] ; UNROLL-NO-IC: for.cond.cleanup: ; UNROLL-NO-IC-NEXT: [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[E_1_LCSSA_LCSSA]] ; UNROLL-NO-IC: for.cond1: -; UNROLL-NO-IC-NEXT: [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ] -; UNROLL-NO-IC-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; UNROLL-NO-IC-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1 ; UNROLL-NO-IC-NEXT: [[DEC]] = add nsw i32 [[K_0]], -1 -; UNROLL-NO-IC-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]] +; UNROLL-NO-IC-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP19:![0-9]+]] ; UNROLL-NO-IC: for.cond.cleanup3: -; UNROLL-NO-IC-NEXT: [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ] +; UNROLL-NO-IC-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; UNROLL-NO-IC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 ; UNROLL-NO-IC-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] @@ -1873,18 +1963,40 @@ ; UNROLL-NO-VF: for.cond1.preheader: ; UNROLL-NO-VF-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] ; UNROLL-NO-VF-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] +; UNROLL-NO-VF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I_016]], 2 +; UNROLL-NO-VF-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-VF: vector.ph: +; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 2 +; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]] +; UNROLL-NO-VF-NEXT: [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] +; UNROLL-NO-VF: vector.body: +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[INDUCTION1:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]] +; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i32 [[OFFSET_IDX]], 0 +; UNROLL-NO-VF-NEXT: [[INDUCTION1]] = add i32 [[OFFSET_IDX]], -1 +; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; UNROLL-NO-VF: middle.block: +; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] +; UNROLL-NO-VF: scalar.ph: +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[INDUCTION1]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_COND1:%.*]] ; UNROLL-NO-VF: for.cond.cleanup: ; UNROLL-NO-VF-NEXT: [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ] ; UNROLL-NO-VF-NEXT: ret i32 [[E_1_LCSSA_LCSSA]] ; UNROLL-NO-VF: for.cond1: -; UNROLL-NO-VF-NEXT: [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ] -; UNROLL-NO-VF-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; UNROLL-NO-VF-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NO-VF-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1 ; UNROLL-NO-VF-NEXT: [[DEC]] = add nsw i32 [[K_0]], -1 -; UNROLL-NO-VF-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]] +; UNROLL-NO-VF-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP18:![0-9]+]] ; UNROLL-NO-VF: for.cond.cleanup3: -; UNROLL-NO-VF-NEXT: [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ] +; UNROLL-NO-VF-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[INDUCTION]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 ; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] @@ -1895,18 +2007,51 @@ ; SINK-AFTER: for.cond1.preheader: ; SINK-AFTER-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] ; SINK-AFTER-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] +; SINK-AFTER-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I_016]], 4 +; SINK-AFTER-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SINK-AFTER: vector.ph: +; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 4 +; SINK-AFTER-NEXT: [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]] +; SINK-AFTER-NEXT: [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]] +; SINK-AFTER-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3 +; SINK-AFTER-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i32 0 +; SINK-AFTER-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; SINK-AFTER-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; SINK-AFTER-NEXT: br label [[VECTOR_BODY:%.*]] +; SINK-AFTER: vector.body: +; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ] +; SINK-AFTER-NEXT: [[VEC_IND]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SINK-AFTER-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]] +; SINK-AFTER-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 +; SINK-AFTER-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], -1 +; SINK-AFTER-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], -2 +; SINK-AFTER-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -3 +; SINK-AFTER-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> +; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; SINK-AFTER-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SINK-AFTER-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; SINK-AFTER: middle.block: +; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] +; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3 +; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2 +; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] +; SINK-AFTER: scalar.ph: +; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] ; SINK-AFTER-NEXT: br label [[FOR_COND1:%.*]] ; SINK-AFTER: for.cond.cleanup: ; SINK-AFTER-NEXT: [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ] ; SINK-AFTER-NEXT: ret i32 [[E_1_LCSSA_LCSSA]] ; SINK-AFTER: for.cond1: -; SINK-AFTER-NEXT: [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ] -; SINK-AFTER-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] +; SINK-AFTER-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; SINK-AFTER-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; SINK-AFTER-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1 ; SINK-AFTER-NEXT: [[DEC]] = add nsw i32 [[K_0]], -1 -; SINK-AFTER-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]] +; SINK-AFTER-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP19:![0-9]+]] ; SINK-AFTER: for.cond.cleanup3: -; SINK-AFTER-NEXT: [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ] +; SINK-AFTER-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; SINK-AFTER-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 ; SINK-AFTER-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] @@ -1917,18 +2062,51 @@ ; NO-SINK-AFTER: for.cond1.preheader: ; NO-SINK-AFTER-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] ; NO-SINK-AFTER-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] +; NO-SINK-AFTER-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I_016]], 4 +; NO-SINK-AFTER-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-SINK-AFTER: vector.ph: +; NO-SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 4 +; NO-SINK-AFTER-NEXT: [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]] +; NO-SINK-AFTER-NEXT: [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]] +; NO-SINK-AFTER-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3 +; NO-SINK-AFTER-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i32 0 +; NO-SINK-AFTER-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; NO-SINK-AFTER-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; NO-SINK-AFTER-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-SINK-AFTER: vector.body: +; NO-SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ] +; NO-SINK-AFTER-NEXT: [[VEC_IND]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-SINK-AFTER-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]] +; NO-SINK-AFTER-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 +; NO-SINK-AFTER-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], -1 +; NO-SINK-AFTER-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], -2 +; NO-SINK-AFTER-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -3 +; NO-SINK-AFTER-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> +; NO-SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; NO-SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; NO-SINK-AFTER-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; NO-SINK-AFTER-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; NO-SINK-AFTER: middle.block: +; NO-SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] +; NO-SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3 +; NO-SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2 +; NO-SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] +; NO-SINK-AFTER: scalar.ph: +; NO-SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; NO-SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] ; NO-SINK-AFTER-NEXT: br label [[FOR_COND1:%.*]] ; NO-SINK-AFTER: for.cond.cleanup: ; NO-SINK-AFTER-NEXT: [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ] ; NO-SINK-AFTER-NEXT: ret i32 [[E_1_LCSSA_LCSSA]] ; NO-SINK-AFTER: for.cond1: -; NO-SINK-AFTER-NEXT: [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ] -; NO-SINK-AFTER-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] +; NO-SINK-AFTER-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; NO-SINK-AFTER-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; NO-SINK-AFTER-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1 ; NO-SINK-AFTER-NEXT: [[DEC]] = add nsw i32 [[K_0]], -1 -; NO-SINK-AFTER-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]] +; NO-SINK-AFTER-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP19:![0-9]+]] ; NO-SINK-AFTER: for.cond.cleanup3: -; NO-SINK-AFTER-NEXT: [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ] +; NO-SINK-AFTER-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; NO-SINK-AFTER-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; NO-SINK-AFTER-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 ; NO-SINK-AFTER-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]