diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -180,15 +180,18 @@ DemandedBits *DB = nullptr, AssumptionCache *AC = nullptr, DominatorTree *DT = nullptr, ScalarEvolution *SE = nullptr); - /// Returns true if Phi is a first-order recurrence. A first-order recurrence + /// Returns true if Phi is a fixed-order recurrence. A fixed-order recurrence /// is a non-reduction recurrence relation in which the value of the - /// recurrence in the current loop iteration equals a value defined in the - /// previous iteration. \p SinkAfter includes pairs of instructions where the - /// first will be rescheduled to appear after the second if/when the loop is - /// vectorized. It may be augmented with additional pairs if needed in order - /// to handle Phi as a first-order recurrence. + /// recurrence in the current loop iteration equals a value defined in a + /// previous iteration (e.g. if the value is defined in the previous + /// iteration, we refer to it as first-order recurrence, if it is defined in + /// the iteration before the previous, we refer to it as second-order + /// recurrence and so on). \p SinkAfter includes pairs of instructions where + /// the first will be rescheduled to appear after the second if/when the loop + /// is vectorized. It may be augmented with additional pairs if needed in + /// order to handle Phi as a first-order recurrence. static bool - isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop, + isFixedOrderRecurrence(PHINode *Phi, Loop *TheLoop, MapVector &SinkAfter, DominatorTree *DT); diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -291,10 +291,10 @@ /// Returns the induction variables found in the loop. const InductionList &getInductionVars() const { return Inductions; } - /// Return the first-order recurrences found in the loop. - RecurrenceSet &getFirstOrderRecurrences() { return FirstOrderRecurrences; } + /// Return the fixed-order recurrences found in the loop. + RecurrenceSet &getFixedOrderRecurrences() { return FixedOrderRecurrences; } - /// Return the set of instructions to sink to handle first-order recurrences. + /// Return the set of instructions to sink to handle fixed-order recurrences. MapVector &getSinkAfter() { return SinkAfter; } /// Returns the widest induction type. @@ -332,8 +332,8 @@ /// Returns True if PN is a reduction variable in this loop. bool isReductionVariable(PHINode *PN) const { return Reductions.count(PN); } - /// Returns True if Phi is a first-order recurrence in this loop. - bool isFirstOrderRecurrence(const PHINode *Phi) const; + /// Returns True if Phi is a fixed-order recurrence in this loop. + bool isFixedOrderRecurrence(const PHINode *Phi) const; /// Return true if the block BB needs to be predicated in order for the loop /// to be vectorized. @@ -515,11 +515,11 @@ /// loop body. SmallPtrSet InductionCastsToIgnore; - /// Holds the phi nodes that are first-order recurrences. - RecurrenceSet FirstOrderRecurrences; + /// Holds the phi nodes that are fixed-order recurrences. + RecurrenceSet FixedOrderRecurrences; /// Holds instructions that need to sink past other instructions to handle - /// first-order recurrences. + /// fixed-order recurrences. MapVector SinkAfter; /// Holds the widest induction type encountered. diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -921,7 +921,7 @@ return false; } -bool RecurrenceDescriptor::isFirstOrderRecurrence( +bool RecurrenceDescriptor::isFixedOrderRecurrence( PHINode *Phi, Loop *TheLoop, MapVector &SinkAfter, DominatorTree *DT) { @@ -945,6 +945,20 @@ // Get the previous value. The previous value comes from the latch edge while // the initial value comes form the preheader edge. auto *Previous = dyn_cast(Phi->getIncomingValueForBlock(Latch)); + + // If Previous is a phi in the header, go through incoming values from the + // latch until we find a non-phi value. Use this as the new Previous, all uses + // in the header will be dominated by the original phi, but need to be moved + // after the non-phi previous value. + SmallPtrSet SeenPhis; + while (auto *PrevPhi = dyn_cast_or_null(Previous)) { + if (PrevPhi->getParent() != Phi->getParent()) + return false; + if (!SeenPhis.insert(PrevPhi).second) + return false; + Previous = dyn_cast(PrevPhi->getIncomingValueForBlock(Latch)); + } + if (!Previous || !TheLoop->contains(Previous) || isa(Previous) || SinkAfter.count(Previous)) // Cannot rely on dominance due to motion. return false; @@ -986,7 +1000,7 @@ return false; // Avoid sinking an instruction multiple times (if multiple operands are - // first order recurrences) by sinking once - after the latest 'previous' + // fixed order recurrences) by sinking once - after the latest 'previous' // instruction. auto It = SinkAfter.find(SinkCandidate); if (It != SinkAfter.end()) { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -102,7 +102,7 @@ "\nsimple Use tail-folding for simple loops (not reductions or " "recurrences)" "\nreductions Use tail-folding for loops containing reductions" - "\nrecurrences Use tail-folding for loops containing first order " + "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences"), cl::location(TailFoldingKindLoc)); @@ -3044,7 +3044,7 @@ TailFoldingKind Required; // Defaults to 0. if (LVL->getReductionVars().size()) Required.add(TailFoldingKind::TFReductions); - if (LVL->getFirstOrderRecurrences().size()) + if (LVL->getFixedOrderRecurrences().size()) Required.add(TailFoldingKind::TFRecurrences); if (!Required) Required.add(TailFoldingKind::TFSimple); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -677,7 +677,7 @@ // Non-header phi nodes that have outside uses can be vectorized. Add // them to the list of allowed exits. // Unsafe cyclic dependencies with header phis are identified during - // legalization for reduction, induction and first order + // legalization for reduction, induction and fixed order // recurrences. AllowedExit.insert(&I); continue; @@ -710,7 +710,7 @@ // 3. Non-Phis with outside uses when SCEV predicates cannot be used // outside the loop - see call to hasOutsideLoopUser in the non-phi // handling below - // 4. FirstOrderRecurrence phis that can possibly be handled by + // 4. FixedOrderRecurrence phis that can possibly be handled by // extraction. // By recording these, we can then reason about ways to vectorize each // of these NotAllowedExit. @@ -721,10 +721,10 @@ continue; } - if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, + if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, SinkAfter, DT)) { AllowedExit.insert(Phi); - FirstOrderRecurrences.insert(Phi); + FixedOrderRecurrences.insert(Phi); continue; } @@ -894,12 +894,12 @@ } } - // For first order recurrences, we use the previous value (incoming value from + // For fixed order recurrences, we use the previous value (incoming value from // the latch) to check if it dominates all users of the recurrence. Bail out // if we have to sink such an instruction for another recurrence, as the // dominance requirement may not hold after sinking. BasicBlock *LoopLatch = TheLoop->getLoopLatch(); - if (any_of(FirstOrderRecurrences, [LoopLatch, this](const PHINode *Phi) { + if (any_of(FixedOrderRecurrences, [LoopLatch, this](const PHINode *Phi) { Instruction *V = cast(Phi->getIncomingValueForBlock(LoopLatch)); return SinkAfter.find(V) != SinkAfter.end(); @@ -1080,9 +1080,9 @@ return isInductionPhi(V) || isCastedInductionVariable(V); } -bool LoopVectorizationLegality::isFirstOrderRecurrence( +bool LoopVectorizationLegality::isFixedOrderRecurrence( const PHINode *Phi) const { - return FirstOrderRecurrences.count(Phi); + return FixedOrderRecurrences.count(Phi); } bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -552,7 +552,7 @@ /// Create the exit value of first order recurrences in the middle block and /// update their users. - void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, + void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State); /// Create code for the loop exit value of the reduction. @@ -3705,11 +3705,11 @@ if (auto *ReductionPhi = dyn_cast(&R)) fixReduction(ReductionPhi, State); else if (auto *FOR = dyn_cast(&R)) - fixFirstOrderRecurrence(FOR, State); + fixFixedOrderRecurrence(FOR, State); } } -void InnerLoopVectorizer::fixFirstOrderRecurrence( +void InnerLoopVectorizer::fixFixedOrderRecurrence( VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { // This is the second phase of vectorizing first-order recurrences. An // overview of the transformation is described below. Suppose we have the @@ -4002,7 +4002,7 @@ // We know that the loop is in LCSSA form. We need to update the PHI nodes // in the exit blocks. See comment on analogous loop in - // fixFirstOrderRecurrence for a more complete explaination of the logic. + // fixFixedOrderRecurrence for a more complete explaination of the logic. if (!Cost->requiresScalarEpilogue(VF)) for (PHINode &LCSSAPhi : LoopExitBlock->phis()) if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { @@ -4738,7 +4738,7 @@ // First order recurrence Phi's should typically be considered // non-uniform. auto *OP = dyn_cast(OV); - if (OP && Legal->isFirstOrderRecurrence(OP)) + if (OP && Legal->isFixedOrderRecurrence(OP)) continue; // If all the users of the operand are uniform, then add the // operand into the uniform worklist. @@ -5427,7 +5427,7 @@ // Cross iteration phis such as reductions need special handling and are // currently unsupported. if (any_of(L.getHeader()->phis(), - [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) + [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) return false; // Phis with uses outside of the loop require special handling and are @@ -7032,7 +7032,7 @@ // First-order recurrences are replaced by vector shuffles inside the loop. // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. - if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) + if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) return TTI.getShuffleCost( TargetTransformInfo::SK_ExtractSubvector, cast(VectorTy), None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); @@ -8509,13 +8509,18 @@ if (auto Phi = dyn_cast(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) return tryToBlend(Phi, Operands, Plan); + + // Always record recipes for header phis. Later first-order recurrence phis + // can have earlier phis as incoming values. + recordRecipeOf(Phi); + if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) return toVPRecipeResult(Recipe); VPHeaderPHIRecipe *PhiRecipe = nullptr; assert((Legal->isReductionVariable(Phi) || - Legal->isFirstOrderRecurrence(Phi)) && - "can only widen reductions and first-order recurrences here"); + Legal->isFixedOrderRecurrence(Phi)) && + "can only widen reductions and fixed-order recurrences here"); VPValue *StartV = Operands[0]; if (Legal->isReductionVariable(Phi)) { const RecurrenceDescriptor &RdxDesc = @@ -8526,13 +8531,21 @@ CM.isInLoopReduction(Phi), CM.useOrderedReductions(RdxDesc)); } else { + // TODO: Currently fixed-order recurrences are modeled as chains of + // first-order recurrences. If there are no users of the intermediate + // recurrences in the chain, the fixed order recurrence should be modeled + // directly, enabling more efficient codegen. PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); } // Record the incoming value from the backedge, so we can add the incoming // value from the backedge after all recipes have been created. - recordRecipeOf(cast( - Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); + auto *Inc = cast( + Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); + auto RecipeIter = Ingredient2Recipe.find(Inc); + if (RecipeIter == Ingredient2Recipe.end()) + recordRecipeOf(Inc); + PhisToFix.push_back(PhiRecipe); return toVPRecipeResult(PhiRecipe); } @@ -8597,7 +8610,7 @@ assert( SinkTarget != FirstInst && "Must find a live instruction (at least the one feeding the " - "first-order recurrence PHI) before reaching beginning of the block"); + "fixed-order recurrence PHI) before reaching beginning of the block"); SinkTarget = SinkTarget->getPrevNode(); assert(SinkTarget != P.first && "sink source equals target, no sinking required"); @@ -8982,7 +8995,7 @@ RecipeBuilder, Range.Start); // Introduce a recipe to combine the incoming and previous values of a - // first-order recurrence. + // fixed-order recurrence. for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { auto *RecurPhi = dyn_cast(&R); @@ -8990,6 +9003,11 @@ continue; VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); + // Fixed-order recurrences do not contain cycles, so this loop is guaranteed + // to terminate. + while (auto *PrevPhi = + dyn_cast(PrevRecipe)) + PrevRecipe = PrevPhi->getBackedgeRecipe(); VPBasicBlock *InsertBlock = PrevRecipe->getParent(); auto *Region = GetReplicateRegion(PrevRecipe); if (Region) diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll @@ -0,0 +1,108 @@ +; REQUIRES: asserts + +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -debug-only=loop-vectorize -disable-output -S %s 2>&1 | FileCheck %s + +define void @test_chained_first_order_recurrences_1(i16* %ptr) { +; CHECK-LABEL: 'test_chained_first_order_recurrences_1' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.1> = phi ir<22>, ir<%for.1.next> +; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.2> = phi ir<33>, vp<%8> +; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%2>, ir<0>, ir<1> +; CHECK-NEXT: CLONE ir<%gep.ptr> = getelementptr ir<%ptr>, vp<%5> +; CHECK-NEXT: WIDEN ir<%for.1.next> = load ir<%gep.ptr> +; CHECK-NEXT: EMIT vp<%8> = first-order splice ir<%for.1> ir<%for.1.next> +; CHECK-NEXT: EMIT vp<%9> = first-order splice ir<%for.2> vp<%8> +; CHECK-NEXT: WIDEN ir<%add> = add vp<%8>, vp<%9> +; CHECK-NEXT: WIDEN store ir<%gep.ptr>, ir<%add> +; CHECK-NEXT: EMIT vp<%11> = VF * UF +(nuw) vp<%2> +; CHECK-NEXT: EMIT branch-on-count vp<%11> vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] + %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add nuw nsw i64 %iv, 1 + %gep.ptr = getelementptr inbounds i16, i16* %ptr, i64 %iv + %for.1.next = load i16, i16* %gep.ptr, align 2 + %add = add i16 %for.1, %for.2 + store i16 %add, i16* %gep.ptr + %exitcond.not = icmp eq i64 %iv.next, 1000 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +define void @test_chained_first_order_recurrences_3(i16* %ptr) { +; CHECK-LABEL: 'test_chained_first_order_recurrences_3' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.1> = phi ir<22>, ir<%for.1.next> +; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.2> = phi ir<33>, vp<%9> +; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.3> = phi ir<33>, vp<%10> +; CHECK-NEXT: vp<%6> = SCALAR-STEPS vp<%2>, ir<0>, ir<1> +; CHECK-NEXT: CLONE ir<%gep.ptr> = getelementptr ir<%ptr>, vp<%6> +; CHECK-NEXT: WIDEN ir<%for.1.next> = load ir<%gep.ptr> +; CHECK-NEXT: EMIT vp<%9> = first-order splice ir<%for.1> ir<%for.1.next> +; CHECK-NEXT: EMIT vp<%10> = first-order splice ir<%for.2> vp<%9> +; CHECK-NEXT: EMIT vp<%11> = first-order splice ir<%for.3> vp<%10> +; CHECK-NEXT: WIDEN ir<%add.1> = add vp<%9>, vp<%10> +; CHECK-NEXT: WIDEN ir<%add.2> = add ir<%add.1>, vp<%11> +; CHECK-NEXT: WIDEN store ir<%gep.ptr>, ir<%add.2> +; CHECK-NEXT: EMIT vp<%14> = VF * UF +(nuw) vp<%2> +; CHECK-NEXT: EMIT branch-on-count vp<%14> vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +; CHECK-NOT: vector.body: +; +entry: + br label %loop + +loop: + %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] + %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] + %for.3 = phi i16 [ 33, %entry ], [ %for.2, %loop ] + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add nuw nsw i64 %iv, 1 + %gep.ptr = getelementptr inbounds i16, i16* %ptr, i64 %iv + %for.1.next = load i16, i16* %gep.ptr, align 2 + %add.1 = add i16 %for.1, %for.2 + %add.2 = add i16 %add.1, %for.3 + store i16 %add.2, i16* %gep.ptr + %exitcond.not = icmp eq i64 %iv.next, 1000 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll @@ -2,7 +2,27 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) { ; CHECK-LABEL: @test_chained_first_order_recurrences_1 -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP4:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]] +; CHECK-NEXT: store <4 x i16> [[TMP6]], ptr [[TMP2]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 ; entry: br label %loop @@ -25,7 +45,27 @@ define void @test_chained_first_order_recurrences_2(ptr %ptr) { ; CHECK-LABEL: @test_chained_first_order_recurrences_2 -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP4:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]] +; CHECK-NEXT: store <4 x i16> [[TMP6]], ptr [[TMP2]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body, !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 ; entry: br label %loop @@ -48,7 +88,32 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) { ; CHECK-LABEL: @test_chained_first_order_recurrences_3 -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP4:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP5:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i16> [[TMP7]], [[TMP6]] +; CHECK-NEXT: store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 ; entry: br label %loop @@ -134,7 +199,32 @@ define void @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr) { ; CHECK-LABEL: @test_chained_first_order_recurrences_3_reordered_1 -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP5:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP4:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i16> [[TMP7]], [[TMP6]] +; CHECK-NEXT: store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 ; entry: br label %loop @@ -159,7 +249,32 @@ define void @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr) { ; CHECK-LABEL: @test_chained_first_order_recurrences_3_reordered_2 -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP4:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP5:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i16> [[TMP7]], [[TMP6]] +; CHECK-NEXT: store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 ; entry: br label %loop @@ -184,7 +299,32 @@ define void @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr) { ; CHECK-LABEL: @test_chained_first_order_recurrences_3_for2_no_other_uses -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP4:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP5:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i16> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i16> [[TMP7]], [[TMP6]] +; CHECK-NEXT: store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 ; entry: br label %loop @@ -209,7 +349,31 @@ define void @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr %ptr) { ; CHECK-LABEL: @test_chained_first_order_recurrences_3_for1_for2_no_other_uses -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP4:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ , %vector.ph ], [ [[TMP5:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i16> [[TMP6]], +; CHECK-NEXT: store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 ; entry: br label %loop @@ -233,7 +397,29 @@ define void @test_chained_first_order_recurrence_sink_users_1(double* %ptr) { ; CHECK-LABEL: @test_chained_first_order_recurrence_sink_users_1 -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ , %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ , %vector.ph ], [ [[TMP4:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x double>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x double> , [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x double> [[TMP6]], [[TMP4]] +; CHECK-NEXT: store <4 x double> [[TMP7]], ptr [[TMP2]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996 +; CHECK-NEXT: br i1 [[TMP9]], label %middle.block, label %vector.body, !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 999, 996 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x double> [[TMP4]], i32 2 ; entry: br label %loop @@ -282,8 +468,27 @@ define void @test_first_order_recurrences_and_induction(ptr %ptr) { ; CHECK-LABEL: @test_first_order_recurrences_and_induction( -; CHECK-NOT: vector.body: -; +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ , %vector.ph ], [ [[VEC_IND:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND]] = phi <4 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[VEC_IND]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP1]], +; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP5]], label %middle.block, label %vector.body +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2 +; CHECK-NEXT: br i1 [[CMP_N]] + entry: br label %loop @@ -306,7 +511,26 @@ ; flipped. define void @test_first_order_recurrences_and_induction2(ptr %ptr) { ; CHECK-LABEL: @test_first_order_recurrences_and_induction2( -; CHECK-NOT: vector.body: +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ , %vector.ph ], [ [[VEC_IND]], %vector.body ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[VEC_IND]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP1]], +; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP5]], label %middle.block, label %vector.body +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2 +; CHECK-NEXT: br i1 [[CMP_N]] ; entry: br label %loop @@ -328,7 +552,28 @@ define void @test_first_order_recurrences_and_pointer_induction1(ptr %ptr) { ; CHECK-LABEL: @test_first_order_recurrences_and_pointer_induction1( -; CHECK-NOT: vector.body +; CHECK: vector.ph: +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 4000 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[PTR]], %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ , %vector.ph ], [ [[TMP0:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x ptr> [[VECTOR_RECUR]], <4 x ptr> [[TMP0]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <4 x ptr> [[TMP0]], ptr [[TMP4]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP5]], label %middle.block, label %vector.body +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[CMP_N]], ; entry: br label %loop @@ -352,7 +597,28 @@ ; of phis flipped. define void @test_first_order_recurrences_and_pointer_induction2(ptr %ptr) { ; CHECK-LABEL: @test_first_order_recurrences_and_pointer_induction2( -; CHECK-NOT: vector.body +; CHECK: vector.ph: +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 4000 +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[PTR]], %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ , %vector.ph ], [ [[TMP0:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP0]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x ptr> [[VECTOR_RECUR]], <4 x ptr> [[TMP0]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <4 x ptr> [[TMP0]], ptr [[TMP4]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP5]], label %middle.block, label %vector.body +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[CMP_N]], ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -933,18 +933,48 @@ ; UNROLL-NO-IC: for.cond1.preheader: ; UNROLL-NO-IC-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] ; UNROLL-NO-IC-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] +; UNROLL-NO-IC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I_016]], 8 +; UNROLL-NO-IC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC: vector.ph: +; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 8 +; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]] +; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3 +; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i32 0 +; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] +; UNROLL-NO-IC: vector.body: +; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], +; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> +; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> +; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 +; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], +; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; UNROLL-NO-IC: middle.block: +; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3 +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2 +; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] +; UNROLL-NO-IC: scalar.ph: +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_COND1:%.*]] ; UNROLL-NO-IC: for.cond.cleanup: ; UNROLL-NO-IC-NEXT: [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[E_1_LCSSA_LCSSA]] ; UNROLL-NO-IC: for.cond1: -; UNROLL-NO-IC-NEXT: [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ] -; UNROLL-NO-IC-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; UNROLL-NO-IC-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1 ; UNROLL-NO-IC-NEXT: [[DEC]] = add nsw i32 [[K_0]], -1 -; UNROLL-NO-IC-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]] +; UNROLL-NO-IC-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]] ; UNROLL-NO-IC: for.cond.cleanup3: -; UNROLL-NO-IC-NEXT: [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ] +; UNROLL-NO-IC-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; UNROLL-NO-IC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 ; UNROLL-NO-IC-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] @@ -955,18 +985,40 @@ ; UNROLL-NO-VF: for.cond1.preheader: ; UNROLL-NO-VF-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] ; UNROLL-NO-VF-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] +; UNROLL-NO-VF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I_016]], 2 +; UNROLL-NO-VF-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-VF: vector.ph: +; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 2 +; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]] +; UNROLL-NO-VF-NEXT: [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] +; UNROLL-NO-VF: vector.body: +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[INDUCTION1:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]] +; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i32 [[OFFSET_IDX]], 0 +; UNROLL-NO-VF-NEXT: [[INDUCTION1]] = add i32 [[OFFSET_IDX]], -1 +; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; UNROLL-NO-VF: middle.block: +; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] +; UNROLL-NO-VF: scalar.ph: +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[INDUCTION1]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_COND1:%.*]] ; UNROLL-NO-VF: for.cond.cleanup: ; UNROLL-NO-VF-NEXT: [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ] ; UNROLL-NO-VF-NEXT: ret i32 [[E_1_LCSSA_LCSSA]] ; UNROLL-NO-VF: for.cond1: -; UNROLL-NO-VF-NEXT: [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ] -; UNROLL-NO-VF-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; UNROLL-NO-VF-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NO-VF-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1 ; UNROLL-NO-VF-NEXT: [[DEC]] = add nsw i32 [[K_0]], -1 -; UNROLL-NO-VF-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]] +; UNROLL-NO-VF-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP8:![0-9]+]] ; UNROLL-NO-VF: for.cond.cleanup3: -; UNROLL-NO-VF-NEXT: [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ] +; UNROLL-NO-VF-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[INDUCTION]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 ; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] @@ -977,18 +1029,46 @@ ; SINK-AFTER: for.cond1.preheader: ; SINK-AFTER-NEXT: [[I_016:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] ; SINK-AFTER-NEXT: [[E_015:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ [[E_1_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ] +; SINK-AFTER-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I_016]], 4 +; SINK-AFTER-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SINK-AFTER: vector.ph: +; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 4 +; SINK-AFTER-NEXT: [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]] +; SINK-AFTER-NEXT: [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]] +; SINK-AFTER-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3 +; SINK-AFTER-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i32 0 +; SINK-AFTER-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; SINK-AFTER-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; SINK-AFTER-NEXT: br label [[VECTOR_BODY:%.*]] +; SINK-AFTER: vector.body: +; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ] +; SINK-AFTER-NEXT: [[VEC_IND]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SINK-AFTER-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> +; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SINK-AFTER-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SINK-AFTER: middle.block: +; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] +; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3 +; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2 +; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] +; SINK-AFTER: scalar.ph: +; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] ; SINK-AFTER-NEXT: br label [[FOR_COND1:%.*]] ; SINK-AFTER: for.cond.cleanup: ; SINK-AFTER-NEXT: [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ] ; SINK-AFTER-NEXT: ret i32 [[E_1_LCSSA_LCSSA]] ; SINK-AFTER: for.cond1: -; SINK-AFTER-NEXT: [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ] -; SINK-AFTER-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] +; SINK-AFTER-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; SINK-AFTER-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; SINK-AFTER-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1 ; SINK-AFTER-NEXT: [[DEC]] = add nsw i32 [[K_0]], -1 -; SINK-AFTER-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]] +; SINK-AFTER-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]] ; SINK-AFTER: for.cond.cleanup3: -; SINK-AFTER-NEXT: [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ] +; SINK-AFTER-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; SINK-AFTER-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 ; SINK-AFTER-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]