diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -256,7 +256,8 @@ /// InductionList saves induction variables and maps them to the /// induction descriptor. - using InductionList = MapVector; + using InductionList = + MapVector>; /// RecurrenceSet contains the phi nodes that are recurrences other than /// inductions and reductions. @@ -402,6 +403,13 @@ DominatorTree *getDominatorTree() const { return DT; } + /// Updates the vectorization state by adding \p Phi to the inductions list. + /// This can set \p Phi as the main induction of the loop if \p Phi is a + /// better choice for the main induction than the existing one. + const InductionDescriptor & + addInductionPhi(PHINode *Phi, const InductionDescriptor &ID, + SmallPtrSetImpl &AllowedExit); + private: /// Return true if the pre-header, exiting and latch blocks of \p Lp and all /// its nested loops are considered legal for vectorization. These legal @@ -457,11 +465,15 @@ SmallPtrSetImpl &MaskedOp, SmallPtrSetImpl &ConditionalAssumes) const; - /// Updates the vectorization state by adding \p Phi to the inductions list. - /// This can set \p Phi as the main induction of the loop if \p Phi is a - /// better choice for the main induction than the existing one. - void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID, - SmallPtrSetImpl &AllowedExit); + /// If an access has a symbolic strides, this maps the pointer value to + /// the stride symbol. + const ValueToValueMap *getSymbolicStrides() const { + // FIXME: Currently, the set of symbolic strides is sometimes queried before + // it's collected. This happens from canVectorizeWithIfConvert, when the + // pointer is checked to reference consecutive elements suitable for a + // masked access. + return LAI ? &LAI->getSymbolicStrides() : nullptr; + } /// The loop that we evaluate. Loop *TheLoop; diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -972,7 +972,6 @@ // not need to vectorize the initial value prior to the first iteration of the // loop. // TODO: Consider extending this sinking to handle memory instructions. - SmallPtrSet Seen; BasicBlock *PhiBB = Phi->getParent(); SmallVector WorkList; @@ -988,7 +987,6 @@ return true; if (SinkCandidate->getParent() != PhiBB || - SinkCandidate->mayHaveSideEffects() || SinkCandidate->mayReadFromMemory() || SinkCandidate->isTerminator()) return false; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -554,10 +554,11 @@ return Result; } -void LoopVectorizationLegality::addInductionPhi( +const InductionDescriptor &LoopVectorizationLegality::addInductionPhi( PHINode *Phi, const InductionDescriptor &ID, SmallPtrSetImpl &AllowedExit) { - Inductions[Phi] = ID; + std::unique_ptr IDPtr(new InductionDescriptor(ID)); + Inductions[Phi] = std::move(IDPtr); // In case this induction also comes with casts that we know we can ignore // in the vectorized loop body, record them here. All casts could be recorded @@ -604,6 +605,7 @@ } LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n"); + return *Inductions[Phi]; } bool LoopVectorizationLegality::setupOuterLoopInductions() { @@ -1033,7 +1035,7 @@ // Exact FP induction vars, which we cannot vectorize. if (!EnableStrictReductions || any_of(getInductionVars(), [&](auto &Induction) -> bool { - InductionDescriptor IndDesc = Induction.second; + InductionDescriptor IndDesc = *Induction.second; return IndDesc.getExactFPMathInst(); })) return false; @@ -1080,7 +1082,7 @@ LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const { if (!isInductionPhi(Phi)) return nullptr; - auto &ID = getInductionVars().find(Phi)->second; + auto &ID = *getInductionVars().find(Phi)->second; if (ID.getKind() == InductionDescriptor::IK_IntInduction || ID.getKind() == InductionDescriptor::IK_FpInduction) return &ID; @@ -1091,7 +1093,7 @@ LoopVectorizationLegality::getPointerInductionDescriptor(PHINode *Phi) const { if (!isInductionPhi(Phi)) return nullptr; - auto &ID = getInductionVars().find(Phi)->second; + auto &ID = *getInductionVars().find(Phi)->second; if (ID.getKind() == InductionDescriptor::IK_PtrInduction) return &ID; return nullptr; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3232,7 +3232,7 @@ // start value. for (const auto &InductionEntry : Legal->getInductionVars()) { PHINode *OrigPhi = InductionEntry.first; - const InductionDescriptor &II = InductionEntry.second; + const InductionDescriptor &II = *InductionEntry.second; PHINode *BCResumeVal = createInductionResumeValue( OrigPhi, II, LoopBypassBlocks, AdditionalBypass); OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); @@ -3737,7 +3737,7 @@ // Fix-up external users of the induction variables. for (const auto &Entry : Legal->getInductionVars()) - fixupIVUsers(Entry.first, Entry.second, + fixupIVUsers(Entry.first, *Entry.second, getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), IVEndValues[Entry.first], LoopMiddleBlock, VectorLoop->getHeader(), Plan); @@ -4389,7 +4389,7 @@ // load/store instruction \p I. auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, Instruction *I) { - return Induction.second.getKind() == + return Induction.second->getKind() == InductionDescriptor::IK_PtrInduction && (isa(I) || isa(I)) && Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); @@ -7453,7 +7453,7 @@ // Ignore type-casting instructions we identified during induction // detection. for (const auto &Induction : Legal->getInductionVars()) { - const InductionDescriptor &IndDes = Induction.second; + const InductionDescriptor &IndDes = *Induction.second; const SmallVectorImpl &Casts = IndDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } @@ -9037,7 +9037,13 @@ // Sink users of fixed-order recurrence past the recipe defining the previous // value and introduce FirstOrderRecurrenceSplice VPInstructions. - if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) + if (!VPlanTransforms::adjustFixedOrderRecurrences( + *Plan, Builder, OrigLoop, PSE, + [this](PHINode *PN, + const InductionDescriptor &ID) -> const InductionDescriptor & { + SmallPtrSet AllowedExit; + return Legal->addInductionPhi(PN, ID, AllowedExit); + })) return std::nullopt; // Interleave memory: for each Interleave Group we marked earlier as relevant diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1100,6 +1100,11 @@ return B && B->getVPDefID() >= VPRecipeBase::VPFirstHeaderPHISC && B->getVPDefID() <= VPRecipeBase::VPLastHeaderPHISC; } + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast(U); + return R && R->getVPDefID() >= VPRecipeBase::VPFirstHeaderPHISC && + R->getVPDefID() <= VPRecipeBase::VPLastPHISC; + } /// Generate the phi nodes. void execute(VPTransformState &State) override = 0; @@ -2253,6 +2258,8 @@ /// Values used outside the plan. MapVector LiveOuts; + SmallVector> ExtraIndDescs; + public: VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { if (Entry) @@ -2416,6 +2423,11 @@ return LiveOuts; } + InductionDescriptor &addInductionDescriptor() { + ExtraIndDescs.emplace_back(new InductionDescriptor()); + return *ExtraIndDescs.back(); + } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -80,7 +80,11 @@ /// \returns true if all users of fixed-order recurrences could be re-arranged /// as needed or false if it is not possible. In the latter case, \p Plan is /// not valid. - static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder); + static bool adjustFixedOrderRecurrences( + VPlan &Plan, VPBuilder &Builder, Loop *L, PredicatedScalarEvolution &PSE, + function_ref + AddInduction); /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the /// resulting plan to \p BestVF and \p BestUF. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -679,6 +679,9 @@ properlyDominates(Previous, SinkCandidate, VPDT)) return true; + if (SinkCandidate->mayHaveSideEffects()) + return false; + WorkList.push_back(SinkCandidate); return true; }; @@ -712,8 +715,11 @@ return true; } -bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, - VPBuilder &Builder) { +bool VPlanTransforms::adjustFixedOrderRecurrences( + VPlan &Plan, VPBuilder &Builder, Loop *L, PredicatedScalarEvolution &PSE, + function_ref + AddInduction) { VPDominatorTree VPDT; VPDT.recalculate(Plan); @@ -723,6 +729,9 @@ if (auto *FOR = dyn_cast(&R)) RecurrencePhis.push_back(FOR); + SmallVector IllegalFORs; + VPBasicBlock *VectorHeader = + Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock(); for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) { SmallPtrSet SeenPhis; VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe(); @@ -735,8 +744,10 @@ Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe(); } - if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT)) - return false; + if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT)) { + IllegalFORs.push_back(FOR); + continue; + } // Introduce a recipe to combine the incoming and previous values of a // fixed-order recurrence. @@ -755,5 +766,37 @@ // all users. RecurSplice->setOperand(0, FOR); } + + for (auto *FOR : IllegalFORs) { + PHINode *Phi = cast(FOR->getUnderlyingValue()); + InductionDescriptor TmpID; + if (!InductionDescriptor::isInductionPHI(Phi, L, PSE, TmpID, true)) + return false; + + const InductionDescriptor &ID = AddInduction(Phi, TmpID); + + SmallVector WorkList; + WorkList.push_back(FOR); + WorkList.push_back(FOR->getBackedgeValue()->getDefiningRecipe()); + + while (!WorkList.empty()) { + VPUser *R = WorkList.pop_back_val(); + if (isa(R)) + return false; + if (isa(R)) + continue; + for (auto *V : cast(R)->definedValues()) + for (auto *U : V->users()) + WorkList.push_back(U); + } + + VPValue *Start = Plan.getVPValueOrAddLiveIn(ID.getStartValue()); + VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, ID.getStep(), + *PSE.getSE()); + auto *Ind = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, ID); + Ind->insertBefore(&*VectorHeader->getFirstNonPhi()); + FOR->replaceAllUsesWith(Ind); + FOR->eraseFromParent(); + } return true; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll @@ -26,9 +26,7 @@ } ; CHECK-LABEL: PR33193 -; CHECK: LV: Found scalar instruction: %i.next = zext i32 %j.next to i64 ; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64 -; CHECK: LV: Not considering vector loop of width 8 because it will not generate any vector instructions %struct.a = type { i32, i8 } define void @PR33193(ptr %a, i64 %n) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -3487,12 +3487,12 @@ ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[T]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[T]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 ; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[LEN]] to i8 ; CHECK-NEXT: [[TMP7:%.*]] = add i8 [[T]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp slt i8 [[TMP7]], [[T]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i8 [[TMP7]], [[T]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt i32 [[LEN]], 255 ; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]] @@ -3555,11 +3555,11 @@ ; IND-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; IND: vector.scevcheck: ; IND-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 -; IND-NEXT: [[TMP2:%.*]] = xor i8 [[T]], -1 -; IND-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[TMP1]] +; IND-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] +; IND-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; IND-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; IND-NEXT: [[TMP5:%.*]] = add i8 [[TMP4]], [[T]] -; IND-NEXT: [[TMP6:%.*]] = icmp slt i8 [[TMP5]], [[T]] +; IND-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 +; IND-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] ; IND-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 ; IND-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] ; IND-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] @@ -3621,11 +3621,11 @@ ; UNROLL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; UNROLL: vector.scevcheck: ; UNROLL-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NEXT: [[TMP2:%.*]] = xor i8 [[T]], -1 -; UNROLL-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[TMP1]] +; UNROLL-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] +; UNROLL-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; UNROLL-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NEXT: [[TMP5:%.*]] = add i8 [[TMP4]], [[T]] -; UNROLL-NEXT: [[TMP6:%.*]] = icmp slt i8 [[TMP5]], [[T]] +; UNROLL-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 +; UNROLL-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] ; UNROLL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 ; UNROLL-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] ; UNROLL-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] @@ -3692,12 +3692,12 @@ ; UNROLL-NO-IC: vector.scevcheck: ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add i8 [[T]], [[TMP1]] -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[T]] +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = trunc i32 [[LEN]] to i8 ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i8 [[T]], [[TMP6]] -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = icmp slt i8 [[TMP7]], [[T]] +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = icmp ult i8 [[TMP7]], [[T]] ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = icmp ugt i32 [[LEN]], 255 ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]] @@ -3765,11 +3765,11 @@ ; INTERLEAVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; INTERLEAVE: vector.scevcheck: ; INTERLEAVE-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 -; INTERLEAVE-NEXT: [[TMP2:%.*]] = xor i8 [[T]], -1 -; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[TMP1]] +; INTERLEAVE-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] +; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; INTERLEAVE-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = add i8 [[TMP4]], [[T]] -; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp slt i8 [[TMP5]], [[T]] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 +; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] ; INTERLEAVE-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 ; INTERLEAVE-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] ; INTERLEAVE-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] @@ -3868,12 +3868,12 @@ ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[T]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[T]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 ; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[LEN]] to i8 ; CHECK-NEXT: [[TMP7:%.*]] = add i8 [[T]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp slt i8 [[TMP7]], [[T]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i8 [[TMP7]], [[T]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt i32 [[LEN]], 255 ; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]] @@ -3884,7 +3884,7 @@ ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] ; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4 -; CHECK-NEXT: [[IND_END1:%.*]] = add i32 [[EXT_MUL]], [[TMP12]] +; CHECK-NEXT: [[IND_END2:%.*]] = add i32 [[EXT_MUL]], [[TMP12]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], @@ -3907,13 +3907,13 @@ ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] ; CHECK-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 ; CHECK-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 @@ -3939,11 +3939,11 @@ ; IND-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; IND: vector.scevcheck: ; IND-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 -; IND-NEXT: [[TMP2:%.*]] = xor i8 [[T]], -1 -; IND-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[TMP1]] +; IND-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] +; IND-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; IND-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; IND-NEXT: [[TMP5:%.*]] = add i8 [[TMP4]], [[T]] -; IND-NEXT: [[TMP6:%.*]] = icmp slt i8 [[TMP5]], [[T]] +; IND-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 +; IND-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] ; IND-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 ; IND-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] ; IND-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] @@ -3953,7 +3953,7 @@ ; IND-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] ; IND-NEXT: [[EXT_MUL5:%.*]] = add i32 [[N_VEC]], [[EXT]] -; IND-NEXT: [[IND_END1:%.*]] = shl i32 [[EXT_MUL5]], 2 +; IND-NEXT: [[IND_END2:%.*]] = shl i32 [[EXT_MUL5]], 2 ; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 ; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; IND-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], @@ -3975,13 +3975,13 @@ ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: ; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; IND-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] -; IND-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; IND-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; IND-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] ; IND-NEXT: br label [[LOOP:%.*]] ; IND: loop: ; IND-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; IND-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] -; IND-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; IND-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; IND-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; IND-NEXT: [[TMP13:%.*]] = sext i8 [[IDX]] to i64 ; IND-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] ; IND-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 @@ -4008,11 +4008,11 @@ ; UNROLL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; UNROLL: vector.scevcheck: ; UNROLL-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NEXT: [[TMP2:%.*]] = xor i8 [[T]], -1 -; UNROLL-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[TMP1]] +; UNROLL-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] +; UNROLL-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; UNROLL-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NEXT: [[TMP5:%.*]] = add i8 [[TMP4]], [[T]] -; UNROLL-NEXT: [[TMP6:%.*]] = icmp slt i8 [[TMP5]], [[T]] +; UNROLL-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 +; UNROLL-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] ; UNROLL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 ; UNROLL-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] ; UNROLL-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] @@ -4022,7 +4022,7 @@ ; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] ; UNROLL-NEXT: [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]] -; UNROLL-NEXT: [[IND_END1:%.*]] = shl i32 [[EXT_MUL6]], 2 +; UNROLL-NEXT: [[IND_END2:%.*]] = shl i32 [[EXT_MUL6]], 2 ; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 ; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], @@ -4047,13 +4047,13 @@ ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] ; UNROLL-NEXT: br label [[LOOP:%.*]] ; UNROLL: loop: ; UNROLL-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; UNROLL-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; UNROLL-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 ; UNROLL-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] ; UNROLL-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 @@ -4082,12 +4082,12 @@ ; UNROLL-NO-IC: vector.scevcheck: ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add i8 [[T]], [[TMP1]] -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[T]] +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = trunc i32 [[LEN]] to i8 ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i8 [[T]], [[TMP6]] -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = icmp slt i8 [[TMP7]], [[T]] +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = icmp ult i8 [[TMP7]], [[T]] ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = icmp ugt i32 [[LEN]], 255 ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]] @@ -4098,7 +4098,7 @@ ; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4 -; UNROLL-NO-IC-NEXT: [[IND_END1:%.*]] = add i32 [[EXT_MUL]], [[TMP12]] +; UNROLL-NO-IC-NEXT: [[IND_END2:%.*]] = add i32 [[EXT_MUL]], [[TMP12]] ; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 ; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], @@ -4126,13 +4126,13 @@ ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: ; UNROLL-NO-IC-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; UNROLL-NO-IC-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; UNROLL-NO-IC-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] ; UNROLL-NO-IC-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 ; UNROLL-NO-IC-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 @@ -4158,11 +4158,11 @@ ; INTERLEAVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; INTERLEAVE: vector.scevcheck: ; INTERLEAVE-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 -; INTERLEAVE-NEXT: [[TMP2:%.*]] = xor i8 [[T]], -1 -; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[TMP1]] +; INTERLEAVE-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] +; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; INTERLEAVE-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = add i8 [[TMP4]], [[T]] -; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp slt i8 [[TMP5]], [[T]] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 +; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] ; INTERLEAVE-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 ; INTERLEAVE-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] ; INTERLEAVE-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] @@ -4172,7 +4172,7 @@ ; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] ; INTERLEAVE-NEXT: [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]] -; INTERLEAVE-NEXT: [[IND_END1:%.*]] = shl i32 [[EXT_MUL6]], 2 +; INTERLEAVE-NEXT: [[IND_END2:%.*]] = shl i32 [[EXT_MUL6]], 2 ; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0 ; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], @@ -4197,13 +4197,13 @@ ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] ; INTERLEAVE-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE: loop: ; INTERLEAVE-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; INTERLEAVE-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; INTERLEAVE-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; INTERLEAVE-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 ; INTERLEAVE-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] ; INTERLEAVE-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4