diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1906,9 +1906,11 @@ /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized - /// during vectorization. Non-uniform scalarized instructions will be - /// represented by VF values in the vectorized loop, each corresponding to an - /// iteration of the original scalar loop. + /// during vectorization. collectLoopScalars should only add non-uniform nodes + /// to the list if they are used by a load/store instruction that is marked as + /// CM_Scalarize. Non-uniform scalarized instructions will be represented by + /// VF values in the vectorized loop, each corresponding to an iteration of + /// the original scalar loop. void collectLoopScalars(ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. @@ -4855,36 +4857,10 @@ bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); - bool NeedsVectorIndex = !IsUniform && VF.isScalable(); - Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; - if (NeedsVectorIndex) { - Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); - UnitStepVec = Builder.CreateStepVector(VecIVTy); - PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); - } - for (unsigned Part = 0; Part < UF; ++Part) { Value *PartStart = createStepForVF(Builder, PtrInd->getType(), VF, Part); - if (NeedsVectorIndex) { - // Here we cache the whole vector, which means we can support the - // extraction of any lane. However, in some cases the extractelement - // instruction that is generated for scalar uses of this vector (e.g. - // a load instruction) is not folded away. Therefore we still - // calculate values for the first n lanes to avoid redundant moves - // (when extracting the 0th element) and to produce scalar code (i.e. - // additional add/gep instructions instead of expensive extractelement - // instructions) when extracting higher-order elements. - Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); - Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); - Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); - Value *SclrGep = - emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); - SclrGep->setName("next.gep"); - State.set(PhiR, SclrGep, Part); - } - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { Value *Idx = Builder.CreateAdd( PartStart, ConstantInt::get(PtrInd->getType(), Lane)); @@ -5220,38 +5196,12 @@ !TheLoop->isLoopInvariant(V); }; - auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { - if (!isa(Ptr) || - !Legal->getInductionVars().count(cast(Ptr))) - return false; - auto &Induction = Legal->getInductionVars()[cast(Ptr)]; - if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) - return false; - return isScalarUse(MemAccess, Ptr); - }; - // A helper that evaluates a memory access's use of a pointer. If the // pointer is actually the pointer induction of a loop, it is being // inserted into Worklist. If the use will be a scalar use, and the // pointer is only used by memory accesses, we place the pointer in // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { - if (isScalarPtrInduction(MemAccess, Ptr)) { - Worklist.insert(cast(Ptr)); - LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr - << "\n"); - - Instruction *Update = cast( - cast(Ptr)->getIncomingValueForBlock(Latch)); - - // If there is more than one user of Update (Ptr), we shouldn't assume it - // will be scalar after vectorisation as other users of the instruction - // may require widening. Otherwise, add it to ScalarPtrs. - if (Update->hasOneUse() && cast(*Update->user_begin()) == Ptr) { - ScalarPtrs.insert(Update); - return; - } - } // We only care about bitcast and getelementptr instructions contained in // the loop. if (!isLoopVaryingBitCastOrGEP(Ptr)) @@ -5343,11 +5293,22 @@ if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) continue; + // Returns true if \p Indvar is a pointer induction that is used directly by + // load/store instruction \p I. + auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, + Instruction *I) { + return Induction.second.getKind() == + InductionDescriptor::IK_PtrInduction && + (isa(I) || isa(I)) && + Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); + }; + // Determine if all users of the induction variable are scalar after // vectorization. auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast(U); - return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); + return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || + IsDirectLoadStoreFromPtrIndvar(Ind, I); }); if (!ScalarInd) continue; @@ -5357,7 +5318,8 @@ auto ScalarIndUpdate = llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast(U); - return I == Ind || !TheLoop->contains(I) || Worklist.count(I); + return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || + IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); }); if (!ScalarIndUpdate) continue; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -90,7 +90,7 @@ ; ; Same as predicate_store except we use a pointer PHI to maintain the address ; -; CHECK: Found new scalar instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ] +; CHECK: Found scalar instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ] ; CHECK: Found scalar instruction: %addr.next = getelementptr inbounds i32, i32* %addr, i64 1 ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %addr, align 4 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -41,35 +41,39 @@ ; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, i8* [[START_2:%.*]], i64 [[N_VEC]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8*, i8** [[START_1]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP7]], 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = add zeroinitializer, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = add [[DOTSPLAT]], [[TMP7]] -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START_2]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, [[NEXT_GEP4]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8** [[TMP12]] to * -; CHECK-NEXT: store [[TMP11]], * [[TMP13]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* [[NEXT_GEP5]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP15]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP14]] to * -; CHECK-NEXT: store [[TMP16]], * [[TMP17]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP12:%.*]] = add [[DOTSPLAT]], [[TMP11]] +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP12]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], [[VECTOR_GEP]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, [[TMP13]], i64 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8** [[TMP15]] to * +; CHECK-NEXT: store [[TMP14]], * [[TMP16]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP19]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP18]] to * +; CHECK-NEXT: store [[TMP20]], * [[TMP21]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]] +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP9]] +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -119,32 +123,66 @@ define void @pointer_induction(i8* noalias %start, i64 %N) { ; CHECK-LABEL: @pointer_induction( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8* [[START:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[START:%.*]], i64 [[N_VEC]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8* [[START]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP6]], 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = add zeroinitializer, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = add [[DOTSPLAT]], [[TMP6]] -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[START]], [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, i8* [[START]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 1 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[NEXT_GEP3]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP12]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, [[NEXT_GEP]], i64 1 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq [[TMP13]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP11:%.*]] = add [[DOTSPLAT]], [[TMP10]] +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP11]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], [[VECTOR_GEP]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, i8* [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP15]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP16]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, [[TMP12]], i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq [[TMP17]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP8]] +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[PTR_PHI:%.*]] = phi i8* [ [[PTR_PHI_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX_NXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP22:%.*]] = load i8, i8* [[PTR_PHI]], align 1 +; CHECK-NEXT: [[PTR_PHI_NEXT]] = getelementptr inbounds i8, i8* [[PTR_PHI]], i64 1 +; CHECK-NEXT: [[CMP_I_NOT:%.*]] = icmp eq i8* [[PTR_PHI_NEXT]], [[START]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDEX]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: end: +; CHECK-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -125,25 +125,26 @@ define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) #0 { ; CHECK-LABEL: @pointer_iv_mixed( ; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ %a, %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 0, i32 0), %vector.ph ], [ [[TMP7:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = add [[DOTSPLAT]], [[TMP4]] -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* %a, [[TMP5]] -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i32, i32* %a, i64 [[INDEX]] -; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i32*, i32** %b, i64 [[INDEX]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[NEXT_GEP4]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7]] = add [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32** [[NEXT_GEP6]] to * -; CHECK-NEXT: store [[NEXT_GEP]], * [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}} -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 0, i32 0), %vector.ph ], [ [[TMP9:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], [[TMP6]] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32*, i32** %b, i64 [[INDEX]] +; CHECK-NEXT: [[BC:%.*]] = bitcast [[TMP7]] to *> +; CHECK-NEXT: [[TMP8:%.*]] = extractelement *> [[BC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP8]], align 8 +; CHECK-NEXT: [[TMP9]] = add [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32** [[NEXT_GEP]] to * +; CHECK-NEXT: store [[TMP7]], * [[TMP10]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}} +; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP5]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP7:![0-9]+]] entry: br label %for.body @@ -166,7 +167,51 @@ ret i32 %tmp5 } +define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(i16* %ptr) #0 { +; CHECK-LABEL: @phi_used_in_vector_compare_and_scalar_indvar_update_and_store( +; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i16* [ %ptr, %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, i16* [[POINTER_PHI]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne [[TMP5]], zeroinitializer +; CHECK-NEXT: [[BC:%.*]] = bitcast [[TMP5]] to *> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement *> [[BC]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv2i16.p0nxv2i16( zeroinitializer, * [[TMP7]], i32 2, [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}} +; CHECK-NEXT: [[PTR_IND]] = getelementptr i16, i16* [[POINTER_PHI]], i64 [[TMP3]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP9:![0-9]+]] +entry: + br label %for.body + +for.body: ; preds = %if.end, %entry + %iv = phi i64 [ %inc, %if.end ], [ 0, %entry ] + %iv.ptr = phi i16* [ %incdec.iv.ptr, %if.end ], [ %ptr, %entry ] + %cmp.i = icmp ne i16* %iv.ptr, null + br i1 %cmp.i, label %if.end.sink.split, label %if.end + +if.end.sink.split: ; preds = %for.body + store i16 0, i16* %iv.ptr, align 2 + br label %if.end + +if.end: ; preds = %if.end.sink.split, %for.body + %incdec.iv.ptr = getelementptr inbounds i16, i16* %iv.ptr, i64 1 + %inc = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp ult i64 %inc, 1024 + br i1 %exitcond.not, label %for.body, label %for.cond.preheader, !llvm.loop !6 + +for.cond.preheader: ; preds = %if.end, %for.cond.preheader + %iv.ptr.1.lcssa = phi i16* [ %incdec.iv.ptr, %if.end ] + ret void +} + attributes #0 = { vscale_range(0, 16) } + !0 = distinct !{!0, !1, !2, !3, !4, !5} !1 = !{!"llvm.loop.mustprogress"} !2 = !{!"llvm.loop.vectorize.width", i32 4} diff --git a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll --- a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll @@ -399,23 +399,13 @@ ; CHECK-NOT: LV: Found uniform instruction: %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ] ; CHECK: LV: Found uniform instruction: %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ] ; CHECK: vector.body +; CHECK: %pointer.phi = phi i32* [ %a, %vector.ph ], [ %ptr.ind, %vector.body ] ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %next.gep = getelementptr i32, i32* %a, i64 %index -; CHECK: %[[I1:.+]] = or i64 %index, 1 -; CHECK: %next.gep10 = getelementptr i32, i32* %a, i64 %[[I1]] -; CHECK: %[[I2:.+]] = or i64 %index, 2 -; CHECK: %next.gep11 = getelementptr i32, i32* %a, i64 %[[I2]] -; CHECK: %[[I3:.+]] = or i64 %index, 3 -; CHECK: %next.gep12 = getelementptr i32, i32* %a, i64 %[[I3]] -; CHECK: %[[V0:.+]] = insertelement <4 x i32*> poison, i32* %next.gep, i32 0 -; CHECK: %[[V1:.+]] = insertelement <4 x i32*> %[[V0]], i32* %next.gep10, i32 1 -; CHECK: %[[V2:.+]] = insertelement <4 x i32*> %[[V1]], i32* %next.gep11, i32 2 -; CHECK: %[[V3:.+]] = insertelement <4 x i32*> %[[V2]], i32* %next.gep12, i32 3 -; CHECK-NOT: getelementptr -; CHECK: %next.gep13 = getelementptr i32*, i32** %b, i64 %index -; CHECK-NOT: getelementptr -; CHECK: %[[B0:.+]] = bitcast i32** %next.gep13 to <4 x i32*>* -; CHECK: store <4 x i32*> %[[V3]], <4 x i32*>* %[[B0]], align 8 +; CHECK: %[[PTRVEC:.+]] = getelementptr i32, i32* %pointer.phi, <4 x i64> +; CHECK: %next.gep = getelementptr i32*, i32** %b, i64 %index +; CHECK: %[[NEXTGEPBC:.+]] = bitcast i32** %next.gep to <4 x i32*>* +; CHECK: store <4 x i32*> %[[PTRVEC]], <4 x i32*>* %[[NEXTGEPBC]], align 8 +; CHECK: %ptr.ind = getelementptr i32, i32* %pointer.phi, i64 4 ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body ; define i32 @pointer_iv_mixed(i32* %a, i32** %b, i64 %n) { diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll --- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll @@ -21,13 +21,13 @@ ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* null, i64 [[TMP1]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ null, [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ] -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP2]], i64 -1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP5]], i32 -3 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], -1 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* null, i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i64 -1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 -3 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> @@ -36,35 +36,43 @@ ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0 ; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i64 -1 ; CHECK-NEXT: store i8 95, i8* [[TMP11]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP9]], i32 1 -; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]] -; CHECK: pred.store.if2: -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 1 -; CHECK-NEXT: store i8 95, i8* [[TMP13]], align 1 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE3]] -; CHECK: pred.store.continue3: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP9]], i32 2 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] -; CHECK: pred.store.if4: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 2 +; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; CHECK: pred.store.if5: +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], -1 +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, i8* null, i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP2]], i64 -1 ; CHECK-NEXT: store i8 95, i8* [[TMP15]], align 1 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE5]] -; CHECK: pred.store.continue5: -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP9]], i32 3 -; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]] -; CHECK: pred.store.if6: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 3 -; CHECK-NEXT: store i8 95, i8* [[TMP17]], align 1 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE7]] -; CHECK: pred.store.continue7: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] +; CHECK: pred.store.continue6: +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP9]], i32 2 +; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] +; CHECK: pred.store.if7: +; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], -1 +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, i8* null, i64 [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP3]], i64 -1 +; CHECK-NEXT: store i8 95, i8* [[TMP19]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] +; CHECK: pred.store.continue8: +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP9]], i32 3 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]] +; CHECK: pred.store.if9: +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], -1 +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* null, i64 [[TMP22]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP4]], i64 -1 +; CHECK-NEXT: store i8 95, i8* [[TMP23]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] +; CHECK: pred.store.continue10: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 -4 -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -78,8 +86,8 @@ ; CHECK: for.body: ; CHECK-NEXT: [[C_05:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[C_05]], i64 -1 -; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP19]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP25]], 0 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: store i8 95, i8* [[INCDEC_PTR]], align 1 @@ -134,35 +142,27 @@ ; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, i8* [[START_2:%.*]], i64 [[N_VEC]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8*, i8** [[START_1]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i8*> poison, i8* [[NEXT_GEP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i8*> [[TMP6]], i8* [[NEXT_GEP5]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i8*> [[TMP7]], i8* [[NEXT_GEP6]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i8*> [[TMP8]], i8* [[NEXT_GEP7]], i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP9]], i64 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8** [[TMP11]] to <4 x i8*>* -; CHECK-NEXT: store <4 x i8*> [[TMP10]], <4 x i8*>* [[TMP12]], align 8 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[NEXT_GEP4]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP14]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i8> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP13]] to <4 x i8>* -; CHECK-NEXT: store <4 x i8> [[TMP15]], <4 x i8>* [[TMP16]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8** [[TMP4]] to <4 x i8*>* +; CHECK-NEXT: store <4 x i8*> [[TMP3]], <4 x i8*>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8*> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[TMP9]], <4 x i8>* [[TMP10]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 4 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]